diff mbox series

[3/4] net/virtio: add failover support

Message ID 20190517125820.2885-4-jfreimann@redhat.com
State New
Headers show
Series add failover feature for assigned networkdevices | expand

Commit Message

Jens Freimann May 17, 2019, 12:58 p.m. UTC
This patch adds support to handle failover device pairs of a virtio-net
device and a vfio-pci device, where the virtio-net acts as the standby
device and the vfio-pci device as the primary.

The general idea is that we have a pair of devices, a vfio-pci and a
emulated (virtio-net) device. Before migration the vfio device is
unplugged and data flows to the emulated device, on the target side
another vfio-pci device is plugged in to take over the data-path. In the
guest the net_failover module will pair net devices with the same MAC
address.

To achieve this we need:

1. Provide a callback function for the should_be_hidden DeviceListener.
   It is called when the primary device is plugged in. Evaluate the QOpt
   passed in to check if it is the matching primary device. It returns
   two values:
     - one to signal if the device to be added is the matching
       primary device
     - another one to signal to qdev if it should actually
       continue with adding the device or skip it.

   In the latter case it stores the device options in the VirtioNet
   struct and the device is added once the VIRTIO_NET_F_STANDBY feature is
   negotiated during virtio feature negotiation.

2. Register a callback for migration status notifier. When called it
   will unplug its primary device before the migration happens.

Signed-off-by: Jens Freimann <jfreimann@redhat.com>
---
 hw/net/virtio-net.c            | 117 +++++++++++++++++++++++++++++++++
 include/hw/virtio/virtio-net.h |  12 ++++
 2 files changed, 129 insertions(+)

Comments

Dr. David Alan Gilbert May 21, 2019, 9:45 a.m. UTC | #1
* Jens Freimann (jfreimann@redhat.com) wrote:
> This patch adds support to handle failover device pairs of a virtio-net
> device and a vfio-pci device, where the virtio-net acts as the standby
> device and the vfio-pci device as the primary.
> 
> The general idea is that we have a pair of devices, a vfio-pci and a
> emulated (virtio-net) device. Before migration the vfio device is
> unplugged and data flows to the emulated device, on the target side
> another vfio-pci device is plugged in to take over the data-path. In the
> guest the net_failover module will pair net devices with the same MAC
> address.
> 
> To achieve this we need:
> 
> 1. Provide a callback function for the should_be_hidden DeviceListener.
>    It is called when the primary device is plugged in. Evaluate the QOpt
>    passed in to check if it is the matching primary device. It returns
>    two values:
>      - one to signal if the device to be added is the matching
>        primary device
>      - another one to signal to qdev if it should actually
>        continue with adding the device or skip it.
> 
>    In the latter case it stores the device options in the VirtioNet
>    struct and the device is added once the VIRTIO_NET_F_STANDBY feature is
>    negotiated during virtio feature negotiation.
> 
> 2. Register a callback for migration status notifier. When called it
>    will unplug its primary device before the migration happens.
> 
> Signed-off-by: Jens Freimann <jfreimann@redhat.com>
> ---
>  hw/net/virtio-net.c            | 117 +++++++++++++++++++++++++++++++++
>  include/hw/virtio/virtio-net.h |  12 ++++
>  2 files changed, 129 insertions(+)
> 
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index ffe0872fff..120eccbb98 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -12,6 +12,7 @@
>   */
>  
>  #include "qemu/osdep.h"
> +#include "qemu/atomic.h"
>  #include "qemu/iov.h"
>  #include "hw/virtio/virtio.h"
>  #include "net/net.h"
> @@ -19,6 +20,10 @@
>  #include "net/tap.h"
>  #include "qemu/error-report.h"
>  #include "qemu/timer.h"
> +#include "qemu/option.h"
> +#include "qemu/option_int.h"
> +#include "qemu/config-file.h"
> +#include "qapi/qmp/qdict.h"
>  #include "hw/virtio/virtio-net.h"
>  #include "net/vhost_net.h"
>  #include "net/announce.h"
> @@ -29,6 +34,8 @@
>  #include "migration/misc.h"
>  #include "standard-headers/linux/ethtool.h"
>  #include "trace.h"
> +#include "monitor/qdev.h"
> +#include "hw/pci/pci.h"
>  
>  #define VIRTIO_NET_VM_VERSION    11
>  
> @@ -364,6 +371,9 @@ static void virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status)
>      }
>  }
>  
> +
> +static void virtio_net_primary_plug_timer(void *opaque);
> +
>  static void virtio_net_set_link_status(NetClientState *nc)
>  {
>      VirtIONet *n = qemu_get_nic_opaque(nc);
> @@ -786,6 +796,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
>      } else {
>          memset(n->vlans, 0xff, MAX_VLAN >> 3);
>      }
> +
> +    if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
> +        atomic_set(&n->primary_should_be_hidden, false);
> +        if (n->primary_device_timer)
> +            timer_mod(n->primary_device_timer,
> +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> +                4000);
> +    }

What's this magic timer constant and why?

>  }
>  
>  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
> @@ -2626,6 +2644,87 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
>      n->netclient_type = g_strdup(type);
>  }
>  
> +static void virtio_net_primary_plug_timer(void *opaque)
> +{
> +    VirtIONet *n = opaque;
> +    Error *err = NULL;
> +
> +    if (n->primary_device_dict)
> +        n->primary_device_opts = qemu_opts_from_qdict(qemu_find_opts("device"),
> +            n->primary_device_dict, &err);
> +    if (n->primary_device_opts) {
> +        n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
> +        error_setg(&err, "virtio_net: couldn't plug in primary device");
> +        return;
> +    }
> +    if (!n->primary_device_dict && err) {
> +        if (n->primary_device_timer) {
> +            timer_mod(n->primary_device_timer,
> +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> +                100);

same here.


> +        }
> +    }
> +}
> +
> +static void virtio_net_handle_migration_primary(VirtIONet *n,
> +                                                MigrationState *s)
> +{
> +    Error *err = NULL;
> +    bool should_be_hidden = atomic_read(&n->primary_should_be_hidden);
> +
> +    n->primary_dev = qdev_find_recursive(sysbus_get_default(),
> +            n->primary_device_id);
> +    if (!n->primary_dev) {
> +        error_setg(&err, "virtio_net: couldn't find primary device");

There's something broken with the error handling in this function - the
'err' never goes anywhere - I don't think it ever gets printed or
reported or stops the migration.

> +    }
> +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
> +        qdev_unplug(n->primary_dev, &err);

Not knowing unplug well; can you just explain - is that device hard
unplugged and it's gone by the time this function returns or is it still
hanging around for some indeterminate time?

> +        if (!err) {
> +            atomic_set(&n->primary_should_be_hidden, true);
> +            n->primary_dev = NULL;
> +        }
> +    } else if (migration_has_failed(s)) {
> +        if (should_be_hidden && !n->primary_dev) {
> +            /* We already unplugged the device let's plugged it back */
> +            n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
> +        }
> +    }
> +}
> +
> +static void migration_state_notifier(Notifier *notifier, void *data)
> +{
> +    MigrationState *s = data;
> +    VirtIONet *n = container_of(notifier, VirtIONet, migration_state);
> +    virtio_net_handle_migration_primary(n, s);
> +}
> +
> +static void virtio_net_primary_should_be_hidden(DeviceListener *listener,
> +            QemuOpts *device_opts, bool *match_found, bool *res)
> +{
> +    VirtIONet *n = container_of(listener, VirtIONet, primary_listener);
> +
> +    if (device_opts) {
> +        n->primary_device_dict = qemu_opts_to_qdict(device_opts,
> +                n->primary_device_dict);
> +    }
> +    g_free(n->standby_id);
> +    n->standby_id = g_strdup(qdict_get_try_str(n->primary_device_dict,
> +                             "standby"));
> +    if (n->standby_id) {
> +        *match_found = true;
> +    }
> +    /* primary_should_be_hidden is set during feature negotiation */
> +    if (atomic_read(&n->primary_should_be_hidden) && *match_found) {
> +        *res = true;
> +    } else if (*match_found)  {
> +        n->primary_device_dict = qemu_opts_to_qdict(device_opts,
> +                n->primary_device_dict);
> +        *res = false;
> +    }
> +    g_free(n->primary_device_id);
> +    n->primary_device_id = g_strdup(device_opts->id);
> +}
> +
>  static void virtio_net_device_realize(DeviceState *dev, Error **errp)
>  {
>      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> @@ -2656,6 +2755,18 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
>          n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX);
>      }
>  
> +    if (n->failover) {
> +        n->primary_listener.should_be_hidden =
> +            virtio_net_primary_should_be_hidden;
> +        atomic_set(&n->primary_should_be_hidden, true);
> +        device_listener_register(&n->primary_listener);
> +        n->migration_state.notify = migration_state_notifier;
> +        add_migration_state_change_notifier(&n->migration_state);
> +        n->host_features |= (1ULL << VIRTIO_NET_F_STANDBY);
> +        n->primary_device_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
> +                                     virtio_net_primary_plug_timer, n);
> +    }
> +
>      virtio_net_set_config_size(n, n->host_features);
>      virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, n->config_size);
>  
> @@ -2778,6 +2889,11 @@ static void virtio_net_device_unrealize(DeviceState *dev, Error **errp)
>      g_free(n->mac_table.macs);
>      g_free(n->vlans);
>  
> +    g_free(n->primary_device_id);
> +    g_free(n->standby_id);
> +    qobject_unref(n->primary_device_dict);
> +    n->primary_device_dict = NULL;
> +
>      max_queues = n->multiqueue ? n->max_queues : 1;
>      for (i = 0; i < max_queues; i++) {
>          virtio_net_del_queue(n, i);
> @@ -2885,6 +3001,7 @@ static Property virtio_net_properties[] = {
>                       true),
>      DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
>      DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
> +    DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
> index b96f0c643f..c2bb6ada44 100644
> --- a/include/hw/virtio/virtio-net.h
> +++ b/include/hw/virtio/virtio-net.h
> @@ -18,6 +18,7 @@
>  #include "standard-headers/linux/virtio_net.h"
>  #include "hw/virtio/virtio.h"
>  #include "net/announce.h"
> +#include "qemu/option_int.h"
>  
>  #define TYPE_VIRTIO_NET "virtio-net-device"
>  #define VIRTIO_NET(obj) \
> @@ -43,6 +44,7 @@ typedef struct virtio_net_conf
>      int32_t speed;
>      char *duplex_str;
>      uint8_t duplex;
> +    char *primary_id_str;
>  } virtio_net_conf;
>  
>  /* Coalesced packets type & status */
> @@ -185,6 +187,16 @@ struct VirtIONet {
>      AnnounceTimer announce_timer;
>      bool needs_vnet_hdr_swap;
>      bool mtu_bypass_backend;
> +    QemuOpts *primary_device_opts;
> +    QDict *primary_device_dict;
> +    DeviceState *primary_dev;
> +    char *primary_device_id;
> +    char *standby_id;
> +    bool primary_should_be_hidden;
> +    bool failover;
> +    DeviceListener primary_listener;
> +    QEMUTimer *primary_device_timer;
> +    Notifier migration_state;
>  };
>  
>  void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
> -- 
> 2.21.0
> 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Jens Freimann May 30, 2019, 2:56 p.m. UTC | #2
Hi David,

sorry for the  delayed reply. 

On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
>On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
>> * Jens Freimann (jfreimann@redhat.com) wrote:
>> > +static void virtio_net_primary_plug_timer(void *opaque);
>> > +
>> >  static void virtio_net_set_link_status(NetClientState *nc)
>> >  {
>> >      VirtIONet *n = qemu_get_nic_opaque(nc);
>> > @@ -786,6 +796,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
>> >      } else {
>> >          memset(n->vlans, 0xff, MAX_VLAN >> 3);
>> >      }
>> > +
>> > +    if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
>> > +        atomic_set(&n->primary_should_be_hidden, false);
>> > +        if (n->primary_device_timer)
>> > +            timer_mod(n->primary_device_timer,
>> > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
>> > +                4000);
>> > +    }
>>
>> What's this magic timer constant and why?

To be honest it's a leftover from previous versions (before I took
over) of the patches and I'm not sure why the timer is there.
I removed it and so far see no reason to keep it.  

>>
>> >  }
>> >
>> >  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
>> > @@ -2626,6 +2644,87 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
>> >      n->netclient_type = g_strdup(type);
>> >  }
>> >
>> > +static void virtio_net_primary_plug_timer(void *opaque)
>> > +{
>> > +    VirtIONet *n = opaque;
>> > +    Error *err = NULL;
>> > +
>> > +    if (n->primary_device_dict)
>> > +        n->primary_device_opts = qemu_opts_from_qdict(qemu_find_opts("device"),
>> > +            n->primary_device_dict, &err);
>> > +    if (n->primary_device_opts) {
>> > +        n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
>> > +        error_setg(&err, "virtio_net: couldn't plug in primary device");
>> > +        return;
>> > +    }
>> > +    if (!n->primary_device_dict && err) {
>> > +        if (n->primary_device_timer) {
>> > +            timer_mod(n->primary_device_timer,
>> > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
>> > +                100);
>>
>> same here.

see above

>>
>>
>> > +        }
>> > +    }
>> > +}
>> > +
>> > +static void virtio_net_handle_migration_primary(VirtIONet *n,
>> > +                                                MigrationState *s)
>> > +{
>> > +    Error *err = NULL;
>> > +    bool should_be_hidden = atomic_read(&n->primary_should_be_hidden);
>> > +
>> > +    n->primary_dev = qdev_find_recursive(sysbus_get_default(),
>> > +            n->primary_device_id);
>> > +    if (!n->primary_dev) {
>> > +        error_setg(&err, "virtio_net: couldn't find primary device");
>>
>> There's something broken with the error handling in this function - the
>> 'err' never goes anywhere - I don't think it ever gets printed or
>> reported or stops the migration.

yes, I'll fix it.

>> > +    }
>> > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
>> > +        qdev_unplug(n->primary_dev, &err);
>>
>> Not knowing unplug well; can you just explain - is that device hard
>> unplugged and it's gone by the time this function returns or is it still
>> hanging around for some indeterminate time?

Qemu will trigger an unplug request via pcie attention button in which case
there could be a delay by the guest operating system. We could give it some
amount of time and if nothing happens try surpise removal or handle the
error otherwise.


regards,
Jens
Michael S. Tsirkin May 30, 2019, 5:46 p.m. UTC | #3
On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> Hi David,
> 
> sorry for the  delayed reply.
> 
> On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > +static void virtio_net_primary_plug_timer(void *opaque);
> > > > +
> > > >  static void virtio_net_set_link_status(NetClientState *nc)
> > > >  {
> > > >      VirtIONet *n = qemu_get_nic_opaque(nc);
> > > > @@ -786,6 +796,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
> > > >      } else {
> > > >          memset(n->vlans, 0xff, MAX_VLAN >> 3);
> > > >      }
> > > > +
> > > > +    if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
> > > > +        atomic_set(&n->primary_should_be_hidden, false);
> > > > +        if (n->primary_device_timer)
> > > > +            timer_mod(n->primary_device_timer,
> > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > +                4000);
> > > > +    }
> > > 
> > > What's this magic timer constant and why?
> 
> To be honest it's a leftover from previous versions (before I took
> over) of the patches and I'm not sure why the timer is there.
> I removed it and so far see no reason to keep it.
> 
> > > 
> > > >  }
> > > >
> > > >  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
> > > > @@ -2626,6 +2644,87 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
> > > >      n->netclient_type = g_strdup(type);
> > > >  }
> > > >
> > > > +static void virtio_net_primary_plug_timer(void *opaque)
> > > > +{
> > > > +    VirtIONet *n = opaque;
> > > > +    Error *err = NULL;
> > > > +
> > > > +    if (n->primary_device_dict)
> > > > +        n->primary_device_opts = qemu_opts_from_qdict(qemu_find_opts("device"),
> > > > +            n->primary_device_dict, &err);
> > > > +    if (n->primary_device_opts) {
> > > > +        n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
> > > > +        error_setg(&err, "virtio_net: couldn't plug in primary device");
> > > > +        return;
> > > > +    }
> > > > +    if (!n->primary_device_dict && err) {
> > > > +        if (n->primary_device_timer) {
> > > > +            timer_mod(n->primary_device_timer,
> > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > +                100);
> > > 
> > > same here.
> 
> see above
> 
> > > 
> > > 
> > > > +        }
> > > > +    }
> > > > +}
> > > > +
> > > > +static void virtio_net_handle_migration_primary(VirtIONet *n,
> > > > +                                                MigrationState *s)
> > > > +{
> > > > +    Error *err = NULL;
> > > > +    bool should_be_hidden = atomic_read(&n->primary_should_be_hidden);
> > > > +
> > > > +    n->primary_dev = qdev_find_recursive(sysbus_get_default(),
> > > > +            n->primary_device_id);
> > > > +    if (!n->primary_dev) {
> > > > +        error_setg(&err, "virtio_net: couldn't find primary device");
> > > 
> > > There's something broken with the error handling in this function - the
> > > 'err' never goes anywhere - I don't think it ever gets printed or
> > > reported or stops the migration.
> 
> yes, I'll fix it.
> 
> > > > +    }
> > > > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
> > > > +        qdev_unplug(n->primary_dev, &err);
> > > 
> > > Not knowing unplug well; can you just explain - is that device hard
> > > unplugged and it's gone by the time this function returns or is it still
> > > hanging around for some indeterminate time?
> 
> Qemu will trigger an unplug request via pcie attention button in which case
> there could be a delay by the guest operating system. We could give it some
> amount of time and if nothing happens try surpise removal or handle the
> error otherwise.
> 
> 
> regards,
> Jens

That's a subject for another day. Let's get the basic thing
working.
Dr. David Alan Gilbert May 30, 2019, 6 p.m. UTC | #4
* Michael S. Tsirkin (mst@redhat.com) wrote:
> On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > Hi David,
> > 
> > sorry for the  delayed reply.
> > 
> > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > > +static void virtio_net_primary_plug_timer(void *opaque);
> > > > > +
> > > > >  static void virtio_net_set_link_status(NetClientState *nc)
> > > > >  {
> > > > >      VirtIONet *n = qemu_get_nic_opaque(nc);
> > > > > @@ -786,6 +796,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
> > > > >      } else {
> > > > >          memset(n->vlans, 0xff, MAX_VLAN >> 3);
> > > > >      }
> > > > > +
> > > > > +    if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
> > > > > +        atomic_set(&n->primary_should_be_hidden, false);
> > > > > +        if (n->primary_device_timer)
> > > > > +            timer_mod(n->primary_device_timer,
> > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > +                4000);
> > > > > +    }
> > > > 
> > > > What's this magic timer constant and why?
> > 
> > To be honest it's a leftover from previous versions (before I took
> > over) of the patches and I'm not sure why the timer is there.
> > I removed it and so far see no reason to keep it.
> > 
> > > > 
> > > > >  }
> > > > >
> > > > >  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
> > > > > @@ -2626,6 +2644,87 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
> > > > >      n->netclient_type = g_strdup(type);
> > > > >  }
> > > > >
> > > > > +static void virtio_net_primary_plug_timer(void *opaque)
> > > > > +{
> > > > > +    VirtIONet *n = opaque;
> > > > > +    Error *err = NULL;
> > > > > +
> > > > > +    if (n->primary_device_dict)
> > > > > +        n->primary_device_opts = qemu_opts_from_qdict(qemu_find_opts("device"),
> > > > > +            n->primary_device_dict, &err);
> > > > > +    if (n->primary_device_opts) {
> > > > > +        n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
> > > > > +        error_setg(&err, "virtio_net: couldn't plug in primary device");
> > > > > +        return;
> > > > > +    }
> > > > > +    if (!n->primary_device_dict && err) {
> > > > > +        if (n->primary_device_timer) {
> > > > > +            timer_mod(n->primary_device_timer,
> > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > +                100);
> > > > 
> > > > same here.
> > 
> > see above
> > 
> > > > 
> > > > 
> > > > > +        }
> > > > > +    }
> > > > > +}
> > > > > +
> > > > > +static void virtio_net_handle_migration_primary(VirtIONet *n,
> > > > > +                                                MigrationState *s)
> > > > > +{
> > > > > +    Error *err = NULL;
> > > > > +    bool should_be_hidden = atomic_read(&n->primary_should_be_hidden);
> > > > > +
> > > > > +    n->primary_dev = qdev_find_recursive(sysbus_get_default(),
> > > > > +            n->primary_device_id);
> > > > > +    if (!n->primary_dev) {
> > > > > +        error_setg(&err, "virtio_net: couldn't find primary device");
> > > > 
> > > > There's something broken with the error handling in this function - the
> > > > 'err' never goes anywhere - I don't think it ever gets printed or
> > > > reported or stops the migration.
> > 
> > yes, I'll fix it.
> > 
> > > > > +    }
> > > > > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
> > > > > +        qdev_unplug(n->primary_dev, &err);
> > > > 
> > > > Not knowing unplug well; can you just explain - is that device hard
> > > > unplugged and it's gone by the time this function returns or is it still
> > > > hanging around for some indeterminate time?
> > 
> > Qemu will trigger an unplug request via pcie attention button in which case
> > there could be a delay by the guest operating system. We could give it some
> > amount of time and if nothing happens try surpise removal or handle the
> > error otherwise.
> > 
> > 
> > regards,
> > Jens
> 
> That's a subject for another day. Let's get the basic thing
> working.

Well no, we need to know this thing isn't going to hang in the migration
setup phase, or if it does how we recover.  This patch series is very
odd precisely because it's trying to do the unplug itself in the
migration phase rather than let the management layer do it - so unless
it's nailed down how to make sure that's really really bullet proof
then we've got to go back and ask the question about whether we should
really fix it so it can be done by the management layer.

Dave

> -- 
> MST
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Michael S. Tsirkin May 30, 2019, 6:09 p.m. UTC | #5
On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
> * Michael S. Tsirkin (mst@redhat.com) wrote:
> > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > Hi David,
> > > 
> > > sorry for the  delayed reply.
> > > 
> > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > > > +static void virtio_net_primary_plug_timer(void *opaque);
> > > > > > +
> > > > > >  static void virtio_net_set_link_status(NetClientState *nc)
> > > > > >  {
> > > > > >      VirtIONet *n = qemu_get_nic_opaque(nc);
> > > > > > @@ -786,6 +796,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
> > > > > >      } else {
> > > > > >          memset(n->vlans, 0xff, MAX_VLAN >> 3);
> > > > > >      }
> > > > > > +
> > > > > > +    if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
> > > > > > +        atomic_set(&n->primary_should_be_hidden, false);
> > > > > > +        if (n->primary_device_timer)
> > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > +                4000);
> > > > > > +    }
> > > > > 
> > > > > What's this magic timer constant and why?
> > > 
> > > To be honest it's a leftover from previous versions (before I took
> > > over) of the patches and I'm not sure why the timer is there.
> > > I removed it and so far see no reason to keep it.
> > > 
> > > > > 
> > > > > >  }
> > > > > >
> > > > > >  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
> > > > > > @@ -2626,6 +2644,87 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
> > > > > >      n->netclient_type = g_strdup(type);
> > > > > >  }
> > > > > >
> > > > > > +static void virtio_net_primary_plug_timer(void *opaque)
> > > > > > +{
> > > > > > +    VirtIONet *n = opaque;
> > > > > > +    Error *err = NULL;
> > > > > > +
> > > > > > +    if (n->primary_device_dict)
> > > > > > +        n->primary_device_opts = qemu_opts_from_qdict(qemu_find_opts("device"),
> > > > > > +            n->primary_device_dict, &err);
> > > > > > +    if (n->primary_device_opts) {
> > > > > > +        n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
> > > > > > +        error_setg(&err, "virtio_net: couldn't plug in primary device");
> > > > > > +        return;
> > > > > > +    }
> > > > > > +    if (!n->primary_device_dict && err) {
> > > > > > +        if (n->primary_device_timer) {
> > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > +                100);
> > > > > 
> > > > > same here.
> > > 
> > > see above
> > > 
> > > > > 
> > > > > 
> > > > > > +        }
> > > > > > +    }
> > > > > > +}
> > > > > > +
> > > > > > +static void virtio_net_handle_migration_primary(VirtIONet *n,
> > > > > > +                                                MigrationState *s)
> > > > > > +{
> > > > > > +    Error *err = NULL;
> > > > > > +    bool should_be_hidden = atomic_read(&n->primary_should_be_hidden);
> > > > > > +
> > > > > > +    n->primary_dev = qdev_find_recursive(sysbus_get_default(),
> > > > > > +            n->primary_device_id);
> > > > > > +    if (!n->primary_dev) {
> > > > > > +        error_setg(&err, "virtio_net: couldn't find primary device");
> > > > > 
> > > > > There's something broken with the error handling in this function - the
> > > > > 'err' never goes anywhere - I don't think it ever gets printed or
> > > > > reported or stops the migration.
> > > 
> > > yes, I'll fix it.
> > > 
> > > > > > +    }
> > > > > > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
> > > > > > +        qdev_unplug(n->primary_dev, &err);
> > > > > 
> > > > > Not knowing unplug well; can you just explain - is that device hard
> > > > > unplugged and it's gone by the time this function returns or is it still
> > > > > hanging around for some indeterminate time?
> > > 
> > > Qemu will trigger an unplug request via pcie attention button in which case
> > > there could be a delay by the guest operating system. We could give it some
> > > amount of time and if nothing happens try surpise removal or handle the
> > > error otherwise.
> > > 
> > > 
> > > regards,
> > > Jens
> > 
> > That's a subject for another day. Let's get the basic thing
> > working.
> 
> Well no, we need to know this thing isn't going to hang in the migration
> setup phase, or if it does how we recover.


This thing is *supposed* to be stuck in migration startup phase
if guest is malicious.

If migration does not progress management needs
a way to detect this and cancel.

Some more documentation about how this is supposed to happen
would be helpful.

>  This patch series is very
> odd precisely because it's trying to do the unplug itself in the
> migration phase rather than let the management layer do it - so unless
> it's nailed down how to make sure that's really really bullet proof
> then we've got to go back and ask the question about whether we should
> really fix it so it can be done by the management layer.
> 
> Dave

management already said they can't because files get closed and
resources freed on unplug and so they might not be able to re-add device
on migration failure. We do it in migration because that is
where failures can happen and we can recover.

> > -- 
> > MST
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Eduardo Habkost May 30, 2019, 6:17 p.m. UTC | #6
On Thu, Dec 06, 2018 at 10:01:46AM +0000, Daniel P. Berrangé wrote:
> Users absolutely *do* care why migration is not finishing. A migration that
> does not finish is a major problem for mgmt apps in many case of the use
> cases for migration. Especially important when evacuating VMs from a host
> in order to do a software upgrade or replace faulty hardware. As mentioned
> previously, they will also often serialize migrations to prevent eh network
> being overutilized, so a migration that runs indefinitely will stall
> evacuation of additional VMs too.  Predictable execution of migration and
> clear error reporting/handling are critical features. IMHO this is the key
> reason VFIO unplug/plug needs to be done explicitly by the mgmt app, so it
> can be in control over when each part of the process takes place.

On Fri, Apr 05, 2019 at 09:56:29AM +0100, Dr. David Alan Gilbert wrote:
> Why not just let this happen at the libvirt level; then you do the
> hotunplug etc before you actually tell qemu anything about starting a
> migration?

On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
> Well no, we need to know this thing isn't going to hang in the migration
> setup phase, or if it does how we recover.  This patch series is very
> odd precisely because it's trying to do the unplug itself in the
> migration phase rather than let the management layer do it - so unless
> it's nailed down how to make sure that's really really bullet proof
> then we've got to go back and ask the question about whether we should
> really fix it so it can be done by the management layer.
> 

I have the impression we are running in circles here.
Eduardo Habkost May 30, 2019, 6:22 p.m. UTC | #7
On Thu, May 30, 2019 at 02:09:42PM -0400, Michael S. Tsirkin wrote:
> On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
> > * Michael S. Tsirkin (mst@redhat.com) wrote:
> > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > > Hi David,
> > > > 
> > > > sorry for the  delayed reply.
> > > > 
> > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > > > > +static void virtio_net_primary_plug_timer(void *opaque);
> > > > > > > +
> > > > > > >  static void virtio_net_set_link_status(NetClientState *nc)
> > > > > > >  {
> > > > > > >      VirtIONet *n = qemu_get_nic_opaque(nc);
> > > > > > > @@ -786,6 +796,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
> > > > > > >      } else {
> > > > > > >          memset(n->vlans, 0xff, MAX_VLAN >> 3);
> > > > > > >      }
> > > > > > > +
> > > > > > > +    if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
> > > > > > > +        atomic_set(&n->primary_should_be_hidden, false);
> > > > > > > +        if (n->primary_device_timer)
> > > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > > +                4000);
> > > > > > > +    }
> > > > > > 
> > > > > > What's this magic timer constant and why?
> > > > 
> > > > To be honest it's a leftover from previous versions (before I took
> > > > over) of the patches and I'm not sure why the timer is there.
> > > > I removed it and so far see no reason to keep it.
> > > > 
> > > > > > 
> > > > > > >  }
> > > > > > >
> > > > > > >  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
> > > > > > > @@ -2626,6 +2644,87 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
> > > > > > >      n->netclient_type = g_strdup(type);
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtio_net_primary_plug_timer(void *opaque)
> > > > > > > +{
> > > > > > > +    VirtIONet *n = opaque;
> > > > > > > +    Error *err = NULL;
> > > > > > > +
> > > > > > > +    if (n->primary_device_dict)
> > > > > > > +        n->primary_device_opts = qemu_opts_from_qdict(qemu_find_opts("device"),
> > > > > > > +            n->primary_device_dict, &err);
> > > > > > > +    if (n->primary_device_opts) {
> > > > > > > +        n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
> > > > > > > +        error_setg(&err, "virtio_net: couldn't plug in primary device");
> > > > > > > +        return;
> > > > > > > +    }
> > > > > > > +    if (!n->primary_device_dict && err) {
> > > > > > > +        if (n->primary_device_timer) {
> > > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > > +                100);
> > > > > > 
> > > > > > same here.
> > > > 
> > > > see above
> > > > 
> > > > > > 
> > > > > > 
> > > > > > > +        }
> > > > > > > +    }
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void virtio_net_handle_migration_primary(VirtIONet *n,
> > > > > > > +                                                MigrationState *s)
> > > > > > > +{
> > > > > > > +    Error *err = NULL;
> > > > > > > +    bool should_be_hidden = atomic_read(&n->primary_should_be_hidden);
> > > > > > > +
> > > > > > > +    n->primary_dev = qdev_find_recursive(sysbus_get_default(),
> > > > > > > +            n->primary_device_id);
> > > > > > > +    if (!n->primary_dev) {
> > > > > > > +        error_setg(&err, "virtio_net: couldn't find primary device");
> > > > > > 
> > > > > > There's something broken with the error handling in this function - the
> > > > > > 'err' never goes anywhere - I don't think it ever gets printed or
> > > > > > reported or stops the migration.
> > > > 
> > > > yes, I'll fix it.
> > > > 
> > > > > > > +    }
> > > > > > > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
> > > > > > > +        qdev_unplug(n->primary_dev, &err);
> > > > > > 
> > > > > > Not knowing unplug well; can you just explain - is that device hard
> > > > > > unplugged and it's gone by the time this function returns or is it still
> > > > > > hanging around for some indeterminate time?
> > > > 
> > > > Qemu will trigger an unplug request via pcie attention button in which case
> > > > there could be a delay by the guest operating system. We could give it some
> > > > amount of time and if nothing happens try surpise removal or handle the
> > > > error otherwise.
> > > > 
> > > > 
> > > > regards,
> > > > Jens
> > > 
> > > That's a subject for another day. Let's get the basic thing
> > > working.
> > 
> > Well no, we need to know this thing isn't going to hang in the migration
> > setup phase, or if it does how we recover.
> 
> 
> This thing is *supposed* to be stuck in migration startup phase
> if guest is malicious.
> 
> If migration does not progress management needs
> a way to detect this and cancel.
> 
> Some more documentation about how this is supposed to happen
> would be helpful.

Do we have confirmation from libvirt developers that this would
be a reasonable API for them?


> >  This patch series is very
> > odd precisely because it's trying to do the unplug itself in the
> > migration phase rather than let the management layer do it - so unless
> > it's nailed down how to make sure that's really really bullet proof
> > then we've got to go back and ask the question about whether we should
> > really fix it so it can be done by the management layer.
> > 
> > Dave
> 
> management already said they can't because files get closed and
> resources freed on unplug and so they might not be able to re-add device
> on migration failure. We do it in migration because that is
> where failures can happen and we can recover.

We are capable of providing an API to libvirt where files won't
get closed when a device is unplugged, if necessary.

This might become necessary if libvirt or management software
developers tell us the interface we are providing is not going to
work for them.
Dr. David Alan Gilbert May 30, 2019, 7:08 p.m. UTC | #8
* Michael S. Tsirkin (mst@redhat.com) wrote:
> On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
> > * Michael S. Tsirkin (mst@redhat.com) wrote:
> > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > > Hi David,
> > > > 
> > > > sorry for the  delayed reply.
> > > > 
> > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > > > > +static void virtio_net_primary_plug_timer(void *opaque);
> > > > > > > +
> > > > > > >  static void virtio_net_set_link_status(NetClientState *nc)
> > > > > > >  {
> > > > > > >      VirtIONet *n = qemu_get_nic_opaque(nc);
> > > > > > > @@ -786,6 +796,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
> > > > > > >      } else {
> > > > > > >          memset(n->vlans, 0xff, MAX_VLAN >> 3);
> > > > > > >      }
> > > > > > > +
> > > > > > > +    if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
> > > > > > > +        atomic_set(&n->primary_should_be_hidden, false);
> > > > > > > +        if (n->primary_device_timer)
> > > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > > +                4000);
> > > > > > > +    }
> > > > > > 
> > > > > > What's this magic timer constant and why?
> > > > 
> > > > To be honest it's a leftover from previous versions (before I took
> > > > over) of the patches and I'm not sure why the timer is there.
> > > > I removed it and so far see no reason to keep it.
> > > > 
> > > > > > 
> > > > > > >  }
> > > > > > >
> > > > > > >  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
> > > > > > > @@ -2626,6 +2644,87 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
> > > > > > >      n->netclient_type = g_strdup(type);
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtio_net_primary_plug_timer(void *opaque)
> > > > > > > +{
> > > > > > > +    VirtIONet *n = opaque;
> > > > > > > +    Error *err = NULL;
> > > > > > > +
> > > > > > > +    if (n->primary_device_dict)
> > > > > > > +        n->primary_device_opts = qemu_opts_from_qdict(qemu_find_opts("device"),
> > > > > > > +            n->primary_device_dict, &err);
> > > > > > > +    if (n->primary_device_opts) {
> > > > > > > +        n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
> > > > > > > +        error_setg(&err, "virtio_net: couldn't plug in primary device");
> > > > > > > +        return;
> > > > > > > +    }
> > > > > > > +    if (!n->primary_device_dict && err) {
> > > > > > > +        if (n->primary_device_timer) {
> > > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > > +                100);
> > > > > > 
> > > > > > same here.
> > > > 
> > > > see above
> > > > 
> > > > > > 
> > > > > > 
> > > > > > > +        }
> > > > > > > +    }
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void virtio_net_handle_migration_primary(VirtIONet *n,
> > > > > > > +                                                MigrationState *s)
> > > > > > > +{
> > > > > > > +    Error *err = NULL;
> > > > > > > +    bool should_be_hidden = atomic_read(&n->primary_should_be_hidden);
> > > > > > > +
> > > > > > > +    n->primary_dev = qdev_find_recursive(sysbus_get_default(),
> > > > > > > +            n->primary_device_id);
> > > > > > > +    if (!n->primary_dev) {
> > > > > > > +        error_setg(&err, "virtio_net: couldn't find primary device");
> > > > > > 
> > > > > > There's something broken with the error handling in this function - the
> > > > > > 'err' never goes anywhere - I don't think it ever gets printed or
> > > > > > reported or stops the migration.
> > > > 
> > > > yes, I'll fix it.
> > > > 
> > > > > > > +    }
> > > > > > > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
> > > > > > > +        qdev_unplug(n->primary_dev, &err);
> > > > > > 
> > > > > > Not knowing unplug well; can you just explain - is that device hard
> > > > > > unplugged and it's gone by the time this function returns or is it still
> > > > > > hanging around for some indeterminate time?
> > > > 
> > > > Qemu will trigger an unplug request via pcie attention button in which case
> > > > there could be a delay by the guest operating system. We could give it some
> > > > amount of time and if nothing happens try surpise removal or handle the
> > > > error otherwise.
> > > > 
> > > > 
> > > > regards,
> > > > Jens
> > > 
> > > That's a subject for another day. Let's get the basic thing
> > > working.
> > 
> > Well no, we need to know this thing isn't going to hang in the migration
> > setup phase, or if it does how we recover.
> 
> 
> This thing is *supposed* to be stuck in migration startup phase
> if guest is malicious.
> 
> If migration does not progress management needs
> a way to detect this and cancel.
> 
> Some more documentation about how this is supposed to happen
> would be helpful.

I want to see that first; because I want to convinced it's just a
documentation problem and that we actually really have a method of
recovering.

> >  This patch series is very
> > odd precisely because it's trying to do the unplug itself in the
> > migration phase rather than let the management layer do it - so unless
> > it's nailed down how to make sure that's really really bullet proof
> > then we've got to go back and ask the question about whether we should
> > really fix it so it can be done by the management layer.
> > 
> > Dave
> 
> management already said they can't because files get closed and
> resources freed on unplug and so they might not be able to re-add device
> on migration failure. We do it in migration because that is
> where failures can happen and we can recover.

I find this explanation confusing - I can kind of see where it's coming
from, but we've got a pretty clear separation between a NIC and the
netdev that backs it; those files and resources should be associated
with the netdev and not the NIC.  So does hot-removing the NIC really
clean up the netdev?  (I guess maybe this is a different in vfio
which is the problem)

Dave

> > > -- 
> > > MST
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert May 30, 2019, 7:09 p.m. UTC | #9
* Jens Freimann (jfreimann@redhat.com) wrote:
> Hi David,
> 
> sorry for the  delayed reply.
> 
> On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > +static void virtio_net_primary_plug_timer(void *opaque);
> > > > +
> > > >  static void virtio_net_set_link_status(NetClientState *nc)
> > > >  {
> > > >      VirtIONet *n = qemu_get_nic_opaque(nc);
> > > > @@ -786,6 +796,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
> > > >      } else {
> > > >          memset(n->vlans, 0xff, MAX_VLAN >> 3);
> > > >      }
> > > > +
> > > > +    if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
> > > > +        atomic_set(&n->primary_should_be_hidden, false);
> > > > +        if (n->primary_device_timer)
> > > > +            timer_mod(n->primary_device_timer,
> > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > +                4000);
> > > > +    }
> > > 
> > > What's this magic timer constant and why?
> 
> To be honest it's a leftover from previous versions (before I took
> over) of the patches and I'm not sure why the timer is there.
> I removed it and so far see no reason to keep it.
> 
> > > 
> > > >  }
> > > >
> > > >  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
> > > > @@ -2626,6 +2644,87 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
> > > >      n->netclient_type = g_strdup(type);
> > > >  }
> > > >
> > > > +static void virtio_net_primary_plug_timer(void *opaque)
> > > > +{
> > > > +    VirtIONet *n = opaque;
> > > > +    Error *err = NULL;
> > > > +
> > > > +    if (n->primary_device_dict)
> > > > +        n->primary_device_opts = qemu_opts_from_qdict(qemu_find_opts("device"),
> > > > +            n->primary_device_dict, &err);
> > > > +    if (n->primary_device_opts) {
> > > > +        n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
> > > > +        error_setg(&err, "virtio_net: couldn't plug in primary device");
> > > > +        return;
> > > > +    }
> > > > +    if (!n->primary_device_dict && err) {
> > > > +        if (n->primary_device_timer) {
> > > > +            timer_mod(n->primary_device_timer,
> > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > +                100);
> > > 
> > > same here.
> 
> see above
> 
> > > 
> > > 
> > > > +        }
> > > > +    }
> > > > +}
> > > > +
> > > > +static void virtio_net_handle_migration_primary(VirtIONet *n,
> > > > +                                                MigrationState *s)
> > > > +{
> > > > +    Error *err = NULL;
> > > > +    bool should_be_hidden = atomic_read(&n->primary_should_be_hidden);
> > > > +
> > > > +    n->primary_dev = qdev_find_recursive(sysbus_get_default(),
> > > > +            n->primary_device_id);
> > > > +    if (!n->primary_dev) {
> > > > +        error_setg(&err, "virtio_net: couldn't find primary device");
> > > 
> > > There's something broken with the error handling in this function - the
> > > 'err' never goes anywhere - I don't think it ever gets printed or
> > > reported or stops the migration.
> 
> yes, I'll fix it.
> 
> > > > +    }
> > > > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
> > > > +        qdev_unplug(n->primary_dev, &err);
> > > 
> > > Not knowing unplug well; can you just explain - is that device hard
> > > unplugged and it's gone by the time this function returns or is it still
> > > hanging around for some indeterminate time?
> 
> Qemu will trigger an unplug request via pcie attention button in which case
> there could be a delay by the guest operating system. We could give it some
> amount of time and if nothing happens try surpise removal or handle the
> error otherwise.

OK, can you show how one of those is going to work and try it out.
THis setup is weird enough I just want to make sure it works.

Dave

> 
> regards,
> Jens
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Michael S. Tsirkin May 30, 2019, 7:21 p.m. UTC | #10
On Thu, May 30, 2019 at 08:08:23PM +0100, Dr. David Alan Gilbert wrote:
> * Michael S. Tsirkin (mst@redhat.com) wrote:
> > On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
> > > * Michael S. Tsirkin (mst@redhat.com) wrote:
> > > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > > > Hi David,
> > > > > 
> > > > > sorry for the  delayed reply.
> > > > > 
> > > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > > > > > +static void virtio_net_primary_plug_timer(void *opaque);
> > > > > > > > +
> > > > > > > >  static void virtio_net_set_link_status(NetClientState *nc)
> > > > > > > >  {
> > > > > > > >      VirtIONet *n = qemu_get_nic_opaque(nc);
> > > > > > > > @@ -786,6 +796,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
> > > > > > > >      } else {
> > > > > > > >          memset(n->vlans, 0xff, MAX_VLAN >> 3);
> > > > > > > >      }
> > > > > > > > +
> > > > > > > > +    if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
> > > > > > > > +        atomic_set(&n->primary_should_be_hidden, false);
> > > > > > > > +        if (n->primary_device_timer)
> > > > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > > > +                4000);
> > > > > > > > +    }
> > > > > > > 
> > > > > > > What's this magic timer constant and why?
> > > > > 
> > > > > To be honest it's a leftover from previous versions (before I took
> > > > > over) of the patches and I'm not sure why the timer is there.
> > > > > I removed it and so far see no reason to keep it.
> > > > > 
> > > > > > > 
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
> > > > > > > > @@ -2626,6 +2644,87 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
> > > > > > > >      n->netclient_type = g_strdup(type);
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static void virtio_net_primary_plug_timer(void *opaque)
> > > > > > > > +{
> > > > > > > > +    VirtIONet *n = opaque;
> > > > > > > > +    Error *err = NULL;
> > > > > > > > +
> > > > > > > > +    if (n->primary_device_dict)
> > > > > > > > +        n->primary_device_opts = qemu_opts_from_qdict(qemu_find_opts("device"),
> > > > > > > > +            n->primary_device_dict, &err);
> > > > > > > > +    if (n->primary_device_opts) {
> > > > > > > > +        n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
> > > > > > > > +        error_setg(&err, "virtio_net: couldn't plug in primary device");
> > > > > > > > +        return;
> > > > > > > > +    }
> > > > > > > > +    if (!n->primary_device_dict && err) {
> > > > > > > > +        if (n->primary_device_timer) {
> > > > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > > > +                100);
> > > > > > > 
> > > > > > > same here.
> > > > > 
> > > > > see above
> > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > > +        }
> > > > > > > > +    }
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void virtio_net_handle_migration_primary(VirtIONet *n,
> > > > > > > > +                                                MigrationState *s)
> > > > > > > > +{
> > > > > > > > +    Error *err = NULL;
> > > > > > > > +    bool should_be_hidden = atomic_read(&n->primary_should_be_hidden);
> > > > > > > > +
> > > > > > > > +    n->primary_dev = qdev_find_recursive(sysbus_get_default(),
> > > > > > > > +            n->primary_device_id);
> > > > > > > > +    if (!n->primary_dev) {
> > > > > > > > +        error_setg(&err, "virtio_net: couldn't find primary device");
> > > > > > > 
> > > > > > > There's something broken with the error handling in this function - the
> > > > > > > 'err' never goes anywhere - I don't think it ever gets printed or
> > > > > > > reported or stops the migration.
> > > > > 
> > > > > yes, I'll fix it.
> > > > > 
> > > > > > > > +    }
> > > > > > > > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
> > > > > > > > +        qdev_unplug(n->primary_dev, &err);
> > > > > > > 
> > > > > > > Not knowing unplug well; can you just explain - is that device hard
> > > > > > > unplugged and it's gone by the time this function returns or is it still
> > > > > > > hanging around for some indeterminate time?
> > > > > 
> > > > > Qemu will trigger an unplug request via pcie attention button in which case
> > > > > there could be a delay by the guest operating system. We could give it some
> > > > > amount of time and if nothing happens try surpise removal or handle the
> > > > > error otherwise.
> > > > > 
> > > > > 
> > > > > regards,
> > > > > Jens
> > > > 
> > > > That's a subject for another day. Let's get the basic thing
> > > > working.
> > > 
> > > Well no, we need to know this thing isn't going to hang in the migration
> > > setup phase, or if it does how we recover.
> > 
> > 
> > This thing is *supposed* to be stuck in migration startup phase
> > if guest is malicious.
> > 
> > If migration does not progress management needs
> > a way to detect this and cancel.
> > 
> > Some more documentation about how this is supposed to happen
> > would be helpful.
> 
> I want to see that first; because I want to convinced it's just a
> documentation problem and that we actually really have a method of
> recovering.
> 
> > >  This patch series is very
> > > odd precisely because it's trying to do the unplug itself in the
> > > migration phase rather than let the management layer do it - so unless
> > > it's nailed down how to make sure that's really really bullet proof
> > > then we've got to go back and ask the question about whether we should
> > > really fix it so it can be done by the management layer.
> > > 
> > > Dave
> > 
> > management already said they can't because files get closed and
> > resources freed on unplug and so they might not be able to re-add device
> > on migration failure. We do it in migration because that is
> > where failures can happen and we can recover.
> 
> I find this explanation confusing - I can kind of see where it's coming
> from, but we've got a pretty clear separation between a NIC and the
> netdev that backs it; those files and resources should be associated
> with the netdev and not the NIC.  So does hot-removing the NIC really
> clean up the netdev?  (I guess maybe this is a different in vfio
> which is the problem)
> 
> Dave

what we are removing is the VFIO device.
Nothing to do with nic or netdev.

> > > > -- 
> > > > MST
> > > --
> > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Michael S. Tsirkin May 30, 2019, 11:06 p.m. UTC | #11
On Thu, May 30, 2019 at 03:22:10PM -0300, Eduardo Habkost wrote:
> On Thu, May 30, 2019 at 02:09:42PM -0400, Michael S. Tsirkin wrote:
> > On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
> > > * Michael S. Tsirkin (mst@redhat.com) wrote:
> > > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > > > Hi David,
> > > > > 
> > > > > sorry for the  delayed reply.
> > > > > 
> > > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > > > > > +static void virtio_net_primary_plug_timer(void *opaque);
> > > > > > > > +
> > > > > > > >  static void virtio_net_set_link_status(NetClientState *nc)
> > > > > > > >  {
> > > > > > > >      VirtIONet *n = qemu_get_nic_opaque(nc);
> > > > > > > > @@ -786,6 +796,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
> > > > > > > >      } else {
> > > > > > > >          memset(n->vlans, 0xff, MAX_VLAN >> 3);
> > > > > > > >      }
> > > > > > > > +
> > > > > > > > +    if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
> > > > > > > > +        atomic_set(&n->primary_should_be_hidden, false);
> > > > > > > > +        if (n->primary_device_timer)
> > > > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > > > +                4000);
> > > > > > > > +    }
> > > > > > > 
> > > > > > > What's this magic timer constant and why?
> > > > > 
> > > > > To be honest it's a leftover from previous versions (before I took
> > > > > over) of the patches and I'm not sure why the timer is there.
> > > > > I removed it and so far see no reason to keep it.
> > > > > 
> > > > > > > 
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
> > > > > > > > @@ -2626,6 +2644,87 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
> > > > > > > >      n->netclient_type = g_strdup(type);
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static void virtio_net_primary_plug_timer(void *opaque)
> > > > > > > > +{
> > > > > > > > +    VirtIONet *n = opaque;
> > > > > > > > +    Error *err = NULL;
> > > > > > > > +
> > > > > > > > +    if (n->primary_device_dict)
> > > > > > > > +        n->primary_device_opts = qemu_opts_from_qdict(qemu_find_opts("device"),
> > > > > > > > +            n->primary_device_dict, &err);
> > > > > > > > +    if (n->primary_device_opts) {
> > > > > > > > +        n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
> > > > > > > > +        error_setg(&err, "virtio_net: couldn't plug in primary device");
> > > > > > > > +        return;
> > > > > > > > +    }
> > > > > > > > +    if (!n->primary_device_dict && err) {
> > > > > > > > +        if (n->primary_device_timer) {
> > > > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > > > +                100);
> > > > > > > 
> > > > > > > same here.
> > > > > 
> > > > > see above
> > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > > +        }
> > > > > > > > +    }
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void virtio_net_handle_migration_primary(VirtIONet *n,
> > > > > > > > +                                                MigrationState *s)
> > > > > > > > +{
> > > > > > > > +    Error *err = NULL;
> > > > > > > > +    bool should_be_hidden = atomic_read(&n->primary_should_be_hidden);
> > > > > > > > +
> > > > > > > > +    n->primary_dev = qdev_find_recursive(sysbus_get_default(),
> > > > > > > > +            n->primary_device_id);
> > > > > > > > +    if (!n->primary_dev) {
> > > > > > > > +        error_setg(&err, "virtio_net: couldn't find primary device");
> > > > > > > 
> > > > > > > There's something broken with the error handling in this function - the
> > > > > > > 'err' never goes anywhere - I don't think it ever gets printed or
> > > > > > > reported or stops the migration.
> > > > > 
> > > > > yes, I'll fix it.
> > > > > 
> > > > > > > > +    }
> > > > > > > > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
> > > > > > > > +        qdev_unplug(n->primary_dev, &err);
> > > > > > > 
> > > > > > > Not knowing unplug well; can you just explain - is that device hard
> > > > > > > unplugged and it's gone by the time this function returns or is it still
> > > > > > > hanging around for some indeterminate time?
> > > > > 
> > > > > Qemu will trigger an unplug request via pcie attention button in which case
> > > > > there could be a delay by the guest operating system. We could give it some
> > > > > amount of time and if nothing happens try surpise removal or handle the
> > > > > error otherwise.
> > > > > 
> > > > > 
> > > > > regards,
> > > > > Jens
> > > > 
> > > > That's a subject for another day. Let's get the basic thing
> > > > working.
> > > 
> > > Well no, we need to know this thing isn't going to hang in the migration
> > > setup phase, or if it does how we recover.
> > 
> > 
> > This thing is *supposed* to be stuck in migration startup phase
> > if guest is malicious.
> > 
> > If migration does not progress management needs
> > a way to detect this and cancel.
> > 
> > Some more documentation about how this is supposed to happen
> > would be helpful.
> 
> Do we have confirmation from libvirt developers that this would
> be a reasonable API for them?
> 
> 
> > >  This patch series is very
> > > odd precisely because it's trying to do the unplug itself in the
> > > migration phase rather than let the management layer do it - so unless
> > > it's nailed down how to make sure that's really really bullet proof
> > > then we've got to go back and ask the question about whether we should
> > > really fix it so it can be done by the management layer.
> > > 
> > > Dave
> > 
> > management already said they can't because files get closed and
> > resources freed on unplug and so they might not be able to re-add device
> > on migration failure. We do it in migration because that is
> > where failures can happen and we can recover.
> 
> We are capable of providing an API to libvirt where files won't
> get closed when a device is unplugged, if necessary.
> 
> This might become necessary if libvirt or management software
> developers tell us the interface we are providing is not going to
> work for them.
> 
> -- 
> Eduardo

Yes. It's just lots of extremely low level interfaces
and all rather pointless.

And down the road extensions like surprise removal support will make it
all cleaner and more transparent. Floating things up to libvirt means
all these low level details will require more and more hacks.
Dr. David Alan Gilbert May 31, 2019, 8:23 a.m. UTC | #12
* Michael S. Tsirkin (mst@redhat.com) wrote:
> On Thu, May 30, 2019 at 08:08:23PM +0100, Dr. David Alan Gilbert wrote:
> > * Michael S. Tsirkin (mst@redhat.com) wrote:
> > > On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
> > > > * Michael S. Tsirkin (mst@redhat.com) wrote:
> > > > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > > > > Hi David,
> > > > > > 
> > > > > > sorry for the  delayed reply.
> > > > > > 
> > > > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > > > > > > +static void virtio_net_primary_plug_timer(void *opaque);
> > > > > > > > > +
> > > > > > > > >  static void virtio_net_set_link_status(NetClientState *nc)
> > > > > > > > >  {
> > > > > > > > >      VirtIONet *n = qemu_get_nic_opaque(nc);
> > > > > > > > > @@ -786,6 +796,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
> > > > > > > > >      } else {
> > > > > > > > >          memset(n->vlans, 0xff, MAX_VLAN >> 3);
> > > > > > > > >      }
> > > > > > > > > +
> > > > > > > > > +    if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
> > > > > > > > > +        atomic_set(&n->primary_should_be_hidden, false);
> > > > > > > > > +        if (n->primary_device_timer)
> > > > > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > > > > +                4000);
> > > > > > > > > +    }
> > > > > > > > 
> > > > > > > > What's this magic timer constant and why?
> > > > > > 
> > > > > > To be honest it's a leftover from previous versions (before I took
> > > > > > over) of the patches and I'm not sure why the timer is there.
> > > > > > I removed it and so far see no reason to keep it.
> > > > > > 
> > > > > > > > 
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > >  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
> > > > > > > > > @@ -2626,6 +2644,87 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
> > > > > > > > >      n->netclient_type = g_strdup(type);
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +static void virtio_net_primary_plug_timer(void *opaque)
> > > > > > > > > +{
> > > > > > > > > +    VirtIONet *n = opaque;
> > > > > > > > > +    Error *err = NULL;
> > > > > > > > > +
> > > > > > > > > +    if (n->primary_device_dict)
> > > > > > > > > +        n->primary_device_opts = qemu_opts_from_qdict(qemu_find_opts("device"),
> > > > > > > > > +            n->primary_device_dict, &err);
> > > > > > > > > +    if (n->primary_device_opts) {
> > > > > > > > > +        n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
> > > > > > > > > +        error_setg(&err, "virtio_net: couldn't plug in primary device");
> > > > > > > > > +        return;
> > > > > > > > > +    }
> > > > > > > > > +    if (!n->primary_device_dict && err) {
> > > > > > > > > +        if (n->primary_device_timer) {
> > > > > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > > > > +                100);
> > > > > > > > 
> > > > > > > > same here.
> > > > > > 
> > > > > > see above
> > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > +        }
> > > > > > > > > +    }
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static void virtio_net_handle_migration_primary(VirtIONet *n,
> > > > > > > > > +                                                MigrationState *s)
> > > > > > > > > +{
> > > > > > > > > +    Error *err = NULL;
> > > > > > > > > +    bool should_be_hidden = atomic_read(&n->primary_should_be_hidden);
> > > > > > > > > +
> > > > > > > > > +    n->primary_dev = qdev_find_recursive(sysbus_get_default(),
> > > > > > > > > +            n->primary_device_id);
> > > > > > > > > +    if (!n->primary_dev) {
> > > > > > > > > +        error_setg(&err, "virtio_net: couldn't find primary device");
> > > > > > > > 
> > > > > > > > There's something broken with the error handling in this function - the
> > > > > > > > 'err' never goes anywhere - I don't think it ever gets printed or
> > > > > > > > reported or stops the migration.
> > > > > > 
> > > > > > yes, I'll fix it.
> > > > > > 
> > > > > > > > > +    }
> > > > > > > > > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
> > > > > > > > > +        qdev_unplug(n->primary_dev, &err);
> > > > > > > > 
> > > > > > > > Not knowing unplug well; can you just explain - is that device hard
> > > > > > > > unplugged and it's gone by the time this function returns or is it still
> > > > > > > > hanging around for some indeterminate time?
> > > > > > 
> > > > > > Qemu will trigger an unplug request via pcie attention button in which case
> > > > > > there could be a delay by the guest operating system. We could give it some
> > > > > > amount of time and if nothing happens try surpise removal or handle the
> > > > > > error otherwise.
> > > > > > 
> > > > > > 
> > > > > > regards,
> > > > > > Jens
> > > > > 
> > > > > That's a subject for another day. Let's get the basic thing
> > > > > working.
> > > > 
> > > > Well no, we need to know this thing isn't going to hang in the migration
> > > > setup phase, or if it does how we recover.
> > > 
> > > 
> > > This thing is *supposed* to be stuck in migration startup phase
> > > if guest is malicious.
> > > 
> > > If migration does not progress management needs
> > > a way to detect this and cancel.
> > > 
> > > Some more documentation about how this is supposed to happen
> > > would be helpful.
> > 
> > I want to see that first; because I want to convinced it's just a
> > documentation problem and that we actually really have a method of
> > recovering.
> > 
> > > >  This patch series is very
> > > > odd precisely because it's trying to do the unplug itself in the
> > > > migration phase rather than let the management layer do it - so unless
> > > > it's nailed down how to make sure that's really really bullet proof
> > > > then we've got to go back and ask the question about whether we should
> > > > really fix it so it can be done by the management layer.
> > > > 
> > > > Dave
> > > 
> > > management already said they can't because files get closed and
> > > resources freed on unplug and so they might not be able to re-add device
> > > on migration failure. We do it in migration because that is
> > > where failures can happen and we can recover.
> > 
> > I find this explanation confusing - I can kind of see where it's coming
> > from, but we've got a pretty clear separation between a NIC and the
> > netdev that backs it; those files and resources should be associated
> > with the netdev and not the NIC.  So does hot-removing the NIC really
> > clean up the netdev?  (I guess maybe this is a different in vfio
> > which is the problem)
> > 
> > Dave
> 
> what we are removing is the VFIO device.
> Nothing to do with nic or netdev.

OK, but at the same time why can't we hold open the VFIOs devices
resources in a comparable way - i.e. don't really let qemu let go of
it even when the guest has unplugged it?

Dave

> > > > > -- 
> > > > > MST
> > > > --
> > > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Eduardo Habkost May 31, 2019, 5:01 p.m. UTC | #13
On Thu, May 30, 2019 at 07:06:29PM -0400, Michael S. Tsirkin wrote:
> On Thu, May 30, 2019 at 03:22:10PM -0300, Eduardo Habkost wrote:
> > On Thu, May 30, 2019 at 02:09:42PM -0400, Michael S. Tsirkin wrote:
> > > On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
> > > > * Michael S. Tsirkin (mst@redhat.com) wrote:
> > > > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > > > > Hi David,
> > > > > > 
> > > > > > sorry for the  delayed reply.
> > > > > > 
> > > > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > > > > > > +static void virtio_net_primary_plug_timer(void *opaque);
> > > > > > > > > +
> > > > > > > > >  static void virtio_net_set_link_status(NetClientState *nc)
> > > > > > > > >  {
> > > > > > > > >      VirtIONet *n = qemu_get_nic_opaque(nc);
> > > > > > > > > @@ -786,6 +796,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
> > > > > > > > >      } else {
> > > > > > > > >          memset(n->vlans, 0xff, MAX_VLAN >> 3);
> > > > > > > > >      }
> > > > > > > > > +
> > > > > > > > > +    if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
> > > > > > > > > +        atomic_set(&n->primary_should_be_hidden, false);
> > > > > > > > > +        if (n->primary_device_timer)
> > > > > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > > > > +                4000);
> > > > > > > > > +    }
> > > > > > > > 
> > > > > > > > What's this magic timer constant and why?
> > > > > > 
> > > > > > To be honest it's a leftover from previous versions (before I took
> > > > > > over) of the patches and I'm not sure why the timer is there.
> > > > > > I removed it and so far see no reason to keep it.
> > > > > > 
> > > > > > > > 
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > >  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
> > > > > > > > > @@ -2626,6 +2644,87 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
> > > > > > > > >      n->netclient_type = g_strdup(type);
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +static void virtio_net_primary_plug_timer(void *opaque)
> > > > > > > > > +{
> > > > > > > > > +    VirtIONet *n = opaque;
> > > > > > > > > +    Error *err = NULL;
> > > > > > > > > +
> > > > > > > > > +    if (n->primary_device_dict)
> > > > > > > > > +        n->primary_device_opts = qemu_opts_from_qdict(qemu_find_opts("device"),
> > > > > > > > > +            n->primary_device_dict, &err);
> > > > > > > > > +    if (n->primary_device_opts) {
> > > > > > > > > +        n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
> > > > > > > > > +        error_setg(&err, "virtio_net: couldn't plug in primary device");
> > > > > > > > > +        return;
> > > > > > > > > +    }
> > > > > > > > > +    if (!n->primary_device_dict && err) {
> > > > > > > > > +        if (n->primary_device_timer) {
> > > > > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > > > > +                100);
> > > > > > > > 
> > > > > > > > same here.
> > > > > > 
> > > > > > see above
> > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > +        }
> > > > > > > > > +    }
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static void virtio_net_handle_migration_primary(VirtIONet *n,
> > > > > > > > > +                                                MigrationState *s)
> > > > > > > > > +{
> > > > > > > > > +    Error *err = NULL;
> > > > > > > > > +    bool should_be_hidden = atomic_read(&n->primary_should_be_hidden);
> > > > > > > > > +
> > > > > > > > > +    n->primary_dev = qdev_find_recursive(sysbus_get_default(),
> > > > > > > > > +            n->primary_device_id);
> > > > > > > > > +    if (!n->primary_dev) {
> > > > > > > > > +        error_setg(&err, "virtio_net: couldn't find primary device");
> > > > > > > > 
> > > > > > > > There's something broken with the error handling in this function - the
> > > > > > > > 'err' never goes anywhere - I don't think it ever gets printed or
> > > > > > > > reported or stops the migration.
> > > > > > 
> > > > > > yes, I'll fix it.
> > > > > > 
> > > > > > > > > +    }
> > > > > > > > > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
> > > > > > > > > +        qdev_unplug(n->primary_dev, &err);
> > > > > > > > 
> > > > > > > > Not knowing unplug well; can you just explain - is that device hard
> > > > > > > > unplugged and it's gone by the time this function returns or is it still
> > > > > > > > hanging around for some indeterminate time?
> > > > > > 
> > > > > > Qemu will trigger an unplug request via pcie attention button in which case
> > > > > > there could be a delay by the guest operating system. We could give it some
> > > > > > amount of time and if nothing happens try surpise removal or handle the
> > > > > > error otherwise.
> > > > > > 
> > > > > > 
> > > > > > regards,
> > > > > > Jens
> > > > > 
> > > > > That's a subject for another day. Let's get the basic thing
> > > > > working.
> > > > 
> > > > Well no, we need to know this thing isn't going to hang in the migration
> > > > setup phase, or if it does how we recover.
> > > 
> > > 
> > > This thing is *supposed* to be stuck in migration startup phase
> > > if guest is malicious.
> > > 
> > > If migration does not progress management needs
> > > a way to detect this and cancel.
> > > 
> > > Some more documentation about how this is supposed to happen
> > > would be helpful.
> > 
> > Do we have confirmation from libvirt developers that this would
> > be a reasonable API for them?
> > 
> > 
> > > >  This patch series is very
> > > > odd precisely because it's trying to do the unplug itself in the
> > > > migration phase rather than let the management layer do it - so unless
> > > > it's nailed down how to make sure that's really really bullet proof
> > > > then we've got to go back and ask the question about whether we should
> > > > really fix it so it can be done by the management layer.
> > > > 
> > > > Dave
> > > 
> > > management already said they can't because files get closed and
> > > resources freed on unplug and so they might not be able to re-add device
> > > on migration failure. We do it in migration because that is
> > > where failures can happen and we can recover.
> > 
> > We are capable of providing an API to libvirt where files won't
> > get closed when a device is unplugged, if necessary.
> > 
> > This might become necessary if libvirt or management software
> > developers tell us the interface we are providing is not going to
> > work for them.
> > 
> > -- 
> > Eduardo
> 
> Yes. It's just lots of extremely low level interfaces
> and all rather pointless.
> 
> And down the road extensions like surprise removal support will make it
> all cleaner and more transparent. Floating things up to libvirt means
> all these low level details will require more and more hacks.

Why do you call it pointless?  If we want this to work before
surprise removal is implemented, we need to provide an API that
works for management software.  Don't we want to make this work
without surprise removal too?
Michael S. Tsirkin May 31, 2019, 6:04 p.m. UTC | #14
On Fri, May 31, 2019 at 02:01:54PM -0300, Eduardo Habkost wrote:
> > Yes. It's just lots of extremely low level interfaces
> > and all rather pointless.
> > 
> > And down the road extensions like surprise removal support will make it
> > all cleaner and more transparent. Floating things up to libvirt means
> > all these low level details will require more and more hacks.
> 
> Why do you call it pointless?

We'd need APIs to manipulate device visibility to guest, hotplug
controller state and separately manipulate the resources allocated. This
is low level stuff that users really have no idea what to do about.
Exposing such a level of detail to management is imho pointless.
We are better off with a high level API, see below.

> If we want this to work before
> surprise removal is implemented, we need to provide an API that
> works for management software.
>  Don't we want to make this work
> without surprise removal too?

This patchset adds an optional, off by default support for
migrating guests with an assigned network device.
If enabled this requires guest to allow migration.

Of course this can be viewed as a security problem since it allows guest
to block migration. We can't detect a malicious guest reliably imho.
What we can do is report to management when guest allows migration.
Policy such what to do when this does not happen for a while and
what timeout to set would be up to management.

The API in question would be a high level one, something
along the lines of a single "guest allowed migration" event.
Eduardo Habkost May 31, 2019, 6:42 p.m. UTC | #15
On Fri, May 31, 2019 at 02:04:49PM -0400, Michael S. Tsirkin wrote:
> On Fri, May 31, 2019 at 02:01:54PM -0300, Eduardo Habkost wrote:
> > > Yes. It's just lots of extremely low level interfaces
> > > and all rather pointless.
> > > 
> > > And down the road extensions like surprise removal support will make it
> > > all cleaner and more transparent. Floating things up to libvirt means
> > > all these low level details will require more and more hacks.
> > 
> > Why do you call it pointless?
> 
> We'd need APIs to manipulate device visibility to guest, hotplug
> controller state and separately manipulate the resources allocated. This
> is low level stuff that users really have no idea what to do about.
> Exposing such a level of detail to management is imho pointless.
> We are better off with a high level API, see below.

I don't disagree it's low level.  I just disagree it's pointless.
The goal here is to provide an API that management software can
use.

> 
> > If we want this to work before
> > surprise removal is implemented, we need to provide an API that
> > works for management software.
> >  Don't we want to make this work
> > without surprise removal too?
> 
> This patchset adds an optional, off by default support for
> migrating guests with an assigned network device.
> If enabled this requires guest to allow migration.
> 
> Of course this can be viewed as a security problem since it allows guest
> to block migration. We can't detect a malicious guest reliably imho.
> What we can do is report to management when guest allows migration.
> Policy such what to do when this does not happen for a while and
> what timeout to set would be up to management.
> 
> The API in question would be a high level one, something
> along the lines of a single "guest allowed migration" event.

If you want to hide the low level details behind a higher level
API, that's OK.  I just want to be sure we have really listened
to management software developers to confirm the API we're
providing will work for them.

This will probably require documenting the new interface in more
detail (as you have already mentioned in this thread).
Dr. David Alan Gilbert May 31, 2019, 6:45 p.m. UTC | #16
* Michael S. Tsirkin (mst@redhat.com) wrote:
> On Fri, May 31, 2019 at 02:01:54PM -0300, Eduardo Habkost wrote:
> > > Yes. It's just lots of extremely low level interfaces
> > > and all rather pointless.
> > > 
> > > And down the road extensions like surprise removal support will make it
> > > all cleaner and more transparent. Floating things up to libvirt means
> > > all these low level details will require more and more hacks.
> > 
> > Why do you call it pointless?
> 
> We'd need APIs to manipulate device visibility to guest, hotplug
> controller state and separately manipulate the resources allocated. This
> is low level stuff that users really have no idea what to do about.
> Exposing such a level of detail to management is imho pointless.
> We are better off with a high level API, see below.

so I don't know much about vfio; but to me it strikes me that
you wouldn't need that low level detail if we just reworked vfio
to look more like all our other devices; something like:

  -vfiodev  host=02:00.0,id=gpu
  -device vfio-pci,dev=gpu

The 'vfiodev' would own the resources; so to do this trick, the
management layer would:
   hotunplug the vfio-pci
   migrate

if anything went wrong it would
   hotplug the vfio-pci backin

you wouldn't have free'd up any resources because they belonged
to the vfiodev.


> > If we want this to work before
> > surprise removal is implemented, we need to provide an API that
> > works for management software.
> >  Don't we want to make this work
> > without surprise removal too?
> 
> This patchset adds an optional, off by default support for
> migrating guests with an assigned network device.
> If enabled this requires guest to allow migration.
> 
> Of course this can be viewed as a security problem since it allows guest
> to block migration. We can't detect a malicious guest reliably imho.
> What we can do is report to management when guest allows migration.
> Policy such what to do when this does not happen for a while and
> what timeout to set would be up to management.
> 
> The API in question would be a high level one, something
> along the lines of a single "guest allowed migration" event.

This is all fairly normal problems with hot unplugging - that's
already dealt with at higher levels for normal hot unplugging.

The question here is to try to avoid duplicating that fairly
painful process in qemu.

Dave

> 
> -- 
> MST
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Alex Williamson May 31, 2019, 8:29 p.m. UTC | #17
On Fri, 31 May 2019 19:45:13 +0100
"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:

> * Michael S. Tsirkin (mst@redhat.com) wrote:
> > On Fri, May 31, 2019 at 02:01:54PM -0300, Eduardo Habkost wrote:  
> > > > Yes. It's just lots of extremely low level interfaces
> > > > and all rather pointless.
> > > > 
> > > > And down the road extensions like surprise removal support will make it
> > > > all cleaner and more transparent. Floating things up to libvirt means
> > > > all these low level details will require more and more hacks.  
> > > 
> > > Why do you call it pointless?  
> > 
> > We'd need APIs to manipulate device visibility to guest, hotplug
> > controller state and separately manipulate the resources allocated. This
> > is low level stuff that users really have no idea what to do about.
> > Exposing such a level of detail to management is imho pointless.
> > We are better off with a high level API, see below.  
> 
> so I don't know much about vfio; but to me it strikes me that
> you wouldn't need that low level detail if we just reworked vfio
> to look more like all our other devices;

I don't understand what this means, I thought vfio-pci followed a very
standard device model.

> something like:
> 
>   -vfiodev  host=02:00.0,id=gpu
>   -device vfio-pci,dev=gpu
>
> The 'vfiodev' would own the resources; so to do this trick, the
> management layer would:
>    hotunplug the vfio-pci
>    migrate
> 
> if anything went wrong it would
>    hotplug the vfio-pci backin
> 
> you wouldn't have free'd up any resources because they belonged
> to the vfiodev.

So you're looking more for some sort of frontend-backend separation, we
hot-unplug the frontend device that's exposed to the guest while the
backend device that holds the host resources is still attached.  I
would have hardly guessed that's "like all our other devices".  I was
under the impression (from previous discussions mostly) that the device
removal would be caught before actually allowing the device to finalize
and exit, such that with a failed migration, re-adding the device would
be deterministic since the device is never released back to the host.
I expected that could be done within QEMU, but I guess that's what
we're getting into here is how management tools specify that eject w/o
release semantic.  I don't know what this frontend/backend rework would
look like for vfio-pci, but it seems non-trivial for this one use case
and I don't see that it adds any value outside of this use case,
perhaps quite the opposite, it's an overly complicated interface for
the majority of use cases so we either move to a more complicated
interface or maintain both.  Poor choices either way.  Thanks,

Alex
Michael S. Tsirkin May 31, 2019, 8:43 p.m. UTC | #18
On Fri, May 31, 2019 at 07:45:13PM +0100, Dr. David Alan Gilbert wrote:
> * Michael S. Tsirkin (mst@redhat.com) wrote:
> > On Fri, May 31, 2019 at 02:01:54PM -0300, Eduardo Habkost wrote:
> > > > Yes. It's just lots of extremely low level interfaces
> > > > and all rather pointless.
> > > > 
> > > > And down the road extensions like surprise removal support will make it
> > > > all cleaner and more transparent. Floating things up to libvirt means
> > > > all these low level details will require more and more hacks.
> > > 
> > > Why do you call it pointless?
> > 
> > We'd need APIs to manipulate device visibility to guest, hotplug
> > controller state and separately manipulate the resources allocated. This
> > is low level stuff that users really have no idea what to do about.
> > Exposing such a level of detail to management is imho pointless.
> > We are better off with a high level API, see below.
> 
> so I don't know much about vfio; but to me it strikes me that
> you wouldn't need that low level detail if we just reworked vfio
> to look more like all our other devices; something like:
> 
>   -vfiodev  host=02:00.0,id=gpu
>   -device vfio-pci,dev=gpu
> 
> The 'vfiodev' would own the resources; so to do this trick, the
> management layer would:
>    hotunplug the vfio-pci
>    migrate
> 
> if anything went wrong it would
>    hotplug the vfio-pci backin
> 
> you wouldn't have free'd up any resources because they belonged
> to the vfiodev.


IIUC that doesn't really work with passthrough
unless guests support surprise removal.


> > > If we want this to work before
> > > surprise removal is implemented, we need to provide an API that
> > > works for management software.
> > >  Don't we want to make this work
> > > without surprise removal too?
> > 
> > This patchset adds an optional, off by default support for
> > migrating guests with an assigned network device.
> > If enabled this requires guest to allow migration.
> > 
> > Of course this can be viewed as a security problem since it allows guest
> > to block migration. We can't detect a malicious guest reliably imho.
> > What we can do is report to management when guest allows migration.
> > Policy such what to do when this does not happen for a while and
> > what timeout to set would be up to management.
> > 
> > The API in question would be a high level one, something
> > along the lines of a single "guest allowed migration" event.
> 
> This is all fairly normal problems with hot unplugging - that's
> already dealt with at higher levels for normal hot unplugging.
> 
> The question here is to try to avoid duplicating that fairly
> painful process in qemu.
> 
> Dave
> > 
> > -- 
> > MST
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Eduardo Habkost May 31, 2019, 9:03 p.m. UTC | #19
On Fri, May 31, 2019 at 04:43:44PM -0400, Michael S. Tsirkin wrote:
> On Fri, May 31, 2019 at 07:45:13PM +0100, Dr. David Alan Gilbert wrote:
> > * Michael S. Tsirkin (mst@redhat.com) wrote:
> > > On Fri, May 31, 2019 at 02:01:54PM -0300, Eduardo Habkost wrote:
> > > > > Yes. It's just lots of extremely low level interfaces
> > > > > and all rather pointless.
> > > > > 
> > > > > And down the road extensions like surprise removal support will make it
> > > > > all cleaner and more transparent. Floating things up to libvirt means
> > > > > all these low level details will require more and more hacks.
> > > > 
> > > > Why do you call it pointless?
> > > 
> > > We'd need APIs to manipulate device visibility to guest, hotplug
> > > controller state and separately manipulate the resources allocated. This
> > > is low level stuff that users really have no idea what to do about.
> > > Exposing such a level of detail to management is imho pointless.
> > > We are better off with a high level API, see below.
> > 
> > so I don't know much about vfio; but to me it strikes me that
> > you wouldn't need that low level detail if we just reworked vfio
> > to look more like all our other devices; something like:
> > 
> >   -vfiodev  host=02:00.0,id=gpu
> >   -device vfio-pci,dev=gpu
> > 
> > The 'vfiodev' would own the resources; so to do this trick, the
> > management layer would:
> >    hotunplug the vfio-pci
> >    migrate
> > 
> > if anything went wrong it would
> >    hotplug the vfio-pci backin
> > 
> > you wouldn't have free'd up any resources because they belonged
> > to the vfiodev.
> 
> 
> IIUC that doesn't really work with passthrough
> unless guests support surprise removal.

Why?  For the guest, this is indistinguishable from the unplug
request implemented by this series.
Michael S. Tsirkin May 31, 2019, 9:05 p.m. UTC | #20
On Fri, May 31, 2019 at 02:29:33PM -0600, Alex Williamson wrote:
> I don't know what this frontend/backend rework would
> look like for vfio-pci, but it seems non-trivial for this one use case
> and I don't see that it adds any value outside of this use case,
> perhaps quite the opposite, it's an overly complicated interface for
> the majority of use cases so we either move to a more complicated
> interface or maintain both.  Poor choices either way.

Well put Alex this is what I meant when I said it's a useless
interface. I meant it only has a single use.
Eduardo Habkost May 31, 2019, 9:47 p.m. UTC | #21
On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > * Jens Freimann (jfreimann@redhat.com) wrote:
[...]
> > > > +    }
> > > > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
> > > > +        qdev_unplug(n->primary_dev, &err);
> > > 
> > > Not knowing unplug well; can you just explain - is that device hard
> > > unplugged and it's gone by the time this function returns or is it still
> > > hanging around for some indeterminate time?
> 
> Qemu will trigger an unplug request via pcie attention button in which case
> there could be a delay by the guest operating system. We could give it some
> amount of time and if nothing happens try surpise removal or handle the
> error otherwise.

I'm missing something here:

Isn't the whole point of the new device-hiding infrastructure to
prevent QEMU from closing the VFIO until migration ended
successfully?

What exactly is preventing QEMU from closing the host VFIO device
after the guest OS has handled the unplug request?
Eduardo Habkost May 31, 2019, 9:59 p.m. UTC | #22
On Fri, May 31, 2019 at 05:05:26PM -0400, Michael S. Tsirkin wrote:
> On Fri, May 31, 2019 at 02:29:33PM -0600, Alex Williamson wrote:
> > I don't know what this frontend/backend rework would
> > look like for vfio-pci, but it seems non-trivial for this one use case
> > and I don't see that it adds any value outside of this use case,
> > perhaps quite the opposite, it's an overly complicated interface for
> > the majority of use cases so we either move to a more complicated
> > interface or maintain both.  Poor choices either way.
> 
> Well put Alex this is what I meant when I said it's a useless
> interface. I meant it only has a single use.

I might agree if the code needed to hide the VFIO device from the
guest while keeping resources open (so it can be re-added if
migration fails) is demonstrably simpler than the code that would
be necessary to separate the device backend from the frontend.

But I couldn't find the code that does that in this series.  Is
this already implemented?

All I see is a qdev_unplug() call (which will close host
resources) and a qdev_device_add() call (which will reopen the
host device).
Dr. David Alan Gilbert June 3, 2019, 8:06 a.m. UTC | #23
* Michael S. Tsirkin (mst@redhat.com) wrote:
> On Fri, May 31, 2019 at 07:45:13PM +0100, Dr. David Alan Gilbert wrote:
> > * Michael S. Tsirkin (mst@redhat.com) wrote:
> > > On Fri, May 31, 2019 at 02:01:54PM -0300, Eduardo Habkost wrote:
> > > > > Yes. It's just lots of extremely low level interfaces
> > > > > and all rather pointless.
> > > > > 
> > > > > And down the road extensions like surprise removal support will make it
> > > > > all cleaner and more transparent. Floating things up to libvirt means
> > > > > all these low level details will require more and more hacks.
> > > > 
> > > > Why do you call it pointless?
> > > 
> > > We'd need APIs to manipulate device visibility to guest, hotplug
> > > controller state and separately manipulate the resources allocated. This
> > > is low level stuff that users really have no idea what to do about.
> > > Exposing such a level of detail to management is imho pointless.
> > > We are better off with a high level API, see below.
> > 
> > so I don't know much about vfio; but to me it strikes me that
> > you wouldn't need that low level detail if we just reworked vfio
> > to look more like all our other devices; something like:
> > 
> >   -vfiodev  host=02:00.0,id=gpu
> >   -device vfio-pci,dev=gpu
> > 
> > The 'vfiodev' would own the resources; so to do this trick, the
> > management layer would:
> >    hotunplug the vfio-pci
> >    migrate
> > 
> > if anything went wrong it would
> >    hotplug the vfio-pci backin
> > 
> > you wouldn't have free'd up any resources because they belonged
> > to the vfiodev.
> 
> 
> IIUC that doesn't really work with passthrough
> unless guests support surprise removal.

Why? The view from the guest here is just like what this series
has added without the special hack.

Dave

> 
> > > > If we want this to work before
> > > > surprise removal is implemented, we need to provide an API that
> > > > works for management software.
> > > >  Don't we want to make this work
> > > > without surprise removal too?
> > > 
> > > This patchset adds an optional, off by default support for
> > > migrating guests with an assigned network device.
> > > If enabled this requires guest to allow migration.
> > > 
> > > Of course this can be viewed as a security problem since it allows guest
> > > to block migration. We can't detect a malicious guest reliably imho.
> > > What we can do is report to management when guest allows migration.
> > > Policy such what to do when this does not happen for a while and
> > > what timeout to set would be up to management.
> > > 
> > > The API in question would be a high level one, something
> > > along the lines of a single "guest allowed migration" event.
> > 
> > This is all fairly normal problems with hot unplugging - that's
> > already dealt with at higher levels for normal hot unplugging.
> > 
> > The question here is to try to avoid duplicating that fairly
> > painful process in qemu.
> > 
> > Dave
> > > 
> > > -- 
> > > MST
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Jens Freimann June 3, 2019, 8:24 a.m. UTC | #24
On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:
>On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
>> On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
>> > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
>> > > * Jens Freimann (jfreimann@redhat.com) wrote:
>[...]
>> > > > +    }
>> > > > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
>> > > > +        qdev_unplug(n->primary_dev, &err);
>> > >
>> > > Not knowing unplug well; can you just explain - is that device hard
>> > > unplugged and it's gone by the time this function returns or is it still
>> > > hanging around for some indeterminate time?
>>
>> Qemu will trigger an unplug request via pcie attention button in which case
>> there could be a delay by the guest operating system. We could give it some
>> amount of time and if nothing happens try surpise removal or handle the
>> error otherwise.
>
>I'm missing something here:
>
>Isn't the whole point of the new device-hiding infrastructure to
>prevent QEMU from closing the VFIO until migration ended
>successfully?

No. The point of hiding it is to only add the VFIO (that is configured
with the same MAC as the virtio-net device) until the
VIRTIO_NET_F_STANDBY feature is negotiated. We don't want to expose to
devices with the same MAC to guests who can't handle it.

>What exactly is preventing QEMU from closing the host VFIO device
>after the guest OS has handled the unplug request?

We qdev_unplug() the VFIO device and want the virtio-net standby device to
take over. If something goes wrong with unplug or
migration in general we have to qdev_plug() the device back.

This series does not try to implement new functionality to close a
device without freeing the resources.

From the discussion in this thread I understand that is what libvirt
needs though. Something that will trigger the unplug from the
guest but not free the devices resources in the host system (which is
what qdev_unplug() does). Correct? 

Why is it bad to fully re-create the device in case of a failed migration?


regards,
Jens 
 


>-- 
>Eduardo
>
Dr. David Alan Gilbert June 3, 2019, 8:59 a.m. UTC | #25
* Alex Williamson (alex.williamson@redhat.com) wrote:
> On Fri, 31 May 2019 19:45:13 +0100
> "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
> 
> > * Michael S. Tsirkin (mst@redhat.com) wrote:
> > > On Fri, May 31, 2019 at 02:01:54PM -0300, Eduardo Habkost wrote:  
> > > > > Yes. It's just lots of extremely low level interfaces
> > > > > and all rather pointless.
> > > > > 
> > > > > And down the road extensions like surprise removal support will make it
> > > > > all cleaner and more transparent. Floating things up to libvirt means
> > > > > all these low level details will require more and more hacks.  
> > > > 
> > > > Why do you call it pointless?  
> > > 
> > > We'd need APIs to manipulate device visibility to guest, hotplug
> > > controller state and separately manipulate the resources allocated. This
> > > is low level stuff that users really have no idea what to do about.
> > > Exposing such a level of detail to management is imho pointless.
> > > We are better off with a high level API, see below.  
> > 
> > so I don't know much about vfio; but to me it strikes me that
> > you wouldn't need that low level detail if we just reworked vfio
> > to look more like all our other devices;
> 
> I don't understand what this means, I thought vfio-pci followed a very
> standard device model.
> 
> > something like:
> > 
> >   -vfiodev  host=02:00.0,id=gpu
> >   -device vfio-pci,dev=gpu
> >
> > The 'vfiodev' would own the resources; so to do this trick, the
> > management layer would:
> >    hotunplug the vfio-pci
> >    migrate
> > 
> > if anything went wrong it would
> >    hotplug the vfio-pci backin
> > 
> > you wouldn't have free'd up any resources because they belonged
> > to the vfiodev.
> 
> So you're looking more for some sort of frontend-backend separation, we
> hot-unplug the frontend device that's exposed to the guest while the
> backend device that holds the host resources is still attached.  I
> would have hardly guessed that's "like all our other devices".


Well, we have netdev's and NICs that connect them to the guest,
        and blockdev's and guest devices that expose them to the guest

> I was
> under the impression (from previous discussions mostly) that the device
> removal would be caught before actually allowing the device to finalize
> and exit, such that with a failed migration, re-adding the device would
> be deterministic since the device is never released back to the host.
> I expected that could be done within QEMU, but I guess that's what
> we're getting into here is how management tools specify that eject w/o
> release semantic.

My worry here is that this is all being done behind the back of the
management tools in this series.
The management tools already deal with hot-unplugging and problems with
it;  here we're duplicating that set of problems and trying to stuff
them into the start of migration.

> I don't know what this frontend/backend rework would
> look like for vfio-pci, but it seems non-trivial for this one use case
> and I don't see that it adds any value outside of this use case,
> perhaps quite the opposite, it's an overly complicated interface for
> the majority of use cases so we either move to a more complicated
> interface or maintain both.  Poor choices either way.  Thanks,

Yep, tricky.

Dave

> Alex
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Jens Freimann June 3, 2019, 9:26 a.m. UTC | #26
On Mon, Jun 03, 2019 at 10:24:56AM +0200, Jens Freimann wrote:
>On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:
>>On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
>>>On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
>>>> On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
>>>> > * Jens Freimann (jfreimann@redhat.com) wro
>>What exactly is preventing QEMU from closing the host VFIO device
>>after the guest OS has handled the unplug request?
>
>We qdev_unplug() the VFIO device and want the virtio-net standby device to
>take over. If something goes wrong with unplug or
>migration in general we have to qdev_plug() the device back.

I meant qdev_device_add, not qdev_plug.

regards,
Jens
Laine Stump June 3, 2019, 6:10 p.m. UTC | #27
On 6/3/19 4:24 AM, Jens Freimann wrote:
> On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:
>> On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
>>> On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
>>> > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert 
>>> wrote:
>>> > > * Jens Freimann (jfreimann@redhat.com) wrote:
>> [...]
>>> > > > +    }
>>> > > > +    if (migration_in_setup(s) && !should_be_hidden && 
>>> n->primary_dev) {
>>> > > > +        qdev_unplug(n->primary_dev, &err);
>>> > >
>>> > > Not knowing unplug well; can you just explain - is that device hard
>>> > > unplugged and it's gone by the time this function returns or is 
>>> it still
>>> > > hanging around for some indeterminate time?
>>>
>>> Qemu will trigger an unplug request via pcie attention button in 
>>> which case
>>> there could be a delay by the guest operating system. We could give 
>>> it some
>>> amount of time and if nothing happens try surpise removal or handle the
>>> error otherwise.
>>
>> I'm missing something here:
>>
>> Isn't the whole point of the new device-hiding infrastructure to
>> prevent QEMU from closing the VFIO until migration ended
>> successfully?
> 
> No. The point of hiding it is to only add the VFIO (that is configured
> with the same MAC as the virtio-net device) until the
> VIRTIO_NET_F_STANDBY feature is negotiated. We don't want to expose to
> devices with the same MAC to guests who can't handle it.
> 
>> What exactly is preventing QEMU from closing the host VFIO device
>> after the guest OS has handled the unplug request?
> 
> We qdev_unplug() the VFIO device and want the virtio-net standby device to
> take over. If something goes wrong with unplug or
> migration in general we have to qdev_plug() the device back.
> 
> This series does not try to implement new functionality to close a
> device without freeing the resources.
> 
>  From the discussion in this thread I understand that is what libvirt
> needs though. Something that will trigger the unplug from the
> guest but not free the devices resources in the host system (which is
> what qdev_unplug() does). Correct?
> Why is it bad to fully re-create the device in case of a failed migration?

I think the concern is that if the device was fully released by qemu 
during migration, it might have already been given to some other/new 
guest during the time that migration is trying to complete. If migration 
then fails, you may be unable to restore the guest to the previous state.
Alex Williamson June 3, 2019, 6:46 p.m. UTC | #28
On Mon, 3 Jun 2019 14:10:52 -0400
Laine Stump <laine@redhat.com> wrote:

> On 6/3/19 4:24 AM, Jens Freimann wrote:
> > On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:  
> >> On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:  
> >>> On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:  
> >>> > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert   
> >>> wrote:  
> >>> > > * Jens Freimann (jfreimann@redhat.com) wrote:  
> >> [...]  
> >>> > > > +    }
> >>> > > > +    if (migration_in_setup(s) && !should_be_hidden &&   
> >>> n->primary_dev) {  
> >>> > > > +        qdev_unplug(n->primary_dev, &err);  
> >>> > >
> >>> > > Not knowing unplug well; can you just explain - is that device hard
> >>> > > unplugged and it's gone by the time this function returns or is   
> >>> it still  
> >>> > > hanging around for some indeterminate time?  
> >>>
> >>> Qemu will trigger an unplug request via pcie attention button in 
> >>> which case
> >>> there could be a delay by the guest operating system. We could give 
> >>> it some
> >>> amount of time and if nothing happens try surpise removal or handle the
> >>> error otherwise.  
> >>
> >> I'm missing something here:
> >>
> >> Isn't the whole point of the new device-hiding infrastructure to
> >> prevent QEMU from closing the VFIO until migration ended
> >> successfully?  
> > 
> > No. The point of hiding it is to only add the VFIO (that is configured
> > with the same MAC as the virtio-net device) until the
> > VIRTIO_NET_F_STANDBY feature is negotiated. We don't want to expose to
> > devices with the same MAC to guests who can't handle it.
> >   
> >> What exactly is preventing QEMU from closing the host VFIO device
> >> after the guest OS has handled the unplug request?  
> > 
> > We qdev_unplug() the VFIO device and want the virtio-net standby device to
> > take over. If something goes wrong with unplug or
> > migration in general we have to qdev_plug() the device back.
> > 
> > This series does not try to implement new functionality to close a
> > device without freeing the resources.
> > 
> >  From the discussion in this thread I understand that is what libvirt
> > needs though. Something that will trigger the unplug from the
> > guest but not free the devices resources in the host system (which is
> > what qdev_unplug() does). Correct?
> > Why is it bad to fully re-create the device in case of a failed migration?  
> 
> I think the concern is that if the device was fully released by qemu 
> during migration, it might have already been given to some other/new 
> guest during the time that migration is trying to complete. If migration 
> then fails, you may be unable to restore the guest to the previous state.

Yep, plus I think the memory pinning and IOMMU resources could be a
variable as well.  Essentially, there's no guaranteed reservation to
the device or any of the additional resources that the device implies
once it's released, so we want to keep as much of that on hot-standby
as we can in case the migration fails.  Unfortunately even just
unmapping the BARs for a guest-only hot-unplug unmaps those regions
from the IOMMU, but aside from catastrophic resource issues on the
host, we can essentially guarantee being able to remap those.  Thanks,

Alex
Eduardo Habkost June 3, 2019, 7:36 p.m. UTC | #29
On Mon, Jun 03, 2019 at 10:24:56AM +0200, Jens Freimann wrote:
> On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:
> > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > [...]
> > > > > > +    }
> > > > > > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
> > > > > > +        qdev_unplug(n->primary_dev, &err);
> > > > >
> > > > > Not knowing unplug well; can you just explain - is that device hard
> > > > > unplugged and it's gone by the time this function returns or is it still
> > > > > hanging around for some indeterminate time?
> > > 
> > > Qemu will trigger an unplug request via pcie attention button in which case
> > > there could be a delay by the guest operating system. We could give it some
> > > amount of time and if nothing happens try surpise removal or handle the
> > > error otherwise.
> > 
> > I'm missing something here:
> > 
> > Isn't the whole point of the new device-hiding infrastructure to
> > prevent QEMU from closing the VFIO until migration ended
> > successfully?
> 
> No. The point of hiding it is to only add the VFIO (that is configured
> with the same MAC as the virtio-net device) until the
> VIRTIO_NET_F_STANDBY feature is negotiated. We don't want to expose to
> devices with the same MAC to guests who can't handle it.
> 
> > What exactly is preventing QEMU from closing the host VFIO device
> > after the guest OS has handled the unplug request?
> 
> We qdev_unplug() the VFIO device and want the virtio-net standby device to
> take over. If something goes wrong with unplug or
> migration in general we have to qdev_plug() the device back.
> 
> This series does not try to implement new functionality to close a
> device without freeing the resources.
> 
> From the discussion in this thread I understand that is what libvirt
> needs though. Something that will trigger the unplug from the
> guest but not free the devices resources in the host system (which is
> what qdev_unplug() does). Correct?

This is what I understand we need, but this is not what
qdev_unplug() does.

> 
> Why is it bad to fully re-create the device in case of a failed migration?

Bad or not, I thought the whole point of doing it inside QEMU was
to do something libvirt wouldn't be able to do (namely,
unplugging the device while not freeing resources).  If we are
doing something that management software is already capable of
doing, what's the point?

Quoting a previous message from this thread:

On Thu, May 30, 2019 at 02:09:42PM -0400, Michael S. Tsirkin wrote:
| > On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
| > >  This patch series is very
| > > odd precisely because it's trying to do the unplug itself in the
| > > migration phase rather than let the management layer do it - so unless
| > > it's nailed down how to make sure that's really really bullet proof
| > > then we've got to go back and ask the question about whether we should
| > > really fix it so it can be done by the management layer.
| > > 
| > > Dave
| > 
| > management already said they can't because files get closed and
| > resources freed on unplug and so they might not be able to re-add device
| > on migration failure. We do it in migration because that is
| > where failures can happen and we can recover.
Jens Freimann June 4, 2019, 1:43 p.m. UTC | #30
On Mon, Jun 03, 2019 at 04:36:48PM -0300, Eduardo Habkost wrote:
>On Mon, Jun 03, 2019 at 10:24:56AM +0200, Jens Freimann wrote:
>> On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:
>> > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
>> > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
>> > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
>> > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
>> Why is it bad to fully re-create the device in case of a failed migration?
>
>Bad or not, I thought the whole point of doing it inside QEMU was
>to do something libvirt wouldn't be able to do (namely,
>unplugging the device while not freeing resources).  If we are
>doing something that management software is already capable of
>doing, what's the point?

Event though management software seems to be capable of it, a failover
implementation has never happened. As Michael says network failover is
a mechanism (there's no good reason not to use a PT device if it is
available), not a policy. We are now trying to implement it in a
simple way, contained within QEMU. 

>Quoting a previous message from this thread:
>
>On Thu, May 30, 2019 at 02:09:42PM -0400, Michael S. Tsirkin wrote:
>| > On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
>| > >  This patch series is very
>| > > odd precisely because it's trying to do the unplug itself in the
>| > > migration phase rather than let the management layer do it - so unless
>| > > it's nailed down how to make sure that's really really bullet proof
>| > > then we've got to go back and ask the question about whether we should
>| > > really fix it so it can be done by the management layer.
>| > >
>| > > Dave
>| >
>| > management already said they can't because files get closed and
>| > resources freed on unplug and so they might not be able to re-add device
>| > on migration failure. We do it in migration because that is
>| > where failures can happen and we can recover.

This is something that I can work on as well, but it doesn't have to
be part of this patch set in my opinion. Let's say migration fails and we can't
re-plug the primary device. We can still use the standby (virtio-net)
device which would only mean slower networking. How likely is it that
the primary device is grabbed by another VM between unplugging and
migration failure anyway? 

regards,
Jens 


>
>-- 
>Eduardo
Eduardo Habkost June 4, 2019, 2:09 p.m. UTC | #31
On Tue, Jun 04, 2019 at 03:43:21PM +0200, Jens Freimann wrote:
> On Mon, Jun 03, 2019 at 04:36:48PM -0300, Eduardo Habkost wrote:
> > On Mon, Jun 03, 2019 at 10:24:56AM +0200, Jens Freimann wrote:
> > > On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:
> > > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > Why is it bad to fully re-create the device in case of a failed migration?
> > 
> > Bad or not, I thought the whole point of doing it inside QEMU was
> > to do something libvirt wouldn't be able to do (namely,
> > unplugging the device while not freeing resources).  If we are
> > doing something that management software is already capable of
> > doing, what's the point?
> 
> Event though management software seems to be capable of it, a failover
> implementation has never happened. As Michael says network failover is
> a mechanism (there's no good reason not to use a PT device if it is
> available), not a policy. We are now trying to implement it in a
> simple way, contained within QEMU.

I don't think this is a strong enough reason to move complexity
to QEMU.

This might look like it's reducing complexity in the
QEMU<->libvirt interface, but having QEMU unplugging/plugging
devices automatically without libvirt involvement is actually
complicating that interface.

That said, I won't try to prevent this from being merged if the
maintainers and libvirt developers agree on this interface.
Michael S. Tsirkin June 4, 2019, 5:06 p.m. UTC | #32
On Tue, Jun 04, 2019 at 03:43:21PM +0200, Jens Freimann wrote:
> On Mon, Jun 03, 2019 at 04:36:48PM -0300, Eduardo Habkost wrote:
> > On Mon, Jun 03, 2019 at 10:24:56AM +0200, Jens Freimann wrote:
> > > On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:
> > > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > Why is it bad to fully re-create the device in case of a failed migration?
> > 
> > Bad or not, I thought the whole point of doing it inside QEMU was
> > to do something libvirt wouldn't be able to do (namely,
> > unplugging the device while not freeing resources).  If we are
> > doing something that management software is already capable of
> > doing, what's the point?
> 
> Event though management software seems to be capable of it, a failover
> implementation has never happened. As Michael says network failover is
> a mechanism (there's no good reason not to use a PT device if it is
> available), not a policy. We are now trying to implement it in a
> simple way, contained within QEMU.
> 
> > Quoting a previous message from this thread:
> > 
> > On Thu, May 30, 2019 at 02:09:42PM -0400, Michael S. Tsirkin wrote:
> > | > On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
> > | > >  This patch series is very
> > | > > odd precisely because it's trying to do the unplug itself in the
> > | > > migration phase rather than let the management layer do it - so unless
> > | > > it's nailed down how to make sure that's really really bullet proof
> > | > > then we've got to go back and ask the question about whether we should
> > | > > really fix it so it can be done by the management layer.
> > | > >
> > | > > Dave
> > | >
> > | > management already said they can't because files get closed and
> > | > resources freed on unplug and so they might not be able to re-add device
> > | > on migration failure. We do it in migration because that is
> > | > where failures can happen and we can recover.
> 
> This is something that I can work on as well, but it doesn't have to
> be part of this patch set in my opinion. Let's say migration fails and we can't
> re-plug the primary device. We can still use the standby (virtio-net)
> device which would only mean slower networking. How likely is it that
> the primary device is grabbed by another VM between unplugging and
> migration failure anyway?
> 
> regards,
> Jens

I think I agree with Eduardo it's very important to handle this corner
case correctly. Fast networking outside migration is why people use
failover at all.  Someone who can live with a slower virtio would use
just that.

And IIRC this corner case is exactly why libvirt could not
implement it correctly itself and had to push it up the stack
until it fell off the cliff :).

> 
> > 
> > -- 
> > Eduardo
Dr. David Alan Gilbert June 4, 2019, 7 p.m. UTC | #33
* Michael S. Tsirkin (mst@redhat.com) wrote:
> On Tue, Jun 04, 2019 at 03:43:21PM +0200, Jens Freimann wrote:
> > On Mon, Jun 03, 2019 at 04:36:48PM -0300, Eduardo Habkost wrote:
> > > On Mon, Jun 03, 2019 at 10:24:56AM +0200, Jens Freimann wrote:
> > > > On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:
> > > > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > Why is it bad to fully re-create the device in case of a failed migration?
> > > 
> > > Bad or not, I thought the whole point of doing it inside QEMU was
> > > to do something libvirt wouldn't be able to do (namely,
> > > unplugging the device while not freeing resources).  If we are
> > > doing something that management software is already capable of
> > > doing, what's the point?
> > 
> > Event though management software seems to be capable of it, a failover
> > implementation has never happened. As Michael says network failover is
> > a mechanism (there's no good reason not to use a PT device if it is
> > available), not a policy. We are now trying to implement it in a
> > simple way, contained within QEMU.
> > 
> > > Quoting a previous message from this thread:
> > > 
> > > On Thu, May 30, 2019 at 02:09:42PM -0400, Michael S. Tsirkin wrote:
> > > | > On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
> > > | > >  This patch series is very
> > > | > > odd precisely because it's trying to do the unplug itself in the
> > > | > > migration phase rather than let the management layer do it - so unless
> > > | > > it's nailed down how to make sure that's really really bullet proof
> > > | > > then we've got to go back and ask the question about whether we should
> > > | > > really fix it so it can be done by the management layer.
> > > | > >
> > > | > > Dave
> > > | >
> > > | > management already said they can't because files get closed and
> > > | > resources freed on unplug and so they might not be able to re-add device
> > > | > on migration failure. We do it in migration because that is
> > > | > where failures can happen and we can recover.
> > 
> > This is something that I can work on as well, but it doesn't have to
> > be part of this patch set in my opinion. Let's say migration fails and we can't
> > re-plug the primary device. We can still use the standby (virtio-net)
> > device which would only mean slower networking. How likely is it that
> > the primary device is grabbed by another VM between unplugging and
> > migration failure anyway?
> > 
> > regards,
> > Jens
> 
> I think I agree with Eduardo it's very important to handle this corner
> case correctly. Fast networking outside migration is why people use
> failover at all.  Someone who can live with a slower virtio would use
> just that.
> 
> And IIRC this corner case is exactly why libvirt could not
> implement it correctly itself and had to push it up the stack
> until it fell off the cliff :).

So I think we need to have the code that shows we can cope with the
corner cases - or provide a way for libvirt to handle it (which is
my strong preference).

Dave


> > 
> > > 
> > > -- 
> > > Eduardo
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Daniel P. Berrangé June 5, 2019, 2:36 p.m. UTC | #34
On Tue, Jun 04, 2019 at 03:43:21PM +0200, Jens Freimann wrote:
> On Mon, Jun 03, 2019 at 04:36:48PM -0300, Eduardo Habkost wrote:
> > On Mon, Jun 03, 2019 at 10:24:56AM +0200, Jens Freimann wrote:
> > > On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:
> > > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > Why is it bad to fully re-create the device in case of a failed migration?
> > 
> > Bad or not, I thought the whole point of doing it inside QEMU was
> > to do something libvirt wouldn't be able to do (namely,
> > unplugging the device while not freeing resources).  If we are
> > doing something that management software is already capable of
> > doing, what's the point?
> 
> Event though management software seems to be capable of it, a failover
> implementation has never happened. As Michael says network failover is
> a mechanism (there's no good reason not to use a PT device if it is
> available), not a policy. We are now trying to implement it in a
> simple way, contained within QEMU.
> 
> > Quoting a previous message from this thread:
> > 
> > On Thu, May 30, 2019 at 02:09:42PM -0400, Michael S. Tsirkin wrote:
> > | > On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
> > | > >  This patch series is very
> > | > > odd precisely because it's trying to do the unplug itself in the
> > | > > migration phase rather than let the management layer do it - so unless
> > | > > it's nailed down how to make sure that's really really bullet proof
> > | > > then we've got to go back and ask the question about whether we should
> > | > > really fix it so it can be done by the management layer.
> > | > >
> > | > > Dave
> > | >
> > | > management already said they can't because files get closed and
> > | > resources freed on unplug and so they might not be able to re-add device
> > | > on migration failure. We do it in migration because that is
> > | > where failures can happen and we can recover.
> 
> This is something that I can work on as well, but it doesn't have to
> be part of this patch set in my opinion. Let's say migration fails and we can't
> re-plug the primary device. We can still use the standby (virtio-net)
> device which would only mean slower networking. How likely is it that
> the primary device is grabbed by another VM between unplugging and
> migration failure anyway?

The case of another VM taking the primary device is *not* a problem for
libvirt. We keep track of which device is allocated for use by which
guest, so even if its not currently plugged into the guest, we won't
give it away to a second guest.

The failure scenario is the edge cases where replugging the device fails
for some reason more outside libvirt's control. Running out of file
descriptors, memory allocation failure when pinning guest RAM. Essentially
any failure path that may arise from "device-add vfio..."

In such a case the device won't get replugged. So the mgmt app will think
the migration was rolled back, but the rollback won't be complete as the
original device will be missing.

I guess the question is whether that's really something to worry about ?

Can we justifiably just leave this as a docs problem give that it would
be very rare failure ?

Regards,
Daniel
Daniel P. Berrangé June 5, 2019, 3:20 p.m. UTC | #35
On Mon, Jun 03, 2019 at 12:46:52PM -0600, Alex Williamson wrote:
> On Mon, 3 Jun 2019 14:10:52 -0400
> Laine Stump <laine@redhat.com> wrote:
> 
> > On 6/3/19 4:24 AM, Jens Freimann wrote:
> > > On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:  
> > >> On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:  
> > >>> On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:  
> > >>> > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert   
> > >>> wrote:  
> > >>> > > * Jens Freimann (jfreimann@redhat.com) wrote:  
> > >> [...]  
> > >>> > > > +    }
> > >>> > > > +    if (migration_in_setup(s) && !should_be_hidden &&   
> > >>> n->primary_dev) {  
> > >>> > > > +        qdev_unplug(n->primary_dev, &err);  
> > >>> > >
> > >>> > > Not knowing unplug well; can you just explain - is that device hard
> > >>> > > unplugged and it's gone by the time this function returns or is   
> > >>> it still  
> > >>> > > hanging around for some indeterminate time?  
> > >>>
> > >>> Qemu will trigger an unplug request via pcie attention button in 
> > >>> which case
> > >>> there could be a delay by the guest operating system. We could give 
> > >>> it some
> > >>> amount of time and if nothing happens try surpise removal or handle the
> > >>> error otherwise.  
> > >>
> > >> I'm missing something here:
> > >>
> > >> Isn't the whole point of the new device-hiding infrastructure to
> > >> prevent QEMU from closing the VFIO until migration ended
> > >> successfully?  
> > > 
> > > No. The point of hiding it is to only add the VFIO (that is configured
> > > with the same MAC as the virtio-net device) until the
> > > VIRTIO_NET_F_STANDBY feature is negotiated. We don't want to expose to
> > > devices with the same MAC to guests who can't handle it.
> > >   
> > >> What exactly is preventing QEMU from closing the host VFIO device
> > >> after the guest OS has handled the unplug request?  
> > > 
> > > We qdev_unplug() the VFIO device and want the virtio-net standby device to
> > > take over. If something goes wrong with unplug or
> > > migration in general we have to qdev_plug() the device back.
> > > 
> > > This series does not try to implement new functionality to close a
> > > device without freeing the resources.
> > > 
> > >  From the discussion in this thread I understand that is what libvirt
> > > needs though. Something that will trigger the unplug from the
> > > guest but not free the devices resources in the host system (which is
> > > what qdev_unplug() does). Correct?
> > > Why is it bad to fully re-create the device in case of a failed migration?  
> > 
> > I think the concern is that if the device was fully released by qemu 
> > during migration, it might have already been given to some other/new 
> > guest during the time that migration is trying to complete. If migration 
> > then fails, you may be unable to restore the guest to the previous state.
> 
> Yep, plus I think the memory pinning and IOMMU resources could be a
> variable as well.  Essentially, there's no guaranteed reservation to
> the device or any of the additional resources that the device implies
> once it's released, so we want to keep as much of that on hot-standby
> as we can in case the migration fails.  Unfortunately even just
> unmapping the BARs for a guest-only hot-unplug unmaps those regions
> from the IOMMU, but aside from catastrophic resource issues on the
> host, we can essentially guarantee being able to remap those.  Thanks,

Yes its these other resource allocations that are the problem. Libvirt
can easily ensure that the actual PCI device is not given away to a
second guest until migration completes. The mgmt app above libvirt is
likely ensure this exclusion of PCI devices too.

Regards,
Daniel
Daniel P. Berrangé June 5, 2019, 3:23 p.m. UTC | #36
On Thu, May 30, 2019 at 02:09:42PM -0400, Michael S. Tsirkin wrote:
> On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
> > * Michael S. Tsirkin (mst@redhat.com) wrote:
> > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > > Hi David,
> > > > 
> > > > sorry for the  delayed reply.
> > > > 
> > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > > > > +static void virtio_net_primary_plug_timer(void *opaque);
> > > > > > > +
> > > > > > >  static void virtio_net_set_link_status(NetClientState *nc)
> > > > > > >  {
> > > > > > >      VirtIONet *n = qemu_get_nic_opaque(nc);
> > > > > > > @@ -786,6 +796,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
> > > > > > >      } else {
> > > > > > >          memset(n->vlans, 0xff, MAX_VLAN >> 3);
> > > > > > >      }
> > > > > > > +
> > > > > > > +    if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
> > > > > > > +        atomic_set(&n->primary_should_be_hidden, false);
> > > > > > > +        if (n->primary_device_timer)
> > > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > > +                4000);
> > > > > > > +    }
> > > > > > 
> > > > > > What's this magic timer constant and why?
> > > > 
> > > > To be honest it's a leftover from previous versions (before I took
> > > > over) of the patches and I'm not sure why the timer is there.
> > > > I removed it and so far see no reason to keep it.
> > > > 
> > > > > > 
> > > > > > >  }
> > > > > > >
> > > > > > >  static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
> > > > > > > @@ -2626,6 +2644,87 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
> > > > > > >      n->netclient_type = g_strdup(type);
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtio_net_primary_plug_timer(void *opaque)
> > > > > > > +{
> > > > > > > +    VirtIONet *n = opaque;
> > > > > > > +    Error *err = NULL;
> > > > > > > +
> > > > > > > +    if (n->primary_device_dict)
> > > > > > > +        n->primary_device_opts = qemu_opts_from_qdict(qemu_find_opts("device"),
> > > > > > > +            n->primary_device_dict, &err);
> > > > > > > +    if (n->primary_device_opts) {
> > > > > > > +        n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
> > > > > > > +        error_setg(&err, "virtio_net: couldn't plug in primary device");
> > > > > > > +        return;
> > > > > > > +    }
> > > > > > > +    if (!n->primary_device_dict && err) {
> > > > > > > +        if (n->primary_device_timer) {
> > > > > > > +            timer_mod(n->primary_device_timer,
> > > > > > > +                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > > > > > > +                100);
> > > > > > 
> > > > > > same here.
> > > > 
> > > > see above
> > > > 
> > > > > > 
> > > > > > 
> > > > > > > +        }
> > > > > > > +    }
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void virtio_net_handle_migration_primary(VirtIONet *n,
> > > > > > > +                                                MigrationState *s)
> > > > > > > +{
> > > > > > > +    Error *err = NULL;
> > > > > > > +    bool should_be_hidden = atomic_read(&n->primary_should_be_hidden);
> > > > > > > +
> > > > > > > +    n->primary_dev = qdev_find_recursive(sysbus_get_default(),
> > > > > > > +            n->primary_device_id);
> > > > > > > +    if (!n->primary_dev) {
> > > > > > > +        error_setg(&err, "virtio_net: couldn't find primary device");
> > > > > > 
> > > > > > There's something broken with the error handling in this function - the
> > > > > > 'err' never goes anywhere - I don't think it ever gets printed or
> > > > > > reported or stops the migration.
> > > > 
> > > > yes, I'll fix it.
> > > > 
> > > > > > > +    }
> > > > > > > +    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
> > > > > > > +        qdev_unplug(n->primary_dev, &err);
> > > > > > 
> > > > > > Not knowing unplug well; can you just explain - is that device hard
> > > > > > unplugged and it's gone by the time this function returns or is it still
> > > > > > hanging around for some indeterminate time?
> > > > 
> > > > Qemu will trigger an unplug request via pcie attention button in which case
> > > > there could be a delay by the guest operating system. We could give it some
> > > > amount of time and if nothing happens try surpise removal or handle the
> > > > error otherwise.
> > > > 
> > > > 
> > > > regards,
> > > > Jens
> > > 
> > > That's a subject for another day. Let's get the basic thing
> > > working.
> > 
> > Well no, we need to know this thing isn't going to hang in the migration
> > setup phase, or if it does how we recover.
> 
> 
> This thing is *supposed* to be stuck in migration startup phase
> if guest is malicious.
> 
> If migration does not progress management needs
> a way to detect this and cancel.
> 
> Some more documentation about how this is supposed to happen
> would be helpful.

We need more than merely documentation in this area. We need some explicit
migration status or event exported from QMP to reflect that fact that QEMU
has not yet started migration, because it is waiting for PCI device unplug
to complete.

Regards,
Daniel
Laine Stump June 5, 2019, 4:04 p.m. UTC | #37
On 6/4/19 9:43 AM, Jens Freimann wrote:
> On Mon, Jun 03, 2019 at 04:36:48PM -0300, Eduardo Habkost wrote:
>> On Mon, Jun 03, 2019 at 10:24:56AM +0200, Jens Freimann wrote:
>>> On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:
>>> > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
>>> > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
>>> > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan 
>>> Gilbert wrote:
>>> > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
>>> Why is it bad to fully re-create the device in case of a failed 
>>> migration?
>>
>> Bad or not, I thought the whole point of doing it inside QEMU was
>> to do something libvirt wouldn't be able to do (namely,
>> unplugging the device while not freeing resources).  If we are
>> doing something that management software is already capable of
>> doing, what's the point?
> 
> Event though management software seems to be capable of it, a failover
> implementation has never happened.

I'm pretty sure RHV/oVirt+vdsm has implemented it and it is even being 
used in production. Of course it requires a bond/team device to be 
configured in the guest OS, but the part about auto-detaching the VF 
before migration, then reattaching a similar VF on the destination is 
all done by vdsm. (Don't misunderstand this as discouraging this new 
method! Just wanted to set the record straight.)
Daniel P. Berrangé June 5, 2019, 4:19 p.m. UTC | #38
On Wed, Jun 05, 2019 at 12:04:28PM -0400, Laine Stump wrote:
> On 6/4/19 9:43 AM, Jens Freimann wrote:
> > On Mon, Jun 03, 2019 at 04:36:48PM -0300, Eduardo Habkost wrote:
> > > On Mon, Jun 03, 2019 at 10:24:56AM +0200, Jens Freimann wrote:
> > > > On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:
> > > > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan
> > > > Gilbert wrote:
> > > > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > Why is it bad to fully re-create the device in case of a failed
> > > > migration?
> > > 
> > > Bad or not, I thought the whole point of doing it inside QEMU was
> > > to do something libvirt wouldn't be able to do (namely,
> > > unplugging the device while not freeing resources).  If we are
> > > doing something that management software is already capable of
> > > doing, what's the point?
> > 
> > Event though management software seems to be capable of it, a failover
> > implementation has never happened.
> 
> I'm pretty sure RHV/oVirt+vdsm has implemented it and it is even being used
> in production. Of course it requires a bond/team device to be configured in
> the guest OS, but the part about auto-detaching the VF before migration,
> then reattaching a similar VF on the destination is all done by vdsm. (Don't
> misunderstand this as discouraging this new method! Just wanted to set the
> record straight.)

OpenStack will detach/reattach PCI devices around a save-to-disk, but
does not currently do that for live migration, but easily could do. The
blocker why they've not done that is the issue around exposing to the
guest which pairs of devices are intended to be used together in the
bond.

OpenStack could have defined a way to express that, but it would be
specific to OpenStack which is not very desirable. Standardization
of how to express the relationship between the pair of devices would
be very desirable, allowing the host side solution to be done in either
QEMU or the mgmt app as they see fit.

Regards,
Daniel
Roman Kagan June 6, 2019, 3 p.m. UTC | #39
On Mon, Jun 03, 2019 at 12:46:52PM -0600, Alex Williamson wrote:
> On Mon, 3 Jun 2019 14:10:52 -0400
> Laine Stump <laine@redhat.com> wrote:
> 
> > On 6/3/19 4:24 AM, Jens Freimann wrote:
> > > On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:  
> > >> On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:  
> > >>> On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:  
> > >>> > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert   
> > >>> wrote:  
> > >>> > > * Jens Freimann (jfreimann@redhat.com) wrote:  
> > >> [...]  
> > >>> > > > +    }
> > >>> > > > +    if (migration_in_setup(s) && !should_be_hidden &&   
> > >>> n->primary_dev) {  
> > >>> > > > +        qdev_unplug(n->primary_dev, &err);  
> > >>> > >
> > >>> > > Not knowing unplug well; can you just explain - is that device hard
> > >>> > > unplugged and it's gone by the time this function returns or is   
> > >>> it still  
> > >>> > > hanging around for some indeterminate time?  
> > >>>
> > >>> Qemu will trigger an unplug request via pcie attention button in 
> > >>> which case
> > >>> there could be a delay by the guest operating system. We could give 
> > >>> it some
> > >>> amount of time and if nothing happens try surpise removal or handle the
> > >>> error otherwise.  
> > >>
> > >> I'm missing something here:
> > >>
> > >> Isn't the whole point of the new device-hiding infrastructure to
> > >> prevent QEMU from closing the VFIO until migration ended
> > >> successfully?  
> > > 
> > > No. The point of hiding it is to only add the VFIO (that is configured
> > > with the same MAC as the virtio-net device) until the
> > > VIRTIO_NET_F_STANDBY feature is negotiated. We don't want to expose to
> > > devices with the same MAC to guests who can't handle it.
> > >   
> > >> What exactly is preventing QEMU from closing the host VFIO device
> > >> after the guest OS has handled the unplug request?  
> > > 
> > > We qdev_unplug() the VFIO device and want the virtio-net standby device to
> > > take over. If something goes wrong with unplug or
> > > migration in general we have to qdev_plug() the device back.
> > > 
> > > This series does not try to implement new functionality to close a
> > > device without freeing the resources.
> > > 
> > >  From the discussion in this thread I understand that is what libvirt
> > > needs though. Something that will trigger the unplug from the
> > > guest but not free the devices resources in the host system (which is
> > > what qdev_unplug() does). Correct?
> > > Why is it bad to fully re-create the device in case of a failed migration?  
> > 
> > I think the concern is that if the device was fully released by qemu 
> > during migration, it might have already been given to some other/new 
> > guest during the time that migration is trying to complete. If migration 
> > then fails, you may be unable to restore the guest to the previous state.
> 
> Yep, plus I think the memory pinning and IOMMU resources could be a
> variable as well.  Essentially, there's no guaranteed reservation to
> the device or any of the additional resources that the device implies
> once it's released, so we want to keep as much of that on hot-standby
> as we can in case the migration fails.  Unfortunately even just
> unmapping the BARs for a guest-only hot-unplug unmaps those regions
> from the IOMMU, but aside from catastrophic resource issues on the
> host, we can essentially guarantee being able to remap those.  Thanks,

Isn't this also the case for guest (re)boots?

IOW libvirt/mgmt will anyway have to be aware of the PT-PV pairing and
that pair being in a degraded state sometimes.  Then the migration of
such VMs would just imply transitioning to the degraded state prior to
the actual migration.  Which sounds much like putting the mostly
existing bits together in libvirt/mgmt and nothing to be done in QEMU.

Am I missing anything?

Thanks,
Roman.
Jens Freimann June 7, 2019, 2:14 p.m. UTC | #40
On Tue, Jun 04, 2019 at 08:00:19PM +0100, Dr. David Alan Gilbert wrote:
>* Michael S. Tsirkin (mst@redhat.com) wrote:
>> On Tue, Jun 04, 2019 at 03:43:21PM +0200, Jens Freimann wrote:
>> > On Mon, Jun 03, 2019 at 04:36:48PM -0300, Eduardo Habkost wrote:
>> > > On Mon, Jun 03, 2019 at 10:24:56AM +0200, Jens Freimann wrote:
>> > > > On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:
>> > > > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
>> > > > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
>> > > > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
>> > > > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
>> > > > Why is it bad to fully re-create the device in case of a failed migration?
>> > >
>> > > Bad or not, I thought the whole point of doing it inside QEMU was
>> > > to do something libvirt wouldn't be able to do (namely,
>> > > unplugging the device while not freeing resources).  If we are
>> > > doing something that management software is already capable of
>> > > doing, what's the point?
>> >
>> > Event though management software seems to be capable of it, a failover
>> > implementation has never happened. As Michael says network failover is
>> > a mechanism (there's no good reason not to use a PT device if it is
>> > available), not a policy. We are now trying to implement it in a
>> > simple way, contained within QEMU.
>> >
>> > > Quoting a previous message from this thread:
>> > >
>> > > On Thu, May 30, 2019 at 02:09:42PM -0400, Michael S. Tsirkin wrote:
>> > > | > On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
>> > > | > >  This patch series is very
>> > > | > > odd precisely because it's trying to do the unplug itself in the
>> > > | > > migration phase rather than let the management layer do it - so unless
>> > > | > > it's nailed down how to make sure that's really really bullet proof
>> > > | > > then we've got to go back and ask the question about whether we should
>> > > | > > really fix it so it can be done by the management layer.
>> > > | > >
>> > > | > > Dave
>> > > | >
>> > > | > management already said they can't because files get closed and
>> > > | > resources freed on unplug and so they might not be able to re-add device
>> > > | > on migration failure. We do it in migration because that is
>> > > | > where failures can happen and we can recover.
>> >
>> > This is something that I can work on as well, but it doesn't have to
>> > be part of this patch set in my opinion. Let's say migration fails and we can't
>> > re-plug the primary device. We can still use the standby (virtio-net)
>> > device which would only mean slower networking. How likely is it that
>> > the primary device is grabbed by another VM between unplugging and
>> > migration failure anyway?
>> >
>> > regards,
>> > Jens
>>
>> I think I agree with Eduardo it's very important to handle this corner
>> case correctly. Fast networking outside migration is why people use
>> failover at all.  Someone who can live with a slower virtio would use
>> just that.
>>
>> And IIRC this corner case is exactly why libvirt could not
>> implement it correctly itself and had to push it up the stack
>> until it fell off the cliff :).
>
>So I think we need to have the code that shows we can cope with the
>corner cases - or provide a way for libvirt to handle it (which is
>my strong preference).

Would this work: We add a new migration state MIGRATE_WAIT_UNPLUG (or
a better more generic name) which tells libvirt that migration has not
started yet because we are waiting for the guest. And extend the qmp
events for the migration state. When we know the device was
sucessfully unplugged we sent a qmp event DEVICE_DELETED or a new one
DEVICE_DELETED_PARTIALLY (not sure about that yet), let migration
start and set the migration state to active?

To do a partial unplug I imagine, we have to separate vfio(-pci) code
to differ between release of resources (fds, mappings etc) and unplug
(I haven't yet found out how it works in vfio). In the failover case
we only do the unplug part but not the release part. 

regards,
Jens
Michael S. Tsirkin June 7, 2019, 2:32 p.m. UTC | #41
On Fri, Jun 07, 2019 at 04:14:07PM +0200, Jens Freimann wrote:
> On Tue, Jun 04, 2019 at 08:00:19PM +0100, Dr. David Alan Gilbert wrote:
> > * Michael S. Tsirkin (mst@redhat.com) wrote:
> > > On Tue, Jun 04, 2019 at 03:43:21PM +0200, Jens Freimann wrote:
> > > > On Mon, Jun 03, 2019 at 04:36:48PM -0300, Eduardo Habkost wrote:
> > > > > On Mon, Jun 03, 2019 at 10:24:56AM +0200, Jens Freimann wrote:
> > > > > > On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:
> > > > > > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > > > > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > > > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > > > Why is it bad to fully re-create the device in case of a failed migration?
> > > > >
> > > > > Bad or not, I thought the whole point of doing it inside QEMU was
> > > > > to do something libvirt wouldn't be able to do (namely,
> > > > > unplugging the device while not freeing resources).  If we are
> > > > > doing something that management software is already capable of
> > > > > doing, what's the point?
> > > >
> > > > Event though management software seems to be capable of it, a failover
> > > > implementation has never happened. As Michael says network failover is
> > > > a mechanism (there's no good reason not to use a PT device if it is
> > > > available), not a policy. We are now trying to implement it in a
> > > > simple way, contained within QEMU.
> > > >
> > > > > Quoting a previous message from this thread:
> > > > >
> > > > > On Thu, May 30, 2019 at 02:09:42PM -0400, Michael S. Tsirkin wrote:
> > > > > | > On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
> > > > > | > >  This patch series is very
> > > > > | > > odd precisely because it's trying to do the unplug itself in the
> > > > > | > > migration phase rather than let the management layer do it - so unless
> > > > > | > > it's nailed down how to make sure that's really really bullet proof
> > > > > | > > then we've got to go back and ask the question about whether we should
> > > > > | > > really fix it so it can be done by the management layer.
> > > > > | > >
> > > > > | > > Dave
> > > > > | >
> > > > > | > management already said they can't because files get closed and
> > > > > | > resources freed on unplug and so they might not be able to re-add device
> > > > > | > on migration failure. We do it in migration because that is
> > > > > | > where failures can happen and we can recover.
> > > >
> > > > This is something that I can work on as well, but it doesn't have to
> > > > be part of this patch set in my opinion. Let's say migration fails and we can't
> > > > re-plug the primary device. We can still use the standby (virtio-net)
> > > > device which would only mean slower networking. How likely is it that
> > > > the primary device is grabbed by another VM between unplugging and
> > > > migration failure anyway?
> > > >
> > > > regards,
> > > > Jens
> > > 
> > > I think I agree with Eduardo it's very important to handle this corner
> > > case correctly. Fast networking outside migration is why people use
> > > failover at all.  Someone who can live with a slower virtio would use
> > > just that.
> > > 
> > > And IIRC this corner case is exactly why libvirt could not
> > > implement it correctly itself and had to push it up the stack
> > > until it fell off the cliff :).
> > 
> > So I think we need to have the code that shows we can cope with the
> > corner cases - or provide a way for libvirt to handle it (which is
> > my strong preference).
> 
> Would this work: We add a new migration state MIGRATE_WAIT_UNPLUG (or
> a better more generic name) which tells libvirt that migration has not
> started yet because we are waiting for the guest. And extend the qmp
> events for the migration state. When we know the device was
> sucessfully unplugged we sent a qmp event DEVICE_DELETED or a new one
> DEVICE_DELETED_PARTIALLY (not sure about that yet), let migration
> start and set the migration state to active?
> 
> To do a partial unplug I imagine, we have to separate vfio(-pci) code
> to differ between release of resources (fds, mappings etc) and unplug
> (I haven't yet found out how it works in vfio). In the failover case
> we only do the unplug part but not the release part.
> 
> regards,
> Jens

I think the first is done in vfio_exitfn and the second in
vfio_instance_finalize.
Dr. David Alan Gilbert June 7, 2019, 5:51 p.m. UTC | #42
* Jens Freimann (jfreimann@redhat.com) wrote:
> On Tue, Jun 04, 2019 at 08:00:19PM +0100, Dr. David Alan Gilbert wrote:
> > * Michael S. Tsirkin (mst@redhat.com) wrote:
> > > On Tue, Jun 04, 2019 at 03:43:21PM +0200, Jens Freimann wrote:
> > > > On Mon, Jun 03, 2019 at 04:36:48PM -0300, Eduardo Habkost wrote:
> > > > > On Mon, Jun 03, 2019 at 10:24:56AM +0200, Jens Freimann wrote:
> > > > > > On Fri, May 31, 2019 at 06:47:48PM -0300, Eduardo Habkost wrote:
> > > > > > > On Thu, May 30, 2019 at 04:56:45PM +0200, Jens Freimann wrote:
> > > > > > > > On Tue, May 28, 2019 at 11:04:15AM -0400, Michael S. Tsirkin wrote:
> > > > > > > > > On Tue, May 21, 2019 at 10:45:05AM +0100, Dr. David Alan Gilbert wrote:
> > > > > > > > > > * Jens Freimann (jfreimann@redhat.com) wrote:
> > > > > > Why is it bad to fully re-create the device in case of a failed migration?
> > > > >
> > > > > Bad or not, I thought the whole point of doing it inside QEMU was
> > > > > to do something libvirt wouldn't be able to do (namely,
> > > > > unplugging the device while not freeing resources).  If we are
> > > > > doing something that management software is already capable of
> > > > > doing, what's the point?
> > > >
> > > > Event though management software seems to be capable of it, a failover
> > > > implementation has never happened. As Michael says network failover is
> > > > a mechanism (there's no good reason not to use a PT device if it is
> > > > available), not a policy. We are now trying to implement it in a
> > > > simple way, contained within QEMU.
> > > >
> > > > > Quoting a previous message from this thread:
> > > > >
> > > > > On Thu, May 30, 2019 at 02:09:42PM -0400, Michael S. Tsirkin wrote:
> > > > > | > On Thu, May 30, 2019 at 07:00:23PM +0100, Dr. David Alan Gilbert wrote:
> > > > > | > >  This patch series is very
> > > > > | > > odd precisely because it's trying to do the unplug itself in the
> > > > > | > > migration phase rather than let the management layer do it - so unless
> > > > > | > > it's nailed down how to make sure that's really really bullet proof
> > > > > | > > then we've got to go back and ask the question about whether we should
> > > > > | > > really fix it so it can be done by the management layer.
> > > > > | > >
> > > > > | > > Dave
> > > > > | >
> > > > > | > management already said they can't because files get closed and
> > > > > | > resources freed on unplug and so they might not be able to re-add device
> > > > > | > on migration failure. We do it in migration because that is
> > > > > | > where failures can happen and we can recover.
> > > >
> > > > This is something that I can work on as well, but it doesn't have to
> > > > be part of this patch set in my opinion. Let's say migration fails and we can't
> > > > re-plug the primary device. We can still use the standby (virtio-net)
> > > > device which would only mean slower networking. How likely is it that
> > > > the primary device is grabbed by another VM between unplugging and
> > > > migration failure anyway?
> > > >
> > > > regards,
> > > > Jens
> > > 
> > > I think I agree with Eduardo it's very important to handle this corner
> > > case correctly. Fast networking outside migration is why people use
> > > failover at all.  Someone who can live with a slower virtio would use
> > > just that.
> > > 
> > > And IIRC this corner case is exactly why libvirt could not
> > > implement it correctly itself and had to push it up the stack
> > > until it fell off the cliff :).
> > 
> > So I think we need to have the code that shows we can cope with the
> > corner cases - or provide a way for libvirt to handle it (which is
> > my strong preference).
> 
> Would this work: We add a new migration state MIGRATE_WAIT_UNPLUG (or
> a better more generic name) which tells libvirt that migration has not
> started yet because we are waiting for the guest. And extend the qmp
> events for the migration state. When we know the device was
> sucessfully unplugged we sent a qmp event DEVICE_DELETED or a new one
> DEVICE_DELETED_PARTIALLY (not sure about that yet), let migration
> start and set the migration state to active?

Potentially; lets see what the libvirt people have to say.
What happens if you have multiple devices and one of them unplugs OK and
then the other fails?

> To do a partial unplug I imagine, we have to separate vfio(-pci) code
> to differ between release of resources (fds, mappings etc) and unplug
> (I haven't yet found out how it works in vfio). In the failover case
> we only do the unplug part but not the release part.

Dave

> regards,
> Jens
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox series

Patch

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index ffe0872fff..120eccbb98 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -12,6 +12,7 @@ 
  */
 
 #include "qemu/osdep.h"
+#include "qemu/atomic.h"
 #include "qemu/iov.h"
 #include "hw/virtio/virtio.h"
 #include "net/net.h"
@@ -19,6 +20,10 @@ 
 #include "net/tap.h"
 #include "qemu/error-report.h"
 #include "qemu/timer.h"
+#include "qemu/option.h"
+#include "qemu/option_int.h"
+#include "qemu/config-file.h"
+#include "qapi/qmp/qdict.h"
 #include "hw/virtio/virtio-net.h"
 #include "net/vhost_net.h"
 #include "net/announce.h"
@@ -29,6 +34,8 @@ 
 #include "migration/misc.h"
 #include "standard-headers/linux/ethtool.h"
 #include "trace.h"
+#include "monitor/qdev.h"
+#include "hw/pci/pci.h"
 
 #define VIRTIO_NET_VM_VERSION    11
 
@@ -364,6 +371,9 @@  static void virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status)
     }
 }
 
+
+static void virtio_net_primary_plug_timer(void *opaque);
+
 static void virtio_net_set_link_status(NetClientState *nc)
 {
     VirtIONet *n = qemu_get_nic_opaque(nc);
@@ -786,6 +796,14 @@  static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
     } else {
         memset(n->vlans, 0xff, MAX_VLAN >> 3);
     }
+
+    if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
+        atomic_set(&n->primary_should_be_hidden, false);
+        if (n->primary_device_timer)
+            timer_mod(n->primary_device_timer,
+                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+                4000);
+    }
 }
 
 static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
@@ -2626,6 +2644,87 @@  void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
     n->netclient_type = g_strdup(type);
 }
 
+static void virtio_net_primary_plug_timer(void *opaque)
+{
+    VirtIONet *n = opaque;
+    Error *err = NULL;
+
+    if (n->primary_device_dict)
+        n->primary_device_opts = qemu_opts_from_qdict(qemu_find_opts("device"),
+            n->primary_device_dict, &err);
+    if (n->primary_device_opts) {
+        n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
+        error_setg(&err, "virtio_net: couldn't plug in primary device");
+        return;
+    }
+    if (!n->primary_device_dict && err) {
+        if (n->primary_device_timer) {
+            timer_mod(n->primary_device_timer,
+                qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+                100);
+        }
+    }
+}
+
+static void virtio_net_handle_migration_primary(VirtIONet *n,
+                                                MigrationState *s)
+{
+    Error *err = NULL;
+    bool should_be_hidden = atomic_read(&n->primary_should_be_hidden);
+
+    n->primary_dev = qdev_find_recursive(sysbus_get_default(),
+            n->primary_device_id);
+    if (!n->primary_dev) {
+        error_setg(&err, "virtio_net: couldn't find primary device");
+    }
+    if (migration_in_setup(s) && !should_be_hidden && n->primary_dev) {
+        qdev_unplug(n->primary_dev, &err);
+        if (!err) {
+            atomic_set(&n->primary_should_be_hidden, true);
+            n->primary_dev = NULL;
+        }
+    } else if (migration_has_failed(s)) {
+        if (should_be_hidden && !n->primary_dev) {
+            /* We already unplugged the device let's plugged it back */
+            n->primary_dev = qdev_device_add(n->primary_device_opts, &err);
+        }
+    }
+}
+
+static void migration_state_notifier(Notifier *notifier, void *data)
+{
+    MigrationState *s = data;
+    VirtIONet *n = container_of(notifier, VirtIONet, migration_state);
+    virtio_net_handle_migration_primary(n, s);
+}
+
+static void virtio_net_primary_should_be_hidden(DeviceListener *listener,
+            QemuOpts *device_opts, bool *match_found, bool *res)
+{
+    VirtIONet *n = container_of(listener, VirtIONet, primary_listener);
+
+    if (device_opts) {
+        n->primary_device_dict = qemu_opts_to_qdict(device_opts,
+                n->primary_device_dict);
+    }
+    g_free(n->standby_id);
+    n->standby_id = g_strdup(qdict_get_try_str(n->primary_device_dict,
+                             "standby"));
+    if (n->standby_id) {
+        *match_found = true;
+    }
+    /* primary_should_be_hidden is set during feature negotiation */
+    if (atomic_read(&n->primary_should_be_hidden) && *match_found) {
+        *res = true;
+    } else if (*match_found)  {
+        n->primary_device_dict = qemu_opts_to_qdict(device_opts,
+                n->primary_device_dict);
+        *res = false;
+    }
+    g_free(n->primary_device_id);
+    n->primary_device_id = g_strdup(device_opts->id);
+}
+
 static void virtio_net_device_realize(DeviceState *dev, Error **errp)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
@@ -2656,6 +2755,18 @@  static void virtio_net_device_realize(DeviceState *dev, Error **errp)
         n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX);
     }
 
+    if (n->failover) {
+        n->primary_listener.should_be_hidden =
+            virtio_net_primary_should_be_hidden;
+        atomic_set(&n->primary_should_be_hidden, true);
+        device_listener_register(&n->primary_listener);
+        n->migration_state.notify = migration_state_notifier;
+        add_migration_state_change_notifier(&n->migration_state);
+        n->host_features |= (1ULL << VIRTIO_NET_F_STANDBY);
+        n->primary_device_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
+                                     virtio_net_primary_plug_timer, n);
+    }
+
     virtio_net_set_config_size(n, n->host_features);
     virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, n->config_size);
 
@@ -2778,6 +2889,11 @@  static void virtio_net_device_unrealize(DeviceState *dev, Error **errp)
     g_free(n->mac_table.macs);
     g_free(n->vlans);
 
+    g_free(n->primary_device_id);
+    g_free(n->standby_id);
+    qobject_unref(n->primary_device_dict);
+    n->primary_device_dict = NULL;
+
     max_queues = n->multiqueue ? n->max_queues : 1;
     for (i = 0; i < max_queues; i++) {
         virtio_net_del_queue(n, i);
@@ -2885,6 +3001,7 @@  static Property virtio_net_properties[] = {
                      true),
     DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
     DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
+    DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index b96f0c643f..c2bb6ada44 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -18,6 +18,7 @@ 
 #include "standard-headers/linux/virtio_net.h"
 #include "hw/virtio/virtio.h"
 #include "net/announce.h"
+#include "qemu/option_int.h"
 
 #define TYPE_VIRTIO_NET "virtio-net-device"
 #define VIRTIO_NET(obj) \
@@ -43,6 +44,7 @@  typedef struct virtio_net_conf
     int32_t speed;
     char *duplex_str;
     uint8_t duplex;
+    char *primary_id_str;
 } virtio_net_conf;
 
 /* Coalesced packets type & status */
@@ -185,6 +187,16 @@  struct VirtIONet {
     AnnounceTimer announce_timer;
     bool needs_vnet_hdr_swap;
     bool mtu_bypass_backend;
+    QemuOpts *primary_device_opts;
+    QDict *primary_device_dict;
+    DeviceState *primary_dev;
+    char *primary_device_id;
+    char *standby_id;
+    bool primary_should_be_hidden;
+    bool failover;
+    DeviceListener primary_listener;
+    QEMUTimer *primary_device_timer;
+    Notifier migration_state;
 };
 
 void virtio_net_set_netclient_name(VirtIONet *n, const char *name,