[RFC] virtio-pci: disable vring processing when bus-mastering is disabled
diff mbox series

Message ID 20191113054301.31529-1-mdroth@linux.vnet.ibm.com
State New
Headers show
Series
  • [RFC] virtio-pci: disable vring processing when bus-mastering is disabled
Related show

Commit Message

Michael Roth Nov. 13, 2019, 5:43 a.m. UTC
Currently the SLOF firmware for pseries guests will disable/re-enable
a PCI device multiple times via IO/MEM/MASTER bits of PCI_COMMAND
register after the initial probe/feature negotiation, as it tends to
work with a single device at a time at various stages like probing
and running block/network bootloaders without doing a full reset
in-between.

In QEMU, when PCI_COMMAND_MASTER is disabled we disable the
corresponding IOMMU memory region, so DMA accesses (including to vring
fields like idx/flags) will no longer undergo the necessary
translation. Normally we wouldn't expect this to happen since it would
be misbehavior on the driver side to continue driving DMA requests.

However, in the case of pseries, with iommu_platform=on, we trigger the
following sequence when tearing down the virtio-blk dataplane ioeventfd
in response to the guest unsetting PCI_COMMAND_MASTER:

  #2  0x0000555555922651 in virtqueue_map_desc (vdev=vdev@entry=0x555556dbcfb0, p_num_sg=p_num_sg@entry=0x7fffe657e1a8, addr=addr@entry=0x7fffe657e240, iov=iov@entry=0x7fffe6580240, max_num_sg=max_num_sg@entry=1024, is_write=is_write@entry=false, pa=0, sz=0)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:757
  #3  0x0000555555922a89 in virtqueue_pop (vq=vq@entry=0x555556dc8660, sz=sz@entry=184)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:950
  #4  0x00005555558d3eca in virtio_blk_get_request (vq=0x555556dc8660, s=0x555556dbcfb0)
      at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:255
  #5  0x00005555558d3eca in virtio_blk_handle_vq (s=0x555556dbcfb0, vq=0x555556dc8660)
      at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:776
  #6  0x000055555591dd66 in virtio_queue_notify_aio_vq (vq=vq@entry=0x555556dc8660)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1550
  #7  0x000055555591ecef in virtio_queue_notify_aio_vq (vq=0x555556dc8660)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1546
  #8  0x000055555591ecef in virtio_queue_host_notifier_aio_poll (opaque=0x555556dc86c8)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:2527
  #9  0x0000555555d02164 in run_poll_handlers_once (ctx=ctx@entry=0x55555688bfc0, timeout=timeout@entry=0x7fffe65844a8)
      at /home/mdroth/w/qemu.git/util/aio-posix.c:520
  #10 0x0000555555d02d1b in try_poll_mode (timeout=0x7fffe65844a8, ctx=0x55555688bfc0)
      at /home/mdroth/w/qemu.git/util/aio-posix.c:607
  #11 0x0000555555d02d1b in aio_poll (ctx=ctx@entry=0x55555688bfc0, blocking=blocking@entry=true)
      at /home/mdroth/w/qemu.git/util/aio-posix.c:639
  #12 0x0000555555d0004d in aio_wait_bh_oneshot (ctx=0x55555688bfc0, cb=cb@entry=0x5555558d5130 <virtio_blk_data_plane_stop_bh>, opaque=opaque@entry=0x555556de86f0)
      at /home/mdroth/w/qemu.git/util/aio-wait.c:71
  #13 0x00005555558d59bf in virtio_blk_data_plane_stop (vdev=<optimized out>)
      at /home/mdroth/w/qemu.git/hw/block/dataplane/virtio-blk.c:288
  #14 0x0000555555b906a1 in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:245
  #15 0x0000555555b90dbb in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:237
  #16 0x0000555555b92a8e in virtio_pci_stop_ioeventfd (proxy=0x555556db4e40)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:292
  #17 0x0000555555b92a8e in virtio_write_config (pci_dev=0x555556db4e40, address=<optimized out>, val=1048832, len=<optimized out>)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:613

I.e. the calling code is only scheduling a one-shot BH for
virtio_blk_data_plane_stop_bh, but somehow we end up trying to process
an additional virtqueue entry before we get there. This is likely due
to the following check in virtio_queue_host_notifier_aio_poll:

  static bool virtio_queue_host_notifier_aio_poll(void *opaque)
  {
      EventNotifier *n = opaque;
      VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
      bool progress;

      if (!vq->vring.desc || virtio_queue_empty(vq)) {
          return false;
      }

      progress = virtio_queue_notify_aio_vq(vq);

namely the call to virtio_queue_empty(). In this case, since no new
requests have actually been issued, shadow_avail_idx == last_avail_idx,
so we actually try to access the vring via vring_avail_idx() to get
the latest non-shadowed idx:

  int virtio_queue_empty(VirtQueue *vq)
  {
      bool empty;
      ...

      if (vq->shadow_avail_idx != vq->last_avail_idx) {
          return 0;
      }

      rcu_read_lock();
      empty = vring_avail_idx(vq) == vq->last_avail_idx;
      rcu_read_unlock();
      return empty;

but since the IOMMU region has been disabled we get a bogus value (0
usually), which causes virtio_queue_empty() to falsely report that
there are entries to be processed, which causes errors such as:

  "virtio: zero sized buffers are not allowed"

or

  "virtio-blk missing headers"

and puts the device in an error state.

This patch works around the issue by introducing virtio_set_disabled(),
which piggy-backs off the vdev->broken flag we already use to bypass
checks like virtio_queue_empty(), and sets/unsets it in response to
enabling/disabling bus-mastering.

NOTES:

 - It's possible we could also work around this in SLOF by doing a
   full reset instead of relying on PCI_COMMAND to enable/disable, but
   in any case the QEMU behavior seems wrong.
 - This leaves some other oddities in play, like the fact that
   DRIVER_OK also gets unset in response to bus-mastering being
   disabled, but not restored (however the device seems to continue
   working)
 - Similarly, we disable the host notifier via
   virtio_bus_stop_ioeventfd(), which seems to move the handling out
   of virtio-blk dataplane and back into the main IO thread, and it
   ends up staying there till a reset (but otherwise continues working
   normally)

Cc: David Gibson <david@gibson.dropbear.id.au>,
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
---
 hw/virtio/virtio-pci.c     | 12 ++++++++----
 hw/virtio/virtio.c         |  7 ++++++-
 include/hw/virtio/virtio.h |  1 +
 3 files changed, 15 insertions(+), 5 deletions(-)

Comments

Michael S. Tsirkin Nov. 13, 2019, 10:09 a.m. UTC | #1
On Tue, Nov 12, 2019 at 11:43:01PM -0600, Michael Roth wrote:
> Currently the SLOF firmware for pseries guests will disable/re-enable
> a PCI device multiple times via IO/MEM/MASTER bits of PCI_COMMAND
> register after the initial probe/feature negotiation, as it tends to
> work with a single device at a time at various stages like probing
> and running block/network bootloaders without doing a full reset
> in-between.
> 
> In QEMU, when PCI_COMMAND_MASTER is disabled we disable the
> corresponding IOMMU memory region, so DMA accesses (including to vring
> fields like idx/flags) will no longer undergo the necessary
> translation. Normally we wouldn't expect this to happen since it would
> be misbehavior on the driver side to continue driving DMA requests.
> 
> However, in the case of pseries, with iommu_platform=on, we trigger the
> following sequence when tearing down the virtio-blk dataplane ioeventfd
> in response to the guest unsetting PCI_COMMAND_MASTER:
> 
>   #2  0x0000555555922651 in virtqueue_map_desc (vdev=vdev@entry=0x555556dbcfb0, p_num_sg=p_num_sg@entry=0x7fffe657e1a8, addr=addr@entry=0x7fffe657e240, iov=iov@entry=0x7fffe6580240, max_num_sg=max_num_sg@entry=1024, is_write=is_write@entry=false, pa=0, sz=0)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:757
>   #3  0x0000555555922a89 in virtqueue_pop (vq=vq@entry=0x555556dc8660, sz=sz@entry=184)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:950
>   #4  0x00005555558d3eca in virtio_blk_get_request (vq=0x555556dc8660, s=0x555556dbcfb0)
>       at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:255
>   #5  0x00005555558d3eca in virtio_blk_handle_vq (s=0x555556dbcfb0, vq=0x555556dc8660)
>       at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:776
>   #6  0x000055555591dd66 in virtio_queue_notify_aio_vq (vq=vq@entry=0x555556dc8660)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1550
>   #7  0x000055555591ecef in virtio_queue_notify_aio_vq (vq=0x555556dc8660)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1546
>   #8  0x000055555591ecef in virtio_queue_host_notifier_aio_poll (opaque=0x555556dc86c8)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:2527
>   #9  0x0000555555d02164 in run_poll_handlers_once (ctx=ctx@entry=0x55555688bfc0, timeout=timeout@entry=0x7fffe65844a8)
>       at /home/mdroth/w/qemu.git/util/aio-posix.c:520
>   #10 0x0000555555d02d1b in try_poll_mode (timeout=0x7fffe65844a8, ctx=0x55555688bfc0)
>       at /home/mdroth/w/qemu.git/util/aio-posix.c:607
>   #11 0x0000555555d02d1b in aio_poll (ctx=ctx@entry=0x55555688bfc0, blocking=blocking@entry=true)
>       at /home/mdroth/w/qemu.git/util/aio-posix.c:639
>   #12 0x0000555555d0004d in aio_wait_bh_oneshot (ctx=0x55555688bfc0, cb=cb@entry=0x5555558d5130 <virtio_blk_data_plane_stop_bh>, opaque=opaque@entry=0x555556de86f0)
>       at /home/mdroth/w/qemu.git/util/aio-wait.c:71
>   #13 0x00005555558d59bf in virtio_blk_data_plane_stop (vdev=<optimized out>)
>       at /home/mdroth/w/qemu.git/hw/block/dataplane/virtio-blk.c:288
>   #14 0x0000555555b906a1 in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:245
>   #15 0x0000555555b90dbb in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:237
>   #16 0x0000555555b92a8e in virtio_pci_stop_ioeventfd (proxy=0x555556db4e40)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:292
>   #17 0x0000555555b92a8e in virtio_write_config (pci_dev=0x555556db4e40, address=<optimized out>, val=1048832, len=<optimized out>)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:613
> 
> I.e. the calling code is only scheduling a one-shot BH for
> virtio_blk_data_plane_stop_bh, but somehow we end up trying to process
> an additional virtqueue entry before we get there. This is likely due
> to the following check in virtio_queue_host_notifier_aio_poll:
> 
>   static bool virtio_queue_host_notifier_aio_poll(void *opaque)
>   {
>       EventNotifier *n = opaque;
>       VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
>       bool progress;
> 
>       if (!vq->vring.desc || virtio_queue_empty(vq)) {
>           return false;
>       }
> 
>       progress = virtio_queue_notify_aio_vq(vq);
> 
> namely the call to virtio_queue_empty(). In this case, since no new
> requests have actually been issued, shadow_avail_idx == last_avail_idx,
> so we actually try to access the vring via vring_avail_idx() to get
> the latest non-shadowed idx:
> 
>   int virtio_queue_empty(VirtQueue *vq)
>   {
>       bool empty;
>       ...
> 
>       if (vq->shadow_avail_idx != vq->last_avail_idx) {
>           return 0;
>       }
> 
>       rcu_read_lock();
>       empty = vring_avail_idx(vq) == vq->last_avail_idx;
>       rcu_read_unlock();
>       return empty;
> 
> but since the IOMMU region has been disabled we get a bogus value (0
> usually), which causes virtio_queue_empty() to falsely report that
> there are entries to be processed, which causes errors such as:
> 
>   "virtio: zero sized buffers are not allowed"
> 
> or
> 
>   "virtio-blk missing headers"
> 
> and puts the device in an error state.
> 
> This patch works around the issue by introducing virtio_set_disabled(),
> which piggy-backs off the vdev->broken flag we already use to bypass
> checks like virtio_queue_empty(), and sets/unsets it in response to
> enabling/disabling bus-mastering.
> 
> NOTES:
> 
>  - It's possible we could also work around this in SLOF by doing a
>    full reset instead of relying on PCI_COMMAND to enable/disable, but
>    in any case the QEMU behavior seems wrong.
>  - This leaves some other oddities in play, like the fact that
>    DRIVER_OK also gets unset in response to bus-mastering being
>    disabled, but not restored (however the device seems to continue
>    working)
>  - Similarly, we disable the host notifier via
>    virtio_bus_stop_ioeventfd(), which seems to move the handling out
>    of virtio-blk dataplane and back into the main IO thread, and it
>    ends up staying there till a reset (but otherwise continues working
>    normally)
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>,
> Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> ---
>  hw/virtio/virtio-pci.c     | 12 ++++++++----
>  hw/virtio/virtio.c         |  7 ++++++-
>  include/hw/virtio/virtio.h |  1 +
>  3 files changed, 15 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index c6b47a9c73..394d409fb9 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -608,10 +608,14 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
>          pcie_cap_flr_write_config(pci_dev, address, val, len);
>      }
>  
> -    if (range_covers_byte(address, len, PCI_COMMAND) &&
> -        !(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
> -        virtio_pci_stop_ioeventfd(proxy);
> -        virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
> +    if (range_covers_byte(address, len, PCI_COMMAND)) {
> +        if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
> +            virtio_set_disabled(vdev, true);
> +            virtio_pci_stop_ioeventfd(proxy);
> +            virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
> +        } else {
> +            virtio_set_disabled(vdev, false);
> +        }
>      }
>  
>      if (proxy->config_cap &&
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 527df03bfd..46580c357d 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -2575,6 +2575,11 @@ void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name)
>      vdev->bus_name = g_strdup(bus_name);
>  }
>  
> +void virtio_set_disabled(VirtIODevice *vdev, bool disable)
> +{
> +    vdev->broken = disable;
> +}
> +
>  void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
>  {
>      va_list ap;

Hmm. I think just clear and immediate set of bus master while device was
not doing any DMA actually should be fine and should not require a
reset.  Now it's true that right now guests reset first thing which will
clear the broken flag, but I'd say it's wrong to require this specific
order.
I think the easiest thing is to add a "disabled" flag.


> @@ -2588,7 +2593,7 @@ void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
>          virtio_notify_config(vdev);
>      }
>  
> -    vdev->broken = true;
> +    virtio_set_disabled(vdev, true);
>  }
>  
>  static void virtio_memory_listener_commit(MemoryListener *listener)
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index 48e8d04ff6..921945b7e8 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -168,6 +168,7 @@ void virtio_init(VirtIODevice *vdev, const char *name,
>                           uint16_t device_id, size_t config_size);
>  void virtio_cleanup(VirtIODevice *vdev);
>  
> +void virtio_set_disabled(VirtIODevice *vdev, bool disable);
>  void virtio_error(VirtIODevice *vdev, const char *fmt, ...) GCC_FMT_ATTR(2, 3);
>  
>  /* Set the child bus name. */


One open question here is cross version migration.
What exactly happens if we migrate to an old qemu?
from an old qemu?

> -- 
> 2.17.1
Michael Roth Nov. 14, 2019, 1:07 a.m. UTC | #2
Quoting Michael S. Tsirkin (2019-11-13 04:09:02)
> On Tue, Nov 12, 2019 at 11:43:01PM -0600, Michael Roth wrote:
> > Currently the SLOF firmware for pseries guests will disable/re-enable
> > a PCI device multiple times via IO/MEM/MASTER bits of PCI_COMMAND
> > register after the initial probe/feature negotiation, as it tends to
> > work with a single device at a time at various stages like probing
> > and running block/network bootloaders without doing a full reset
> > in-between.
> > 
> > In QEMU, when PCI_COMMAND_MASTER is disabled we disable the
> > corresponding IOMMU memory region, so DMA accesses (including to vring
> > fields like idx/flags) will no longer undergo the necessary
> > translation. Normally we wouldn't expect this to happen since it would
> > be misbehavior on the driver side to continue driving DMA requests.
> > 
> > However, in the case of pseries, with iommu_platform=on, we trigger the
> > following sequence when tearing down the virtio-blk dataplane ioeventfd
> > in response to the guest unsetting PCI_COMMAND_MASTER:
> > 
> >   #2  0x0000555555922651 in virtqueue_map_desc (vdev=vdev@entry=0x555556dbcfb0, p_num_sg=p_num_sg@entry=0x7fffe657e1a8, addr=addr@entry=0x7fffe657e240, iov=iov@entry=0x7fffe6580240, max_num_sg=max_num_sg@entry=1024, is_write=is_write@entry=false, pa=0, sz=0)
> >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:757
> >   #3  0x0000555555922a89 in virtqueue_pop (vq=vq@entry=0x555556dc8660, sz=sz@entry=184)
> >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:950
> >   #4  0x00005555558d3eca in virtio_blk_get_request (vq=0x555556dc8660, s=0x555556dbcfb0)
> >       at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:255
> >   #5  0x00005555558d3eca in virtio_blk_handle_vq (s=0x555556dbcfb0, vq=0x555556dc8660)
> >       at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:776
> >   #6  0x000055555591dd66 in virtio_queue_notify_aio_vq (vq=vq@entry=0x555556dc8660)
> >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1550
> >   #7  0x000055555591ecef in virtio_queue_notify_aio_vq (vq=0x555556dc8660)
> >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1546
> >   #8  0x000055555591ecef in virtio_queue_host_notifier_aio_poll (opaque=0x555556dc86c8)
> >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:2527
> >   #9  0x0000555555d02164 in run_poll_handlers_once (ctx=ctx@entry=0x55555688bfc0, timeout=timeout@entry=0x7fffe65844a8)
> >       at /home/mdroth/w/qemu.git/util/aio-posix.c:520
> >   #10 0x0000555555d02d1b in try_poll_mode (timeout=0x7fffe65844a8, ctx=0x55555688bfc0)
> >       at /home/mdroth/w/qemu.git/util/aio-posix.c:607
> >   #11 0x0000555555d02d1b in aio_poll (ctx=ctx@entry=0x55555688bfc0, blocking=blocking@entry=true)
> >       at /home/mdroth/w/qemu.git/util/aio-posix.c:639
> >   #12 0x0000555555d0004d in aio_wait_bh_oneshot (ctx=0x55555688bfc0, cb=cb@entry=0x5555558d5130 <virtio_blk_data_plane_stop_bh>, opaque=opaque@entry=0x555556de86f0)
> >       at /home/mdroth/w/qemu.git/util/aio-wait.c:71
> >   #13 0x00005555558d59bf in virtio_blk_data_plane_stop (vdev=<optimized out>)
> >       at /home/mdroth/w/qemu.git/hw/block/dataplane/virtio-blk.c:288
> >   #14 0x0000555555b906a1 in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
> >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:245
> >   #15 0x0000555555b90dbb in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
> >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:237
> >   #16 0x0000555555b92a8e in virtio_pci_stop_ioeventfd (proxy=0x555556db4e40)
> >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:292
> >   #17 0x0000555555b92a8e in virtio_write_config (pci_dev=0x555556db4e40, address=<optimized out>, val=1048832, len=<optimized out>)
> >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:613
> > 
> > I.e. the calling code is only scheduling a one-shot BH for
> > virtio_blk_data_plane_stop_bh, but somehow we end up trying to process
> > an additional virtqueue entry before we get there. This is likely due
> > to the following check in virtio_queue_host_notifier_aio_poll:
> > 
> >   static bool virtio_queue_host_notifier_aio_poll(void *opaque)
> >   {
> >       EventNotifier *n = opaque;
> >       VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
> >       bool progress;
> > 
> >       if (!vq->vring.desc || virtio_queue_empty(vq)) {
> >           return false;
> >       }
> > 
> >       progress = virtio_queue_notify_aio_vq(vq);
> > 
> > namely the call to virtio_queue_empty(). In this case, since no new
> > requests have actually been issued, shadow_avail_idx == last_avail_idx,
> > so we actually try to access the vring via vring_avail_idx() to get
> > the latest non-shadowed idx:
> > 
> >   int virtio_queue_empty(VirtQueue *vq)
> >   {
> >       bool empty;
> >       ...
> > 
> >       if (vq->shadow_avail_idx != vq->last_avail_idx) {
> >           return 0;
> >       }
> > 
> >       rcu_read_lock();
> >       empty = vring_avail_idx(vq) == vq->last_avail_idx;
> >       rcu_read_unlock();
> >       return empty;
> > 
> > but since the IOMMU region has been disabled we get a bogus value (0
> > usually), which causes virtio_queue_empty() to falsely report that
> > there are entries to be processed, which causes errors such as:
> > 
> >   "virtio: zero sized buffers are not allowed"
> > 
> > or
> > 
> >   "virtio-blk missing headers"
> > 
> > and puts the device in an error state.
> > 
> > This patch works around the issue by introducing virtio_set_disabled(),
> > which piggy-backs off the vdev->broken flag we already use to bypass
> > checks like virtio_queue_empty(), and sets/unsets it in response to
> > enabling/disabling bus-mastering.
> > 
> > NOTES:
> > 
> >  - It's possible we could also work around this in SLOF by doing a
> >    full reset instead of relying on PCI_COMMAND to enable/disable, but
> >    in any case the QEMU behavior seems wrong.
> >  - This leaves some other oddities in play, like the fact that
> >    DRIVER_OK also gets unset in response to bus-mastering being
> >    disabled, but not restored (however the device seems to continue
> >    working)
> >  - Similarly, we disable the host notifier via
> >    virtio_bus_stop_ioeventfd(), which seems to move the handling out
> >    of virtio-blk dataplane and back into the main IO thread, and it
> >    ends up staying there till a reset (but otherwise continues working
> >    normally)
> > 
> > Cc: David Gibson <david@gibson.dropbear.id.au>,
> > Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> > Cc: "Michael S. Tsirkin" <mst@redhat.com>
> > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > ---
> >  hw/virtio/virtio-pci.c     | 12 ++++++++----
> >  hw/virtio/virtio.c         |  7 ++++++-
> >  include/hw/virtio/virtio.h |  1 +
> >  3 files changed, 15 insertions(+), 5 deletions(-)
> > 
> > diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> > index c6b47a9c73..394d409fb9 100644
> > --- a/hw/virtio/virtio-pci.c
> > +++ b/hw/virtio/virtio-pci.c
> > @@ -608,10 +608,14 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
> >          pcie_cap_flr_write_config(pci_dev, address, val, len);
> >      }
> >  
> > -    if (range_covers_byte(address, len, PCI_COMMAND) &&
> > -        !(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
> > -        virtio_pci_stop_ioeventfd(proxy);
> > -        virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
> > +    if (range_covers_byte(address, len, PCI_COMMAND)) {
> > +        if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
> > +            virtio_set_disabled(vdev, true);
> > +            virtio_pci_stop_ioeventfd(proxy);
> > +            virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
> > +        } else {
> > +            virtio_set_disabled(vdev, false);
> > +        }
> >      }
> >  
> >      if (proxy->config_cap &&
> > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> > index 527df03bfd..46580c357d 100644
> > --- a/hw/virtio/virtio.c
> > +++ b/hw/virtio/virtio.c
> > @@ -2575,6 +2575,11 @@ void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name)
> >      vdev->bus_name = g_strdup(bus_name);
> >  }
> >  
> > +void virtio_set_disabled(VirtIODevice *vdev, bool disable)
> > +{
> > +    vdev->broken = disable;
> > +}
> > +
> >  void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
> >  {
> >      va_list ap;
> 
> Hmm. I think just clear and immediate set of bus master while device was
> not doing any DMA actually should be fine and should not require a
> reset.  Now it's true that right now guests reset first thing which will
> clear the broken flag, but I'd say it's wrong to require this specific
> order.
> I think the easiest thing is to add a "disabled" flag.

Agreed. Some comments on that below.

> 
> 
> > @@ -2588,7 +2593,7 @@ void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
> >          virtio_notify_config(vdev);
> >      }
> >  
> > -    vdev->broken = true;
> > +    virtio_set_disabled(vdev, true);
> >  }
> >  
> >  static void virtio_memory_listener_commit(MemoryListener *listener)
> > diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> > index 48e8d04ff6..921945b7e8 100644
> > --- a/include/hw/virtio/virtio.h
> > +++ b/include/hw/virtio/virtio.h
> > @@ -168,6 +168,7 @@ void virtio_init(VirtIODevice *vdev, const char *name,
> >                           uint16_t device_id, size_t config_size);
> >  void virtio_cleanup(VirtIODevice *vdev);
> >  
> > +void virtio_set_disabled(VirtIODevice *vdev, bool disable);
> >  void virtio_error(VirtIODevice *vdev, const char *fmt, ...) GCC_FMT_ATTR(2, 3);
> >  
> >  /* Set the child bus name. */
> 
> 
> One open question here is cross version migration.
> What exactly happens if we migrate to an old qemu?
> from an old qemu?

Currently vdev->broken is already migrated via a subsection if it is set.

So, with the current proposed patch I would expect the following:

 old -> new:
   dev->broken set due to proposed virtio_set_disabled():
     - N/A
   dev->broken set due to other/existing reasons:
     - value is migrated, migration succeeds
     - device continues not working until a reset, or bus-mastering
       enabled/re-enabled. (which is good, but also makes me realize
       that a device that was put into broken state for reasons
       other than virtio_set_disabled() should not get 'unbroken'
       simply because bus-master bit was cycled. a separate flag
       is probably needed)
     - PASS
   dev->broken not set:
     - not migrated
     - PASS
 new -> old:
   dev->broken set due to proposed virtio_set_disabled():
     - migration succeeds to any QEMU that already has handling for
       dev->broken.
     - migration fails for any QEMU that doesn't have handling for
       dev->broken as it does now, but will a higher chance of
       triggering
     - device won't work until it is reset. for most guests we will
       probably get a reset before the device is used again anyway. for
       the SLOF case the device will stay broken after bus-mastering
       re-enabled, but that's the case for existing QEMU now anyway
     - PASS, but with increased chance of migration breakage for
       QEMU's that don't have handling for dev->broken.
   dev->broken set due to other/existing reasons:
     - migration succeeds to any QEMU that already has handling for
       dev->broken. device will require a reset as it does now.
     - migration fails for any QEMU that doesn't have handling for
       dev->broken as it does now
     - PASS
   dev->broken not set:
     - not migrated
     - PASS

With a new dev->disabled flag, which we'd likely send using a subsection
like with dev->broken, I would expect the following:

 old -> new:
   dev->disabled set:
     - N/A
   dev->disabled not set:
     - not migrated
     - if source disables BM just before migration we will likely go
       into an error state that either sets dev->broken or puts the
       device in some other possibly bad state. either case would require
       subsequent reset, just as they would without migration
     - PASS
 new -> old:
   dev->disabled set:
     - migration stream fails
     - FAIL, but as expected, and doesn't seem common currently outside
       of SLOF early-boot else we'd probably have more reports of
       breakage from vring access while BM isn't set.
   dev->broken not set:
     - not migrated
     - PASS

So re-using 'broken' is slightly better from a migration standpoint, but
as noted above it is probably wrong to unset 'broken' just because BM bit
gets cycled, so a new 'disabled' flag is probably needed.

We could probably get by with just adding a check for dev->disabled in 
virtio_queue_empty(), or even earlier in
virtio_queue_host_notifier_aio_poll(), but it seems more proper to also
add it in most of the same places we currently check for dev->broken.

That seems somewhat redundant though, so I think maybe the best approach
is to:

 - replace most dev->broken checks with checks for dev->disabled
 - set dev->disabled whenever dev->broken gets set
 - add a check in virtio_set_disabled() that only allows us to re-enable if
   dev->broken hasn't also been set.

I'll work on a follow-up using that approach if it seems reasonable to
you.

Thanks!

> 
> > -- 
> > 2.17.1
>
Michael S. Tsirkin Nov. 14, 2019, 9:10 a.m. UTC | #3
On Wed, Nov 13, 2019 at 07:07:36PM -0600, Michael Roth wrote:
> Quoting Michael S. Tsirkin (2019-11-13 04:09:02)
> > On Tue, Nov 12, 2019 at 11:43:01PM -0600, Michael Roth wrote:
> > > Currently the SLOF firmware for pseries guests will disable/re-enable
> > > a PCI device multiple times via IO/MEM/MASTER bits of PCI_COMMAND
> > > register after the initial probe/feature negotiation, as it tends to
> > > work with a single device at a time at various stages like probing
> > > and running block/network bootloaders without doing a full reset
> > > in-between.
> > > 
> > > In QEMU, when PCI_COMMAND_MASTER is disabled we disable the
> > > corresponding IOMMU memory region, so DMA accesses (including to vring
> > > fields like idx/flags) will no longer undergo the necessary
> > > translation. Normally we wouldn't expect this to happen since it would
> > > be misbehavior on the driver side to continue driving DMA requests.
> > > 
> > > However, in the case of pseries, with iommu_platform=on, we trigger the
> > > following sequence when tearing down the virtio-blk dataplane ioeventfd
> > > in response to the guest unsetting PCI_COMMAND_MASTER:
> > > 
> > >   #2  0x0000555555922651 in virtqueue_map_desc (vdev=vdev@entry=0x555556dbcfb0, p_num_sg=p_num_sg@entry=0x7fffe657e1a8, addr=addr@entry=0x7fffe657e240, iov=iov@entry=0x7fffe6580240, max_num_sg=max_num_sg@entry=1024, is_write=is_write@entry=false, pa=0, sz=0)
> > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:757
> > >   #3  0x0000555555922a89 in virtqueue_pop (vq=vq@entry=0x555556dc8660, sz=sz@entry=184)
> > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:950
> > >   #4  0x00005555558d3eca in virtio_blk_get_request (vq=0x555556dc8660, s=0x555556dbcfb0)
> > >       at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:255
> > >   #5  0x00005555558d3eca in virtio_blk_handle_vq (s=0x555556dbcfb0, vq=0x555556dc8660)
> > >       at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:776
> > >   #6  0x000055555591dd66 in virtio_queue_notify_aio_vq (vq=vq@entry=0x555556dc8660)
> > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1550
> > >   #7  0x000055555591ecef in virtio_queue_notify_aio_vq (vq=0x555556dc8660)
> > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1546
> > >   #8  0x000055555591ecef in virtio_queue_host_notifier_aio_poll (opaque=0x555556dc86c8)
> > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:2527
> > >   #9  0x0000555555d02164 in run_poll_handlers_once (ctx=ctx@entry=0x55555688bfc0, timeout=timeout@entry=0x7fffe65844a8)
> > >       at /home/mdroth/w/qemu.git/util/aio-posix.c:520
> > >   #10 0x0000555555d02d1b in try_poll_mode (timeout=0x7fffe65844a8, ctx=0x55555688bfc0)
> > >       at /home/mdroth/w/qemu.git/util/aio-posix.c:607
> > >   #11 0x0000555555d02d1b in aio_poll (ctx=ctx@entry=0x55555688bfc0, blocking=blocking@entry=true)
> > >       at /home/mdroth/w/qemu.git/util/aio-posix.c:639
> > >   #12 0x0000555555d0004d in aio_wait_bh_oneshot (ctx=0x55555688bfc0, cb=cb@entry=0x5555558d5130 <virtio_blk_data_plane_stop_bh>, opaque=opaque@entry=0x555556de86f0)
> > >       at /home/mdroth/w/qemu.git/util/aio-wait.c:71
> > >   #13 0x00005555558d59bf in virtio_blk_data_plane_stop (vdev=<optimized out>)
> > >       at /home/mdroth/w/qemu.git/hw/block/dataplane/virtio-blk.c:288
> > >   #14 0x0000555555b906a1 in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
> > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:245
> > >   #15 0x0000555555b90dbb in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
> > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:237
> > >   #16 0x0000555555b92a8e in virtio_pci_stop_ioeventfd (proxy=0x555556db4e40)
> > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:292
> > >   #17 0x0000555555b92a8e in virtio_write_config (pci_dev=0x555556db4e40, address=<optimized out>, val=1048832, len=<optimized out>)
> > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:613
> > > 
> > > I.e. the calling code is only scheduling a one-shot BH for
> > > virtio_blk_data_plane_stop_bh, but somehow we end up trying to process
> > > an additional virtqueue entry before we get there. This is likely due
> > > to the following check in virtio_queue_host_notifier_aio_poll:
> > > 
> > >   static bool virtio_queue_host_notifier_aio_poll(void *opaque)
> > >   {
> > >       EventNotifier *n = opaque;
> > >       VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
> > >       bool progress;
> > > 
> > >       if (!vq->vring.desc || virtio_queue_empty(vq)) {
> > >           return false;
> > >       }
> > > 
> > >       progress = virtio_queue_notify_aio_vq(vq);
> > > 
> > > namely the call to virtio_queue_empty(). In this case, since no new
> > > requests have actually been issued, shadow_avail_idx == last_avail_idx,
> > > so we actually try to access the vring via vring_avail_idx() to get
> > > the latest non-shadowed idx:
> > > 
> > >   int virtio_queue_empty(VirtQueue *vq)
> > >   {
> > >       bool empty;
> > >       ...
> > > 
> > >       if (vq->shadow_avail_idx != vq->last_avail_idx) {
> > >           return 0;
> > >       }
> > > 
> > >       rcu_read_lock();
> > >       empty = vring_avail_idx(vq) == vq->last_avail_idx;
> > >       rcu_read_unlock();
> > >       return empty;
> > > 
> > > but since the IOMMU region has been disabled we get a bogus value (0
> > > usually), which causes virtio_queue_empty() to falsely report that
> > > there are entries to be processed, which causes errors such as:
> > > 
> > >   "virtio: zero sized buffers are not allowed"
> > > 
> > > or
> > > 
> > >   "virtio-blk missing headers"
> > > 
> > > and puts the device in an error state.
> > > 
> > > This patch works around the issue by introducing virtio_set_disabled(),
> > > which piggy-backs off the vdev->broken flag we already use to bypass
> > > checks like virtio_queue_empty(), and sets/unsets it in response to
> > > enabling/disabling bus-mastering.
> > > 
> > > NOTES:
> > > 
> > >  - It's possible we could also work around this in SLOF by doing a
> > >    full reset instead of relying on PCI_COMMAND to enable/disable, but
> > >    in any case the QEMU behavior seems wrong.
> > >  - This leaves some other oddities in play, like the fact that
> > >    DRIVER_OK also gets unset in response to bus-mastering being
> > >    disabled, but not restored (however the device seems to continue
> > >    working)
> > >  - Similarly, we disable the host notifier via
> > >    virtio_bus_stop_ioeventfd(), which seems to move the handling out
> > >    of virtio-blk dataplane and back into the main IO thread, and it
> > >    ends up staying there till a reset (but otherwise continues working
> > >    normally)
> > > 
> > > Cc: David Gibson <david@gibson.dropbear.id.au>,
> > > Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > Cc: "Michael S. Tsirkin" <mst@redhat.com>
> > > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > > ---
> > >  hw/virtio/virtio-pci.c     | 12 ++++++++----
> > >  hw/virtio/virtio.c         |  7 ++++++-
> > >  include/hw/virtio/virtio.h |  1 +
> > >  3 files changed, 15 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> > > index c6b47a9c73..394d409fb9 100644
> > > --- a/hw/virtio/virtio-pci.c
> > > +++ b/hw/virtio/virtio-pci.c
> > > @@ -608,10 +608,14 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
> > >          pcie_cap_flr_write_config(pci_dev, address, val, len);
> > >      }
> > >  
> > > -    if (range_covers_byte(address, len, PCI_COMMAND) &&
> > > -        !(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
> > > -        virtio_pci_stop_ioeventfd(proxy);
> > > -        virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
> > > +    if (range_covers_byte(address, len, PCI_COMMAND)) {
> > > +        if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
> > > +            virtio_set_disabled(vdev, true);
> > > +            virtio_pci_stop_ioeventfd(proxy);
> > > +            virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
> > > +        } else {
> > > +            virtio_set_disabled(vdev, false);
> > > +        }
> > >      }
> > >  
> > >      if (proxy->config_cap &&
> > > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> > > index 527df03bfd..46580c357d 100644
> > > --- a/hw/virtio/virtio.c
> > > +++ b/hw/virtio/virtio.c
> > > @@ -2575,6 +2575,11 @@ void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name)
> > >      vdev->bus_name = g_strdup(bus_name);
> > >  }
> > >  
> > > +void virtio_set_disabled(VirtIODevice *vdev, bool disable)
> > > +{
> > > +    vdev->broken = disable;
> > > +}
> > > +
> > >  void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
> > >  {
> > >      va_list ap;
> > 
> > Hmm. I think just clear and immediate set of bus master while device was
> > not doing any DMA actually should be fine and should not require a
> > reset.  Now it's true that right now guests reset first thing which will
> > clear the broken flag, but I'd say it's wrong to require this specific
> > order.
> > I think the easiest thing is to add a "disabled" flag.
> 
> Agreed. Some comments on that below.
> 
> > 
> > 
> > > @@ -2588,7 +2593,7 @@ void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
> > >          virtio_notify_config(vdev);
> > >      }
> > >  
> > > -    vdev->broken = true;
> > > +    virtio_set_disabled(vdev, true);
> > >  }
> > >  
> > >  static void virtio_memory_listener_commit(MemoryListener *listener)
> > > diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> > > index 48e8d04ff6..921945b7e8 100644
> > > --- a/include/hw/virtio/virtio.h
> > > +++ b/include/hw/virtio/virtio.h
> > > @@ -168,6 +168,7 @@ void virtio_init(VirtIODevice *vdev, const char *name,
> > >                           uint16_t device_id, size_t config_size);
> > >  void virtio_cleanup(VirtIODevice *vdev);
> > >  
> > > +void virtio_set_disabled(VirtIODevice *vdev, bool disable);
> > >  void virtio_error(VirtIODevice *vdev, const char *fmt, ...) GCC_FMT_ATTR(2, 3);
> > >  
> > >  /* Set the child bus name. */
> > 
> > 
> > One open question here is cross version migration.
> > What exactly happens if we migrate to an old qemu?
> > from an old qemu?
> 
> Currently vdev->broken is already migrated via a subsection if it is set.
> 
> So, with the current proposed patch I would expect the following:
> 
>  old -> new:
>    dev->broken set due to proposed virtio_set_disabled():
>      - N/A
>    dev->broken set due to other/existing reasons:
>      - value is migrated, migration succeeds
>      - device continues not working until a reset, or bus-mastering
>        enabled/re-enabled. (which is good, but also makes me realize
>        that a device that was put into broken state for reasons
>        other than virtio_set_disabled() should not get 'unbroken'
>        simply because bus-master bit was cycled. a separate flag
>        is probably needed)
>      - PASS
>    dev->broken not set:
>      - not migrated
>      - PASS
>  new -> old:
>    dev->broken set due to proposed virtio_set_disabled():
>      - migration succeeds to any QEMU that already has handling for
>        dev->broken.
>      - migration fails for any QEMU that doesn't have handling for
>        dev->broken as it does now, but will a higher chance of
>        triggering
>      - device won't work until it is reset. for most guests we will
>        probably get a reset before the device is used again anyway. for
>        the SLOF case the device will stay broken after bus-mastering
>        re-enabled, but that's the case for existing QEMU now anyway
>      - PASS, but with increased chance of migration breakage for
>        QEMU's that don't have handling for dev->broken.
>    dev->broken set due to other/existing reasons:
>      - migration succeeds to any QEMU that already has handling for
>        dev->broken. device will require a reset as it does now.
>      - migration fails for any QEMU that doesn't have handling for
>        dev->broken as it does now
>      - PASS
>    dev->broken not set:
>      - not migrated
>      - PASS
> 
> With a new dev->disabled flag, which we'd likely send using a subsection
> like with dev->broken, I would expect the following:
> 
>  old -> new:
>    dev->disabled set:
>      - N/A
>    dev->disabled not set:
>      - not migrated
>      - if source disables BM just before migration we will likely go
>        into an error state that either sets dev->broken or puts the
>        device in some other possibly bad state. either case would require
>        subsequent reset, just as they would without migration
>      - PASS
>  new -> old:
>    dev->disabled set:
>      - migration stream fails
>      - FAIL, but as expected, and doesn't seem common currently outside
>        of SLOF early-boot else we'd probably have more reports of
>        breakage from vring access while BM isn't set.
>    dev->broken not set:
>      - not migrated
>      - PASS
> 
> So re-using 'broken' is slightly better from a migration standpoint, but
> as noted above it is probably wrong to unset 'broken' just because BM bit
> gets cycled, so a new 'disabled' flag is probably needed.
> 
> We could probably get by with just adding a check for dev->disabled in 
> virtio_queue_empty(), or even earlier in
> virtio_queue_host_notifier_aio_poll(), but it seems more proper to also
> add it in most of the same places we currently check for dev->broken.
> 
> That seems somewhat redundant though, so I think maybe the best approach
> is to:
> 
>  - replace most dev->broken checks with checks for dev->disabled

I propose calling an inline functions so we are not tied to a specific
field.

>  - set dev->disabled whenever dev->broken gets set
>  - add a check in virtio_set_disabled() that only allows us to re-enable if
>    dev->broken hasn't also been set.
> 
> I'll work on a follow-up using that approach if it seems reasonable to
> you.
> 
> Thanks!
> 
> > 
> > > -- 
> > > 2.17.1
> >
Michael S. Tsirkin Nov. 14, 2019, 9:12 a.m. UTC | #4
On Thu, Nov 14, 2019 at 04:10:36AM -0500, Michael S. Tsirkin wrote:
> On Wed, Nov 13, 2019 at 07:07:36PM -0600, Michael Roth wrote:
> > Quoting Michael S. Tsirkin (2019-11-13 04:09:02)
> > > On Tue, Nov 12, 2019 at 11:43:01PM -0600, Michael Roth wrote:
> > > > Currently the SLOF firmware for pseries guests will disable/re-enable
> > > > a PCI device multiple times via IO/MEM/MASTER bits of PCI_COMMAND
> > > > register after the initial probe/feature negotiation, as it tends to
> > > > work with a single device at a time at various stages like probing
> > > > and running block/network bootloaders without doing a full reset
> > > > in-between.
> > > > 
> > > > In QEMU, when PCI_COMMAND_MASTER is disabled we disable the
> > > > corresponding IOMMU memory region, so DMA accesses (including to vring
> > > > fields like idx/flags) will no longer undergo the necessary
> > > > translation. Normally we wouldn't expect this to happen since it would
> > > > be misbehavior on the driver side to continue driving DMA requests.
> > > > 
> > > > However, in the case of pseries, with iommu_platform=on, we trigger the
> > > > following sequence when tearing down the virtio-blk dataplane ioeventfd
> > > > in response to the guest unsetting PCI_COMMAND_MASTER:
> > > > 
> > > >   #2  0x0000555555922651 in virtqueue_map_desc (vdev=vdev@entry=0x555556dbcfb0, p_num_sg=p_num_sg@entry=0x7fffe657e1a8, addr=addr@entry=0x7fffe657e240, iov=iov@entry=0x7fffe6580240, max_num_sg=max_num_sg@entry=1024, is_write=is_write@entry=false, pa=0, sz=0)
> > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:757
> > > >   #3  0x0000555555922a89 in virtqueue_pop (vq=vq@entry=0x555556dc8660, sz=sz@entry=184)
> > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:950
> > > >   #4  0x00005555558d3eca in virtio_blk_get_request (vq=0x555556dc8660, s=0x555556dbcfb0)
> > > >       at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:255
> > > >   #5  0x00005555558d3eca in virtio_blk_handle_vq (s=0x555556dbcfb0, vq=0x555556dc8660)
> > > >       at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:776
> > > >   #6  0x000055555591dd66 in virtio_queue_notify_aio_vq (vq=vq@entry=0x555556dc8660)
> > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1550
> > > >   #7  0x000055555591ecef in virtio_queue_notify_aio_vq (vq=0x555556dc8660)
> > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1546
> > > >   #8  0x000055555591ecef in virtio_queue_host_notifier_aio_poll (opaque=0x555556dc86c8)
> > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:2527
> > > >   #9  0x0000555555d02164 in run_poll_handlers_once (ctx=ctx@entry=0x55555688bfc0, timeout=timeout@entry=0x7fffe65844a8)
> > > >       at /home/mdroth/w/qemu.git/util/aio-posix.c:520
> > > >   #10 0x0000555555d02d1b in try_poll_mode (timeout=0x7fffe65844a8, ctx=0x55555688bfc0)
> > > >       at /home/mdroth/w/qemu.git/util/aio-posix.c:607
> > > >   #11 0x0000555555d02d1b in aio_poll (ctx=ctx@entry=0x55555688bfc0, blocking=blocking@entry=true)
> > > >       at /home/mdroth/w/qemu.git/util/aio-posix.c:639
> > > >   #12 0x0000555555d0004d in aio_wait_bh_oneshot (ctx=0x55555688bfc0, cb=cb@entry=0x5555558d5130 <virtio_blk_data_plane_stop_bh>, opaque=opaque@entry=0x555556de86f0)
> > > >       at /home/mdroth/w/qemu.git/util/aio-wait.c:71
> > > >   #13 0x00005555558d59bf in virtio_blk_data_plane_stop (vdev=<optimized out>)
> > > >       at /home/mdroth/w/qemu.git/hw/block/dataplane/virtio-blk.c:288
> > > >   #14 0x0000555555b906a1 in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
> > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:245
> > > >   #15 0x0000555555b90dbb in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
> > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:237
> > > >   #16 0x0000555555b92a8e in virtio_pci_stop_ioeventfd (proxy=0x555556db4e40)
> > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:292
> > > >   #17 0x0000555555b92a8e in virtio_write_config (pci_dev=0x555556db4e40, address=<optimized out>, val=1048832, len=<optimized out>)
> > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:613
> > > > 
> > > > I.e. the calling code is only scheduling a one-shot BH for
> > > > virtio_blk_data_plane_stop_bh, but somehow we end up trying to process
> > > > an additional virtqueue entry before we get there. This is likely due
> > > > to the following check in virtio_queue_host_notifier_aio_poll:
> > > > 
> > > >   static bool virtio_queue_host_notifier_aio_poll(void *opaque)
> > > >   {
> > > >       EventNotifier *n = opaque;
> > > >       VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
> > > >       bool progress;
> > > > 
> > > >       if (!vq->vring.desc || virtio_queue_empty(vq)) {
> > > >           return false;
> > > >       }
> > > > 
> > > >       progress = virtio_queue_notify_aio_vq(vq);
> > > > 
> > > > namely the call to virtio_queue_empty(). In this case, since no new
> > > > requests have actually been issued, shadow_avail_idx == last_avail_idx,
> > > > so we actually try to access the vring via vring_avail_idx() to get
> > > > the latest non-shadowed idx:
> > > > 
> > > >   int virtio_queue_empty(VirtQueue *vq)
> > > >   {
> > > >       bool empty;
> > > >       ...
> > > > 
> > > >       if (vq->shadow_avail_idx != vq->last_avail_idx) {
> > > >           return 0;
> > > >       }
> > > > 
> > > >       rcu_read_lock();
> > > >       empty = vring_avail_idx(vq) == vq->last_avail_idx;
> > > >       rcu_read_unlock();
> > > >       return empty;
> > > > 
> > > > but since the IOMMU region has been disabled we get a bogus value (0
> > > > usually), which causes virtio_queue_empty() to falsely report that
> > > > there are entries to be processed, which causes errors such as:
> > > > 
> > > >   "virtio: zero sized buffers are not allowed"
> > > > 
> > > > or
> > > > 
> > > >   "virtio-blk missing headers"
> > > > 
> > > > and puts the device in an error state.
> > > > 
> > > > This patch works around the issue by introducing virtio_set_disabled(),
> > > > which piggy-backs off the vdev->broken flag we already use to bypass
> > > > checks like virtio_queue_empty(), and sets/unsets it in response to
> > > > enabling/disabling bus-mastering.
> > > > 
> > > > NOTES:
> > > > 
> > > >  - It's possible we could also work around this in SLOF by doing a
> > > >    full reset instead of relying on PCI_COMMAND to enable/disable, but
> > > >    in any case the QEMU behavior seems wrong.
> > > >  - This leaves some other oddities in play, like the fact that
> > > >    DRIVER_OK also gets unset in response to bus-mastering being
> > > >    disabled, but not restored (however the device seems to continue
> > > >    working)
> > > >  - Similarly, we disable the host notifier via
> > > >    virtio_bus_stop_ioeventfd(), which seems to move the handling out
> > > >    of virtio-blk dataplane and back into the main IO thread, and it
> > > >    ends up staying there till a reset (but otherwise continues working
> > > >    normally)
> > > > 
> > > > Cc: David Gibson <david@gibson.dropbear.id.au>,
> > > > Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > > Cc: "Michael S. Tsirkin" <mst@redhat.com>
> > > > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > > > ---
> > > >  hw/virtio/virtio-pci.c     | 12 ++++++++----
> > > >  hw/virtio/virtio.c         |  7 ++++++-
> > > >  include/hw/virtio/virtio.h |  1 +
> > > >  3 files changed, 15 insertions(+), 5 deletions(-)
> > > > 
> > > > diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> > > > index c6b47a9c73..394d409fb9 100644
> > > > --- a/hw/virtio/virtio-pci.c
> > > > +++ b/hw/virtio/virtio-pci.c
> > > > @@ -608,10 +608,14 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
> > > >          pcie_cap_flr_write_config(pci_dev, address, val, len);
> > > >      }
> > > >  
> > > > -    if (range_covers_byte(address, len, PCI_COMMAND) &&
> > > > -        !(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
> > > > -        virtio_pci_stop_ioeventfd(proxy);
> > > > -        virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
> > > > +    if (range_covers_byte(address, len, PCI_COMMAND)) {
> > > > +        if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
> > > > +            virtio_set_disabled(vdev, true);
> > > > +            virtio_pci_stop_ioeventfd(proxy);
> > > > +            virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
> > > > +        } else {
> > > > +            virtio_set_disabled(vdev, false);
> > > > +        }
> > > >      }
> > > >  
> > > >      if (proxy->config_cap &&
> > > > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> > > > index 527df03bfd..46580c357d 100644
> > > > --- a/hw/virtio/virtio.c
> > > > +++ b/hw/virtio/virtio.c
> > > > @@ -2575,6 +2575,11 @@ void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name)
> > > >      vdev->bus_name = g_strdup(bus_name);
> > > >  }
> > > >  
> > > > +void virtio_set_disabled(VirtIODevice *vdev, bool disable)
> > > > +{
> > > > +    vdev->broken = disable;
> > > > +}
> > > > +
> > > >  void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
> > > >  {
> > > >      va_list ap;
> > > 
> > > Hmm. I think just clear and immediate set of bus master while device was
> > > not doing any DMA actually should be fine and should not require a
> > > reset.  Now it's true that right now guests reset first thing which will
> > > clear the broken flag, but I'd say it's wrong to require this specific
> > > order.
> > > I think the easiest thing is to add a "disabled" flag.
> > 
> > Agreed. Some comments on that below.
> > 
> > > 
> > > 
> > > > @@ -2588,7 +2593,7 @@ void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
> > > >          virtio_notify_config(vdev);
> > > >      }
> > > >  
> > > > -    vdev->broken = true;
> > > > +    virtio_set_disabled(vdev, true);
> > > >  }
> > > >  
> > > >  static void virtio_memory_listener_commit(MemoryListener *listener)
> > > > diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> > > > index 48e8d04ff6..921945b7e8 100644
> > > > --- a/include/hw/virtio/virtio.h
> > > > +++ b/include/hw/virtio/virtio.h
> > > > @@ -168,6 +168,7 @@ void virtio_init(VirtIODevice *vdev, const char *name,
> > > >                           uint16_t device_id, size_t config_size);
> > > >  void virtio_cleanup(VirtIODevice *vdev);
> > > >  
> > > > +void virtio_set_disabled(VirtIODevice *vdev, bool disable);
> > > >  void virtio_error(VirtIODevice *vdev, const char *fmt, ...) GCC_FMT_ATTR(2, 3);
> > > >  
> > > >  /* Set the child bus name. */
> > > 
> > > 
> > > One open question here is cross version migration.
> > > What exactly happens if we migrate to an old qemu?
> > > from an old qemu?
> > 
> > Currently vdev->broken is already migrated via a subsection if it is set.
> > 
> > So, with the current proposed patch I would expect the following:
> > 
> >  old -> new:
> >    dev->broken set due to proposed virtio_set_disabled():
> >      - N/A
> >    dev->broken set due to other/existing reasons:
> >      - value is migrated, migration succeeds
> >      - device continues not working until a reset, or bus-mastering
> >        enabled/re-enabled. (which is good, but also makes me realize
> >        that a device that was put into broken state for reasons
> >        other than virtio_set_disabled() should not get 'unbroken'
> >        simply because bus-master bit was cycled. a separate flag
> >        is probably needed)
> >      - PASS
> >    dev->broken not set:
> >      - not migrated
> >      - PASS
> >  new -> old:
> >    dev->broken set due to proposed virtio_set_disabled():
> >      - migration succeeds to any QEMU that already has handling for
> >        dev->broken.
> >      - migration fails for any QEMU that doesn't have handling for
> >        dev->broken as it does now, but will a higher chance of
> >        triggering
> >      - device won't work until it is reset. for most guests we will
> >        probably get a reset before the device is used again anyway. for
> >        the SLOF case the device will stay broken after bus-mastering
> >        re-enabled, but that's the case for existing QEMU now anyway
> >      - PASS, but with increased chance of migration breakage for
> >        QEMU's that don't have handling for dev->broken.
> >    dev->broken set due to other/existing reasons:
> >      - migration succeeds to any QEMU that already has handling for
> >        dev->broken. device will require a reset as it does now.
> >      - migration fails for any QEMU that doesn't have handling for
> >        dev->broken as it does now
> >      - PASS
> >    dev->broken not set:
> >      - not migrated
> >      - PASS
> > 
> > With a new dev->disabled flag, which we'd likely send using a subsection
> > like with dev->broken, I would expect the following:
> > 
> >  old -> new:
> >    dev->disabled set:
> >      - N/A
> >    dev->disabled not set:
> >      - not migrated
> >      - if source disables BM just before migration we will likely go
> >        into an error state that either sets dev->broken or puts the
> >        device in some other possibly bad state. either case would require
> >        subsequent reset, just as they would without migration
> >      - PASS
> >  new -> old:
> >    dev->disabled set:
> >      - migration stream fails
> >      - FAIL, but as expected, and doesn't seem common currently outside
> >        of SLOF early-boot else we'd probably have more reports of
> >        breakage from vring access while BM isn't set.
> >    dev->broken not set:
> >      - not migrated
> >      - PASS
> > 
> > So re-using 'broken' is slightly better from a migration standpoint, but
> > as noted above it is probably wrong to unset 'broken' just because BM bit
> > gets cycled, so a new 'disabled' flag is probably needed.
> > 
> > We could probably get by with just adding a check for dev->disabled in 
> > virtio_queue_empty(), or even earlier in
> > virtio_queue_host_notifier_aio_poll(), but it seems more proper to also
> > add it in most of the same places we currently check for dev->broken.
> > 
> > That seems somewhat redundant though, so I think maybe the best approach
> > is to:
> > 
> >  - replace most dev->broken checks with checks for dev->disabled
> 
> I propose calling an inline functions so we are not tied to a specific
> field.
> 
> >  - set dev->disabled whenever dev->broken gets set
> >  - add a check in virtio_set_disabled() that only allows us to re-enable if
> >    dev->broken hasn't also been set.
> > 
> > I'll work on a follow-up using that approach if it seems reasonable to
> > you.


Also, since the new field is not migrated anyway, how about a flag
that old machine types set, and behave the way we do now?
> > 
> > Thanks!
> > 
> > > 
> > > > -- 
> > > > 2.17.1
> > >
Michael Roth Nov. 14, 2019, 5:17 p.m. UTC | #5
Quoting Michael S. Tsirkin (2019-11-14 03:12:00)
> On Thu, Nov 14, 2019 at 04:10:36AM -0500, Michael S. Tsirkin wrote:
> > On Wed, Nov 13, 2019 at 07:07:36PM -0600, Michael Roth wrote:
> > > Quoting Michael S. Tsirkin (2019-11-13 04:09:02)
> > > > On Tue, Nov 12, 2019 at 11:43:01PM -0600, Michael Roth wrote:
> > > > > Currently the SLOF firmware for pseries guests will disable/re-enable
> > > > > a PCI device multiple times via IO/MEM/MASTER bits of PCI_COMMAND
> > > > > register after the initial probe/feature negotiation, as it tends to
> > > > > work with a single device at a time at various stages like probing
> > > > > and running block/network bootloaders without doing a full reset
> > > > > in-between.
> > > > > 
> > > > > In QEMU, when PCI_COMMAND_MASTER is disabled we disable the
> > > > > corresponding IOMMU memory region, so DMA accesses (including to vring
> > > > > fields like idx/flags) will no longer undergo the necessary
> > > > > translation. Normally we wouldn't expect this to happen since it would
> > > > > be misbehavior on the driver side to continue driving DMA requests.
> > > > > 
> > > > > However, in the case of pseries, with iommu_platform=on, we trigger the
> > > > > following sequence when tearing down the virtio-blk dataplane ioeventfd
> > > > > in response to the guest unsetting PCI_COMMAND_MASTER:
> > > > > 
> > > > >   #2  0x0000555555922651 in virtqueue_map_desc (vdev=vdev@entry=0x555556dbcfb0, p_num_sg=p_num_sg@entry=0x7fffe657e1a8, addr=addr@entry=0x7fffe657e240, iov=iov@entry=0x7fffe6580240, max_num_sg=max_num_sg@entry=1024, is_write=is_write@entry=false, pa=0, sz=0)
> > > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:757
> > > > >   #3  0x0000555555922a89 in virtqueue_pop (vq=vq@entry=0x555556dc8660, sz=sz@entry=184)
> > > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:950
> > > > >   #4  0x00005555558d3eca in virtio_blk_get_request (vq=0x555556dc8660, s=0x555556dbcfb0)
> > > > >       at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:255
> > > > >   #5  0x00005555558d3eca in virtio_blk_handle_vq (s=0x555556dbcfb0, vq=0x555556dc8660)
> > > > >       at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:776
> > > > >   #6  0x000055555591dd66 in virtio_queue_notify_aio_vq (vq=vq@entry=0x555556dc8660)
> > > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1550
> > > > >   #7  0x000055555591ecef in virtio_queue_notify_aio_vq (vq=0x555556dc8660)
> > > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1546
> > > > >   #8  0x000055555591ecef in virtio_queue_host_notifier_aio_poll (opaque=0x555556dc86c8)
> > > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:2527
> > > > >   #9  0x0000555555d02164 in run_poll_handlers_once (ctx=ctx@entry=0x55555688bfc0, timeout=timeout@entry=0x7fffe65844a8)
> > > > >       at /home/mdroth/w/qemu.git/util/aio-posix.c:520
> > > > >   #10 0x0000555555d02d1b in try_poll_mode (timeout=0x7fffe65844a8, ctx=0x55555688bfc0)
> > > > >       at /home/mdroth/w/qemu.git/util/aio-posix.c:607
> > > > >   #11 0x0000555555d02d1b in aio_poll (ctx=ctx@entry=0x55555688bfc0, blocking=blocking@entry=true)
> > > > >       at /home/mdroth/w/qemu.git/util/aio-posix.c:639
> > > > >   #12 0x0000555555d0004d in aio_wait_bh_oneshot (ctx=0x55555688bfc0, cb=cb@entry=0x5555558d5130 <virtio_blk_data_plane_stop_bh>, opaque=opaque@entry=0x555556de86f0)
> > > > >       at /home/mdroth/w/qemu.git/util/aio-wait.c:71
> > > > >   #13 0x00005555558d59bf in virtio_blk_data_plane_stop (vdev=<optimized out>)
> > > > >       at /home/mdroth/w/qemu.git/hw/block/dataplane/virtio-blk.c:288
> > > > >   #14 0x0000555555b906a1 in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
> > > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:245
> > > > >   #15 0x0000555555b90dbb in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
> > > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:237
> > > > >   #16 0x0000555555b92a8e in virtio_pci_stop_ioeventfd (proxy=0x555556db4e40)
> > > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:292
> > > > >   #17 0x0000555555b92a8e in virtio_write_config (pci_dev=0x555556db4e40, address=<optimized out>, val=1048832, len=<optimized out>)
> > > > >       at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:613
> > > > > 
> > > > > I.e. the calling code is only scheduling a one-shot BH for
> > > > > virtio_blk_data_plane_stop_bh, but somehow we end up trying to process
> > > > > an additional virtqueue entry before we get there. This is likely due
> > > > > to the following check in virtio_queue_host_notifier_aio_poll:
> > > > > 
> > > > >   static bool virtio_queue_host_notifier_aio_poll(void *opaque)
> > > > >   {
> > > > >       EventNotifier *n = opaque;
> > > > >       VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
> > > > >       bool progress;
> > > > > 
> > > > >       if (!vq->vring.desc || virtio_queue_empty(vq)) {
> > > > >           return false;
> > > > >       }
> > > > > 
> > > > >       progress = virtio_queue_notify_aio_vq(vq);
> > > > > 
> > > > > namely the call to virtio_queue_empty(). In this case, since no new
> > > > > requests have actually been issued, shadow_avail_idx == last_avail_idx,
> > > > > so we actually try to access the vring via vring_avail_idx() to get
> > > > > the latest non-shadowed idx:
> > > > > 
> > > > >   int virtio_queue_empty(VirtQueue *vq)
> > > > >   {
> > > > >       bool empty;
> > > > >       ...
> > > > > 
> > > > >       if (vq->shadow_avail_idx != vq->last_avail_idx) {
> > > > >           return 0;
> > > > >       }
> > > > > 
> > > > >       rcu_read_lock();
> > > > >       empty = vring_avail_idx(vq) == vq->last_avail_idx;
> > > > >       rcu_read_unlock();
> > > > >       return empty;
> > > > > 
> > > > > but since the IOMMU region has been disabled we get a bogus value (0
> > > > > usually), which causes virtio_queue_empty() to falsely report that
> > > > > there are entries to be processed, which causes errors such as:
> > > > > 
> > > > >   "virtio: zero sized buffers are not allowed"
> > > > > 
> > > > > or
> > > > > 
> > > > >   "virtio-blk missing headers"
> > > > > 
> > > > > and puts the device in an error state.
> > > > > 
> > > > > This patch works around the issue by introducing virtio_set_disabled(),
> > > > > which piggy-backs off the vdev->broken flag we already use to bypass
> > > > > checks like virtio_queue_empty(), and sets/unsets it in response to
> > > > > enabling/disabling bus-mastering.
> > > > > 
> > > > > NOTES:
> > > > > 
> > > > >  - It's possible we could also work around this in SLOF by doing a
> > > > >    full reset instead of relying on PCI_COMMAND to enable/disable, but
> > > > >    in any case the QEMU behavior seems wrong.
> > > > >  - This leaves some other oddities in play, like the fact that
> > > > >    DRIVER_OK also gets unset in response to bus-mastering being
> > > > >    disabled, but not restored (however the device seems to continue
> > > > >    working)
> > > > >  - Similarly, we disable the host notifier via
> > > > >    virtio_bus_stop_ioeventfd(), which seems to move the handling out
> > > > >    of virtio-blk dataplane and back into the main IO thread, and it
> > > > >    ends up staying there till a reset (but otherwise continues working
> > > > >    normally)
> > > > > 
> > > > > Cc: David Gibson <david@gibson.dropbear.id.au>,
> > > > > Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > > > Cc: "Michael S. Tsirkin" <mst@redhat.com>
> > > > > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > > > > ---
> > > > >  hw/virtio/virtio-pci.c     | 12 ++++++++----
> > > > >  hw/virtio/virtio.c         |  7 ++++++-
> > > > >  include/hw/virtio/virtio.h |  1 +
> > > > >  3 files changed, 15 insertions(+), 5 deletions(-)
> > > > > 
> > > > > diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> > > > > index c6b47a9c73..394d409fb9 100644
> > > > > --- a/hw/virtio/virtio-pci.c
> > > > > +++ b/hw/virtio/virtio-pci.c
> > > > > @@ -608,10 +608,14 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
> > > > >          pcie_cap_flr_write_config(pci_dev, address, val, len);
> > > > >      }
> > > > >  
> > > > > -    if (range_covers_byte(address, len, PCI_COMMAND) &&
> > > > > -        !(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
> > > > > -        virtio_pci_stop_ioeventfd(proxy);
> > > > > -        virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > +    if (range_covers_byte(address, len, PCI_COMMAND)) {
> > > > > +        if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
> > > > > +            virtio_set_disabled(vdev, true);
> > > > > +            virtio_pci_stop_ioeventfd(proxy);
> > > > > +            virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > +        } else {
> > > > > +            virtio_set_disabled(vdev, false);
> > > > > +        }
> > > > >      }
> > > > >  
> > > > >      if (proxy->config_cap &&
> > > > > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> > > > > index 527df03bfd..46580c357d 100644
> > > > > --- a/hw/virtio/virtio.c
> > > > > +++ b/hw/virtio/virtio.c
> > > > > @@ -2575,6 +2575,11 @@ void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name)
> > > > >      vdev->bus_name = g_strdup(bus_name);
> > > > >  }
> > > > >  
> > > > > +void virtio_set_disabled(VirtIODevice *vdev, bool disable)
> > > > > +{
> > > > > +    vdev->broken = disable;
> > > > > +}
> > > > > +
> > > > >  void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
> > > > >  {
> > > > >      va_list ap;
> > > > 
> > > > Hmm. I think just clear and immediate set of bus master while device was
> > > > not doing any DMA actually should be fine and should not require a
> > > > reset.  Now it's true that right now guests reset first thing which will
> > > > clear the broken flag, but I'd say it's wrong to require this specific
> > > > order.
> > > > I think the easiest thing is to add a "disabled" flag.
> > > 
> > > Agreed. Some comments on that below.
> > > 
> > > > 
> > > > 
> > > > > @@ -2588,7 +2593,7 @@ void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
> > > > >          virtio_notify_config(vdev);
> > > > >      }
> > > > >  
> > > > > -    vdev->broken = true;
> > > > > +    virtio_set_disabled(vdev, true);
> > > > >  }
> > > > >  
> > > > >  static void virtio_memory_listener_commit(MemoryListener *listener)
> > > > > diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> > > > > index 48e8d04ff6..921945b7e8 100644
> > > > > --- a/include/hw/virtio/virtio.h
> > > > > +++ b/include/hw/virtio/virtio.h
> > > > > @@ -168,6 +168,7 @@ void virtio_init(VirtIODevice *vdev, const char *name,
> > > > >                           uint16_t device_id, size_t config_size);
> > > > >  void virtio_cleanup(VirtIODevice *vdev);
> > > > >  
> > > > > +void virtio_set_disabled(VirtIODevice *vdev, bool disable);
> > > > >  void virtio_error(VirtIODevice *vdev, const char *fmt, ...) GCC_FMT_ATTR(2, 3);
> > > > >  
> > > > >  /* Set the child bus name. */
> > > > 
> > > > 
> > > > One open question here is cross version migration.
> > > > What exactly happens if we migrate to an old qemu?
> > > > from an old qemu?
> > > 
> > > Currently vdev->broken is already migrated via a subsection if it is set.
> > > 
> > > So, with the current proposed patch I would expect the following:
> > > 
> > >  old -> new:
> > >    dev->broken set due to proposed virtio_set_disabled():
> > >      - N/A
> > >    dev->broken set due to other/existing reasons:
> > >      - value is migrated, migration succeeds
> > >      - device continues not working until a reset, or bus-mastering
> > >        enabled/re-enabled. (which is good, but also makes me realize
> > >        that a device that was put into broken state for reasons
> > >        other than virtio_set_disabled() should not get 'unbroken'
> > >        simply because bus-master bit was cycled. a separate flag
> > >        is probably needed)
> > >      - PASS
> > >    dev->broken not set:
> > >      - not migrated
> > >      - PASS
> > >  new -> old:
> > >    dev->broken set due to proposed virtio_set_disabled():
> > >      - migration succeeds to any QEMU that already has handling for
> > >        dev->broken.
> > >      - migration fails for any QEMU that doesn't have handling for
> > >        dev->broken as it does now, but will a higher chance of
> > >        triggering
> > >      - device won't work until it is reset. for most guests we will
> > >        probably get a reset before the device is used again anyway. for
> > >        the SLOF case the device will stay broken after bus-mastering
> > >        re-enabled, but that's the case for existing QEMU now anyway
> > >      - PASS, but with increased chance of migration breakage for
> > >        QEMU's that don't have handling for dev->broken.
> > >    dev->broken set due to other/existing reasons:
> > >      - migration succeeds to any QEMU that already has handling for
> > >        dev->broken. device will require a reset as it does now.
> > >      - migration fails for any QEMU that doesn't have handling for
> > >        dev->broken as it does now
> > >      - PASS
> > >    dev->broken not set:
> > >      - not migrated
> > >      - PASS
> > > 
> > > With a new dev->disabled flag, which we'd likely send using a subsection
> > > like with dev->broken, I would expect the following:
> > > 
> > >  old -> new:
> > >    dev->disabled set:
> > >      - N/A
> > >    dev->disabled not set:
> > >      - not migrated
> > >      - if source disables BM just before migration we will likely go
> > >        into an error state that either sets dev->broken or puts the
> > >        device in some other possibly bad state. either case would require
> > >        subsequent reset, just as they would without migration
> > >      - PASS
> > >  new -> old:
> > >    dev->disabled set:
> > >      - migration stream fails
> > >      - FAIL, but as expected, and doesn't seem common currently outside
> > >        of SLOF early-boot else we'd probably have more reports of
> > >        breakage from vring access while BM isn't set.
> > >    dev->broken not set:
> > >      - not migrated
> > >      - PASS
> > > 
> > > So re-using 'broken' is slightly better from a migration standpoint, but
> > > as noted above it is probably wrong to unset 'broken' just because BM bit
> > > gets cycled, so a new 'disabled' flag is probably needed.
> > > 
> > > We could probably get by with just adding a check for dev->disabled in 
> > > virtio_queue_empty(), or even earlier in
> > > virtio_queue_host_notifier_aio_poll(), but it seems more proper to also
> > > add it in most of the same places we currently check for dev->broken.
> > > 
> > > That seems somewhat redundant though, so I think maybe the best approach
> > > is to:
> > > 
> > >  - replace most dev->broken checks with checks for dev->disabled
> > 
> > I propose calling an inline functions so we are not tied to a specific
> > field.
> > 

Makes sense.

> > >  - set dev->disabled whenever dev->broken gets set
> > >  - add a check in virtio_set_disabled() that only allows us to re-enable if
> > >    dev->broken hasn't also been set.
> > > 
> > > I'll work on a follow-up using that approach if it seems reasonable to
> > > you.
> 
> 
> Also, since the new field is not migrated anyway, how about a flag
> that old machine types set, and behave the way we do now?

Hmm, you're right that in the above cases there's only one case where we
would migrate (new->old, when dev->disabled is set), and this would
break the migration stream anyway, so there's no reason to migrate. But I
forgot to include the more common case of new->new:

  new -> new:
    dev->disabled set:
      - migrated
      - PASS
    dev->disabled not set:
      - not migrated
      - PASS
    
So I was planning on migrating it via subsection. But it does make sense
to only use it for newer machine types so we don't trigger the failure
for (new->old && dev->disabled == true). Maybe that's what you were
suggesting in the first place?

I'll give that a shot.

> > > 
> > > Thanks!
> > > 
> > > > 
> > > > > -- 
> > > > > 2.17.1
> > > > 
>

Patch
diff mbox series

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index c6b47a9c73..394d409fb9 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -608,10 +608,14 @@  static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
         pcie_cap_flr_write_config(pci_dev, address, val, len);
     }
 
-    if (range_covers_byte(address, len, PCI_COMMAND) &&
-        !(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
-        virtio_pci_stop_ioeventfd(proxy);
-        virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
+    if (range_covers_byte(address, len, PCI_COMMAND)) {
+        if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
+            virtio_set_disabled(vdev, true);
+            virtio_pci_stop_ioeventfd(proxy);
+            virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
+        } else {
+            virtio_set_disabled(vdev, false);
+        }
     }
 
     if (proxy->config_cap &&
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 527df03bfd..46580c357d 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -2575,6 +2575,11 @@  void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name)
     vdev->bus_name = g_strdup(bus_name);
 }
 
+void virtio_set_disabled(VirtIODevice *vdev, bool disable)
+{
+    vdev->broken = disable;
+}
+
 void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
 {
     va_list ap;
@@ -2588,7 +2593,7 @@  void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
         virtio_notify_config(vdev);
     }
 
-    vdev->broken = true;
+    virtio_set_disabled(vdev, true);
 }
 
 static void virtio_memory_listener_commit(MemoryListener *listener)
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 48e8d04ff6..921945b7e8 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -168,6 +168,7 @@  void virtio_init(VirtIODevice *vdev, const char *name,
                          uint16_t device_id, size_t config_size);
 void virtio_cleanup(VirtIODevice *vdev);
 
+void virtio_set_disabled(VirtIODevice *vdev, bool disable);
 void virtio_error(VirtIODevice *vdev, const char *fmt, ...) GCC_FMT_ATTR(2, 3);
 
 /* Set the child bus name. */