diff mbox series

[2/4] device-core: use RCU for list of childs of a bus

Message ID 20200416203624.32366-3-mlevitsk@redhat.com
State New
Headers show
Series RFC/WIP: Fix scsi devices plug/unplug races w.r.t virtio-scsi iothread | expand

Commit Message

Maxim Levitsky April 16, 2020, 8:36 p.m. UTC
This fixes the race between device emulation code that tries to find
a child device to dispatch the request to (e.g a scsi disk),
and hotplug of a new device to that bus.

Note that this doesn't convert all the readers of the list
but only these that might go over that list without BQL held.

This is a very small first step to make this code thread safe.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 hw/core/bus.c                  | 43 ++++++++++++++++++++-----------
 hw/core/qdev.c                 | 46 +++++++++++++++++++++++-----------
 hw/scsi/scsi-bus.c             | 17 ++++++++++---
 hw/scsi/virtio-scsi.c          |  6 ++++-
 include/hw/qdev-core.h         |  3 +++
 include/hw/virtio/virtio-bus.h |  7 ++++--
 6 files changed, 87 insertions(+), 35 deletions(-)

Comments

Stefan Hajnoczi May 4, 2020, 10:41 a.m. UTC | #1
On Thu, Apr 16, 2020 at 11:36:22PM +0300, Maxim Levitsky wrote:
> @@ -90,9 +92,13 @@ static void bus_reset_child_foreach(Object *obj, ResettableChildCallback cb,
>      BusState *bus = BUS(obj);
>      BusChild *kid;
>  
> -    QTAILQ_FOREACH(kid, &bus->children, sibling) {
> +    rcu_read_lock();
> +
> +    QTAILQ_FOREACH_RCU(kid, &bus->children, sibling) {
>          cb(OBJECT(kid->child), opaque, type);
>      }
> +
> +    rcu_read_unlock();
>  }
>  
>  static void qbus_realize(BusState *bus, DeviceState *parent, const char *name)
> @@ -138,10 +144,15 @@ static void bus_unparent(Object *obj)
>      /* Only the main system bus has no parent, and that bus is never freed */
>      assert(bus->parent);
>  
> -    while ((kid = QTAILQ_FIRST(&bus->children)) != NULL) {
> +    rcu_read_lock();
> +
> +    while ((kid = QTAILQ_FIRST_RCU(&bus->children)) != NULL) {
>          DeviceState *dev = kid->child;
>          object_unparent(OBJECT(dev));
>      }
> +
> +    rcu_read_unlock();

rcu_read_lock() is called but this looks like a list write operation.
If I understand correctly bus->children list writes can only be called
with the QEMU global mutex and therefore rcu_read_lock() is not required
here?

> diff --git a/hw/core/qdev.c b/hw/core/qdev.c
> index 85f062def7..f0c87e582e 100644
> --- a/hw/core/qdev.c
> +++ b/hw/core/qdev.c
> @@ -50,26 +50,37 @@ const VMStateDescription *qdev_get_vmsd(DeviceState *dev)
>      return dc->vmsd;
>  }
>  
> +static void bus_free_bus_child(BusChild *kid)
> +{
> +    object_unref(OBJECT(kid->child));

Users like scsi_device_find() do not take a refcount on the child.  If
the device is removed then bus_free_bus_child may call object_unref()
while another thread is still accessing the child.

Maybe I'm missing something that prevents this scenario?

If not, then another patch is necessary first that introduces stricter
refcount discipline across the codebase. This applies both to users who
directly access bus->children as well as to those who call walk() and
stash child pointers in their callback function.

> +    g_free(kid);
> +}
> +
>  static void bus_remove_child(BusState *bus, DeviceState *child)
>  {
>      BusChild *kid;
>  
> -    QTAILQ_FOREACH(kid, &bus->children, sibling) {
> +    rcu_read_lock();

List write under rcu_read_lock().

> @@ -82,7 +93,9 @@ static void bus_add_child(BusState *bus, DeviceState *child)
>      kid->child = child;
>      object_ref(OBJECT(kid->child));
>  
> -    QTAILQ_INSERT_HEAD(&bus->children, kid, sibling);
> +    rcu_read_lock();
> +    QTAILQ_INSERT_HEAD_RCU(&bus->children, kid, sibling);
> +    rcu_read_unlock();

List write under rcu_read_lock().

> diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
> index 472bbd233b..b0f4a35f81 100644
> --- a/hw/scsi/virtio-scsi.c
> +++ b/hw/scsi/virtio-scsi.c
> @@ -367,12 +367,16 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
>      case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
>          target = req->req.tmf.lun[1];
>          s->resetting++;
> -        QTAILQ_FOREACH(kid, &s->bus.qbus.children, sibling) {
> +
> +        rcu_read_lock();
> +        QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) {

We need a QTAILQ_FOREACH_WITH_RCU_READ_LOCK() macro that combines
WITH_RCU_READ_LOCK() and QTAILQ_FOREACH_RCU(). :-)

> diff --git a/include/hw/virtio/virtio-bus.h b/include/hw/virtio/virtio-bus.h
> index 38c9399cd4..58733f28e2 100644
> --- a/include/hw/virtio/virtio-bus.h
> +++ b/include/hw/virtio/virtio-bus.h
> @@ -128,8 +128,11 @@ void virtio_bus_set_vdev_config(VirtioBusState *bus, uint8_t *config);
>  static inline VirtIODevice *virtio_bus_get_device(VirtioBusState *bus)
>  {
>      BusState *qbus = &bus->parent_obj;
> -    BusChild *kid = QTAILQ_FIRST(&qbus->children);
> -    DeviceState *qdev = kid ? kid->child : NULL;
> +    BusChild *kid;
> +    DeviceState *qdev;
> +
> +    kid = QTAILQ_FIRST(&qbus->children);

QTAILQ_FIRST_RCU()
Maxim Levitsky May 11, 2020, 8:48 a.m. UTC | #2
On Mon, 2020-05-04 at 11:41 +0100, Stefan Hajnoczi wrote:
> On Thu, Apr 16, 2020 at 11:36:22PM +0300, Maxim Levitsky wrote:
> > @@ -90,9 +92,13 @@ static void bus_reset_child_foreach(Object *obj, ResettableChildCallback cb,
> >      BusState *bus = BUS(obj);
> >      BusChild *kid;
> >  
> > -    QTAILQ_FOREACH(kid, &bus->children, sibling) {
> > +    rcu_read_lock();
> > +
> > +    QTAILQ_FOREACH_RCU(kid, &bus->children, sibling) {
> >          cb(OBJECT(kid->child), opaque, type);
> >      }
> > +
> > +    rcu_read_unlock();
> >  }
> >  
> >  static void qbus_realize(BusState *bus, DeviceState *parent, const char *name)
> > @@ -138,10 +144,15 @@ static void bus_unparent(Object *obj)
> >      /* Only the main system bus has no parent, and that bus is never freed */
> >      assert(bus->parent);
> >  
> > -    while ((kid = QTAILQ_FIRST(&bus->children)) != NULL) {
> > +    rcu_read_lock();
> > +
> > +    while ((kid = QTAILQ_FIRST_RCU(&bus->children)) != NULL) {
> >          DeviceState *dev = kid->child;
> >          object_unparent(OBJECT(dev));
> >      }
> > +
> > +    rcu_read_unlock();
> 
> rcu_read_lock() is called but this looks like a list write operation.
> If I understand correctly bus->children list writes can only be called
> with the QEMU global mutex and therefore rcu_read_lock() is not required
> here?

This is indeed write operation. Paulo helped me to finally understand
what RCU guarantees are, so now I understand.

About write locking here (which I also understand now that I need for RCU),
this is very good question if that can race as well:

It looks like qdev_unplug is what does the device removal, and it first
calls the hotplug handler which is supposed to unrealize the device,
in addition to whatever hot unplug actions are needed (like informing the guest),
and then it does object_unparent which removes the device from the bus.
Therefore as long as the .realized store is atomic and with proper barriers,
the scsi IO thread might be able to see the device but it will be unplugged by then.


There are plenty of object_unparent calls through the code base and I can only hope
that these are done with big qemu lock held.


> 
> > diff --git a/hw/core/qdev.c b/hw/core/qdev.c
> > index 85f062def7..f0c87e582e 100644
> > --- a/hw/core/qdev.c
> > +++ b/hw/core/qdev.c
> > @@ -50,26 +50,37 @@ const VMStateDescription *qdev_get_vmsd(DeviceState *dev)
> >      return dc->vmsd;
> >  }
> >  
> > +static void bus_free_bus_child(BusChild *kid)
> > +{
> > +    object_unref(OBJECT(kid->child));
> 
> Users like scsi_device_find() do not take a refcount on the child.  If
> the device is removed then bus_free_bus_child may call object_unref()
> while another thread is still accessing the child.

I agree, however this is existing bug - bus_remove_child was dropping this
reference immediatly and I delayed it to RCU callback it now sets.
So I didn't made the situation worse.


> 
> Maybe I'm missing something that prevents this scenario?
> 
> If not, then another patch is necessary first that introduces stricter
> refcount discipline across the codebase. This applies both to users who
> directly access bus->children as well as to those who call walk() and
> stash child pointers in their callback function.

In scsi_device_find, as long as RCU read lock is held, no RCU reclaim should happen,
thus this code shouldn't have the child disapper under it.
However once scsi_device_find returns, indeed this can happen,

so scsi_device_find should indeed take a reference to the scsi device and the caller should
drop it when not needed. That should be done in a separate patch, and it
might open yet another can of worms.
While at it, it should be renamed scsi_device_get()
Maybe keep both scsi_device_find and scsi_device_get(), and let the legacy drivers
continue using the former one, while make virtio-scsi use the later? 


> 
> > +    g_free(kid);
> > +}
> > +
> >  static void bus_remove_child(BusState *bus, DeviceState *child)
> >  {
> >      BusChild *kid;
> >  
> > -    QTAILQ_FOREACH(kid, &bus->children, sibling) {
> > +    rcu_read_lock();
> 
> List write under rcu_read_lock().
I removed the RCU read lock here, under assumption that
bus_remove_child will be called with BQL held.
I kept the _RCU version of the list removal, under assumption that
writer still needs it to avoid race vs readers.


> 
> > @@ -82,7 +93,9 @@ static void bus_add_child(BusState *bus, DeviceState *child)
> >      kid->child = child;
> >      object_ref(OBJECT(kid->child));
> >  
> > -    QTAILQ_INSERT_HEAD(&bus->children, kid, sibling);
> > +    rcu_read_lock();
> > +    QTAILQ_INSERT_HEAD_RCU(&bus->children, kid, sibling);
> > +    rcu_read_unlock();
> 
> List write under rcu_read_lock().
Same as above.

> 
> > diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
> > index 472bbd233b..b0f4a35f81 100644
> > --- a/hw/scsi/virtio-scsi.c
> > +++ b/hw/scsi/virtio-scsi.c
> > @@ -367,12 +367,16 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
> >      case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
> >          target = req->req.tmf.lun[1];
> >          s->resetting++;
> > -        QTAILQ_FOREACH(kid, &s->bus.qbus.children, sibling) {
> > +
> > +        rcu_read_lock();
> > +        QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) {
> 
> We need a QTAILQ_FOREACH_WITH_RCU_READ_LOCK() macro that combines
> WITH_RCU_READ_LOCK() and QTAILQ_FOREACH_RCU(). :-)
Assuming that you are not joking here, I'll add this in the new version of the patches.

> 
> > diff --git a/include/hw/virtio/virtio-bus.h b/include/hw/virtio/virtio-bus.h
> > index 38c9399cd4..58733f28e2 100644
> > --- a/include/hw/virtio/virtio-bus.h
> > +++ b/include/hw/virtio/virtio-bus.h
> > @@ -128,8 +128,11 @@ void virtio_bus_set_vdev_config(VirtioBusState *bus, uint8_t *config);
> >  static inline VirtIODevice *virtio_bus_get_device(VirtioBusState *bus)
> >  {
> >      BusState *qbus = &bus->parent_obj;
> > -    BusChild *kid = QTAILQ_FIRST(&qbus->children);
> > -    DeviceState *qdev = kid ? kid->child : NULL;
> > +    BusChild *kid;
> > +    DeviceState *qdev;
> > +
> > +    kid = QTAILQ_FIRST(&qbus->children);
> 
> QTAILQ_FIRST_RCU()

This is on purpose - I didn't convert to RCU most of the drivers
which don't have this race versus iothread after a discussion with Paulo,
and this is one of them. Virtio bus has only one device
and it is added right away on initialization and vise-versa of the parent
(aka virtio-pci/virtio-mmio) bus device.

I will revert the other cosmetic changes in this function which I did when
I wasn't sure if to use _RCU version here.

Best regards,
	Maxim Levitsky
diff mbox series

Patch

diff --git a/hw/core/bus.c b/hw/core/bus.c
index 3dc0a825f0..cb7756ded1 100644
--- a/hw/core/bus.c
+++ b/hw/core/bus.c
@@ -49,12 +49,14 @@  int qbus_walk_children(BusState *bus,
         }
     }
 
-    QTAILQ_FOREACH(kid, &bus->children, sibling) {
-        err = qdev_walk_children(kid->child,
-                                 pre_devfn, pre_busfn,
-                                 post_devfn, post_busfn, opaque);
-        if (err < 0) {
-            return err;
+    WITH_RCU_READ_LOCK_GUARD(){
+        QTAILQ_FOREACH_RCU(kid, &bus->children, sibling) {
+            err = qdev_walk_children(kid->child,
+                                     pre_devfn, pre_busfn,
+                                     post_devfn, post_busfn, opaque);
+            if (err < 0) {
+                return err;
+            }
         }
     }
 
@@ -90,9 +92,13 @@  static void bus_reset_child_foreach(Object *obj, ResettableChildCallback cb,
     BusState *bus = BUS(obj);
     BusChild *kid;
 
-    QTAILQ_FOREACH(kid, &bus->children, sibling) {
+    rcu_read_lock();
+
+    QTAILQ_FOREACH_RCU(kid, &bus->children, sibling) {
         cb(OBJECT(kid->child), opaque, type);
     }
+
+    rcu_read_unlock();
 }
 
 static void qbus_realize(BusState *bus, DeviceState *parent, const char *name)
@@ -138,10 +144,15 @@  static void bus_unparent(Object *obj)
     /* Only the main system bus has no parent, and that bus is never freed */
     assert(bus->parent);
 
-    while ((kid = QTAILQ_FIRST(&bus->children)) != NULL) {
+    rcu_read_lock();
+
+    while ((kid = QTAILQ_FIRST_RCU(&bus->children)) != NULL) {
         DeviceState *dev = kid->child;
         object_unparent(OBJECT(dev));
     }
+
+    rcu_read_unlock();
+
     QLIST_REMOVE(bus, sibling);
     bus->parent->num_child_bus--;
     bus->parent = NULL;
@@ -185,14 +196,18 @@  static void bus_set_realized(Object *obj, bool value, Error **errp)
 
         /* TODO: recursive realization */
     } else if (!value && bus->realized) {
-        QTAILQ_FOREACH(kid, &bus->children, sibling) {
-            DeviceState *dev = kid->child;
-            object_property_set_bool(OBJECT(dev), false, "realized",
-                                     &local_err);
-            if (local_err != NULL) {
-                break;
+
+        WITH_RCU_READ_LOCK_GUARD(){
+            QTAILQ_FOREACH_RCU(kid, &bus->children, sibling) {
+                DeviceState *dev = kid->child;
+                object_property_set_bool(OBJECT(dev), false, "realized",
+                                         &local_err);
+                if (local_err != NULL) {
+                    break;
+                }
             }
         }
+
         if (bc->unrealize && local_err == NULL) {
             bc->unrealize(bus, &local_err);
         }
diff --git a/hw/core/qdev.c b/hw/core/qdev.c
index 85f062def7..f0c87e582e 100644
--- a/hw/core/qdev.c
+++ b/hw/core/qdev.c
@@ -50,26 +50,37 @@  const VMStateDescription *qdev_get_vmsd(DeviceState *dev)
     return dc->vmsd;
 }
 
+static void bus_free_bus_child(BusChild *kid)
+{
+    object_unref(OBJECT(kid->child));
+    g_free(kid);
+}
+
 static void bus_remove_child(BusState *bus, DeviceState *child)
 {
     BusChild *kid;
 
-    QTAILQ_FOREACH(kid, &bus->children, sibling) {
+    rcu_read_lock();
+
+    QTAILQ_FOREACH_RCU(kid, &bus->children, sibling) {
         if (kid->child == child) {
             char name[32];
 
             snprintf(name, sizeof(name), "child[%d]", kid->index);
-            QTAILQ_REMOVE(&bus->children, kid, sibling);
+            QTAILQ_REMOVE_RCU(&bus->children, kid, sibling);
 
             bus->num_children--;
 
             /* This gives back ownership of kid->child back to us.  */
             object_property_del(OBJECT(bus), name, NULL);
-            object_unref(OBJECT(kid->child));
-            g_free(kid);
-            return;
+
+            /* free the bus kid, when it is safe to do so*/
+            call_rcu(kid, bus_free_bus_child, rcu);
+            break;
         }
     }
+
+    rcu_read_unlock();
 }
 
 static void bus_add_child(BusState *bus, DeviceState *child)
@@ -82,7 +93,9 @@  static void bus_add_child(BusState *bus, DeviceState *child)
     kid->child = child;
     object_ref(OBJECT(kid->child));
 
-    QTAILQ_INSERT_HEAD(&bus->children, kid, sibling);
+    rcu_read_lock();
+    QTAILQ_INSERT_HEAD_RCU(&bus->children, kid, sibling);
+    rcu_read_unlock();
 
     /* This transfers ownership of kid->child to the property.  */
     snprintf(name, sizeof(name), "child[%d]", kid->index);
@@ -681,20 +694,23 @@  DeviceState *qdev_find_recursive(BusState *bus, const char *id)
     DeviceState *ret;
     BusState *child;
 
-    QTAILQ_FOREACH(kid, &bus->children, sibling) {
-        DeviceState *dev = kid->child;
+    WITH_RCU_READ_LOCK_GUARD() {
+        QTAILQ_FOREACH_RCU(kid, &bus->children, sibling) {
+            DeviceState *dev = kid->child;
 
-        if (dev->id && strcmp(dev->id, id) == 0) {
-            return dev;
-        }
+            if (dev->id && strcmp(dev->id, id) == 0) {
+                return dev;
+            }
 
-        QLIST_FOREACH(child, &dev->child_bus, sibling) {
-            ret = qdev_find_recursive(child, id);
-            if (ret) {
-                return ret;
+            QLIST_FOREACH(child, &dev->child_bus, sibling) {
+                ret = qdev_find_recursive(child, id);
+                if (ret) {
+                    return ret;
+                }
             }
         }
     }
+
     return NULL;
 }
 
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index 7bbc37acec..2bf6d005f3 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -412,7 +412,10 @@  static bool scsi_target_emulate_report_luns(SCSITargetReq *r)
     id = r->req.dev->id;
     found_lun0 = false;
     n = 0;
-    QTAILQ_FOREACH(kid, &r->req.bus->qbus.children, sibling) {
+
+    rcu_read_lock();
+
+    QTAILQ_FOREACH_RCU(kid, &r->req.bus->qbus.children, sibling) {
         DeviceState *qdev = kid->child;
         SCSIDevice *dev = SCSI_DEVICE(qdev);
 
@@ -433,7 +436,7 @@  static bool scsi_target_emulate_report_luns(SCSITargetReq *r)
     memset(r->buf, 0, len);
     stl_be_p(&r->buf[0], n);
     i = found_lun0 ? 8 : 16;
-    QTAILQ_FOREACH(kid, &r->req.bus->qbus.children, sibling) {
+    QTAILQ_FOREACH_RCU(kid, &r->req.bus->qbus.children, sibling) {
         DeviceState *qdev = kid->child;
         SCSIDevice *dev = SCSI_DEVICE(qdev);
 
@@ -442,6 +445,9 @@  static bool scsi_target_emulate_report_luns(SCSITargetReq *r)
             i += 8;
         }
     }
+
+    rcu_read_unlock();
+
     assert(i == n + 8);
     r->len = len;
     return true;
@@ -1584,12 +1590,15 @@  SCSIDevice *scsi_device_find(SCSIBus *bus, int channel, int id, int lun)
     BusChild *kid;
     SCSIDevice *target_dev = NULL;
 
-    QTAILQ_FOREACH(kid, &bus->qbus.children, sibling) {
+    rcu_read_lock();
+
+    QTAILQ_FOREACH_RCU(kid, &bus->qbus.children, sibling) {
         DeviceState *qdev = kid->child;
         SCSIDevice *dev = SCSI_DEVICE(qdev);
 
         if (dev->channel == channel && dev->id == id) {
             if (dev->lun == lun) {
+                rcu_read_unlock();
                 return dev;
             }
 
@@ -1603,6 +1612,8 @@  SCSIDevice *scsi_device_find(SCSIBus *bus, int channel, int id, int lun)
             }
         }
     }
+
+    rcu_read_unlock();
     return target_dev;
 }
 
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 472bbd233b..b0f4a35f81 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -367,12 +367,16 @@  static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
     case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
         target = req->req.tmf.lun[1];
         s->resetting++;
-        QTAILQ_FOREACH(kid, &s->bus.qbus.children, sibling) {
+
+        rcu_read_lock();
+        QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) {
              d = SCSI_DEVICE(kid->child);
              if (d->channel == 0 && d->id == target) {
                 qdev_reset_all(&d->qdev);
              }
         }
+        rcu_read_unlock();
+
         s->resetting--;
         break;
 
diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index 1405b8a990..196253978b 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -3,6 +3,8 @@ 
 
 #include "qemu/queue.h"
 #include "qemu/bitmap.h"
+#include "qemu/rcu.h"
+#include "qemu/rcu_queue.h"
 #include "qom/object.h"
 #include "hw/hotplug.h"
 #include "hw/resettable.h"
@@ -218,6 +220,7 @@  struct BusClass {
 };
 
 typedef struct BusChild {
+    struct rcu_head rcu;
     DeviceState *child;
     int index;
     QTAILQ_ENTRY(BusChild) sibling;
diff --git a/include/hw/virtio/virtio-bus.h b/include/hw/virtio/virtio-bus.h
index 38c9399cd4..58733f28e2 100644
--- a/include/hw/virtio/virtio-bus.h
+++ b/include/hw/virtio/virtio-bus.h
@@ -128,8 +128,11 @@  void virtio_bus_set_vdev_config(VirtioBusState *bus, uint8_t *config);
 static inline VirtIODevice *virtio_bus_get_device(VirtioBusState *bus)
 {
     BusState *qbus = &bus->parent_obj;
-    BusChild *kid = QTAILQ_FIRST(&qbus->children);
-    DeviceState *qdev = kid ? kid->child : NULL;
+    BusChild *kid;
+    DeviceState *qdev;
+
+    kid = QTAILQ_FIRST(&qbus->children);
+    qdev = kid ? kid->child : NULL;
 
     /* This is used on the data path, the cast is guaranteed
      * to succeed by the qdev machinery.