diff mbox

[Qemu-block,4/4] virtio-blk: Clean up start/stop with mutex and BH

Message ID 56F18955.4060005@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini March 22, 2016, 6:05 p.m. UTC
On 22/03/2016 13:52, Fam Zheng wrote:
>> You're right.  After unrealizing virtio_blk_data_plane_stop has set of
>> vblk->dataplane_started = false, so that's covered.  However, you still
>> need an object_ref/object_object_unref pair.
> 
> Is it safe to call object_unref outside BQL?

Hmm, no.

However, perhaps we can fix the code without a bottom half, using the 
assertion in virtio_blk_data_plane_start to ensure that there is no 
unwanted reentrancy.

Conny's patches are also enough to mask the bug for me, so my tests
do not say much.  But in any case the following patch works here too
instead of Fam's 4/4; it is a mess including some other experiments,
but I'm including it as is because that's what I tested and it's
dinner time now.

Even if it fails for you or Tu Bo, perhaps the backtraces say
something.

Thanks,

Paolo

Comments

Cornelia Huck March 23, 2016, 8:10 a.m. UTC | #1
On Tue, 22 Mar 2016 19:05:09 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> On 22/03/2016 13:52, Fam Zheng wrote:
> >> You're right.  After unrealizing virtio_blk_data_plane_stop has set of
> >> vblk->dataplane_started = false, so that's covered.  However, you still
> >> need an object_ref/object_object_unref pair.
> > 
> > Is it safe to call object_unref outside BQL?
> 
> Hmm, no.
> 
> However, perhaps we can fix the code without a bottom half, using the 
> assertion in virtio_blk_data_plane_start to ensure that there is no 
> unwanted reentrancy.
> 
> Conny's patches are also enough to mask the bug for me, so my tests
> do not say much.  But in any case the following patch works here too
> instead of Fam's 4/4; it is a mess including some other experiments,
> but I'm including it as is because that's what I tested and it's
> dinner time now.
> 
> Even if it fails for you or Tu Bo, perhaps the backtraces say
> something.
> 
> Thanks,
> 
> Paolo
> 
> diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
> index 1b2d5fa..5f72671 100644
> --- a/hw/block/dataplane/virtio-blk.c
> +++ b/hw/block/dataplane/virtio-blk.c
> @@ -26,8 +26,7 @@
>  #include "qom/object_interfaces.h"
> 
>  struct VirtIOBlockDataPlane {
> -    bool starting;
> -    bool stopping;
> +    int starting;
>      bool disabled;
> 
>      VirtIOBlkConf *conf;
> @@ -192,11 +191,7 @@ void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
>      VirtIOBlock *vblk = VIRTIO_BLK(s->vdev);
>      int r;
> 
> -    if (vblk->dataplane_started || s->starting) {
> -        return;
> -    }
> -
> -    s->starting = true;
> +    assert(atomic_fetch_inc(&s->starting) == 0);
>      s->vq = virtio_get_queue(s->vdev, 0);
> 
>      /* Set up guest notifier (irq) */
> @@ -215,27 +210,28 @@ void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
>          goto fail_host_notifier;
>      }
> 
> -    s->starting = false;
> -    vblk->dataplane_started = true;
>      trace_virtio_blk_data_plane_start(s);
> 
>      blk_set_aio_context(s->conf->conf.blk, s->ctx);
> 
> -    /* Kick right away to begin processing requests already in vring */
> -    event_notifier_set(virtio_queue_get_host_notifier(s->vq));
> +    vblk->dataplane_started = true;
> 
> -    /* Get this show started by hooking up our callbacks */
> +    /* Get this show started by hooking up our callbacks.  */
>      aio_context_acquire(s->ctx);
>      virtio_queue_aio_set_host_notifier_handler(s->vq, s->ctx, true, true);
>      aio_context_release(s->ctx);
> +    atomic_dec(&s->starting);
> +
> +    /* Kick right away to begin processing requests already in vring */
> +    event_notifier_set(virtio_queue_get_host_notifier(s->vq));

I'm wondering whether moving this event_notifier_set() masks something?
IOW, may we run into trouble if the event notifier is set from some
other path before the callbacks are set up properly?

>      return;
> 
>    fail_host_notifier:
>      k->set_guest_notifiers(qbus->parent, 1, false);
>    fail_guest_notifiers:
>      s->disabled = true;
> -    s->starting = false;
>      vblk->dataplane_started = true;
> +    atomic_dec(&s->starting);
>  }
> 
>  /* Context: QEMU global mutex held */
> @@ -245,7 +241,7 @@ void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
>      VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
>      VirtIOBlock *vblk = VIRTIO_BLK(s->vdev);
> 
> -    if (!vblk->dataplane_started || s->stopping) {
> +    if (!vblk->dataplane_started) {

No fear of reentrancy here?

>          return;
>      }
> 
> @@ -255,7 +251,7 @@ void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
>          vblk->dataplane_started = false;
>          return;
>      }
> -    s->stopping = true;
> +
>      trace_virtio_blk_data_plane_stop(s);
> 
>      aio_context_acquire(s->ctx);
> @@ -274,5 +270,4 @@ void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
>      k->set_guest_notifiers(qbus->parent, 1, false);
> 
>      vblk->dataplane_started = false;
> -    s->stopping = false;
>  }
>
Paolo Bonzini March 23, 2016, 9:08 a.m. UTC | #2
On 23/03/2016 09:10, Cornelia Huck wrote:
>> -    /* Kick right away to begin processing requests already in vring */
>> -    event_notifier_set(virtio_queue_get_host_notifier(s->vq));
>> +    vblk->dataplane_started = true;
>>
>> -    /* Get this show started by hooking up our callbacks */
>> +    /* Get this show started by hooking up our callbacks.  */
>>      aio_context_acquire(s->ctx);
>>      virtio_queue_aio_set_host_notifier_handler(s->vq, s->ctx, true, true);
>>      aio_context_release(s->ctx);
>> +    atomic_dec(&s->starting);
>> +
>> +    /* Kick right away to begin processing requests already in vring */
>> +    event_notifier_set(virtio_queue_get_host_notifier(s->vq));
> 
> I'm wondering whether moving this event_notifier_set() masks something?
> IOW, may we run into trouble if the event notifier is set from some
> other path before the callbacks are set up properly?

The reentrancy check should catch that...  But:

1) the patch really makes no difference, your fix is enough for me

2) vblk->dataplane_started becomes true before the callbacks are set;
that should be enough.

3) this matches what I tested, but it would of course be better if the
assertions on s->starting suffice

>> -    if (!vblk->dataplane_started || s->stopping) {
>> +    if (!vblk->dataplane_started) {
> 
> No fear of reentrancy here?

No, because this is only invoked from reset, hence only from the CPU
thread and only under the BQL.

On start, reentrancy happens between iothread (running outside BQL) and
a CPU thread (running within BQL).

Paolo

>>          return;
>>      }
>>
>> @@ -255,7 +251,7 @@ void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
>>          vblk->dataplane_started = false;
>>          return;
>>      }
>> -    s->stopping = true;
>> +
>>      trace_virtio_blk_data_plane_stop(s);
>>
>>      aio_context_acquire(s->ctx);
>> @@ -274,5 +270,4 @@ void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
>>      k->set_guest_notifiers(qbus->parent, 1, false);
>>
>>      vblk->dataplane_started = false;
>> -    s->stopping = false;
>>  }
>>
>
Christian Borntraeger March 23, 2016, 9:12 a.m. UTC | #3
On 03/23/2016 10:08 AM, Paolo Bonzini wrote:
> 
> 
> On 23/03/2016 09:10, Cornelia Huck wrote:
>>> -    /* Kick right away to begin processing requests already in vring */
>>> -    event_notifier_set(virtio_queue_get_host_notifier(s->vq));
>>> +    vblk->dataplane_started = true;
>>>
>>> -    /* Get this show started by hooking up our callbacks */
>>> +    /* Get this show started by hooking up our callbacks.  */
>>>      aio_context_acquire(s->ctx);
>>>      virtio_queue_aio_set_host_notifier_handler(s->vq, s->ctx, true, true);
>>>      aio_context_release(s->ctx);
>>> +    atomic_dec(&s->starting);
>>> +
>>> +    /* Kick right away to begin processing requests already in vring */
>>> +    event_notifier_set(virtio_queue_get_host_notifier(s->vq));
>>
>> I'm wondering whether moving this event_notifier_set() masks something?
>> IOW, may we run into trouble if the event notifier is set from some
>> other path before the callbacks are set up properly?
> 
> The reentrancy check should catch that...  But:
> 
> 1) the patch really makes no difference, your fix is enough for me


Tu Bo, can you test with master + Cornelias 6 refactoring patches and nothing on top?


 
> 2) vblk->dataplane_started becomes true before the callbacks are set;
> that should be enough.
> 
> 3) this matches what I tested, but it would of course be better if the
> assertions on s->starting suffice
> 
>>> -    if (!vblk->dataplane_started || s->stopping) {
>>> +    if (!vblk->dataplane_started) {
>>
>> No fear of reentrancy here?
> 
> No, because this is only invoked from reset, hence only from the CPU
> thread and only under the BQL.
> 
> On start, reentrancy happens between iothread (running outside BQL) and
> a CPU thread (running within BQL).
> 
> Paolo
> 
>>>          return;
>>>      }
>>>
>>> @@ -255,7 +251,7 @@ void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
>>>          vblk->dataplane_started = false;
>>>          return;
>>>      }
>>> -    s->stopping = true;
>>> +
>>>      trace_virtio_blk_data_plane_stop(s);
>>>
>>>      aio_context_acquire(s->ctx);
>>> @@ -274,5 +270,4 @@ void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
>>>      k->set_guest_notifiers(qbus->parent, 1, false);
>>>
>>>      vblk->dataplane_started = false;
>>> -    s->stopping = false;
>>>  }
>>>
>>
>
Bo Tu March 24, 2016, 8:19 a.m. UTC | #4
Hi Christian:

On 03/23/2016 05:12 PM, Christian Borntraeger wrote:
> On 03/23/2016 10:08 AM, Paolo Bonzini wrote:
>>
>>
>> On 23/03/2016 09:10, Cornelia Huck wrote:
>>>> -    /* Kick right away to begin processing requests already in vring */
>>>> -    event_notifier_set(virtio_queue_get_host_notifier(s->vq));
>>>> +    vblk->dataplane_started = true;
>>>>
>>>> -    /* Get this show started by hooking up our callbacks */
>>>> +    /* Get this show started by hooking up our callbacks.  */
>>>>       aio_context_acquire(s->ctx);
>>>>       virtio_queue_aio_set_host_notifier_handler(s->vq, s->ctx, true, true);
>>>>       aio_context_release(s->ctx);
>>>> +    atomic_dec(&s->starting);
>>>> +
>>>> +    /* Kick right away to begin processing requests already in vring */
>>>> +    event_notifier_set(virtio_queue_get_host_notifier(s->vq));
>>>
>>> I'm wondering whether moving this event_notifier_set() masks something?
>>> IOW, may we run into trouble if the event notifier is set from some
>>> other path before the callbacks are set up properly?
>>
>> The reentrancy check should catch that...  But:
>>
>> 1) the patch really makes no difference, your fix is enough for me
>
>
> Tu Bo, can you test with master + Cornelias 6 refactoring patches and nothing on top?

With qemu master + Cornelia's 6 refactoring patches and nothing on top, 
I did NOT see crash so far.

>
>
>
>> 2) vblk->dataplane_started becomes true before the callbacks are set;
>> that should be enough.
>>
>> 3) this matches what I tested, but it would of course be better if the
>> assertions on s->starting suffice
>>
>>>> -    if (!vblk->dataplane_started || s->stopping) {
>>>> +    if (!vblk->dataplane_started) {
>>>
>>> No fear of reentrancy here?
>>
>> No, because this is only invoked from reset, hence only from the CPU
>> thread and only under the BQL.
>>
>> On start, reentrancy happens between iothread (running outside BQL) and
>> a CPU thread (running within BQL).
>>
>> Paolo
>>
>>>>           return;
>>>>       }
>>>>
>>>> @@ -255,7 +251,7 @@ void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
>>>>           vblk->dataplane_started = false;
>>>>           return;
>>>>       }
>>>> -    s->stopping = true;
>>>> +
>>>>       trace_virtio_blk_data_plane_stop(s);
>>>>
>>>>       aio_context_acquire(s->ctx);
>>>> @@ -274,5 +270,4 @@ void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
>>>>       k->set_guest_notifiers(qbus->parent, 1, false);
>>>>
>>>>       vblk->dataplane_started = false;
>>>> -    s->stopping = false;
>>>>   }
>>>>
>>>
>>
>
Cornelia Huck March 24, 2016, 8:32 a.m. UTC | #5
On Thu, 24 Mar 2016 16:19:41 +0800
tu bo <tubo@linux.vnet.ibm.com> wrote:

> Hi Christian:
> 
> On 03/23/2016 05:12 PM, Christian Borntraeger wrote:
> > On 03/23/2016 10:08 AM, Paolo Bonzini wrote:
> >>
> >>
> >> On 23/03/2016 09:10, Cornelia Huck wrote:
> >>>> -    /* Kick right away to begin processing requests already in vring */
> >>>> -    event_notifier_set(virtio_queue_get_host_notifier(s->vq));
> >>>> +    vblk->dataplane_started = true;
> >>>>
> >>>> -    /* Get this show started by hooking up our callbacks */
> >>>> +    /* Get this show started by hooking up our callbacks.  */
> >>>>       aio_context_acquire(s->ctx);
> >>>>       virtio_queue_aio_set_host_notifier_handler(s->vq, s->ctx, true, true);
> >>>>       aio_context_release(s->ctx);
> >>>> +    atomic_dec(&s->starting);
> >>>> +
> >>>> +    /* Kick right away to begin processing requests already in vring */
> >>>> +    event_notifier_set(virtio_queue_get_host_notifier(s->vq));
> >>>
> >>> I'm wondering whether moving this event_notifier_set() masks something?
> >>> IOW, may we run into trouble if the event notifier is set from some
> >>> other path before the callbacks are set up properly?
> >>
> >> The reentrancy check should catch that...  But:
> >>
> >> 1) the patch really makes no difference, your fix is enough for me
> >
> >
> > Tu Bo, can you test with master + Cornelias 6 refactoring patches and nothing on top?
> 
> With qemu master + Cornelia's 6 refactoring patches and nothing on top, 
> I did NOT see crash so far.

Cool, thanks for testing!

I'll re-send my patches with some added interface doc in patch 1. Stay
tuned.
Cornelia Huck March 24, 2016, 8:47 a.m. UTC | #6
On Thu, 24 Mar 2016 09:32:30 +0100
Cornelia Huck <cornelia.huck@de.ibm.com> wrote:

> I'll re-send my patches with some added interface doc in patch 1. Stay
> tuned.

Grr. Unfortunately, this fails for _me_ now (-EEXIST after reboot).
Debugging.
Cornelia Huck March 24, 2016, 9:31 a.m. UTC | #7
On Thu, 24 Mar 2016 09:47:56 +0100
Cornelia Huck <cornelia.huck@de.ibm.com> wrote:

> On Thu, 24 Mar 2016 09:32:30 +0100
> Cornelia Huck <cornelia.huck@de.ibm.com> wrote:
> 
> > I'll re-send my patches with some added interface doc in patch 1. Stay
> > tuned.
> 
> Grr. Unfortunately, this fails for _me_ now (-EEXIST after reboot).
> Debugging.

Good news is that I think I understand what happens. Bad news is that
we can scratch all of the previous testing :(

My patchset had a typo (check for !disabled instead of disabled). After
I fixed that, the second assignment of the ioeventfd started failing
(that's what changed when I started passing assign in stop_ioeventfd)
with -EEXIST as the previous ioeventfd is of course still assigned.

What we actually want is to keep the ioeventfd assigned, not add a new
one. But we still want adding a new ioeventfd to fail in case of
collisions.

I think we need to track whether we already assigned an ioeventfd and
don't re-register in that case. I'll try to cook something up.
diff mbox

Patch

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 1b2d5fa..5f72671 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -26,8 +26,7 @@ 
 #include "qom/object_interfaces.h"
 
 struct VirtIOBlockDataPlane {
-    bool starting;
-    bool stopping;
+    int starting;
     bool disabled;
 
     VirtIOBlkConf *conf;
@@ -192,11 +191,7 @@  void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
     VirtIOBlock *vblk = VIRTIO_BLK(s->vdev);
     int r;
 
-    if (vblk->dataplane_started || s->starting) {
-        return;
-    }
-
-    s->starting = true;
+    assert(atomic_fetch_inc(&s->starting) == 0);
     s->vq = virtio_get_queue(s->vdev, 0);
 
     /* Set up guest notifier (irq) */
@@ -215,27 +210,28 @@  void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
         goto fail_host_notifier;
     }
 
-    s->starting = false;
-    vblk->dataplane_started = true;
     trace_virtio_blk_data_plane_start(s);
 
     blk_set_aio_context(s->conf->conf.blk, s->ctx);
 
-    /* Kick right away to begin processing requests already in vring */
-    event_notifier_set(virtio_queue_get_host_notifier(s->vq));
+    vblk->dataplane_started = true;
 
-    /* Get this show started by hooking up our callbacks */
+    /* Get this show started by hooking up our callbacks.  */
     aio_context_acquire(s->ctx);
     virtio_queue_aio_set_host_notifier_handler(s->vq, s->ctx, true, true);
     aio_context_release(s->ctx);
+    atomic_dec(&s->starting);
+
+    /* Kick right away to begin processing requests already in vring */
+    event_notifier_set(virtio_queue_get_host_notifier(s->vq));
     return;
 
   fail_host_notifier:
     k->set_guest_notifiers(qbus->parent, 1, false);
   fail_guest_notifiers:
     s->disabled = true;
-    s->starting = false;
     vblk->dataplane_started = true;
+    atomic_dec(&s->starting);
 }
 
 /* Context: QEMU global mutex held */
@@ -245,7 +241,7 @@  void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
     VirtIOBlock *vblk = VIRTIO_BLK(s->vdev);
 
-    if (!vblk->dataplane_started || s->stopping) {
+    if (!vblk->dataplane_started) {
         return;
     }
 
@@ -255,7 +251,7 @@  void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
         vblk->dataplane_started = false;
         return;
     }
-    s->stopping = true;
+
     trace_virtio_blk_data_plane_stop(s);
 
     aio_context_acquire(s->ctx);
@@ -274,5 +270,4 @@  void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
     k->set_guest_notifiers(qbus->parent, 1, false);
 
     vblk->dataplane_started = false;
-    s->stopping = false;
 }