diff mbox

[10/11] block: let commit blockjob run in BDS AioContext

Message ID 1412182919-9550-11-git-send-email-stefanha@redhat.com
State New
Headers show

Commit Message

Stefan Hajnoczi Oct. 1, 2014, 5:01 p.m. UTC
The commit block job must run in the BlockDriverState AioContext so that
it works with dataplane.

Acquire the AioContext in blockdev.c so starting the block job is safe.
One detail here is that the bdrv_drain_all() must be moved inside the
aio_context_acquire() region so requests cannot sneak in between the
drain and acquire.

The completion code in block/commit.c must perform backing chain
manipulation and bdrv_reopen() from the main loop.  Use
block_job_defer_to_main_loop() to achieve that.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/commit.c | 72 +++++++++++++++++++++++++++++++++++++---------------------
 blockdev.c     | 29 +++++++++++++++--------
 2 files changed, 66 insertions(+), 35 deletions(-)

Comments

Max Reitz Oct. 4, 2014, 9:28 p.m. UTC | #1
On 01.10.2014 19:01, Stefan Hajnoczi wrote:
> The commit block job must run in the BlockDriverState AioContext so that
> it works with dataplane.
>
> Acquire the AioContext in blockdev.c so starting the block job is safe.
> One detail here is that the bdrv_drain_all() must be moved inside the
> aio_context_acquire() region so requests cannot sneak in between the
> drain and acquire.

Hm, I see the intent, but in patch 5 you said bdrv_drain_all() should 
never be called outside of the main loop (at least that's how it 
appeared to me). Wouldn't it be enough to use bdrv_drain() on the source 
BDS, like in patch 9?

> The completion code in block/commit.c must perform backing chain
> manipulation and bdrv_reopen() from the main loop.  Use
> block_job_defer_to_main_loop() to achieve that.
>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>   block/commit.c | 72 +++++++++++++++++++++++++++++++++++++---------------------
>   blockdev.c     | 29 +++++++++++++++--------
>   2 files changed, 66 insertions(+), 35 deletions(-)
>
> diff --git a/block/commit.c b/block/commit.c
> index 91517d3..0fd05dc 100644
> --- a/block/commit.c
> +++ b/block/commit.c
> @@ -60,17 +60,50 @@ static int coroutine_fn commit_populate(BlockDriverState *bs,
>       return 0;
>   }
>   
> -static void coroutine_fn commit_run(void *opaque)
> +typedef struct {
> +    int ret;
> +} CommitCompleteData;
> +
> +static void commit_complete(BlockJob *job, void *opaque)
>   {
> -    CommitBlockJob *s = opaque;
> +    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
> +    CommitCompleteData *data = opaque;
>       BlockDriverState *active = s->active;
>       BlockDriverState *top = s->top;
>       BlockDriverState *base = s->base;
>       BlockDriverState *overlay_bs;
> +    int ret = data->ret;
> +
> +    if (!block_job_is_cancelled(&s->common) && ret == 0) {
> +        /* success */
> +        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
> +    }
> +
> +    /* restore base open flags here if appropriate (e.g., change the base back
> +     * to r/o). These reopens do not need to be atomic, since we won't abort
> +     * even on failure here */
> +    if (s->base_flags != bdrv_get_flags(base)) {
> +        bdrv_reopen(base, s->base_flags, NULL);
> +    }
> +    overlay_bs = bdrv_find_overlay(active, top);
> +    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
> +        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
> +    }
> +    g_free(s->backing_file_str);
> +    block_job_completed(&s->common, ret);
> +    g_free(data);
> +}
> +
> +static void coroutine_fn commit_run(void *opaque)
> +{
> +    CommitBlockJob *s = opaque;
> +    CommitCompleteData *data;
> +    BlockDriverState *top = s->top;
> +    BlockDriverState *base = s->base;
>       int64_t sector_num, end;
>       int ret = 0;
>       int n = 0;
> -    void *buf;
> +    void *buf = NULL;
>       int bytes_written = 0;
>       int64_t base_len;
>   
> @@ -78,18 +111,18 @@ static void coroutine_fn commit_run(void *opaque)
>   
>   
>       if (s->common.len < 0) {
> -        goto exit_restore_reopen;
> +        goto out;
>       }
>   
>       ret = base_len = bdrv_getlength(base);
>       if (base_len < 0) {
> -        goto exit_restore_reopen;
> +        goto out;
>       }
>   
>       if (base_len < s->common.len) {
>           ret = bdrv_truncate(base, s->common.len);
>           if (ret) {
> -            goto exit_restore_reopen;
> +            goto out;
>           }
>       }
>   
> @@ -128,7 +161,7 @@ wait:
>               if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
>                   s->on_error == BLOCKDEV_ON_ERROR_REPORT||
>                   (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) {
> -                goto exit_free_buf;
> +                goto out;
>               } else {
>                   n = 0;
>                   continue;
> @@ -140,27 +173,14 @@ wait:
>   
>       ret = 0;
>   
> -    if (!block_job_is_cancelled(&s->common) && sector_num == end) {
> -        /* success */
> -        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
> +out:
> +    if (buf) {
> +        qemu_vfree(buf);
>       }

Is this new condition really necessary? However, it won't hurt, so:

Reviewed-by: Max Reitz <mreitz@redhat.com>

A general question regarding the assertions here and in patch 8: I tried 
to break them, but it couldn't find a way. The way I tried was by 
creating two devices in different threads with just one qcow2 behind 
each of them, and then trying to attach on of those qcow2 BDS to the 
other as a backing file. I couldn't find out, how, but I guess this is 
something we might want to support in the future. Can we actually be 
sure that all of the BDS in one tree are always running in the same AIO 
context? Are we already enforcing this?

And furthermore, basically all the calls to acquire an AIO context are 
of the form "aio_context = bdrv_get_aio_context(bs); 
aio_context_acquire(aio_context);". It is *extremely* unlikely if 
possible at all, but wouldn't it be possible to change the BDS's AIO 
context from another thread after the first function returned and before 
the lock is acquired? If that is really the case, I think we should have 
some atomic bdrv_acquire_aio_context() function.

Max
Stefan Hajnoczi Oct. 6, 2014, 9:30 a.m. UTC | #2
On Sat, Oct 04, 2014 at 11:28:22PM +0200, Max Reitz wrote:
> On 01.10.2014 19:01, Stefan Hajnoczi wrote:
> >The commit block job must run in the BlockDriverState AioContext so that
> >it works with dataplane.
> >
> >Acquire the AioContext in blockdev.c so starting the block job is safe.
> >One detail here is that the bdrv_drain_all() must be moved inside the
> >aio_context_acquire() region so requests cannot sneak in between the
> >drain and acquire.
> 
> Hm, I see the intent, but in patch 5 you said bdrv_drain_all() should never
> be called outside of the main loop (at least that's how it appeared to me).
> Wouldn't it be enough to use bdrv_drain() on the source BDS, like in patch
> 9?

There is no contradiction here because qmp_block_commit() is invoked by
the QEMU monitor from the main loop.

The problem with bdrv_drain_all() is that it acquires AioContexts.  If
called outside the main loop without taking special care, it could
result in lock ordering problems (e.g. two threads trying to acquire all
AioContexts at the same time while already holding their respective
contexts).

qmp_block_commit() is just traditional QEMU global mutex code so it is
allowed to call bdrv_drain_all().

> >@@ -140,27 +173,14 @@ wait:
> >      ret = 0;
> >-    if (!block_job_is_cancelled(&s->common) && sector_num == end) {
> >-        /* success */
> >-        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
> >+out:
> >+    if (buf) {
> >+        qemu_vfree(buf);
> >      }
> 
> Is this new condition really necessary? However, it won't hurt, so:

This was a mistake.  Since commit
94c8ff3a01d9bd1005f066a0ee3fe43c842a43b7 ("w32: Make qemu_vfree() accept
NULL like the POSIX implementation") it is no longer necessary to check
for NULL pointers.

You can't teach an old dog new tricks :).

Thanks, will fix in the next revision!

> Reviewed-by: Max Reitz <mreitz@redhat.com>
> 
> A general question regarding the assertions here and in patch 8: I tried to
> break them, but it couldn't find a way. The way I tried was by creating two
> devices in different threads with just one qcow2 behind each of them, and
> then trying to attach on of those qcow2 BDS to the other as a backing file.
> I couldn't find out, how, but I guess this is something we might want to
> support in the future. Can we actually be sure that all of the BDS in one
> tree are always running in the same AIO context? Are we already enforcing
> this?

bdrv_set_aio_context() is recursive so it also sets all the child nodes.
That is the only mechanism to ensure AioContext is consistent across
nodes.

When the BDS graph is manipulated (e.g. attaching new roots, swapping
nodes, etc) we don't perform checks today.

Markus has asked that I add the appropriate assertions so errors are
caught early.  I haven't done that yet but it's a good next step.

As far as I'm aware, these patches don't introduce cases where we would
make the AioContext in the graph inconsistent.  So I see the AioContext
consistency assertions as a separate patch series (which I will work on
next...hopefully not to discover horrible problems!).

> And furthermore, basically all the calls to acquire an AIO context are of
> the form "aio_context = bdrv_get_aio_context(bs);
> aio_context_acquire(aio_context);". It is *extremely* unlikely if possible
> at all, but wouldn't it be possible to change the BDS's AIO context from
> another thread after the first function returned and before the lock is
> acquired? If that is really the case, I think we should have some atomic
> bdrv_acquire_aio_context() function.

No, because only the main loop calls bdrv_set_aio_context().  At the
moment the case you mentioned cannot happen.

Ultimately, we should move away from "this only works in the main loop"
constraints.  In order to provide atomic BDS AioContext acquire we need
a global root that is thread-safe.  That doesn't exist today -
bdrv_states is protected by the QEMU global mutex only.

I thought about adding the infrastructure in this patch series but it is
not necessary yet and would make the series more complicated.

The idea is:

 * Add bdrv_states_lock to protect the global list of BlockDriverStates
 * Integrate bdrv_ref()/bdrv_unref() as well as bdrv_get_aio_context()
   so they are atomic and protected by the bdrv_states_lock

So bdrv_find() and other functions that access bdrv_states become the
entry points to acquiring BlockDriverStates in a thread-safe fashion.
bdrv_unref() will need rethinking too to prevent races between freeing a
BDS and bdrv_find().

Can you think of a place where we need this today?  I haven't found one
yet but would like one to develop the code against.

Stefan
Max Reitz Oct. 7, 2014, 3:18 p.m. UTC | #3
On 06.10.2014 11:30, Stefan Hajnoczi wrote:
> On Sat, Oct 04, 2014 at 11:28:22PM +0200, Max Reitz wrote:
>> On 01.10.2014 19:01, Stefan Hajnoczi wrote:
>>> The commit block job must run in the BlockDriverState AioContext so that
>>> it works with dataplane.
>>>
>>> Acquire the AioContext in blockdev.c so starting the block job is safe.
>>> One detail here is that the bdrv_drain_all() must be moved inside the
>>> aio_context_acquire() region so requests cannot sneak in between the
>>> drain and acquire.
>> Hm, I see the intent, but in patch 5 you said bdrv_drain_all() should never
>> be called outside of the main loop (at least that's how it appeared to me).
>> Wouldn't it be enough to use bdrv_drain() on the source BDS, like in patch
>> 9?
> There is no contradiction here because qmp_block_commit() is invoked by
> the QEMU monitor from the main loop.
>
> The problem with bdrv_drain_all() is that it acquires AioContexts.  If
> called outside the main loop without taking special care, it could
> result in lock ordering problems (e.g. two threads trying to acquire all
> AioContexts at the same time while already holding their respective
> contexts).
>
> qmp_block_commit() is just traditional QEMU global mutex code so it is
> allowed to call bdrv_drain_all().

Hm, okay then.

>>> @@ -140,27 +173,14 @@ wait:
>>>       ret = 0;
>>> -    if (!block_job_is_cancelled(&s->common) && sector_num == end) {
>>> -        /* success */
>>> -        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
>>> +out:
>>> +    if (buf) {
>>> +        qemu_vfree(buf);
>>>       }
>> Is this new condition really necessary? However, it won't hurt, so:
> This was a mistake.  Since commit
> 94c8ff3a01d9bd1005f066a0ee3fe43c842a43b7 ("w32: Make qemu_vfree() accept
> NULL like the POSIX implementation") it is no longer necessary to check
> for NULL pointers.
>
> You can't teach an old dog new tricks :).
>
> Thanks, will fix in the next revision!
>
>> Reviewed-by: Max Reitz <mreitz@redhat.com>
>>
>> A general question regarding the assertions here and in patch 8: I tried to
>> break them, but it couldn't find a way. The way I tried was by creating two
>> devices in different threads with just one qcow2 behind each of them, and
>> then trying to attach on of those qcow2 BDS to the other as a backing file.
>> I couldn't find out, how, but I guess this is something we might want to
>> support in the future. Can we actually be sure that all of the BDS in one
>> tree are always running in the same AIO context? Are we already enforcing
>> this?
> bdrv_set_aio_context() is recursive so it also sets all the child nodes.
> That is the only mechanism to ensure AioContext is consistent across
> nodes.
>
> When the BDS graph is manipulated (e.g. attaching new roots, swapping
> nodes, etc) we don't perform checks today.
>
> Markus has asked that I add the appropriate assertions so errors are
> caught early.  I haven't done that yet but it's a good next step.

Okay, seems good to me. It's not possible to break them now, and if it 
will ever be, the assertions will at least catch it.

> As far as I'm aware, these patches don't introduce cases where we would
> make the AioContext in the graph inconsistent.  So I see the AioContext
> consistency assertions as a separate patch series (which I will work on
> next...hopefully not to discover horrible problems!).
>
>> And furthermore, basically all the calls to acquire an AIO context are of
>> the form "aio_context = bdrv_get_aio_context(bs);
>> aio_context_acquire(aio_context);". It is *extremely* unlikely if possible
>> at all, but wouldn't it be possible to change the BDS's AIO context from
>> another thread after the first function returned and before the lock is
>> acquired? If that is really the case, I think we should have some atomic
>> bdrv_acquire_aio_context() function.
> No, because only the main loop calls bdrv_set_aio_context().  At the
> moment the case you mentioned cannot happen.
>
> Ultimately, we should move away from "this only works in the main loop"
> constraints.  In order to provide atomic BDS AioContext acquire we need
> a global root that is thread-safe.  That doesn't exist today -
> bdrv_states is protected by the QEMU global mutex only.
>
> I thought about adding the infrastructure in this patch series but it is
> not necessary yet and would make the series more complicated.
>
> The idea is:
>
>   * Add bdrv_states_lock to protect the global list of BlockDriverStates
>   * Integrate bdrv_ref()/bdrv_unref() as well as bdrv_get_aio_context()
>     so they are atomic and protected by the bdrv_states_lock
>
> So bdrv_find() and other functions that access bdrv_states become the
> entry points to acquiring BlockDriverStates in a thread-safe fashion.
> bdrv_unref() will need rethinking too to prevent races between freeing a
> BDS and bdrv_find().
>
> Can you think of a place where we need this today?  I haven't found one
> yet but would like one to develop the code against.

No, I can't think of anything, as long as QMP commands always arrive 
through the main loop.

Thank you for your explanations!

Max
diff mbox

Patch

diff --git a/block/commit.c b/block/commit.c
index 91517d3..0fd05dc 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -60,17 +60,50 @@  static int coroutine_fn commit_populate(BlockDriverState *bs,
     return 0;
 }
 
-static void coroutine_fn commit_run(void *opaque)
+typedef struct {
+    int ret;
+} CommitCompleteData;
+
+static void commit_complete(BlockJob *job, void *opaque)
 {
-    CommitBlockJob *s = opaque;
+    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
+    CommitCompleteData *data = opaque;
     BlockDriverState *active = s->active;
     BlockDriverState *top = s->top;
     BlockDriverState *base = s->base;
     BlockDriverState *overlay_bs;
+    int ret = data->ret;
+
+    if (!block_job_is_cancelled(&s->common) && ret == 0) {
+        /* success */
+        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
+    }
+
+    /* restore base open flags here if appropriate (e.g., change the base back
+     * to r/o). These reopens do not need to be atomic, since we won't abort
+     * even on failure here */
+    if (s->base_flags != bdrv_get_flags(base)) {
+        bdrv_reopen(base, s->base_flags, NULL);
+    }
+    overlay_bs = bdrv_find_overlay(active, top);
+    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
+        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
+    }
+    g_free(s->backing_file_str);
+    block_job_completed(&s->common, ret);
+    g_free(data);
+}
+
+static void coroutine_fn commit_run(void *opaque)
+{
+    CommitBlockJob *s = opaque;
+    CommitCompleteData *data;
+    BlockDriverState *top = s->top;
+    BlockDriverState *base = s->base;
     int64_t sector_num, end;
     int ret = 0;
     int n = 0;
-    void *buf;
+    void *buf = NULL;
     int bytes_written = 0;
     int64_t base_len;
 
@@ -78,18 +111,18 @@  static void coroutine_fn commit_run(void *opaque)
 
 
     if (s->common.len < 0) {
-        goto exit_restore_reopen;
+        goto out;
     }
 
     ret = base_len = bdrv_getlength(base);
     if (base_len < 0) {
-        goto exit_restore_reopen;
+        goto out;
     }
 
     if (base_len < s->common.len) {
         ret = bdrv_truncate(base, s->common.len);
         if (ret) {
-            goto exit_restore_reopen;
+            goto out;
         }
     }
 
@@ -128,7 +161,7 @@  wait:
             if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
                 s->on_error == BLOCKDEV_ON_ERROR_REPORT||
                 (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) {
-                goto exit_free_buf;
+                goto out;
             } else {
                 n = 0;
                 continue;
@@ -140,27 +173,14 @@  wait:
 
     ret = 0;
 
-    if (!block_job_is_cancelled(&s->common) && sector_num == end) {
-        /* success */
-        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
+out:
+    if (buf) {
+        qemu_vfree(buf);
     }
 
-exit_free_buf:
-    qemu_vfree(buf);
-
-exit_restore_reopen:
-    /* restore base open flags here if appropriate (e.g., change the base back
-     * to r/o). These reopens do not need to be atomic, since we won't abort
-     * even on failure here */
-    if (s->base_flags != bdrv_get_flags(base)) {
-        bdrv_reopen(base, s->base_flags, NULL);
-    }
-    overlay_bs = bdrv_find_overlay(active, top);
-    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
-        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
-    }
-    g_free(s->backing_file_str);
-    block_job_completed(&s->common, ret);
+    data = g_malloc(sizeof(*data));
+    data->ret = ret;
+    block_job_defer_to_main_loop(&s->common, commit_complete, data);
 }
 
 static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
diff --git a/blockdev.c b/blockdev.c
index 5cf2058..d9333d7 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1987,6 +1987,7 @@  void qmp_block_commit(const char *device,
 {
     BlockDriverState *bs;
     BlockDriverState *base_bs, *top_bs;
+    AioContext *aio_context;
     Error *local_err = NULL;
     /* This will be part of the QMP command, if/when the
      * BlockdevOnError change for blkmirror makes it in
@@ -1997,9 +1998,6 @@  void qmp_block_commit(const char *device,
         speed = 0;
     }
 
-    /* drain all i/o before commits */
-    bdrv_drain_all();
-
     /* Important Note:
      *  libvirt relies on the DeviceNotFound error class in order to probe for
      *  live commit feature versions; for this to work, we must make sure to
@@ -2011,8 +2009,14 @@  void qmp_block_commit(const char *device,
         return;
     }
 
+    aio_context = bdrv_get_aio_context(bs);
+    aio_context_acquire(aio_context);
+
+    /* drain all i/o before commits */
+    bdrv_drain_all();
+
     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, errp)) {
-        return;
+        goto out;
     }
 
     /* default top_bs is the active layer */
@@ -2026,9 +2030,11 @@  void qmp_block_commit(const char *device,
 
     if (top_bs == NULL) {
         error_setg(errp, "Top image file %s not found", top ? top : "NULL");
-        return;
+        goto out;
     }
 
+    assert(bdrv_get_aio_context(top_bs) == aio_context);
+
     if (has_base && base) {
         base_bs = bdrv_find_backing_image(top_bs, base);
     } else {
@@ -2037,20 +2043,22 @@  void qmp_block_commit(const char *device,
 
     if (base_bs == NULL) {
         error_set(errp, QERR_BASE_NOT_FOUND, base ? base : "NULL");
-        return;
+        goto out;
     }
 
+    assert(bdrv_get_aio_context(base_bs) == aio_context);
+
     /* Do not allow attempts to commit an image into itself */
     if (top_bs == base_bs) {
         error_setg(errp, "cannot commit an image into itself");
-        return;
+        goto out;
     }
 
     if (top_bs == bs) {
         if (has_backing_file) {
             error_setg(errp, "'backing-file' specified,"
                              " but 'top' is the active layer");
-            return;
+            goto out;
         }
         commit_active_start(bs, base_bs, speed, on_error, block_job_cb,
                             bs, &local_err);
@@ -2060,8 +2068,11 @@  void qmp_block_commit(const char *device,
     }
     if (local_err != NULL) {
         error_propagate(errp, local_err);
-        return;
+        goto out;
     }
+
+out:
+    aio_context_release(aio_context);
 }
 
 void qmp_drive_backup(const char *device, const char *target,