diff mbox series

[v4,04/15] block/commit: refactor commit to use job callbacks

Message ID 20180904170930.28619-5-jsnow@redhat.com
State New
Headers show
Series jobs: Job Exit Refactoring Pt 2 | expand

Commit Message

John Snow Sept. 4, 2018, 5:09 p.m. UTC
Use the component callbacks; prepare, abort, and clean.

NB: prepare is only called when the job has not yet failed;
and abort can be called after prepare.

complete -> prepare -> abort -> clean
complete -> abort -> clean

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 block/commit.c | 90 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 49 insertions(+), 41 deletions(-)

Comments

Jeff Cody Sept. 4, 2018, 6:46 p.m. UTC | #1
On Tue, Sep 04, 2018 at 01:09:19PM -0400, John Snow wrote:
> Use the component callbacks; prepare, abort, and clean.
> 
> NB: prepare is only called when the job has not yet failed;
> and abort can be called after prepare.
> 
> complete -> prepare -> abort -> clean
> complete -> abort -> clean
> 
> Signed-off-by: John Snow <jsnow@redhat.com>
> Reviewed-by: Max Reitz <mreitz@redhat.com>
> ---
>  block/commit.c | 90 ++++++++++++++++++++++++++++++++--------------------------
>  1 file changed, 49 insertions(+), 41 deletions(-)
> 
> diff --git a/block/commit.c b/block/commit.c
> index b6e8969877..eb3941e545 100644
> --- a/block/commit.c
> +++ b/block/commit.c
> @@ -36,6 +36,7 @@ typedef struct CommitBlockJob {
>      BlockDriverState *commit_top_bs;
>      BlockBackend *top;
>      BlockBackend *base;
> +    BlockDriverState *base_bs;
>      BlockdevOnError on_error;
>      int base_flags;
>      char *backing_file_str;
> @@ -68,61 +69,65 @@ static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
>      return 0;
>  }
>  
> -static void commit_exit(Job *job)
> +static int commit_prepare(Job *job)
>  {
>      CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
> -    BlockJob *bjob = &s->common;
> -    BlockDriverState *top = blk_bs(s->top);
> -    BlockDriverState *base = blk_bs(s->base);
> -    BlockDriverState *commit_top_bs = s->commit_top_bs;
> -    bool remove_commit_top_bs = false;
> -
> -    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
> -    bdrv_ref(top);
> -    bdrv_ref(commit_top_bs);
>  
>      /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
>       * the normal backing chain can be restored. */
>      blk_unref(s->base);
> +    s->base = NULL;
>  
> -    if (!job_is_cancelled(job) && job->ret == 0) {
> -        /* success */
> -        job->ret = bdrv_drop_intermediate(s->commit_top_bs, base,
> -                                          s->backing_file_str);
> -    } else {
> -        /* XXX Can (or should) we somehow keep 'consistent read' blocked even
> -         * after the failed/cancelled commit job is gone? If we already wrote
> -         * something to base, the intermediate images aren't valid any more. */
> -        remove_commit_top_bs = true;
> +    return bdrv_drop_intermediate(s->commit_top_bs, s->base_bs,
> +                                  s->backing_file_str);
> +}

If we can go from prepare->abort->clean, then that means to me that every
failure case of .prepare() can be resolved without permanent changes / data
loss.  Is this necessarily the case?

From bdrv_drop_intermediate():

    QLIST_FOREACH_SAFE(c, &top->parents, next_parent, next) {
        /* Check whether we are allowed to switch c from top to base */
        GSList *ignore_children = g_slist_prepend(NULL, c);
        bdrv_check_update_perm(base, NULL, c->perm, c->shared_perm,
                               ignore_children, &local_err);
        g_slist_free(ignore_children);
        if (local_err) {
            ret = -EPERM;
            error_report_err(local_err);
            goto exit;
        }

        /* If so, update the backing file path in the image file */
        if (c->role->update_filename) {
            ret = c->role->update_filename(c, base, backing_file_str,
                                           &local_err);
            if (ret < 0) {
                bdrv_abort_perm_update(base);
                error_report_err(local_err);
                goto exit;
            }
        }

        [...]
     }

We could fail this but still have modified an image file backing filenames,
right?

Or am I incorrect about the intention here, that abort() can always be clean?

-Jeff

> +
> +static void commit_abort(Job *job)
> +{
> +    CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
> +    BlockDriverState *top_bs = blk_bs(s->top);
> +
> +    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
> +    bdrv_ref(top_bs);
> +    bdrv_ref(s->commit_top_bs);
> +
> +    if (s->base) {
> +        blk_unref(s->base);
>      }
>  
> +    /* free the blockers on the intermediate nodes so that bdrv_replace_nodes
> +     * can succeed */
> +    block_job_remove_all_bdrv(&s->common);
> +
> +    /* If bdrv_drop_intermediate() failed (or was not invoked), remove the
> +     * commit filter driver from the backing chain now. Do this as the final
> +     * step so that the 'consistent read' permission can be granted.
> +     *
> +     * XXX Can (or should) we somehow keep 'consistent read' blocked even
> +     * after the failed/cancelled commit job is gone? If we already wrote
> +     * something to base, the intermediate images aren't valid any more. */
> +    bdrv_child_try_set_perm(s->commit_top_bs->backing, 0, BLK_PERM_ALL,
> +                            &error_abort);
> +    bdrv_replace_node(s->commit_top_bs, backing_bs(s->commit_top_bs),
> +                      &error_abort);
> +
> +    bdrv_unref(s->commit_top_bs);
> +    bdrv_unref(top_bs);
> +}
> +
> +static void commit_clean(Job *job)
> +{
> +    CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
> +
>      /* restore base open flags here if appropriate (e.g., change the base back
>       * to r/o). These reopens do not need to be atomic, since we won't abort
>       * even on failure here */
> -    if (s->base_flags != bdrv_get_flags(base)) {
> -        bdrv_reopen(base, s->base_flags, NULL);
> +    if (s->base_flags != bdrv_get_flags(s->base_bs)) {
> +        bdrv_reopen(s->base_bs, s->base_flags, NULL);
>      }
> +
>      g_free(s->backing_file_str);
>      blk_unref(s->top);
> -
> -    /* If there is more than one reference to the job (e.g. if called from
> -     * job_finish_sync()), job_completed() won't free it and therefore the
> -     * blockers on the intermediate nodes remain. This would cause
> -     * bdrv_set_backing_hd() to fail. */
> -    block_job_remove_all_bdrv(bjob);
> -
> -    /* If bdrv_drop_intermediate() didn't already do that, remove the commit
> -     * filter driver from the backing chain. Do this as the final step so that
> -     * the 'consistent read' permission can be granted.  */
> -    if (remove_commit_top_bs) {
> -        bdrv_child_try_set_perm(commit_top_bs->backing, 0, BLK_PERM_ALL,
> -                                &error_abort);
> -        bdrv_replace_node(commit_top_bs, backing_bs(commit_top_bs),
> -                          &error_abort);
> -    }
> -
> -    bdrv_unref(commit_top_bs);
> -    bdrv_unref(top);
>  }
>  
>  static int coroutine_fn commit_run(Job *job, Error **errp)
> @@ -211,7 +216,9 @@ static const BlockJobDriver commit_job_driver = {
>          .user_resume   = block_job_user_resume,
>          .drain         = block_job_drain,
>          .run           = commit_run,
> -        .exit          = commit_exit,
> +        .prepare       = commit_prepare,
> +        .abort         = commit_abort,
> +        .clean         = commit_clean
>      },
>  };
>  
> @@ -345,6 +352,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
>      if (ret < 0) {
>          goto fail;
>      }
> +    s->base_bs = base;
>  
>      /* Required permissions are already taken with block_job_add_bdrv() */
>      s->top = blk_new(0, BLK_PERM_ALL);
> -- 
> 2.14.4
>
John Snow Sept. 4, 2018, 8:32 p.m. UTC | #2
On 09/04/2018 02:46 PM, Jeff Cody wrote:
> On Tue, Sep 04, 2018 at 01:09:19PM -0400, John Snow wrote:
>> Use the component callbacks; prepare, abort, and clean.
>>
>> NB: prepare is only called when the job has not yet failed;
>> and abort can be called after prepare.
>>
>> complete -> prepare -> abort -> clean
>> complete -> abort -> clean
>>
>> Signed-off-by: John Snow <jsnow@redhat.com>
>> Reviewed-by: Max Reitz <mreitz@redhat.com>
>> ---
>>  block/commit.c | 90 ++++++++++++++++++++++++++++++++--------------------------
>>  1 file changed, 49 insertions(+), 41 deletions(-)
>>
>> diff --git a/block/commit.c b/block/commit.c
>> index b6e8969877..eb3941e545 100644
>> --- a/block/commit.c
>> +++ b/block/commit.c
>> @@ -36,6 +36,7 @@ typedef struct CommitBlockJob {
>>      BlockDriverState *commit_top_bs;
>>      BlockBackend *top;
>>      BlockBackend *base;
>> +    BlockDriverState *base_bs;
>>      BlockdevOnError on_error;
>>      int base_flags;
>>      char *backing_file_str;
>> @@ -68,61 +69,65 @@ static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
>>      return 0;
>>  }
>>  
>> -static void commit_exit(Job *job)
>> +static int commit_prepare(Job *job)
>>  {
>>      CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
>> -    BlockJob *bjob = &s->common;
>> -    BlockDriverState *top = blk_bs(s->top);
>> -    BlockDriverState *base = blk_bs(s->base);
>> -    BlockDriverState *commit_top_bs = s->commit_top_bs;
>> -    bool remove_commit_top_bs = false;
>> -
>> -    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
>> -    bdrv_ref(top);
>> -    bdrv_ref(commit_top_bs);
>>  
>>      /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
>>       * the normal backing chain can be restored. */
>>      blk_unref(s->base);
>> +    s->base = NULL;
>>  
>> -    if (!job_is_cancelled(job) && job->ret == 0) {
>> -        /* success */
>> -        job->ret = bdrv_drop_intermediate(s->commit_top_bs, base,
>> -                                          s->backing_file_str);
>> -    } else {
>> -        /* XXX Can (or should) we somehow keep 'consistent read' blocked even
>> -         * after the failed/cancelled commit job is gone? If we already wrote
>> -         * something to base, the intermediate images aren't valid any more. */
>> -        remove_commit_top_bs = true;
>> +    return bdrv_drop_intermediate(s->commit_top_bs, s->base_bs,
>> +                                  s->backing_file_str);
>> +}
> 
> If we can go from prepare->abort->clean, then that means to me that every
> failure case of .prepare() can be resolved without permanent changes / data
> loss.  Is this necessarily the case?
> 

That'd be a requisite to make the job a transaction, but commit, mirror
and stream are not currently transactionable.

The way commit already works, for example, can leave the base and
intermediate images as unusable as standalone images. This refactoring
will not change that alone.

So it's not necessarily a problem, but it's something that would need to
be fixed if we ever wanted transaction support.

However, in talking on IRC we did realize that this patch does change
behavior...

Before:

If bdrv_drop_intermediate fails, we store the retcode but continue
cleaning up as if it didn't fail. i.e., we don't remove the commit job's
installed top_bs node.

After:

if bdrv_drop_intermediate fails, we return the failure retcode and
.abort gets called as a result, i.e. we will remove the commit job's
installed top_bs node in favor of the original top_bs node.

I think this behavior is an improvement, however it raises a question
about the nature of failures in bdrv_drop_intermediate.

If this function fails without making any changes, the new commit
behavior is good. If it succeeds, we're also good. The problem is with
intermediate or partial successes.

If top has multiple parents (I think under normal circumstances it
won't, but I'm not absolutely sure) and it fails to update their backing
file references, it might partially succeed.

I think commit's usage here is correct, but I think we might need to
update bdrv_drop_intermediate to make it roll back changes if it
experiences a partial failure to give all-or-nothing semantics.

Thoughts?

> From bdrv_drop_intermediate():
> 
>     QLIST_FOREACH_SAFE(c, &top->parents, next_parent, next) {
>         /* Check whether we are allowed to switch c from top to base */
>         GSList *ignore_children = g_slist_prepend(NULL, c);
>         bdrv_check_update_perm(base, NULL, c->perm, c->shared_perm,
>                                ignore_children, &local_err);
>         g_slist_free(ignore_children);
>         if (local_err) {
>             ret = -EPERM;
>             error_report_err(local_err);
>             goto exit;
>         }
> 
>         /* If so, update the backing file path in the image file */
>         if (c->role->update_filename) {
>             ret = c->role->update_filename(c, base, backing_file_str,
>                                            &local_err);
>             if (ret < 0) {
>                 bdrv_abort_perm_update(base);
>                 error_report_err(local_err);
>                 goto exit;
>             }
>         }
> 
>         [...]
>      }
> 
> We could fail this but still have modified an image file backing filenames,
> right?
> 
> Or am I incorrect about the intention here, that abort() can always be clean?
> 
> -Jeff
> 
>> +
>> +static void commit_abort(Job *job)
>> +{
>> +    CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
>> +    BlockDriverState *top_bs = blk_bs(s->top);
>> +
>> +    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
>> +    bdrv_ref(top_bs);
>> +    bdrv_ref(s->commit_top_bs);
>> +
>> +    if (s->base) {
>> +        blk_unref(s->base);
>>      }
>>  
>> +    /* free the blockers on the intermediate nodes so that bdrv_replace_nodes
>> +     * can succeed */
>> +    block_job_remove_all_bdrv(&s->common);
>> +
>> +    /* If bdrv_drop_intermediate() failed (or was not invoked), remove the
>> +     * commit filter driver from the backing chain now. Do this as the final
>> +     * step so that the 'consistent read' permission can be granted.
>> +     *
>> +     * XXX Can (or should) we somehow keep 'consistent read' blocked even
>> +     * after the failed/cancelled commit job is gone? If we already wrote
>> +     * something to base, the intermediate images aren't valid any more. */
>> +    bdrv_child_try_set_perm(s->commit_top_bs->backing, 0, BLK_PERM_ALL,
>> +                            &error_abort);
>> +    bdrv_replace_node(s->commit_top_bs, backing_bs(s->commit_top_bs),
>> +                      &error_abort);
>> +
>> +    bdrv_unref(s->commit_top_bs);
>> +    bdrv_unref(top_bs);
>> +}
>> +
>> +static void commit_clean(Job *job)
>> +{
>> +    CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
>> +
>>      /* restore base open flags here if appropriate (e.g., change the base back
>>       * to r/o). These reopens do not need to be atomic, since we won't abort
>>       * even on failure here */
>> -    if (s->base_flags != bdrv_get_flags(base)) {
>> -        bdrv_reopen(base, s->base_flags, NULL);
>> +    if (s->base_flags != bdrv_get_flags(s->base_bs)) {
>> +        bdrv_reopen(s->base_bs, s->base_flags, NULL);
>>      }
>> +
>>      g_free(s->backing_file_str);
>>      blk_unref(s->top);
>> -
>> -    /* If there is more than one reference to the job (e.g. if called from
>> -     * job_finish_sync()), job_completed() won't free it and therefore the
>> -     * blockers on the intermediate nodes remain. This would cause
>> -     * bdrv_set_backing_hd() to fail. */
>> -    block_job_remove_all_bdrv(bjob);
>> -
>> -    /* If bdrv_drop_intermediate() didn't already do that, remove the commit
>> -     * filter driver from the backing chain. Do this as the final step so that
>> -     * the 'consistent read' permission can be granted.  */
>> -    if (remove_commit_top_bs) {
>> -        bdrv_child_try_set_perm(commit_top_bs->backing, 0, BLK_PERM_ALL,
>> -                                &error_abort);
>> -        bdrv_replace_node(commit_top_bs, backing_bs(commit_top_bs),
>> -                          &error_abort);
>> -    }
>> -
>> -    bdrv_unref(commit_top_bs);
>> -    bdrv_unref(top);
>>  }
>>  
>>  static int coroutine_fn commit_run(Job *job, Error **errp)
>> @@ -211,7 +216,9 @@ static const BlockJobDriver commit_job_driver = {
>>          .user_resume   = block_job_user_resume,
>>          .drain         = block_job_drain,
>>          .run           = commit_run,
>> -        .exit          = commit_exit,
>> +        .prepare       = commit_prepare,
>> +        .abort         = commit_abort,
>> +        .clean         = commit_clean
>>      },
>>  };
>>  
>> @@ -345,6 +352,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
>>      if (ret < 0) {
>>          goto fail;
>>      }
>> +    s->base_bs = base;
>>  
>>      /* Required permissions are already taken with block_job_add_bdrv() */
>>      s->top = blk_new(0, BLK_PERM_ALL);
>> -- 
>> 2.14.4
>>
Max Reitz Sept. 5, 2018, 10:27 a.m. UTC | #3
On 2018-09-04 22:32, John Snow wrote:
> 
> 
> On 09/04/2018 02:46 PM, Jeff Cody wrote:
>> On Tue, Sep 04, 2018 at 01:09:19PM -0400, John Snow wrote:
>>> Use the component callbacks; prepare, abort, and clean.
>>>
>>> NB: prepare is only called when the job has not yet failed;
>>> and abort can be called after prepare.
>>>
>>> complete -> prepare -> abort -> clean
>>> complete -> abort -> clean
>>>
>>> Signed-off-by: John Snow <jsnow@redhat.com>
>>> Reviewed-by: Max Reitz <mreitz@redhat.com>
>>> ---
>>>  block/commit.c | 90 ++++++++++++++++++++++++++++++++--------------------------
>>>  1 file changed, 49 insertions(+), 41 deletions(-)
>>>
>>> diff --git a/block/commit.c b/block/commit.c
>>> index b6e8969877..eb3941e545 100644
>>> --- a/block/commit.c
>>> +++ b/block/commit.c
>>> @@ -36,6 +36,7 @@ typedef struct CommitBlockJob {
>>>      BlockDriverState *commit_top_bs;
>>>      BlockBackend *top;
>>>      BlockBackend *base;
>>> +    BlockDriverState *base_bs;
>>>      BlockdevOnError on_error;
>>>      int base_flags;
>>>      char *backing_file_str;
>>> @@ -68,61 +69,65 @@ static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
>>>      return 0;
>>>  }
>>>  
>>> -static void commit_exit(Job *job)
>>> +static int commit_prepare(Job *job)
>>>  {
>>>      CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
>>> -    BlockJob *bjob = &s->common;
>>> -    BlockDriverState *top = blk_bs(s->top);
>>> -    BlockDriverState *base = blk_bs(s->base);
>>> -    BlockDriverState *commit_top_bs = s->commit_top_bs;
>>> -    bool remove_commit_top_bs = false;
>>> -
>>> -    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
>>> -    bdrv_ref(top);
>>> -    bdrv_ref(commit_top_bs);
>>>  
>>>      /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
>>>       * the normal backing chain can be restored. */
>>>      blk_unref(s->base);
>>> +    s->base = NULL;
>>>  
>>> -    if (!job_is_cancelled(job) && job->ret == 0) {
>>> -        /* success */
>>> -        job->ret = bdrv_drop_intermediate(s->commit_top_bs, base,
>>> -                                          s->backing_file_str);
>>> -    } else {
>>> -        /* XXX Can (or should) we somehow keep 'consistent read' blocked even
>>> -         * after the failed/cancelled commit job is gone? If we already wrote
>>> -         * something to base, the intermediate images aren't valid any more. */
>>> -        remove_commit_top_bs = true;
>>> +    return bdrv_drop_intermediate(s->commit_top_bs, s->base_bs,
>>> +                                  s->backing_file_str);
>>> +}
>>
>> If we can go from prepare->abort->clean, then that means to me that every
>> failure case of .prepare() can be resolved without permanent changes / data
>> loss.  Is this necessarily the case?
>>
> 
> That'd be a requisite to make the job a transaction, but commit, mirror
> and stream are not currently transactionable.

Is that already documented anywhere?

(Otherwise I'd be afraid of us forgetting in like a year, asking "Why
isn't this a transaction already?", just making it one, and then
remembering half a year later.)

> The way commit already works, for example, can leave the base and
> intermediate images as unusable as standalone images. This refactoring
> will not change that alone.
> 
> So it's not necessarily a problem, but it's something that would need to
> be fixed if we ever wanted transaction support.
> 
> However, in talking on IRC we did realize that this patch does change
> behavior...
> 
> Before:
> 
> If bdrv_drop_intermediate fails, we store the retcode but continue
> cleaning up as if it didn't fail. i.e., we don't remove the commit job's
> installed top_bs node.
> 
> After:
> 
> if bdrv_drop_intermediate fails, we return the failure retcode and
> .abort gets called as a result, i.e. we will remove the commit job's
> installed top_bs node in favor of the original top_bs node.
> 
> I think this behavior is an improvement,

I agree.

> however it raises a question
> about the nature of failures in bdrv_drop_intermediate.
> 
> If this function fails without making any changes, the new commit
> behavior is good. If it succeeds, we're also good. The problem is with
> intermediate or partial successes.
> 
> If top has multiple parents (I think under normal circumstances it
> won't, but I'm not absolutely sure) and it fails to update their backing
> file references, it might partially succeed.
> 
> I think commit's usage here is correct, but I think we might need to
> update bdrv_drop_intermediate to make it roll back changes if it
> experiences a partial failure to give all-or-nothing semantics.

Sure, that would be good.

> Thoughts?

We could start by calling bdrv_check_update_perm() on all parents before
doing any changes.  Then the roll back would consist only of invoking
bdrv_abort_perm_update() and in theory reverting the
c->update_filename() changes.

In practice...  How do we want to revert c->update_filename()?  There
currently is no way of getting the old value.  (And just using the old
child's filename may well be wrong, because the old child might not be
the one referenced by the image header.)

I have three ideas:
1) We could introduce a way of getting the old filename the parent has,
so we can restore it.

2) We could make .update_filename() kind of transactionable (seems like
overkill, but it would be easier in practice, I think).

3) We basically ignore .update_filename() errors.  We'd still return
them, but we don't abort the graph change operation.  So after
bdrv_drop_intermediate() is done, the graph has been changed
succesfully, or it hasn't changed at all -- whether the filename updates
all went through, that's a different story.

#3 would be the simplest solution.  It's a bit stupid, but it would work
for most problems, I think; at least the callers would know that the
graph is in exactly one of two well-defined states.

Max
Kevin Wolf Sept. 5, 2018, 10:49 a.m. UTC | #4
Am 05.09.2018 um 12:27 hat Max Reitz geschrieben:
> On 2018-09-04 22:32, John Snow wrote:
> > 
> > 
> > On 09/04/2018 02:46 PM, Jeff Cody wrote:
> >> On Tue, Sep 04, 2018 at 01:09:19PM -0400, John Snow wrote:
> >>> Use the component callbacks; prepare, abort, and clean.
> >>>
> >>> NB: prepare is only called when the job has not yet failed;
> >>> and abort can be called after prepare.
> >>>
> >>> complete -> prepare -> abort -> clean
> >>> complete -> abort -> clean
> >>>
> >>> Signed-off-by: John Snow <jsnow@redhat.com>
> >>> Reviewed-by: Max Reitz <mreitz@redhat.com>
> >>> ---
> >>>  block/commit.c | 90 ++++++++++++++++++++++++++++++++--------------------------
> >>>  1 file changed, 49 insertions(+), 41 deletions(-)
> >>>
> >>> diff --git a/block/commit.c b/block/commit.c
> >>> index b6e8969877..eb3941e545 100644
> >>> --- a/block/commit.c
> >>> +++ b/block/commit.c
> >>> @@ -36,6 +36,7 @@ typedef struct CommitBlockJob {
> >>>      BlockDriverState *commit_top_bs;
> >>>      BlockBackend *top;
> >>>      BlockBackend *base;
> >>> +    BlockDriverState *base_bs;
> >>>      BlockdevOnError on_error;
> >>>      int base_flags;
> >>>      char *backing_file_str;
> >>> @@ -68,61 +69,65 @@ static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
> >>>      return 0;
> >>>  }
> >>>  
> >>> -static void commit_exit(Job *job)
> >>> +static int commit_prepare(Job *job)
> >>>  {
> >>>      CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
> >>> -    BlockJob *bjob = &s->common;
> >>> -    BlockDriverState *top = blk_bs(s->top);
> >>> -    BlockDriverState *base = blk_bs(s->base);
> >>> -    BlockDriverState *commit_top_bs = s->commit_top_bs;
> >>> -    bool remove_commit_top_bs = false;
> >>> -
> >>> -    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
> >>> -    bdrv_ref(top);
> >>> -    bdrv_ref(commit_top_bs);
> >>>  
> >>>      /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
> >>>       * the normal backing chain can be restored. */
> >>>      blk_unref(s->base);
> >>> +    s->base = NULL;
> >>>  
> >>> -    if (!job_is_cancelled(job) && job->ret == 0) {
> >>> -        /* success */
> >>> -        job->ret = bdrv_drop_intermediate(s->commit_top_bs, base,
> >>> -                                          s->backing_file_str);
> >>> -    } else {
> >>> -        /* XXX Can (or should) we somehow keep 'consistent read' blocked even
> >>> -         * after the failed/cancelled commit job is gone? If we already wrote
> >>> -         * something to base, the intermediate images aren't valid any more. */
> >>> -        remove_commit_top_bs = true;
> >>> +    return bdrv_drop_intermediate(s->commit_top_bs, s->base_bs,
> >>> +                                  s->backing_file_str);
> >>> +}
> >>
> >> If we can go from prepare->abort->clean, then that means to me that every
> >> failure case of .prepare() can be resolved without permanent changes / data
> >> loss.  Is this necessarily the case?
> >>
> > 
> > That'd be a requisite to make the job a transaction, but commit, mirror
> > and stream are not currently transactionable.
> 
> Is that already documented anywhere?
> 
> (Otherwise I'd be afraid of us forgetting in like a year, asking "Why
> isn't this a transaction already?", just making it one, and then
> remembering half a year later.)
> 
> > The way commit already works, for example, can leave the base and
> > intermediate images as unusable as standalone images. This refactoring
> > will not change that alone.
> > 
> > So it's not necessarily a problem, but it's something that would need to
> > be fixed if we ever wanted transaction support.
> > 
> > However, in talking on IRC we did realize that this patch does change
> > behavior...
> > 
> > Before:
> > 
> > If bdrv_drop_intermediate fails, we store the retcode but continue
> > cleaning up as if it didn't fail. i.e., we don't remove the commit job's
> > installed top_bs node.
> > 
> > After:
> > 
> > if bdrv_drop_intermediate fails, we return the failure retcode and
> > .abort gets called as a result, i.e. we will remove the commit job's
> > installed top_bs node in favor of the original top_bs node.
> > 
> > I think this behavior is an improvement,
> 
> I agree.
> 
> > however it raises a question
> > about the nature of failures in bdrv_drop_intermediate.
> > 
> > If this function fails without making any changes, the new commit
> > behavior is good. If it succeeds, we're also good. The problem is with
> > intermediate or partial successes.
> > 
> > If top has multiple parents (I think under normal circumstances it
> > won't, but I'm not absolutely sure) and it fails to update their backing
> > file references, it might partially succeed.
> > 
> > I think commit's usage here is correct, but I think we might need to
> > update bdrv_drop_intermediate to make it roll back changes if it
> > experiences a partial failure to give all-or-nothing semantics.
> 
> Sure, that would be good.
> 
> > Thoughts?
> 
> We could start by calling bdrv_check_update_perm() on all parents before
> doing any changes.  Then the roll back would consist only of invoking
> bdrv_abort_perm_update() and in theory reverting the
> c->update_filename() changes.
> 
> In practice...  How do we want to revert c->update_filename()?  There
> currently is no way of getting the old value.  (And just using the old
> child's filename may well be wrong, because the old child might not be
> the one referenced by the image header.)
> 
> I have three ideas:
> 1) We could introduce a way of getting the old filename the parent has,
> so we can restore it.
> 
> 2) We could make .update_filename() kind of transactionable (seems like
> overkill, but it would be easier in practice, I think).
> 
> 3) We basically ignore .update_filename() errors.  We'd still return
> them, but we don't abort the graph change operation.  So after
> bdrv_drop_intermediate() is done, the graph has been changed
> succesfully, or it hasn't changed at all -- whether the filename updates
> all went through, that's a different story.
> 
> #3 would be the simplest solution.  It's a bit stupid, but it would work
> for most problems, I think; at least the callers would know that the
> graph is in exactly one of two well-defined states.

Option 2 sounds nice in theory, but how do you make things
transactionable when they require writing to an image? Either you make
the change or you don't. If you make it and later notice that you
shouldn't have done so, you can try to write the old value back, but
that can fail, too.

So making .update_filename() transactionable is probably not feasible.
The choice that is left is whether we update it in .prepare (and
continue with .abort if it fails) or in .commit (and ignore errors).

Kevin
Max Reitz Sept. 5, 2018, 11:37 a.m. UTC | #5
On 2018-09-05 12:49, Kevin Wolf wrote:
> Am 05.09.2018 um 12:27 hat Max Reitz geschrieben:
>> On 2018-09-04 22:32, John Snow wrote:
>>>
>>>
>>> On 09/04/2018 02:46 PM, Jeff Cody wrote:
>>>> On Tue, Sep 04, 2018 at 01:09:19PM -0400, John Snow wrote:
>>>>> Use the component callbacks; prepare, abort, and clean.
>>>>>
>>>>> NB: prepare is only called when the job has not yet failed;
>>>>> and abort can be called after prepare.
>>>>>
>>>>> complete -> prepare -> abort -> clean
>>>>> complete -> abort -> clean
>>>>>
>>>>> Signed-off-by: John Snow <jsnow@redhat.com>
>>>>> Reviewed-by: Max Reitz <mreitz@redhat.com>
>>>>> ---
>>>>>  block/commit.c | 90 ++++++++++++++++++++++++++++++++--------------------------
>>>>>  1 file changed, 49 insertions(+), 41 deletions(-)
>>>>>
>>>>> diff --git a/block/commit.c b/block/commit.c
>>>>> index b6e8969877..eb3941e545 100644
>>>>> --- a/block/commit.c
>>>>> +++ b/block/commit.c
>>>>> @@ -36,6 +36,7 @@ typedef struct CommitBlockJob {
>>>>>      BlockDriverState *commit_top_bs;
>>>>>      BlockBackend *top;
>>>>>      BlockBackend *base;
>>>>> +    BlockDriverState *base_bs;
>>>>>      BlockdevOnError on_error;
>>>>>      int base_flags;
>>>>>      char *backing_file_str;
>>>>> @@ -68,61 +69,65 @@ static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
>>>>>      return 0;
>>>>>  }
>>>>>  
>>>>> -static void commit_exit(Job *job)
>>>>> +static int commit_prepare(Job *job)
>>>>>  {
>>>>>      CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
>>>>> -    BlockJob *bjob = &s->common;
>>>>> -    BlockDriverState *top = blk_bs(s->top);
>>>>> -    BlockDriverState *base = blk_bs(s->base);
>>>>> -    BlockDriverState *commit_top_bs = s->commit_top_bs;
>>>>> -    bool remove_commit_top_bs = false;
>>>>> -
>>>>> -    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
>>>>> -    bdrv_ref(top);
>>>>> -    bdrv_ref(commit_top_bs);
>>>>>  
>>>>>      /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
>>>>>       * the normal backing chain can be restored. */
>>>>>      blk_unref(s->base);
>>>>> +    s->base = NULL;
>>>>>  
>>>>> -    if (!job_is_cancelled(job) && job->ret == 0) {
>>>>> -        /* success */
>>>>> -        job->ret = bdrv_drop_intermediate(s->commit_top_bs, base,
>>>>> -                                          s->backing_file_str);
>>>>> -    } else {
>>>>> -        /* XXX Can (or should) we somehow keep 'consistent read' blocked even
>>>>> -         * after the failed/cancelled commit job is gone? If we already wrote
>>>>> -         * something to base, the intermediate images aren't valid any more. */
>>>>> -        remove_commit_top_bs = true;
>>>>> +    return bdrv_drop_intermediate(s->commit_top_bs, s->base_bs,
>>>>> +                                  s->backing_file_str);
>>>>> +}
>>>>
>>>> If we can go from prepare->abort->clean, then that means to me that every
>>>> failure case of .prepare() can be resolved without permanent changes / data
>>>> loss.  Is this necessarily the case?
>>>>
>>>
>>> That'd be a requisite to make the job a transaction, but commit, mirror
>>> and stream are not currently transactionable.
>>
>> Is that already documented anywhere?
>>
>> (Otherwise I'd be afraid of us forgetting in like a year, asking "Why
>> isn't this a transaction already?", just making it one, and then
>> remembering half a year later.)
>>
>>> The way commit already works, for example, can leave the base and
>>> intermediate images as unusable as standalone images. This refactoring
>>> will not change that alone.
>>>
>>> So it's not necessarily a problem, but it's something that would need to
>>> be fixed if we ever wanted transaction support.
>>>
>>> However, in talking on IRC we did realize that this patch does change
>>> behavior...
>>>
>>> Before:
>>>
>>> If bdrv_drop_intermediate fails, we store the retcode but continue
>>> cleaning up as if it didn't fail. i.e., we don't remove the commit job's
>>> installed top_bs node.
>>>
>>> After:
>>>
>>> if bdrv_drop_intermediate fails, we return the failure retcode and
>>> .abort gets called as a result, i.e. we will remove the commit job's
>>> installed top_bs node in favor of the original top_bs node.
>>>
>>> I think this behavior is an improvement,
>>
>> I agree.
>>
>>> however it raises a question
>>> about the nature of failures in bdrv_drop_intermediate.
>>>
>>> If this function fails without making any changes, the new commit
>>> behavior is good. If it succeeds, we're also good. The problem is with
>>> intermediate or partial successes.
>>>
>>> If top has multiple parents (I think under normal circumstances it
>>> won't, but I'm not absolutely sure) and it fails to update their backing
>>> file references, it might partially succeed.
>>>
>>> I think commit's usage here is correct, but I think we might need to
>>> update bdrv_drop_intermediate to make it roll back changes if it
>>> experiences a partial failure to give all-or-nothing semantics.
>>
>> Sure, that would be good.
>>
>>> Thoughts?
>>
>> We could start by calling bdrv_check_update_perm() on all parents before
>> doing any changes.  Then the roll back would consist only of invoking
>> bdrv_abort_perm_update() and in theory reverting the
>> c->update_filename() changes.
>>
>> In practice...  How do we want to revert c->update_filename()?  There
>> currently is no way of getting the old value.  (And just using the old
>> child's filename may well be wrong, because the old child might not be
>> the one referenced by the image header.)
>>
>> I have three ideas:
>> 1) We could introduce a way of getting the old filename the parent has,
>> so we can restore it.
>>
>> 2) We could make .update_filename() kind of transactionable (seems like
>> overkill, but it would be easier in practice, I think).
>>
>> 3) We basically ignore .update_filename() errors.  We'd still return
>> them, but we don't abort the graph change operation.  So after
>> bdrv_drop_intermediate() is done, the graph has been changed
>> succesfully, or it hasn't changed at all -- whether the filename updates
>> all went through, that's a different story.
>>
>> #3 would be the simplest solution.  It's a bit stupid, but it would work
>> for most problems, I think; at least the callers would know that the
>> graph is in exactly one of two well-defined states.
> 
> Option 2 sounds nice in theory, but how do you make things
> transactionable when they require writing to an image? Either you make
> the change or you don't. If you make it and later notice that you
> shouldn't have done so, you can try to write the old value back, but
> that can fail, too.

Yes, that's what I mean by "kind of".  It's just #1 with the code moved
somewhere else.

> So making .update_filename() transactionable is probably not feasible.
> The choice that is left is whether we update it in .prepare (and
> continue with .abort if it fails) or in .commit (and ignore errors).

My idea was "if we could update it one time, we can probably update it a
second time", so reverting in .abort() would usually work.  I know that
a "usually" is not a "definitely", so that's the "kind of" again.

So the abort could still throw an error which we would have to handle
like in #3 anyway, I suppose.

Max
Kevin Wolf Sept. 5, 2018, 11:53 a.m. UTC | #6
Am 05.09.2018 um 13:37 hat Max Reitz geschrieben:
> On 2018-09-05 12:49, Kevin Wolf wrote:
> > Am 05.09.2018 um 12:27 hat Max Reitz geschrieben:
> >> On 2018-09-04 22:32, John Snow wrote:
> >>>
> >>>
> >>> On 09/04/2018 02:46 PM, Jeff Cody wrote:
> >>>> On Tue, Sep 04, 2018 at 01:09:19PM -0400, John Snow wrote:
> >>>>> Use the component callbacks; prepare, abort, and clean.
> >>>>>
> >>>>> NB: prepare is only called when the job has not yet failed;
> >>>>> and abort can be called after prepare.
> >>>>>
> >>>>> complete -> prepare -> abort -> clean
> >>>>> complete -> abort -> clean
> >>>>>
> >>>>> Signed-off-by: John Snow <jsnow@redhat.com>
> >>>>> Reviewed-by: Max Reitz <mreitz@redhat.com>
> >>>>> ---
> >>>>>  block/commit.c | 90 ++++++++++++++++++++++++++++++++--------------------------
> >>>>>  1 file changed, 49 insertions(+), 41 deletions(-)
> >>>>>
> >>>>> diff --git a/block/commit.c b/block/commit.c
> >>>>> index b6e8969877..eb3941e545 100644
> >>>>> --- a/block/commit.c
> >>>>> +++ b/block/commit.c
> >>>>> @@ -36,6 +36,7 @@ typedef struct CommitBlockJob {
> >>>>>      BlockDriverState *commit_top_bs;
> >>>>>      BlockBackend *top;
> >>>>>      BlockBackend *base;
> >>>>> +    BlockDriverState *base_bs;
> >>>>>      BlockdevOnError on_error;
> >>>>>      int base_flags;
> >>>>>      char *backing_file_str;
> >>>>> @@ -68,61 +69,65 @@ static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
> >>>>>      return 0;
> >>>>>  }
> >>>>>  
> >>>>> -static void commit_exit(Job *job)
> >>>>> +static int commit_prepare(Job *job)
> >>>>>  {
> >>>>>      CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
> >>>>> -    BlockJob *bjob = &s->common;
> >>>>> -    BlockDriverState *top = blk_bs(s->top);
> >>>>> -    BlockDriverState *base = blk_bs(s->base);
> >>>>> -    BlockDriverState *commit_top_bs = s->commit_top_bs;
> >>>>> -    bool remove_commit_top_bs = false;
> >>>>> -
> >>>>> -    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
> >>>>> -    bdrv_ref(top);
> >>>>> -    bdrv_ref(commit_top_bs);
> >>>>>  
> >>>>>      /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
> >>>>>       * the normal backing chain can be restored. */
> >>>>>      blk_unref(s->base);
> >>>>> +    s->base = NULL;
> >>>>>  
> >>>>> -    if (!job_is_cancelled(job) && job->ret == 0) {
> >>>>> -        /* success */
> >>>>> -        job->ret = bdrv_drop_intermediate(s->commit_top_bs, base,
> >>>>> -                                          s->backing_file_str);
> >>>>> -    } else {
> >>>>> -        /* XXX Can (or should) we somehow keep 'consistent read' blocked even
> >>>>> -         * after the failed/cancelled commit job is gone? If we already wrote
> >>>>> -         * something to base, the intermediate images aren't valid any more. */
> >>>>> -        remove_commit_top_bs = true;
> >>>>> +    return bdrv_drop_intermediate(s->commit_top_bs, s->base_bs,
> >>>>> +                                  s->backing_file_str);
> >>>>> +}
> >>>>
> >>>> If we can go from prepare->abort->clean, then that means to me that every
> >>>> failure case of .prepare() can be resolved without permanent changes / data
> >>>> loss.  Is this necessarily the case?
> >>>>
> >>>
> >>> That'd be a requisite to make the job a transaction, but commit, mirror
> >>> and stream are not currently transactionable.
> >>
> >> Is that already documented anywhere?
> >>
> >> (Otherwise I'd be afraid of us forgetting in like a year, asking "Why
> >> isn't this a transaction already?", just making it one, and then
> >> remembering half a year later.)
> >>
> >>> The way commit already works, for example, can leave the base and
> >>> intermediate images as unusable as standalone images. This refactoring
> >>> will not change that alone.
> >>>
> >>> So it's not necessarily a problem, but it's something that would need to
> >>> be fixed if we ever wanted transaction support.
> >>>
> >>> However, in talking on IRC we did realize that this patch does change
> >>> behavior...
> >>>
> >>> Before:
> >>>
> >>> If bdrv_drop_intermediate fails, we store the retcode but continue
> >>> cleaning up as if it didn't fail. i.e., we don't remove the commit job's
> >>> installed top_bs node.
> >>>
> >>> After:
> >>>
> >>> if bdrv_drop_intermediate fails, we return the failure retcode and
> >>> .abort gets called as a result, i.e. we will remove the commit job's
> >>> installed top_bs node in favor of the original top_bs node.
> >>>
> >>> I think this behavior is an improvement,
> >>
> >> I agree.
> >>
> >>> however it raises a question
> >>> about the nature of failures in bdrv_drop_intermediate.
> >>>
> >>> If this function fails without making any changes, the new commit
> >>> behavior is good. If it succeeds, we're also good. The problem is with
> >>> intermediate or partial successes.
> >>>
> >>> If top has multiple parents (I think under normal circumstances it
> >>> won't, but I'm not absolutely sure) and it fails to update their backing
> >>> file references, it might partially succeed.
> >>>
> >>> I think commit's usage here is correct, but I think we might need to
> >>> update bdrv_drop_intermediate to make it roll back changes if it
> >>> experiences a partial failure to give all-or-nothing semantics.
> >>
> >> Sure, that would be good.
> >>
> >>> Thoughts?
> >>
> >> We could start by calling bdrv_check_update_perm() on all parents before
> >> doing any changes.  Then the roll back would consist only of invoking
> >> bdrv_abort_perm_update() and in theory reverting the
> >> c->update_filename() changes.
> >>
> >> In practice...  How do we want to revert c->update_filename()?  There
> >> currently is no way of getting the old value.  (And just using the old
> >> child's filename may well be wrong, because the old child might not be
> >> the one referenced by the image header.)
> >>
> >> I have three ideas:
> >> 1) We could introduce a way of getting the old filename the parent has,
> >> so we can restore it.
> >>
> >> 2) We could make .update_filename() kind of transactionable (seems like
> >> overkill, but it would be easier in practice, I think).
> >>
> >> 3) We basically ignore .update_filename() errors.  We'd still return
> >> them, but we don't abort the graph change operation.  So after
> >> bdrv_drop_intermediate() is done, the graph has been changed
> >> succesfully, or it hasn't changed at all -- whether the filename updates
> >> all went through, that's a different story.
> >>
> >> #3 would be the simplest solution.  It's a bit stupid, but it would work
> >> for most problems, I think; at least the callers would know that the
> >> graph is in exactly one of two well-defined states.
> > 
> > Option 2 sounds nice in theory, but how do you make things
> > transactionable when they require writing to an image? Either you make
> > the change or you don't. If you make it and later notice that you
> > shouldn't have done so, you can try to write the old value back, but
> > that can fail, too.
> 
> Yes, that's what I mean by "kind of".  It's just #1 with the code moved
> somewhere else.
> 
> > So making .update_filename() transactionable is probably not feasible.
> > The choice that is left is whether we update it in .prepare (and
> > continue with .abort if it fails) or in .commit (and ignore errors).
> 
> My idea was "if we could update it one time, we can probably update it a
> second time", so reverting in .abort() would usually work.  I know that
> a "usually" is not a "definitely", so that's the "kind of" again.
> 
> So the abort could still throw an error which we would have to handle
> like in #3 anyway, I suppose.

The question is whether it's better to have the new filename on-disk
when the transaction aborted and the graph stays as it was (if reverting
in .abort fails), or to have the old filename there when the transaction
succeeded.

Both are confusing, but confusing error cases are a bit more reasonable
than confusing success cases, so I tend to agree that updating in
.prepare and reverting in .abort is probably the best we can do.

Kevin
Max Reitz Sept. 5, 2018, 12:25 p.m. UTC | #7
On 2018-09-05 13:53, Kevin Wolf wrote:
> Am 05.09.2018 um 13:37 hat Max Reitz geschrieben:
>> On 2018-09-05 12:49, Kevin Wolf wrote:
>>> Am 05.09.2018 um 12:27 hat Max Reitz geschrieben:
>>>> On 2018-09-04 22:32, John Snow wrote:
>>>>>
>>>>>
>>>>> On 09/04/2018 02:46 PM, Jeff Cody wrote:
>>>>>> On Tue, Sep 04, 2018 at 01:09:19PM -0400, John Snow wrote:
>>>>>>> Use the component callbacks; prepare, abort, and clean.
>>>>>>>
>>>>>>> NB: prepare is only called when the job has not yet failed;
>>>>>>> and abort can be called after prepare.
>>>>>>>
>>>>>>> complete -> prepare -> abort -> clean
>>>>>>> complete -> abort -> clean
>>>>>>>
>>>>>>> Signed-off-by: John Snow <jsnow@redhat.com>
>>>>>>> Reviewed-by: Max Reitz <mreitz@redhat.com>
>>>>>>> ---
>>>>>>>  block/commit.c | 90 ++++++++++++++++++++++++++++++++--------------------------
>>>>>>>  1 file changed, 49 insertions(+), 41 deletions(-)
>>>>>>>
>>>>>>> diff --git a/block/commit.c b/block/commit.c
>>>>>>> index b6e8969877..eb3941e545 100644
>>>>>>> --- a/block/commit.c
>>>>>>> +++ b/block/commit.c
>>>>>>> @@ -36,6 +36,7 @@ typedef struct CommitBlockJob {
>>>>>>>      BlockDriverState *commit_top_bs;
>>>>>>>      BlockBackend *top;
>>>>>>>      BlockBackend *base;
>>>>>>> +    BlockDriverState *base_bs;
>>>>>>>      BlockdevOnError on_error;
>>>>>>>      int base_flags;
>>>>>>>      char *backing_file_str;
>>>>>>> @@ -68,61 +69,65 @@ static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
>>>>>>>      return 0;
>>>>>>>  }
>>>>>>>  
>>>>>>> -static void commit_exit(Job *job)
>>>>>>> +static int commit_prepare(Job *job)
>>>>>>>  {
>>>>>>>      CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
>>>>>>> -    BlockJob *bjob = &s->common;
>>>>>>> -    BlockDriverState *top = blk_bs(s->top);
>>>>>>> -    BlockDriverState *base = blk_bs(s->base);
>>>>>>> -    BlockDriverState *commit_top_bs = s->commit_top_bs;
>>>>>>> -    bool remove_commit_top_bs = false;
>>>>>>> -
>>>>>>> -    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
>>>>>>> -    bdrv_ref(top);
>>>>>>> -    bdrv_ref(commit_top_bs);
>>>>>>>  
>>>>>>>      /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
>>>>>>>       * the normal backing chain can be restored. */
>>>>>>>      blk_unref(s->base);
>>>>>>> +    s->base = NULL;
>>>>>>>  
>>>>>>> -    if (!job_is_cancelled(job) && job->ret == 0) {
>>>>>>> -        /* success */
>>>>>>> -        job->ret = bdrv_drop_intermediate(s->commit_top_bs, base,
>>>>>>> -                                          s->backing_file_str);
>>>>>>> -    } else {
>>>>>>> -        /* XXX Can (or should) we somehow keep 'consistent read' blocked even
>>>>>>> -         * after the failed/cancelled commit job is gone? If we already wrote
>>>>>>> -         * something to base, the intermediate images aren't valid any more. */
>>>>>>> -        remove_commit_top_bs = true;
>>>>>>> +    return bdrv_drop_intermediate(s->commit_top_bs, s->base_bs,
>>>>>>> +                                  s->backing_file_str);
>>>>>>> +}
>>>>>>
>>>>>> If we can go from prepare->abort->clean, then that means to me that every
>>>>>> failure case of .prepare() can be resolved without permanent changes / data
>>>>>> loss.  Is this necessarily the case?
>>>>>>
>>>>>
>>>>> That'd be a requisite to make the job a transaction, but commit, mirror
>>>>> and stream are not currently transactionable.
>>>>
>>>> Is that already documented anywhere?
>>>>
>>>> (Otherwise I'd be afraid of us forgetting in like a year, asking "Why
>>>> isn't this a transaction already?", just making it one, and then
>>>> remembering half a year later.)
>>>>
>>>>> The way commit already works, for example, can leave the base and
>>>>> intermediate images as unusable as standalone images. This refactoring
>>>>> will not change that alone.
>>>>>
>>>>> So it's not necessarily a problem, but it's something that would need to
>>>>> be fixed if we ever wanted transaction support.
>>>>>
>>>>> However, in talking on IRC we did realize that this patch does change
>>>>> behavior...
>>>>>
>>>>> Before:
>>>>>
>>>>> If bdrv_drop_intermediate fails, we store the retcode but continue
>>>>> cleaning up as if it didn't fail. i.e., we don't remove the commit job's
>>>>> installed top_bs node.
>>>>>
>>>>> After:
>>>>>
>>>>> if bdrv_drop_intermediate fails, we return the failure retcode and
>>>>> .abort gets called as a result, i.e. we will remove the commit job's
>>>>> installed top_bs node in favor of the original top_bs node.
>>>>>
>>>>> I think this behavior is an improvement,
>>>>
>>>> I agree.
>>>>
>>>>> however it raises a question
>>>>> about the nature of failures in bdrv_drop_intermediate.
>>>>>
>>>>> If this function fails without making any changes, the new commit
>>>>> behavior is good. If it succeeds, we're also good. The problem is with
>>>>> intermediate or partial successes.
>>>>>
>>>>> If top has multiple parents (I think under normal circumstances it
>>>>> won't, but I'm not absolutely sure) and it fails to update their backing
>>>>> file references, it might partially succeed.
>>>>>
>>>>> I think commit's usage here is correct, but I think we might need to
>>>>> update bdrv_drop_intermediate to make it roll back changes if it
>>>>> experiences a partial failure to give all-or-nothing semantics.
>>>>
>>>> Sure, that would be good.
>>>>
>>>>> Thoughts?
>>>>
>>>> We could start by calling bdrv_check_update_perm() on all parents before
>>>> doing any changes.  Then the roll back would consist only of invoking
>>>> bdrv_abort_perm_update() and in theory reverting the
>>>> c->update_filename() changes.
>>>>
>>>> In practice...  How do we want to revert c->update_filename()?  There
>>>> currently is no way of getting the old value.  (And just using the old
>>>> child's filename may well be wrong, because the old child might not be
>>>> the one referenced by the image header.)
>>>>
>>>> I have three ideas:
>>>> 1) We could introduce a way of getting the old filename the parent has,
>>>> so we can restore it.
>>>>
>>>> 2) We could make .update_filename() kind of transactionable (seems like
>>>> overkill, but it would be easier in practice, I think).
>>>>
>>>> 3) We basically ignore .update_filename() errors.  We'd still return
>>>> them, but we don't abort the graph change operation.  So after
>>>> bdrv_drop_intermediate() is done, the graph has been changed
>>>> succesfully, or it hasn't changed at all -- whether the filename updates
>>>> all went through, that's a different story.
>>>>
>>>> #3 would be the simplest solution.  It's a bit stupid, but it would work
>>>> for most problems, I think; at least the callers would know that the
>>>> graph is in exactly one of two well-defined states.
>>>
>>> Option 2 sounds nice in theory, but how do you make things
>>> transactionable when they require writing to an image? Either you make
>>> the change or you don't. If you make it and later notice that you
>>> shouldn't have done so, you can try to write the old value back, but
>>> that can fail, too.
>>
>> Yes, that's what I mean by "kind of".  It's just #1 with the code moved
>> somewhere else.
>>
>>> So making .update_filename() transactionable is probably not feasible.
>>> The choice that is left is whether we update it in .prepare (and
>>> continue with .abort if it fails) or in .commit (and ignore errors).
>>
>> My idea was "if we could update it one time, we can probably update it a
>> second time", so reverting in .abort() would usually work.  I know that
>> a "usually" is not a "definitely", so that's the "kind of" again.
>>
>> So the abort could still throw an error which we would have to handle
>> like in #3 anyway, I suppose.
> 
> The question is whether it's better to have the new filename on-disk
> when the transaction aborted and the graph stays as it was (if reverting
> in .abort fails), or to have the old filename there when the transaction
> succeeded.

Hm.  Thanks for expressing it this way.  Both cases are bad.

In the former case, the user gets pleasant surprise when restarting the
VM and seeing that the commit actually worked, despite having gotten an
error before (and that the graph stayed unchanged).
But the bad thing is that if any more graph manipulations are done on
the basis that the commit failed, things may break when the VM is
restarted.  Like, maybe the user decides to write into the overlay's
backing file (I don't know) and expects to see that data through the
overlay, all of it will be effectively lost when the VM is restarted.

In the latter case, we have basically the same problem.  When the VM is
restarted and all the images we had wanted to discard suddenly reappear,
that may break things in much the same way.
The VM restart surprise here is that the commit seemed to work, but then
you see that it actually kind of didn't?  (Because all the images
reappear.  Maybe you deleted them in the meantime, and then the VM won't
even want to restart.)

So especially the last part seems to swing the pendulum in the direction
of me preferring having the new filename when the transaction failed.

But still, I don't know whether that's really the only question.

I don't quite see how you want to do the update only in .commit(), and
still call it a transaction.  Do you want to do the bdrv_reopen() (for
bdrv_backing_update_filename()) in prepare, and then invoke
bdrv_backing_update_filename() in commit?  A bit better than nothing,
but it's not really a transaction.

As I said, the benefit of doing it in .prepare() and then rolling back
in .abort() would be the idea of "if I can update it once, I can
probably update it twice -- especially if I write a value to it that it
already had before".

Max

> Both are confusing, but confusing error cases are a bit more reasonable
> than confusing success cases, so I tend to agree that updating in
> .prepare and reverting in .abort is probably the best we can do.
> 
> Kevin
John Snow Sept. 5, 2018, 7:05 p.m. UTC | #8
On 09/05/2018 06:27 AM, Max Reitz wrote:
> On 2018-09-04 22:32, John Snow wrote:
>>
>>
>> On 09/04/2018 02:46 PM, Jeff Cody wrote:
>>> On Tue, Sep 04, 2018 at 01:09:19PM -0400, John Snow wrote:
>>>> Use the component callbacks; prepare, abort, and clean.
>>>>
>>>> NB: prepare is only called when the job has not yet failed;
>>>> and abort can be called after prepare.
>>>>
>>>> complete -> prepare -> abort -> clean
>>>> complete -> abort -> clean
>>>>
>>>> Signed-off-by: John Snow <jsnow@redhat.com>
>>>> Reviewed-by: Max Reitz <mreitz@redhat.com>
>>>> ---
>>>>  block/commit.c | 90 ++++++++++++++++++++++++++++++++--------------------------
>>>>  1 file changed, 49 insertions(+), 41 deletions(-)
>>>>
>>>> diff --git a/block/commit.c b/block/commit.c
>>>> index b6e8969877..eb3941e545 100644
>>>> --- a/block/commit.c
>>>> +++ b/block/commit.c
>>>> @@ -36,6 +36,7 @@ typedef struct CommitBlockJob {
>>>>      BlockDriverState *commit_top_bs;
>>>>      BlockBackend *top;
>>>>      BlockBackend *base;
>>>> +    BlockDriverState *base_bs;
>>>>      BlockdevOnError on_error;
>>>>      int base_flags;
>>>>      char *backing_file_str;
>>>> @@ -68,61 +69,65 @@ static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
>>>>      return 0;
>>>>  }
>>>>  
>>>> -static void commit_exit(Job *job)
>>>> +static int commit_prepare(Job *job)
>>>>  {
>>>>      CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
>>>> -    BlockJob *bjob = &s->common;
>>>> -    BlockDriverState *top = blk_bs(s->top);
>>>> -    BlockDriverState *base = blk_bs(s->base);
>>>> -    BlockDriverState *commit_top_bs = s->commit_top_bs;
>>>> -    bool remove_commit_top_bs = false;
>>>> -
>>>> -    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
>>>> -    bdrv_ref(top);
>>>> -    bdrv_ref(commit_top_bs);
>>>>  
>>>>      /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
>>>>       * the normal backing chain can be restored. */
>>>>      blk_unref(s->base);
>>>> +    s->base = NULL;
>>>>  
>>>> -    if (!job_is_cancelled(job) && job->ret == 0) {
>>>> -        /* success */
>>>> -        job->ret = bdrv_drop_intermediate(s->commit_top_bs, base,
>>>> -                                          s->backing_file_str);
>>>> -    } else {
>>>> -        /* XXX Can (or should) we somehow keep 'consistent read' blocked even
>>>> -         * after the failed/cancelled commit job is gone? If we already wrote
>>>> -         * something to base, the intermediate images aren't valid any more. */
>>>> -        remove_commit_top_bs = true;
>>>> +    return bdrv_drop_intermediate(s->commit_top_bs, s->base_bs,
>>>> +                                  s->backing_file_str);
>>>> +}
>>>
>>> If we can go from prepare->abort->clean, then that means to me that every
>>> failure case of .prepare() can be resolved without permanent changes / data
>>> loss.  Is this necessarily the case?
>>>
>>
>> That'd be a requisite to make the job a transaction, but commit, mirror
>> and stream are not currently transactionable.
> 
> Is that already documented anywhere?
> 

Hm, no, not really.

I'm most inclined to document it near the action table because it would
be hard to miss if you went to add it.

I'll add this in an extra patch at the end in case you want to debate
the wording and/or location.

> (Otherwise I'd be afraid of us forgetting in like a year, asking "Why
> isn't this a transaction already?", just making it one, and then
> remembering half a year later.)
> 

No, it's a good point.

>> The way commit already works, for example, can leave the base and
>> intermediate images as unusable as standalone images. This refactoring
>> will not change that alone.
>>
>> So it's not necessarily a problem, but it's something that would need to
>> be fixed if we ever wanted transaction support.
>>
>> However, in talking on IRC we did realize that this patch does change
>> behavior...
>>
>> Before:
>>
>> If bdrv_drop_intermediate fails, we store the retcode but continue
>> cleaning up as if it didn't fail. i.e., we don't remove the commit job's
>> installed top_bs node.
>>
>> After:
>>
>> if bdrv_drop_intermediate fails, we return the failure retcode and
>> .abort gets called as a result, i.e. we will remove the commit job's
>> installed top_bs node in favor of the original top_bs node.
>>
>> I think this behavior is an improvement,
> 
> I agree.
> 

Based on this I will leave the stickier fix to a future patch... I will
add a FIXME note detailing the shortfall in this patch, but I am
asserting that the current behavior is /not worse/ than the old
behavior, while there is still a bug that we might need to fix in the
future.

>> however it raises a question
>> about the nature of failures in bdrv_drop_intermediate.
>>
>> If this function fails without making any changes, the new commit
>> behavior is good. If it succeeds, we're also good. The problem is with
>> intermediate or partial successes.
>>
>> If top has multiple parents (I think under normal circumstances it
>> won't, but I'm not absolutely sure) and it fails to update their backing
>> file references, it might partially succeed.
>>
>> I think commit's usage here is correct, but I think we might need to
>> update bdrv_drop_intermediate to make it roll back changes if it
>> experiences a partial failure to give all-or-nothing semantics.
> 
> Sure, that would be good.
> 
>> Thoughts?
> 
> We could start by calling bdrv_check_update_perm() on all parents before
> doing any changes.  Then the roll back would consist only of invoking
> bdrv_abort_perm_update() and in theory reverting the
> c->update_filename() changes.
> 
> In practice...  How do we want to revert c->update_filename()?  There
> currently is no way of getting the old value.  (And just using the old
> child's filename may well be wrong, because the old child might not be
> the one referenced by the image header.)
> 
> I have three ideas:
> 1) We could introduce a way of getting the old filename the parent has,
> so we can restore it.
> 
> 2) We could make .update_filename() kind of transactionable (seems like
> overkill, but it would be easier in practice, I think).
> 
> 3) We basically ignore .update_filename() errors.  We'd still return
> them, but we don't abort the graph change operation.  So after
> bdrv_drop_intermediate() is done, the graph has been changed
> succesfully, or it hasn't changed at all -- whether the filename updates
> all went through, that's a different story.
> 
> #3 would be the simplest solution.  It's a bit stupid, but it would work
> for most problems, I think; at least the callers would know that the
> graph is in exactly one of two well-defined states.
> 
> Max
> 

I suppose another option would be to just update the function to return
which kind of error it had.

If it couldn't update *anything*, we can treat this as a hard failure.
If it managed to update *some things*, we can ignore the error for the
purposes of cleanup, but report the error. "Hey, something... happened.
The commit worked but the graph change failed. Please investigate."
And, of course, success is success.

I don't really need or want "transaction" semantics here, just the
ability to have well-defined ending states. The trinary outcome might be
enough -- it's perhaps not the end of the world to ask for manual
intervention after a failure. It seems conservatively the safest option.
diff mbox series

Patch

diff --git a/block/commit.c b/block/commit.c
index b6e8969877..eb3941e545 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -36,6 +36,7 @@  typedef struct CommitBlockJob {
     BlockDriverState *commit_top_bs;
     BlockBackend *top;
     BlockBackend *base;
+    BlockDriverState *base_bs;
     BlockdevOnError on_error;
     int base_flags;
     char *backing_file_str;
@@ -68,61 +69,65 @@  static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
     return 0;
 }
 
-static void commit_exit(Job *job)
+static int commit_prepare(Job *job)
 {
     CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
-    BlockJob *bjob = &s->common;
-    BlockDriverState *top = blk_bs(s->top);
-    BlockDriverState *base = blk_bs(s->base);
-    BlockDriverState *commit_top_bs = s->commit_top_bs;
-    bool remove_commit_top_bs = false;
-
-    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
-    bdrv_ref(top);
-    bdrv_ref(commit_top_bs);
 
     /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
      * the normal backing chain can be restored. */
     blk_unref(s->base);
+    s->base = NULL;
 
-    if (!job_is_cancelled(job) && job->ret == 0) {
-        /* success */
-        job->ret = bdrv_drop_intermediate(s->commit_top_bs, base,
-                                          s->backing_file_str);
-    } else {
-        /* XXX Can (or should) we somehow keep 'consistent read' blocked even
-         * after the failed/cancelled commit job is gone? If we already wrote
-         * something to base, the intermediate images aren't valid any more. */
-        remove_commit_top_bs = true;
+    return bdrv_drop_intermediate(s->commit_top_bs, s->base_bs,
+                                  s->backing_file_str);
+}
+
+static void commit_abort(Job *job)
+{
+    CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
+    BlockDriverState *top_bs = blk_bs(s->top);
+
+    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
+    bdrv_ref(top_bs);
+    bdrv_ref(s->commit_top_bs);
+
+    if (s->base) {
+        blk_unref(s->base);
     }
 
+    /* free the blockers on the intermediate nodes so that bdrv_replace_nodes
+     * can succeed */
+    block_job_remove_all_bdrv(&s->common);
+
+    /* If bdrv_drop_intermediate() failed (or was not invoked), remove the
+     * commit filter driver from the backing chain now. Do this as the final
+     * step so that the 'consistent read' permission can be granted.
+     *
+     * XXX Can (or should) we somehow keep 'consistent read' blocked even
+     * after the failed/cancelled commit job is gone? If we already wrote
+     * something to base, the intermediate images aren't valid any more. */
+    bdrv_child_try_set_perm(s->commit_top_bs->backing, 0, BLK_PERM_ALL,
+                            &error_abort);
+    bdrv_replace_node(s->commit_top_bs, backing_bs(s->commit_top_bs),
+                      &error_abort);
+
+    bdrv_unref(s->commit_top_bs);
+    bdrv_unref(top_bs);
+}
+
+static void commit_clean(Job *job)
+{
+    CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
+
     /* restore base open flags here if appropriate (e.g., change the base back
      * to r/o). These reopens do not need to be atomic, since we won't abort
      * even on failure here */
-    if (s->base_flags != bdrv_get_flags(base)) {
-        bdrv_reopen(base, s->base_flags, NULL);
+    if (s->base_flags != bdrv_get_flags(s->base_bs)) {
+        bdrv_reopen(s->base_bs, s->base_flags, NULL);
     }
+
     g_free(s->backing_file_str);
     blk_unref(s->top);
-
-    /* If there is more than one reference to the job (e.g. if called from
-     * job_finish_sync()), job_completed() won't free it and therefore the
-     * blockers on the intermediate nodes remain. This would cause
-     * bdrv_set_backing_hd() to fail. */
-    block_job_remove_all_bdrv(bjob);
-
-    /* If bdrv_drop_intermediate() didn't already do that, remove the commit
-     * filter driver from the backing chain. Do this as the final step so that
-     * the 'consistent read' permission can be granted.  */
-    if (remove_commit_top_bs) {
-        bdrv_child_try_set_perm(commit_top_bs->backing, 0, BLK_PERM_ALL,
-                                &error_abort);
-        bdrv_replace_node(commit_top_bs, backing_bs(commit_top_bs),
-                          &error_abort);
-    }
-
-    bdrv_unref(commit_top_bs);
-    bdrv_unref(top);
 }
 
 static int coroutine_fn commit_run(Job *job, Error **errp)
@@ -211,7 +216,9 @@  static const BlockJobDriver commit_job_driver = {
         .user_resume   = block_job_user_resume,
         .drain         = block_job_drain,
         .run           = commit_run,
-        .exit          = commit_exit,
+        .prepare       = commit_prepare,
+        .abort         = commit_abort,
+        .clean         = commit_clean
     },
 };
 
@@ -345,6 +352,7 @@  void commit_start(const char *job_id, BlockDriverState *bs,
     if (ret < 0) {
         goto fail;
     }
+    s->base_bs = base;
 
     /* Required permissions are already taken with block_job_add_bdrv() */
     s->top = blk_new(0, BLK_PERM_ALL);