diff mbox series

[for-6.1?,v2,5/7] job: Add job_cancel_requested()

Message ID 20210726144613.954844-6-mreitz@redhat.com
State New
Headers show
Series mirror: Handle errors after READY cancel | expand

Commit Message

Max Reitz July 26, 2021, 2:46 p.m. UTC
Most callers of job_is_cancelled() actually want to know whether the job
is on its way to immediate termination.  For example, we refuse to pause
jobs that are cancelled; but this only makes sense for jobs that are
really actually cancelled.

A mirror job that is cancelled during READY with force=false should
absolutely be allowed to pause.  This "cancellation" (which is actually
a kind of completion) may take an indefinite amount of time, and so
should behave like any job during normal operation.  For example, with
on-target-error=stop, the job should stop on write errors.  (In
contrast, force-cancelled jobs should not get write errors, as they
should just terminate and not do further I/O.)

Therefore, redefine job_is_cancelled() to only return true for jobs that
are force-cancelled (which as of HEAD^ means any job that interprets the
cancellation request as a request for immediate termination), and add
job_cancel_request() as the general variant, which returns true for any
jobs which have been requested to be cancelled, whether it be
immediately or after an arbitrarily long completion phase.

Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 include/qemu/job.h |  8 +++++++-
 block/mirror.c     | 10 ++++------
 job.c              |  7 ++++++-
 3 files changed, 17 insertions(+), 8 deletions(-)

Comments

Vladimir Sementsov-Ogievskiy July 27, 2021, 1:04 p.m. UTC | #1
26.07.2021 17:46, Max Reitz wrote:
> Most callers of job_is_cancelled() actually want to know whether the job
> is on its way to immediate termination.  For example, we refuse to pause
> jobs that are cancelled; but this only makes sense for jobs that are
> really actually cancelled.
> 
> A mirror job that is cancelled during READY with force=false should
> absolutely be allowed to pause.  This "cancellation" (which is actually
> a kind of completion) may take an indefinite amount of time, and so
> should behave like any job during normal operation.  For example, with
> on-target-error=stop, the job should stop on write errors.  (In
> contrast, force-cancelled jobs should not get write errors, as they
> should just terminate and not do further I/O.)
> 
> Therefore, redefine job_is_cancelled() to only return true for jobs that
> are force-cancelled (which as of HEAD^ means any job that interprets the
> cancellation request as a request for immediate termination), and add
> job_cancel_request() as the general variant, which returns true for any

job_cancel_requested()

> jobs which have been requested to be cancelled, whether it be
> immediately or after an arbitrarily long completion phase.
> 
> Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>   include/qemu/job.h |  8 +++++++-
>   block/mirror.c     | 10 ++++------
>   job.c              |  7 ++++++-
>   3 files changed, 17 insertions(+), 8 deletions(-)
> 
> diff --git a/include/qemu/job.h b/include/qemu/job.h
> index 8aa90f7395..032edf3c5f 100644
> --- a/include/qemu/job.h
> +++ b/include/qemu/job.h
> @@ -436,9 +436,15 @@ const char *job_type_str(const Job *job);
>   /** Returns true if the job should not be visible to the management layer. */
>   bool job_is_internal(Job *job);
>   
> -/** Returns whether the job is scheduled for cancellation. */
> +/** Returns whether the job is being cancelled. */
>   bool job_is_cancelled(Job *job);
>   
> +/**
> + * Returns whether the job is scheduled for cancellation (at an
> + * indefinite point).
> + */
> +bool job_cancel_requested(Job *job);
> +
>   /** Returns whether the job is in a completed state. */
>   bool job_is_completed(Job *job);
>   
> diff --git a/block/mirror.c b/block/mirror.c
> index e93631a9f6..72e02fa34e 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -936,7 +936,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
>           /* Transition to the READY state and wait for complete. */
>           job_transition_to_ready(&s->common.job);
>           s->actively_synced = true;
> -        while (!job_is_cancelled(&s->common.job) && !s->should_complete) {
> +        while (!job_cancel_requested(&s->common.job) && !s->should_complete) {
>               job_yield(&s->common.job);
>           }
>           s->common.job.cancelled = false;
> @@ -1043,7 +1043,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
>               }
>   
>               should_complete = s->should_complete ||
> -                job_is_cancelled(&s->common.job);
> +                job_cancel_requested(&s->common.job);
>               cnt = bdrv_get_dirty_count(s->dirty_bitmap);
>           }
>   
> @@ -1087,7 +1087,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
>           trace_mirror_before_sleep(s, cnt, job_is_ready(&s->common.job),
>                                     delay_ns);
>           job_sleep_ns(&s->common.job, delay_ns);
> -        if (job_is_cancelled(&s->common.job) && s->common.job.force_cancel) {
> +        if (job_is_cancelled(&s->common.job)) {
>               break;
>           }
>           s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> @@ -1099,9 +1099,7 @@ immediate_exit:
>            * or it was cancelled prematurely so that we do not guarantee that
>            * the target is a copy of the source.
>            */
> -        assert(ret < 0 ||
> -               (s->common.job.force_cancel &&
> -                job_is_cancelled(&s->common.job)));
> +        assert(ret < 0 || job_is_cancelled(&s->common.job));
>           assert(need_drain);
>           mirror_wait_for_all_io(s);
>       }
> diff --git a/job.c b/job.c
> index e78d893a9c..dba17a680f 100644
> --- a/job.c
> +++ b/job.c
> @@ -216,6 +216,11 @@ const char *job_type_str(const Job *job)
>   }
>   
>   bool job_is_cancelled(Job *job)
> +{
> +    return job->cancelled && job->force_cancel;

can job->cancelled be false when job->force_cancel is true ? I think not and worth an assertion here. Something like

if (job->force_cancel) {
    assert(job->cancelled);
    return true;
}

return false;

> +}
> +
> +bool job_cancel_requested(Job *job)
>   {
>       return job->cancelled;
>   }
> @@ -1015,7 +1020,7 @@ void job_complete(Job *job, Error **errp)
>       if (job_apply_verb(job, JOB_VERB_COMPLETE, errp)) {
>           return;
>       }
> -    if (job_is_cancelled(job) || !job->driver->complete) {
> +    if (job_cancel_requested(job) || !job->driver->complete) {
>           error_setg(errp, "The active block job '%s' cannot be completed",
>                      job->id);
>           return;
> 

I think it's a correct change, although there may be unexpected side-effects, it's hard to imagine all consequences of changing job_is_cancelled() semantics called in several places in job.c.

Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Max Reitz July 27, 2021, 3:39 p.m. UTC | #2
On 27.07.21 15:04, Vladimir Sementsov-Ogievskiy wrote:
> 26.07.2021 17:46, Max Reitz wrote:
>> Most callers of job_is_cancelled() actually want to know whether the job
>> is on its way to immediate termination.  For example, we refuse to pause
>> jobs that are cancelled; but this only makes sense for jobs that are
>> really actually cancelled.
>>
>> A mirror job that is cancelled during READY with force=false should
>> absolutely be allowed to pause.  This "cancellation" (which is actually
>> a kind of completion) may take an indefinite amount of time, and so
>> should behave like any job during normal operation.  For example, with
>> on-target-error=stop, the job should stop on write errors.  (In
>> contrast, force-cancelled jobs should not get write errors, as they
>> should just terminate and not do further I/O.)
>>
>> Therefore, redefine job_is_cancelled() to only return true for jobs that
>> are force-cancelled (which as of HEAD^ means any job that interprets the
>> cancellation request as a request for immediate termination), and add
>> job_cancel_request() as the general variant, which returns true for any
>
> job_cancel_requested()
>
>> jobs which have been requested to be cancelled, whether it be
>> immediately or after an arbitrarily long completion phase.
>>
>> Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>> ---
>>   include/qemu/job.h |  8 +++++++-
>>   block/mirror.c     | 10 ++++------
>>   job.c              |  7 ++++++-
>>   3 files changed, 17 insertions(+), 8 deletions(-)
>>
>> diff --git a/include/qemu/job.h b/include/qemu/job.h
>> index 8aa90f7395..032edf3c5f 100644
>> --- a/include/qemu/job.h
>> +++ b/include/qemu/job.h
>> @@ -436,9 +436,15 @@ const char *job_type_str(const Job *job);
>>   /** Returns true if the job should not be visible to the management 
>> layer. */
>>   bool job_is_internal(Job *job);
>>   -/** Returns whether the job is scheduled for cancellation. */
>> +/** Returns whether the job is being cancelled. */
>>   bool job_is_cancelled(Job *job);
>>   +/**
>> + * Returns whether the job is scheduled for cancellation (at an
>> + * indefinite point).
>> + */
>> +bool job_cancel_requested(Job *job);
>> +
>>   /** Returns whether the job is in a completed state. */
>>   bool job_is_completed(Job *job);
>>   diff --git a/block/mirror.c b/block/mirror.c
>> index e93631a9f6..72e02fa34e 100644
>> --- a/block/mirror.c
>> +++ b/block/mirror.c
>> @@ -936,7 +936,7 @@ static int coroutine_fn mirror_run(Job *job, 
>> Error **errp)
>>           /* Transition to the READY state and wait for complete. */
>>           job_transition_to_ready(&s->common.job);
>>           s->actively_synced = true;
>> -        while (!job_is_cancelled(&s->common.job) && 
>> !s->should_complete) {
>> +        while (!job_cancel_requested(&s->common.job) && 
>> !s->should_complete) {
>>               job_yield(&s->common.job);
>>           }
>>           s->common.job.cancelled = false;
>> @@ -1043,7 +1043,7 @@ static int coroutine_fn mirror_run(Job *job, 
>> Error **errp)
>>               }
>>                 should_complete = s->should_complete ||
>> -                job_is_cancelled(&s->common.job);
>> +                job_cancel_requested(&s->common.job);
>>               cnt = bdrv_get_dirty_count(s->dirty_bitmap);
>>           }
>>   @@ -1087,7 +1087,7 @@ static int coroutine_fn mirror_run(Job *job, 
>> Error **errp)
>>           trace_mirror_before_sleep(s, cnt, 
>> job_is_ready(&s->common.job),
>>                                     delay_ns);
>>           job_sleep_ns(&s->common.job, delay_ns);
>> -        if (job_is_cancelled(&s->common.job) && 
>> s->common.job.force_cancel) {
>> +        if (job_is_cancelled(&s->common.job)) {
>>               break;
>>           }
>>           s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
>> @@ -1099,9 +1099,7 @@ immediate_exit:
>>            * or it was cancelled prematurely so that we do not 
>> guarantee that
>>            * the target is a copy of the source.
>>            */
>> -        assert(ret < 0 ||
>> -               (s->common.job.force_cancel &&
>> -                job_is_cancelled(&s->common.job)));
>> +        assert(ret < 0 || job_is_cancelled(&s->common.job));

(As a note, I hope this does the job regarding your suggestions for 
patch 4. :))

>>           assert(need_drain);
>>           mirror_wait_for_all_io(s);
>>       }
>> diff --git a/job.c b/job.c
>> index e78d893a9c..dba17a680f 100644
>> --- a/job.c
>> +++ b/job.c
>> @@ -216,6 +216,11 @@ const char *job_type_str(const Job *job)
>>   }
>>     bool job_is_cancelled(Job *job)
>> +{
>> +    return job->cancelled && job->force_cancel;
>
> can job->cancelled be false when job->force_cancel is true ? I think 
> not and worth an assertion here. Something like
>
> if (job->force_cancel) {
>    assert(job->cancelled);
>    return true;
> }
>
> return false;

Sounds good, why not.

>
>> +}
>> +
>> +bool job_cancel_requested(Job *job)
>>   {
>>       return job->cancelled;
>>   }
>> @@ -1015,7 +1020,7 @@ void job_complete(Job *job, Error **errp)
>>       if (job_apply_verb(job, JOB_VERB_COMPLETE, errp)) {
>>           return;
>>       }
>> -    if (job_is_cancelled(job) || !job->driver->complete) {
>> +    if (job_cancel_requested(job) || !job->driver->complete) {
>>           error_setg(errp, "The active block job '%s' cannot be 
>> completed",
>>                      job->id);
>>           return;
>>
>
> I think it's a correct change, although there may be unexpected 
> side-effects, it's hard to imagine all consequences of changing 
> job_is_cancelled() semantics called in several places in job.c.

Yeah.  Targeting 6.2, I don’t have a bad feeling about it, though.

> Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

Thanks for the review, by the way!

Max
Vladimir Sementsov-Ogievskiy July 27, 2021, 3:47 p.m. UTC | #3
27.07.2021 18:39, Max Reitz wrote:
> On 27.07.21 15:04, Vladimir Sementsov-Ogievskiy wrote:
>> 26.07.2021 17:46, Max Reitz wrote:
>>> Most callers of job_is_cancelled() actually want to know whether the job
>>> is on its way to immediate termination.  For example, we refuse to pause
>>> jobs that are cancelled; but this only makes sense for jobs that are
>>> really actually cancelled.
>>>
>>> A mirror job that is cancelled during READY with force=false should
>>> absolutely be allowed to pause.  This "cancellation" (which is actually
>>> a kind of completion) may take an indefinite amount of time, and so
>>> should behave like any job during normal operation.  For example, with
>>> on-target-error=stop, the job should stop on write errors.  (In
>>> contrast, force-cancelled jobs should not get write errors, as they
>>> should just terminate and not do further I/O.)
>>>
>>> Therefore, redefine job_is_cancelled() to only return true for jobs that
>>> are force-cancelled (which as of HEAD^ means any job that interprets the
>>> cancellation request as a request for immediate termination), and add
>>> job_cancel_request() as the general variant, which returns true for any
>>
>> job_cancel_requested()
>>
>>> jobs which have been requested to be cancelled, whether it be
>>> immediately or after an arbitrarily long completion phase.
>>>
>>> Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
>>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>>> ---
>>>   include/qemu/job.h |  8 +++++++-
>>>   block/mirror.c     | 10 ++++------
>>>   job.c              |  7 ++++++-
>>>   3 files changed, 17 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/include/qemu/job.h b/include/qemu/job.h
>>> index 8aa90f7395..032edf3c5f 100644
>>> --- a/include/qemu/job.h
>>> +++ b/include/qemu/job.h
>>> @@ -436,9 +436,15 @@ const char *job_type_str(const Job *job);
>>>   /** Returns true if the job should not be visible to the management layer. */
>>>   bool job_is_internal(Job *job);
>>>   -/** Returns whether the job is scheduled for cancellation. */
>>> +/** Returns whether the job is being cancelled. */
>>>   bool job_is_cancelled(Job *job);
>>>   +/**
>>> + * Returns whether the job is scheduled for cancellation (at an
>>> + * indefinite point).
>>> + */
>>> +bool job_cancel_requested(Job *job);
>>> +
>>>   /** Returns whether the job is in a completed state. */
>>>   bool job_is_completed(Job *job);
>>>   diff --git a/block/mirror.c b/block/mirror.c
>>> index e93631a9f6..72e02fa34e 100644
>>> --- a/block/mirror.c
>>> +++ b/block/mirror.c
>>> @@ -936,7 +936,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
>>>           /* Transition to the READY state and wait for complete. */
>>>           job_transition_to_ready(&s->common.job);
>>>           s->actively_synced = true;
>>> -        while (!job_is_cancelled(&s->common.job) && !s->should_complete) {
>>> +        while (!job_cancel_requested(&s->common.job) && !s->should_complete) {
>>>               job_yield(&s->common.job);
>>>           }
>>>           s->common.job.cancelled = false;
>>> @@ -1043,7 +1043,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
>>>               }
>>>                 should_complete = s->should_complete ||
>>> -                job_is_cancelled(&s->common.job);
>>> +                job_cancel_requested(&s->common.job);
>>>               cnt = bdrv_get_dirty_count(s->dirty_bitmap);
>>>           }
>>>   @@ -1087,7 +1087,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
>>>           trace_mirror_before_sleep(s, cnt, job_is_ready(&s->common.job),
>>>                                     delay_ns);
>>>           job_sleep_ns(&s->common.job, delay_ns);
>>> -        if (job_is_cancelled(&s->common.job) && s->common.job.force_cancel) {
>>> +        if (job_is_cancelled(&s->common.job)) {
>>>               break;
>>>           }
>>>           s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
>>> @@ -1099,9 +1099,7 @@ immediate_exit:
>>>            * or it was cancelled prematurely so that we do not guarantee that
>>>            * the target is a copy of the source.
>>>            */
>>> -        assert(ret < 0 ||
>>> -               (s->common.job.force_cancel &&
>>> -                job_is_cancelled(&s->common.job)));
>>> +        assert(ret < 0 || job_is_cancelled(&s->common.job));
> 
> (As a note, I hope this does the job regarding your suggestions for patch 4. :))
> 
>>>           assert(need_drain);
>>>           mirror_wait_for_all_io(s);
>>>       }
>>> diff --git a/job.c b/job.c
>>> index e78d893a9c..dba17a680f 100644
>>> --- a/job.c
>>> +++ b/job.c
>>> @@ -216,6 +216,11 @@ const char *job_type_str(const Job *job)
>>>   }
>>>     bool job_is_cancelled(Job *job)
>>> +{
>>> +    return job->cancelled && job->force_cancel;
>>
>> can job->cancelled be false when job->force_cancel is true ? I think not and worth an assertion here. Something like
>>
>> if (job->force_cancel) {
>>    assert(job->cancelled);
>>    return true;
>> }
>>
>> return false;
> 
> Sounds good, why not.
> 
>>
>>> +}
>>> +
>>> +bool job_cancel_requested(Job *job)
>>>   {
>>>       return job->cancelled;
>>>   }
>>> @@ -1015,7 +1020,7 @@ void job_complete(Job *job, Error **errp)
>>>       if (job_apply_verb(job, JOB_VERB_COMPLETE, errp)) {
>>>           return;
>>>       }
>>> -    if (job_is_cancelled(job) || !job->driver->complete) {
>>> +    if (job_cancel_requested(job) || !job->driver->complete) {
>>>           error_setg(errp, "The active block job '%s' cannot be completed",
>>>                      job->id);
>>>           return;
>>>
>>
>> I think it's a correct change, although there may be unexpected side-effects, it's hard to imagine all consequences of changing job_is_cancelled() semantics called in several places in job.c.

For example: so we now don't set -ECANCELED in job_update_rc for soft-cancel..

This mean that job_finalize_single() will call job_commit instead of job_abort, and job_commit may do some graph changes, which shouldn't happen for soft-cancel

> 
> Yeah.  Targeting 6.2, I don’t have a bad feeling about it, though.
> 
>> Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> 
> Thanks for the review, by the way!
> 
> Max
>
Max Reitz Aug. 2, 2021, 10:23 a.m. UTC | #4
On 27.07.21 17:47, Vladimir Sementsov-Ogievskiy wrote:
> 27.07.2021 18:39, Max Reitz wrote:
>> On 27.07.21 15:04, Vladimir Sementsov-Ogievskiy wrote:
>>> 26.07.2021 17:46, Max Reitz wrote:
>>>> Most callers of job_is_cancelled() actually want to know whether 
>>>> the job
>>>> is on its way to immediate termination.  For example, we refuse to 
>>>> pause
>>>> jobs that are cancelled; but this only makes sense for jobs that are
>>>> really actually cancelled.
>>>>
>>>> A mirror job that is cancelled during READY with force=false should
>>>> absolutely be allowed to pause.  This "cancellation" (which is 
>>>> actually
>>>> a kind of completion) may take an indefinite amount of time, and so
>>>> should behave like any job during normal operation.  For example, with
>>>> on-target-error=stop, the job should stop on write errors. (In
>>>> contrast, force-cancelled jobs should not get write errors, as they
>>>> should just terminate and not do further I/O.)
>>>>
>>>> Therefore, redefine job_is_cancelled() to only return true for jobs 
>>>> that
>>>> are force-cancelled (which as of HEAD^ means any job that 
>>>> interprets the
>>>> cancellation request as a request for immediate termination), and add
>>>> job_cancel_request() as the general variant, which returns true for 
>>>> any
>>>
>>> job_cancel_requested()
>>>
>>>> jobs which have been requested to be cancelled, whether it be
>>>> immediately or after an arbitrarily long completion phase.
>>>>
>>>> Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
>>>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>>>> ---
>>>>   include/qemu/job.h |  8 +++++++-
>>>>   block/mirror.c     | 10 ++++------
>>>>   job.c              |  7 ++++++-
>>>>   3 files changed, 17 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/include/qemu/job.h b/include/qemu/job.h
>>>> index 8aa90f7395..032edf3c5f 100644
>>>> --- a/include/qemu/job.h
>>>> +++ b/include/qemu/job.h
>>>> @@ -436,9 +436,15 @@ const char *job_type_str(const Job *job);
>>>>   /** Returns true if the job should not be visible to the 
>>>> management layer. */
>>>>   bool job_is_internal(Job *job);
>>>>   -/** Returns whether the job is scheduled for cancellation. */
>>>> +/** Returns whether the job is being cancelled. */
>>>>   bool job_is_cancelled(Job *job);
>>>>   +/**
>>>> + * Returns whether the job is scheduled for cancellation (at an
>>>> + * indefinite point).
>>>> + */
>>>> +bool job_cancel_requested(Job *job);
>>>> +
>>>>   /** Returns whether the job is in a completed state. */
>>>>   bool job_is_completed(Job *job);
>>>>   diff --git a/block/mirror.c b/block/mirror.c
>>>> index e93631a9f6..72e02fa34e 100644
>>>> --- a/block/mirror.c
>>>> +++ b/block/mirror.c
>>>> @@ -936,7 +936,7 @@ static int coroutine_fn mirror_run(Job *job, 
>>>> Error **errp)
>>>>           /* Transition to the READY state and wait for complete. */
>>>>           job_transition_to_ready(&s->common.job);
>>>>           s->actively_synced = true;
>>>> -        while (!job_is_cancelled(&s->common.job) && 
>>>> !s->should_complete) {
>>>> +        while (!job_cancel_requested(&s->common.job) && 
>>>> !s->should_complete) {
>>>>               job_yield(&s->common.job);
>>>>           }
>>>>           s->common.job.cancelled = false;
>>>> @@ -1043,7 +1043,7 @@ static int coroutine_fn mirror_run(Job *job, 
>>>> Error **errp)
>>>>               }
>>>>                 should_complete = s->should_complete ||
>>>> -                job_is_cancelled(&s->common.job);
>>>> + job_cancel_requested(&s->common.job);
>>>>               cnt = bdrv_get_dirty_count(s->dirty_bitmap);
>>>>           }
>>>>   @@ -1087,7 +1087,7 @@ static int coroutine_fn mirror_run(Job 
>>>> *job, Error **errp)
>>>>           trace_mirror_before_sleep(s, cnt, 
>>>> job_is_ready(&s->common.job),
>>>>                                     delay_ns);
>>>>           job_sleep_ns(&s->common.job, delay_ns);
>>>> -        if (job_is_cancelled(&s->common.job) && 
>>>> s->common.job.force_cancel) {
>>>> +        if (job_is_cancelled(&s->common.job)) {
>>>>               break;
>>>>           }
>>>>           s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
>>>> @@ -1099,9 +1099,7 @@ immediate_exit:
>>>>            * or it was cancelled prematurely so that we do not 
>>>> guarantee that
>>>>            * the target is a copy of the source.
>>>>            */
>>>> -        assert(ret < 0 ||
>>>> -               (s->common.job.force_cancel &&
>>>> -                job_is_cancelled(&s->common.job)));
>>>> +        assert(ret < 0 || job_is_cancelled(&s->common.job));
>>
>> (As a note, I hope this does the job regarding your suggestions for 
>> patch 4. :))
>>
>>>>           assert(need_drain);
>>>>           mirror_wait_for_all_io(s);
>>>>       }
>>>> diff --git a/job.c b/job.c
>>>> index e78d893a9c..dba17a680f 100644
>>>> --- a/job.c
>>>> +++ b/job.c
>>>> @@ -216,6 +216,11 @@ const char *job_type_str(const Job *job)
>>>>   }
>>>>     bool job_is_cancelled(Job *job)
>>>> +{
>>>> +    return job->cancelled && job->force_cancel;
>>>
>>> can job->cancelled be false when job->force_cancel is true ? I think 
>>> not and worth an assertion here. Something like
>>>
>>> if (job->force_cancel) {
>>>    assert(job->cancelled);
>>>    return true;
>>> }
>>>
>>> return false;
>>
>> Sounds good, why not.
>>
>>>
>>>> +}
>>>> +
>>>> +bool job_cancel_requested(Job *job)
>>>>   {
>>>>       return job->cancelled;
>>>>   }
>>>> @@ -1015,7 +1020,7 @@ void job_complete(Job *job, Error **errp)
>>>>       if (job_apply_verb(job, JOB_VERB_COMPLETE, errp)) {
>>>>           return;
>>>>       }
>>>> -    if (job_is_cancelled(job) || !job->driver->complete) {
>>>> +    if (job_cancel_requested(job) || !job->driver->complete) {
>>>>           error_setg(errp, "The active block job '%s' cannot be 
>>>> completed",
>>>>                      job->id);
>>>>           return;
>>>>
>>>
>>> I think it's a correct change, although there may be unexpected 
>>> side-effects, it's hard to imagine all consequences of changing 
>>> job_is_cancelled() semantics called in several places in job.c.
>
> For example: so we now don't set -ECANCELED in job_update_rc for 
> soft-cancel..
>
> This mean that job_finalize_single() will call job_commit instead of 
> job_abort, and job_commit may do some graph changes, which shouldn't 
> happen for soft-cancel

So the question is when these two conditions come into play.

There are two places that set job->ret to ECANCELED if the job is 
cancelled, namely job_update_rc(), and job_finish_sync().

job_finish_sync() will do so only after the job has been completed, 
which requires the job to either have been aborted (i.e. ret is non-zero 
anyway) or job_completed() to have been called. job_completed() is 
called by job_exit(), which is run after the job’s main loop has 
exited.  If mirror is soft-cancelled, mirror_run() will clear 
s->common.job.cancelled before returning, so job_finish_sync() will not 
see the job as cancelled.

job_update_rc() is called from three places:

job_finalize_single(): Asserts that job_is_completed(), so the same 
reasoning as for job_finish_sync() applies.

job_prepare(): Called by job_do_finalize(), which can only happen when 
the job is completed.  (JobVerbTable only allows finalization when the 
job is PENDING, which is a state where job_is_completed() is true, i.e. 
after mirror_run().)

job_completed(): Same reasoning as for job_finish_sync().


So it looks to me like these places that set job->ret to ECANCELED if 
the job has been cancelled do not consider a soft-cancelled mirror job 
to have been cancelled, which makes using job_is_cancelled() instead of 
job_cancel_requested() correct there. (And most likely, we can drop the 
`.cancelled = false` statements from the mirror job in turn.)

Max
Vladimir Sementsov-Ogievskiy Aug. 3, 2021, 12:35 p.m. UTC | #5
02.08.2021 13:23, Max Reitz wrote:
> On 27.07.21 17:47, Vladimir Sementsov-Ogievskiy wrote:
>> 27.07.2021 18:39, Max Reitz wrote:
>>> On 27.07.21 15:04, Vladimir Sementsov-Ogievskiy wrote:
>>>> 26.07.2021 17:46, Max Reitz wrote:
>>>>> Most callers of job_is_cancelled() actually want to know whether the job
>>>>> is on its way to immediate termination.  For example, we refuse to pause
>>>>> jobs that are cancelled; but this only makes sense for jobs that are
>>>>> really actually cancelled.
>>>>>
>>>>> A mirror job that is cancelled during READY with force=false should
>>>>> absolutely be allowed to pause.  This "cancellation" (which is actually
>>>>> a kind of completion) may take an indefinite amount of time, and so
>>>>> should behave like any job during normal operation.  For example, with
>>>>> on-target-error=stop, the job should stop on write errors. (In
>>>>> contrast, force-cancelled jobs should not get write errors, as they
>>>>> should just terminate and not do further I/O.)
>>>>>
>>>>> Therefore, redefine job_is_cancelled() to only return true for jobs that
>>>>> are force-cancelled (which as of HEAD^ means any job that interprets the
>>>>> cancellation request as a request for immediate termination), and add
>>>>> job_cancel_request() as the general variant, which returns true for any
>>>>
>>>> job_cancel_requested()
>>>>
>>>>> jobs which have been requested to be cancelled, whether it be
>>>>> immediately or after an arbitrarily long completion phase.
>>>>>
>>>>> Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
>>>>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>>>>> ---
>>>>>   include/qemu/job.h |  8 +++++++-
>>>>>   block/mirror.c     | 10 ++++------
>>>>>   job.c              |  7 ++++++-
>>>>>   3 files changed, 17 insertions(+), 8 deletions(-)
>>>>>
>>>>> diff --git a/include/qemu/job.h b/include/qemu/job.h
>>>>> index 8aa90f7395..032edf3c5f 100644
>>>>> --- a/include/qemu/job.h
>>>>> +++ b/include/qemu/job.h
>>>>> @@ -436,9 +436,15 @@ const char *job_type_str(const Job *job);
>>>>>   /** Returns true if the job should not be visible to the management layer. */
>>>>>   bool job_is_internal(Job *job);
>>>>>   -/** Returns whether the job is scheduled for cancellation. */
>>>>> +/** Returns whether the job is being cancelled. */
>>>>>   bool job_is_cancelled(Job *job);
>>>>>   +/**
>>>>> + * Returns whether the job is scheduled for cancellation (at an
>>>>> + * indefinite point).
>>>>> + */
>>>>> +bool job_cancel_requested(Job *job);
>>>>> +
>>>>>   /** Returns whether the job is in a completed state. */
>>>>>   bool job_is_completed(Job *job);
>>>>>   diff --git a/block/mirror.c b/block/mirror.c
>>>>> index e93631a9f6..72e02fa34e 100644
>>>>> --- a/block/mirror.c
>>>>> +++ b/block/mirror.c
>>>>> @@ -936,7 +936,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
>>>>>           /* Transition to the READY state and wait for complete. */
>>>>>           job_transition_to_ready(&s->common.job);
>>>>>           s->actively_synced = true;
>>>>> -        while (!job_is_cancelled(&s->common.job) && !s->should_complete) {
>>>>> +        while (!job_cancel_requested(&s->common.job) && !s->should_complete) {
>>>>>               job_yield(&s->common.job);
>>>>>           }
>>>>>           s->common.job.cancelled = false;
>>>>> @@ -1043,7 +1043,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
>>>>>               }
>>>>>                 should_complete = s->should_complete ||
>>>>> -                job_is_cancelled(&s->common.job);
>>>>> + job_cancel_requested(&s->common.job);
>>>>>               cnt = bdrv_get_dirty_count(s->dirty_bitmap);
>>>>>           }
>>>>>   @@ -1087,7 +1087,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
>>>>>           trace_mirror_before_sleep(s, cnt, job_is_ready(&s->common.job),
>>>>>                                     delay_ns);
>>>>>           job_sleep_ns(&s->common.job, delay_ns);
>>>>> -        if (job_is_cancelled(&s->common.job) && s->common.job.force_cancel) {
>>>>> +        if (job_is_cancelled(&s->common.job)) {
>>>>>               break;
>>>>>           }
>>>>>           s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
>>>>> @@ -1099,9 +1099,7 @@ immediate_exit:
>>>>>            * or it was cancelled prematurely so that we do not guarantee that
>>>>>            * the target is a copy of the source.
>>>>>            */
>>>>> -        assert(ret < 0 ||
>>>>> -               (s->common.job.force_cancel &&
>>>>> -                job_is_cancelled(&s->common.job)));
>>>>> +        assert(ret < 0 || job_is_cancelled(&s->common.job));
>>>
>>> (As a note, I hope this does the job regarding your suggestions for patch 4. :))
>>>
>>>>>           assert(need_drain);
>>>>>           mirror_wait_for_all_io(s);
>>>>>       }
>>>>> diff --git a/job.c b/job.c
>>>>> index e78d893a9c..dba17a680f 100644
>>>>> --- a/job.c
>>>>> +++ b/job.c
>>>>> @@ -216,6 +216,11 @@ const char *job_type_str(const Job *job)
>>>>>   }
>>>>>     bool job_is_cancelled(Job *job)
>>>>> +{
>>>>> +    return job->cancelled && job->force_cancel;
>>>>
>>>> can job->cancelled be false when job->force_cancel is true ? I think not and worth an assertion here. Something like
>>>>
>>>> if (job->force_cancel) {
>>>>    assert(job->cancelled);
>>>>    return true;
>>>> }
>>>>
>>>> return false;
>>>
>>> Sounds good, why not.
>>>
>>>>
>>>>> +}
>>>>> +
>>>>> +bool job_cancel_requested(Job *job)
>>>>>   {
>>>>>       return job->cancelled;
>>>>>   }
>>>>> @@ -1015,7 +1020,7 @@ void job_complete(Job *job, Error **errp)
>>>>>       if (job_apply_verb(job, JOB_VERB_COMPLETE, errp)) {
>>>>>           return;
>>>>>       }
>>>>> -    if (job_is_cancelled(job) || !job->driver->complete) {
>>>>> +    if (job_cancel_requested(job) || !job->driver->complete) {
>>>>>           error_setg(errp, "The active block job '%s' cannot be completed",
>>>>>                      job->id);
>>>>>           return;
>>>>>
>>>>
>>>> I think it's a correct change, although there may be unexpected side-effects, it's hard to imagine all consequences of changing job_is_cancelled() semantics called in several places in job.c.
>>
>> For example: so we now don't set -ECANCELED in job_update_rc for soft-cancel..
>>
>> This mean that job_finalize_single() will call job_commit instead of job_abort, and job_commit may do some graph changes, which shouldn't happen for soft-cancel
> 
> So the question is when these two conditions come into play.
> 
> There are two places that set job->ret to ECANCELED if the job is cancelled, namely job_update_rc(), and job_finish_sync().
> 
> job_finish_sync() will do so only after the job has been completed, which requires the job to either have been aborted (i.e. ret is non-zero anyway) or job_completed() to have been called. job_completed() is called by job_exit(), which is run after the job’s main loop has exited.  If mirror is soft-cancelled, mirror_run() will clear s->common.job.cancelled before returning, so job_finish_sync() will not see the job as cancelled.
> 
> job_update_rc() is called from three places:
> 
> job_finalize_single(): Asserts that job_is_completed(), so the same reasoning as for job_finish_sync() applies.
> 
> job_prepare(): Called by job_do_finalize(), which can only happen when the job is completed.  (JobVerbTable only allows finalization when the job is PENDING, which is a state where job_is_completed() is true, i.e. after mirror_run().)
> 
> job_completed(): Same reasoning as for job_finish_sync().
> 
> 
> So it looks to me like these places that set job->ret to ECANCELED if the job has been cancelled do not consider a soft-cancelled mirror job to have been cancelled, which makes using job_is_cancelled() instead of job_cancel_requested() correct there. (And most likely, we can drop the `.cancelled = false` statements from the mirror job in turn.)
> 


Hm, reasonable. OK than, thanks for explanation.
Kevin Wolf Aug. 3, 2021, 2:25 p.m. UTC | #6
Am 26.07.2021 um 16:46 hat Max Reitz geschrieben:
> Most callers of job_is_cancelled() actually want to know whether the job
> is on its way to immediate termination.  For example, we refuse to pause
> jobs that are cancelled; but this only makes sense for jobs that are
> really actually cancelled.
> 
> A mirror job that is cancelled during READY with force=false should
> absolutely be allowed to pause.  This "cancellation" (which is actually
> a kind of completion) may take an indefinite amount of time, and so
> should behave like any job during normal operation.  For example, with
> on-target-error=stop, the job should stop on write errors.  (In
> contrast, force-cancelled jobs should not get write errors, as they
> should just terminate and not do further I/O.)
> 
> Therefore, redefine job_is_cancelled() to only return true for jobs that
> are force-cancelled (which as of HEAD^ means any job that interprets the
> cancellation request as a request for immediate termination), and add
> job_cancel_request() as the general variant, which returns true for any
> jobs which have been requested to be cancelled, whether it be
> immediately or after an arbitrarily long completion phase.
> 
> Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>  include/qemu/job.h |  8 +++++++-
>  block/mirror.c     | 10 ++++------
>  job.c              |  7 ++++++-
>  3 files changed, 17 insertions(+), 8 deletions(-)
> 
> diff --git a/include/qemu/job.h b/include/qemu/job.h
> index 8aa90f7395..032edf3c5f 100644
> --- a/include/qemu/job.h
> +++ b/include/qemu/job.h
> @@ -436,9 +436,15 @@ const char *job_type_str(const Job *job);
>  /** Returns true if the job should not be visible to the management layer. */
>  bool job_is_internal(Job *job);
>  
> -/** Returns whether the job is scheduled for cancellation. */
> +/** Returns whether the job is being cancelled. */
>  bool job_is_cancelled(Job *job);
>  
> +/**
> + * Returns whether the job is scheduled for cancellation (at an
> + * indefinite point).
> + */
> +bool job_cancel_requested(Job *job);

I don't think non-force blockdev-cancel for mirror should actually be
considered cancellation, so what is the question that this function
answers?

"Is this a cancelled job, or a mirror block job that is supposed to
complete soon, but only if it doesn't switch over the users to the
target on completion"?

Is this ever a reasonable question to ask, except maybe inside the
mirror implementation itself?

job_complete() is the only function outside of mirror that seems to use
it. But even there, it feels wrong to make a difference. Either we
accept redundant completion requests, or we don't. It doesn't really
matter how the job reconfigures the graph on completion. (Also, I feel
this should really have been part of the state machine, but I'm not sure
if we want to touch it now...)

Kevin
Max Reitz Aug. 4, 2021, 8:07 a.m. UTC | #7
On 03.08.21 16:25, Kevin Wolf wrote:
> Am 26.07.2021 um 16:46 hat Max Reitz geschrieben:
>> Most callers of job_is_cancelled() actually want to know whether the job
>> is on its way to immediate termination.  For example, we refuse to pause
>> jobs that are cancelled; but this only makes sense for jobs that are
>> really actually cancelled.
>>
>> A mirror job that is cancelled during READY with force=false should
>> absolutely be allowed to pause.  This "cancellation" (which is actually
>> a kind of completion) may take an indefinite amount of time, and so
>> should behave like any job during normal operation.  For example, with
>> on-target-error=stop, the job should stop on write errors.  (In
>> contrast, force-cancelled jobs should not get write errors, as they
>> should just terminate and not do further I/O.)
>>
>> Therefore, redefine job_is_cancelled() to only return true for jobs that
>> are force-cancelled (which as of HEAD^ means any job that interprets the
>> cancellation request as a request for immediate termination), and add
>> job_cancel_request() as the general variant, which returns true for any
>> jobs which have been requested to be cancelled, whether it be
>> immediately or after an arbitrarily long completion phase.
>>
>> Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>> ---
>>   include/qemu/job.h |  8 +++++++-
>>   block/mirror.c     | 10 ++++------
>>   job.c              |  7 ++++++-
>>   3 files changed, 17 insertions(+), 8 deletions(-)
>>
>> diff --git a/include/qemu/job.h b/include/qemu/job.h
>> index 8aa90f7395..032edf3c5f 100644
>> --- a/include/qemu/job.h
>> +++ b/include/qemu/job.h
>> @@ -436,9 +436,15 @@ const char *job_type_str(const Job *job);
>>   /** Returns true if the job should not be visible to the management layer. */
>>   bool job_is_internal(Job *job);
>>   
>> -/** Returns whether the job is scheduled for cancellation. */
>> +/** Returns whether the job is being cancelled. */
>>   bool job_is_cancelled(Job *job);
>>   
>> +/**
>> + * Returns whether the job is scheduled for cancellation (at an
>> + * indefinite point).
>> + */
>> +bool job_cancel_requested(Job *job);
> I don't think non-force blockdev-cancel for mirror should actually be
> considered cancellation, so what is the question that this function
> answers?
>
> "Is this a cancelled job, or a mirror block job that is supposed to
> complete soon, but only if it doesn't switch over the users to the
> target on completion"?

Well, technically yes, but it was more intended as “Has the user ever 
invoked (block-)job-cancel on this job?”.

> Is this ever a reasonable question to ask, except maybe inside the
> mirror implementation itself?

I asked myself the same for v3, but found two places in job.c where I 
would like to keep it:

First, there’s an assertion in job_completed_txn_abort().  All jobs 
other than @job have been force-cancelled, and so job_is_cancelled() 
would be true for them.  As for @job itself, the function is mostly 
called when the job’s return value is not 0, but a soft-cancelled mirror 
does have a return value of 0 and so would not end up in that function.
But job_cancel() invokes job_completed_txn_abort() if the job has been 
deferred to the main loop, which mostly correlates with the job having 
been completed (in which case the assertion is skipped), but not 100 % 
(there’s a small window between setting deferred_to_main_loop and the 
job changing to a completed state).
So I’d prefer to keep the assertion as-is functionally, i.e. to only 
check job->cancelled.

Second, job_complete() refuses to let a job complete that has been 
cancelled.  This function is basically only invoked by the user (through 
qmp_block_job_complete()/qmp_job_complete(), or job_complete_sync(), 
which comes from qemu-img), so I believe that it should correspond to 
the external interface we have right now; i.e., if the user has invoked 
(block-)job-cancel at one point, job_complete() should generally return 
an error.

> job_complete() is the only function outside of mirror that seems to use
> it. But even there, it feels wrong to make a difference. Either we
> accept redundant completion requests, or we don't. It doesn't really
> matter how the job reconfigures the graph on completion. (Also, I feel
> this should really have been part of the state machine, but I'm not sure
> if we want to touch it now...)

Well, yes, I don’t think it makes a difference because I don’t think 
anyone will first tell the job via block-job-cancel to complete without 
pivoting, and then change their mind and call block-job-complete after 
all.  (Not least because that’s an error pre-series.)

Also, I’m not even sure whether completing after a soft cancel request 
works.  I don’t think any of our code accounts for such a case, so I’d 
rather avoid allowing it if there’s no need to allow it anyway.

Max
Kevin Wolf Aug. 4, 2021, 10:34 a.m. UTC | #8
[ Peter, the question for you is at the end. ]

Am 04.08.2021 um 10:07 hat Max Reitz geschrieben:
> On 03.08.21 16:25, Kevin Wolf wrote:
> > Am 26.07.2021 um 16:46 hat Max Reitz geschrieben:
> > > Most callers of job_is_cancelled() actually want to know whether the job
> > > is on its way to immediate termination.  For example, we refuse to pause
> > > jobs that are cancelled; but this only makes sense for jobs that are
> > > really actually cancelled.
> > > 
> > > A mirror job that is cancelled during READY with force=false should
> > > absolutely be allowed to pause.  This "cancellation" (which is actually
> > > a kind of completion) may take an indefinite amount of time, and so
> > > should behave like any job during normal operation.  For example, with
> > > on-target-error=stop, the job should stop on write errors.  (In
> > > contrast, force-cancelled jobs should not get write errors, as they
> > > should just terminate and not do further I/O.)
> > > 
> > > Therefore, redefine job_is_cancelled() to only return true for jobs that
> > > are force-cancelled (which as of HEAD^ means any job that interprets the
> > > cancellation request as a request for immediate termination), and add
> > > job_cancel_request() as the general variant, which returns true for any
> > > jobs which have been requested to be cancelled, whether it be
> > > immediately or after an arbitrarily long completion phase.
> > > 
> > > Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
> > > Signed-off-by: Max Reitz <mreitz@redhat.com>
> > > ---
> > >   include/qemu/job.h |  8 +++++++-
> > >   block/mirror.c     | 10 ++++------
> > >   job.c              |  7 ++++++-
> > >   3 files changed, 17 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/include/qemu/job.h b/include/qemu/job.h
> > > index 8aa90f7395..032edf3c5f 100644
> > > --- a/include/qemu/job.h
> > > +++ b/include/qemu/job.h
> > > @@ -436,9 +436,15 @@ const char *job_type_str(const Job *job);
> > >   /** Returns true if the job should not be visible to the management layer. */
> > >   bool job_is_internal(Job *job);
> > > -/** Returns whether the job is scheduled for cancellation. */
> > > +/** Returns whether the job is being cancelled. */
> > >   bool job_is_cancelled(Job *job);
> > > +/**
> > > + * Returns whether the job is scheduled for cancellation (at an
> > > + * indefinite point).
> > > + */
> > > +bool job_cancel_requested(Job *job);
> > I don't think non-force blockdev-cancel for mirror should actually be
> > considered cancellation, so what is the question that this function
> > answers?
> > 
> > "Is this a cancelled job, or a mirror block job that is supposed to
> > complete soon, but only if it doesn't switch over the users to the
> > target on completion"?
> 
> Well, technically yes, but it was more intended as “Has the user ever
> invoked (block-)job-cancel on this job?”.

I understand this, but is this much more useful to know than "Has the
user ever called HMP 'change'?", if you know what I mean?

> > Is this ever a reasonable question to ask, except maybe inside the
> > mirror implementation itself?
> 
> I asked myself the same for v3, but found two places in job.c where I
> would like to keep it:
> 
> First, there’s an assertion in job_completed_txn_abort().  All jobs
> other than @job have been force-cancelled, and so job_is_cancelled()
> would be true for them.  As for @job itself, the function is mostly
> called when the job’s return value is not 0, but a soft-cancelled
> mirror does have a return value of 0 and so would not end up in that
> function.
> But job_cancel() invokes job_completed_txn_abort() if the job has been
> deferred to the main loop, which mostly correlates with the job having
> been completed (in which case the assertion is skipped), but not 100 %
> (there’s a small window between setting deferred_to_main_loop and the
> job changing to a completed state).
> So I’d prefer to keep the assertion as-is functionally, i.e. to only
> check job->cancelled.

Well, you don't. It's still job_is_cancelled() after this patch.

So the scenario you're concerned about is a job that has just finished
successfully (job->ret = 0) and then gets a cancel request?

With force=false, I'm pretty sure the code is wrong anyway because
calling job_completed_txn_abort() is not the right response. It should
return an error because you're trying to complete twice, possibly with
conflicting completion modes. Second best is just ignoring the cancel
request because we obviously already fulfilled the request of completing
the job (the completion mode might be different, though).

With force=true, arguably still letting the job fail is correct.
However, letting it fail involves more than just letting the transaction
fail. We would have to call job_update_rc() as well so that instead of
reporting success for the job, ECANCELED is returned and the job
transitions to JOB_STATUS_ABORTING (after which job_is_completed()
returns true).

So, just bugs to be fixed.

After this, I think we could even assert(job->ret != 0 ||
job->status == JOB_STATUS_PENDING) in job_completed_txn_abort().
ret == 0 can only happen when called from job_do_finalize(), when the
job is only failing because other jobs in the same transaction have
failed in their .prepare callback.

> Second, job_complete() refuses to let a job complete that has been
> cancelled.  This function is basically only invoked by the user
> (through qmp_block_job_complete()/qmp_job_complete(), or
> job_complete_sync(), which comes from qemu-img), so I believe that it
> should correspond to the external interface we have right now; i.e.,
> if the user has invoked (block-)job-cancel at one point,
> job_complete() should generally return an error.

True. But it should also return an error if the user has invoked
job-complete at some point. The distinction between complete and
non-force cancel doesn't make sense there.

And cancelling with force=false should fail, too, when either job-cancel
or job-complete was called for the job before.

> > job_complete() is the only function outside of mirror that seems to use
> > it. But even there, it feels wrong to make a difference. Either we
> > accept redundant completion requests, or we don't. It doesn't really
> > matter how the job reconfigures the graph on completion. (Also, I feel
> > this should really have been part of the state machine, but I'm not sure
> > if we want to touch it now...)
> 
> Well, yes, I don’t think it makes a difference because I don’t think
> anyone will first tell the job via block-job-cancel to complete
> without pivoting, and then change their mind and call
> block-job-complete after all.  (Not least because that’s an error
> pre-series.)

Right, I'm just arguing that we shouldn't allow the opposite order
either. Currently I think we do, and it's buggy, as explained above.

> Also, I’m not even sure whether completing after a soft cancel request
> works.  I don’t think any of our code accounts for such a case, so I’d
> rather avoid allowing it if there’s no need to allow it anyway.

Yes, definitely avoid it. We should allow only one completion request
(be it with job-complete or block-job-cancel) and return an error for
all future completion requests for the same job.

We could in theory keep allowing redundant completion requests when the
completion mode doesn't conflict, but I don't see the point of that.

Unless libvirt can actually issue multiple completion requests (again,
this includes both (block-)job-complete and non-force block-job-cancel
for mirror) for the same block job - Peter, I hope it doesn't?

Kevin
Peter Krempa Aug. 4, 2021, 11 a.m. UTC | #9
On Wed, Aug 04, 2021 at 12:34:31 +0200, Kevin Wolf wrote:
> We could in theory keep allowing redundant completion requests when the
> completion mode doesn't conflict, but I don't see the point of that.

I don't see either. Especially since ...



> Unless libvirt can actually issue multiple completion requests (again,
> this includes both (block-)job-complete and non-force block-job-cancel
> for mirror) for the same block job - Peter, I hope it doesn't?

... the regular job completion code in libvirt which is meant for user
interaction (qemuDomainBlockJobAbort) has the following interlock:

    if (job->state == QEMU_BLOCKJOB_STATE_ABORTING ||
        job->state == QEMU_BLOCKJOB_STATE_PIVOTING) {
        virReportError(VIR_ERR_OPERATION_INVALID,
                       _("block job on disk '%s' is still being ended"),
                       disk->dst);
        goto endjob;
    }

.. the other two uses of blockjobs are internal for handling migration
with non shared storage and there we also issue exactly one cancel
request and backup jobs were we too make sure to cancel it just once.

As of such it's okay to forbid the case you are mentioning.
Max Reitz Aug. 4, 2021, 2:15 p.m. UTC | #10
On 04.08.21 12:34, Kevin Wolf wrote:
> [ Peter, the question for you is at the end. ]
>
> Am 04.08.2021 um 10:07 hat Max Reitz geschrieben:
>> On 03.08.21 16:25, Kevin Wolf wrote:
>>> Am 26.07.2021 um 16:46 hat Max Reitz geschrieben:
>>>> Most callers of job_is_cancelled() actually want to know whether the job
>>>> is on its way to immediate termination.  For example, we refuse to pause
>>>> jobs that are cancelled; but this only makes sense for jobs that are
>>>> really actually cancelled.
>>>>
>>>> A mirror job that is cancelled during READY with force=false should
>>>> absolutely be allowed to pause.  This "cancellation" (which is actually
>>>> a kind of completion) may take an indefinite amount of time, and so
>>>> should behave like any job during normal operation.  For example, with
>>>> on-target-error=stop, the job should stop on write errors.  (In
>>>> contrast, force-cancelled jobs should not get write errors, as they
>>>> should just terminate and not do further I/O.)
>>>>
>>>> Therefore, redefine job_is_cancelled() to only return true for jobs that
>>>> are force-cancelled (which as of HEAD^ means any job that interprets the
>>>> cancellation request as a request for immediate termination), and add
>>>> job_cancel_request() as the general variant, which returns true for any
>>>> jobs which have been requested to be cancelled, whether it be
>>>> immediately or after an arbitrarily long completion phase.
>>>>
>>>> Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
>>>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>>>> ---
>>>>    include/qemu/job.h |  8 +++++++-
>>>>    block/mirror.c     | 10 ++++------
>>>>    job.c              |  7 ++++++-
>>>>    3 files changed, 17 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/include/qemu/job.h b/include/qemu/job.h
>>>> index 8aa90f7395..032edf3c5f 100644
>>>> --- a/include/qemu/job.h
>>>> +++ b/include/qemu/job.h
>>>> @@ -436,9 +436,15 @@ const char *job_type_str(const Job *job);
>>>>    /** Returns true if the job should not be visible to the management layer. */
>>>>    bool job_is_internal(Job *job);
>>>> -/** Returns whether the job is scheduled for cancellation. */
>>>> +/** Returns whether the job is being cancelled. */
>>>>    bool job_is_cancelled(Job *job);
>>>> +/**
>>>> + * Returns whether the job is scheduled for cancellation (at an
>>>> + * indefinite point).
>>>> + */
>>>> +bool job_cancel_requested(Job *job);
>>> I don't think non-force blockdev-cancel for mirror should actually be
>>> considered cancellation, so what is the question that this function
>>> answers?
>>>
>>> "Is this a cancelled job, or a mirror block job that is supposed to
>>> complete soon, but only if it doesn't switch over the users to the
>>> target on completion"?
>> Well, technically yes, but it was more intended as “Has the user ever
>> invoked (block-)job-cancel on this job?”.
> I understand this, but is this much more useful to know than "Has the
> user ever called HMP 'change'?", if you know what I mean?

Hm.  Not really.  It’s still a crutch that shouldn’t be there ideally.

But I like this crutch for this series so I can get this batch done, and 
then worry about all the other bugs that keep popping up (and where 
job_cancel_requested() is a nice sign that something’s off).

>>> Is this ever a reasonable question to ask, except maybe inside the
>>> mirror implementation itself?
>> I asked myself the same for v3, but found two places in job.c where I
>> would like to keep it:
>>
>> First, there’s an assertion in job_completed_txn_abort().  All jobs
>> other than @job have been force-cancelled, and so job_is_cancelled()
>> would be true for them.  As for @job itself, the function is mostly
>> called when the job’s return value is not 0, but a soft-cancelled
>> mirror does have a return value of 0 and so would not end up in that
>> function.
>> But job_cancel() invokes job_completed_txn_abort() if the job has been
>> deferred to the main loop, which mostly correlates with the job having
>> been completed (in which case the assertion is skipped), but not 100 %
>> (there’s a small window between setting deferred_to_main_loop and the
>> job changing to a completed state).
>> So I’d prefer to keep the assertion as-is functionally, i.e. to only
>> check job->cancelled.
> Well, you don't. It's still job_is_cancelled() after this patch.

No: I didn’t. O:)

For v3, I had absolutely planned to use job_cancel_requested(), and I 
wanted to put the above explanation into the commit message.

> So the scenario you're concerned about is a job that has just finished
> successfully (job->ret = 0) and then gets a cancel request?

Yes.

> With force=false, I'm pretty sure the code is wrong anyway because
> calling job_completed_txn_abort() is not the right response.

Absolutely possible, I just didn’t want to deal with this, too… :/

> It should
> return an error because you're trying to complete twice, possibly with
> conflicting completion modes. Second best is just ignoring the cancel
> request because we obviously already fulfilled the request of completing
> the job (the completion mode might be different, though).
>
> With force=true, arguably still letting the job fail is correct.
> However, letting it fail involves more than just letting the transaction
> fail. We would have to call job_update_rc() as well so that instead of
> reporting success for the job, ECANCELED is returned and the job
> transitions to JOB_STATUS_ABORTING (after which job_is_completed()
> returns true).
>
> So, just bugs to be fixed.

Yep.  The question is, when/where; in this series or later?

> After this, I think we could even assert(job->ret != 0 ||
> job->status == JOB_STATUS_PENDING) in job_completed_txn_abort().
> ret == 0 can only happen when called from job_do_finalize(), when the
> job is only failing because other jobs in the same transaction have
> failed in their .prepare callback.

Sounds right.

>> Second, job_complete() refuses to let a job complete that has been
>> cancelled.  This function is basically only invoked by the user
>> (through qmp_block_job_complete()/qmp_job_complete(), or
>> job_complete_sync(), which comes from qemu-img), so I believe that it
>> should correspond to the external interface we have right now; i.e.,
>> if the user has invoked (block-)job-cancel at one point,
>> job_complete() should generally return an error.
> True. But it should also return an error if the user has invoked
> job-complete at some point. The distinction between complete and
> non-force cancel doesn't make sense there.

Yes, that’s true, it’s just that having double complete be a failure 
would be a change in behavior, and it would require another patch. Which 
is why I didn’t do it.

> And cancelling with force=false should fail, too, when either job-cancel
> or job-complete was called for the job before.

Yes.  At least for force=true, force=false is just a no-op, I believe.  
(.force_cancel is never reset to false.)

I’d like to defer this for the design series that Vladimir is planning 
to write, though.

>>> job_complete() is the only function outside of mirror that seems to use
>>> it. But even there, it feels wrong to make a difference. Either we
>>> accept redundant completion requests, or we don't. It doesn't really
>>> matter how the job reconfigures the graph on completion. (Also, I feel
>>> this should really have been part of the state machine, but I'm not sure
>>> if we want to touch it now...)
>> Well, yes, I don’t think it makes a difference because I don’t think
>> anyone will first tell the job via block-job-cancel to complete
>> without pivoting, and then change their mind and call
>> block-job-complete after all.  (Not least because that’s an error
>> pre-series.)
> Right, I'm just arguing that we shouldn't allow the opposite order
> either. Currently I think we do, and it's buggy, as explained above.

I agree.

>> Also, I’m not even sure whether completing after a soft cancel request
>> works.  I don’t think any of our code accounts for such a case, so I’d
>> rather avoid allowing it if there’s no need to allow it anyway.
> Yes, definitely avoid it. We should allow only one completion request
> (be it with job-complete or block-job-cancel) and return an error for
> all future completion requests for the same job.
>
> We could in theory keep allowing redundant completion requests when the
> completion mode doesn't conflict, but I don't see the point of that.

OK.  I personally think this need not be part of this series, though, so 
I’d like to defer it for now. O:)

(And since Vladimir is planning on turning soft cancel into a completion 
mode, I think we’re unlikely to forget about the problem.)

> Unless libvirt can actually issue multiple completion requests (again,
> this includes both (block-)job-complete and non-force block-job-cancel
> for mirror) for the same block job - Peter, I hope it doesn't?
>
> Kevin
>
diff mbox series

Patch

diff --git a/include/qemu/job.h b/include/qemu/job.h
index 8aa90f7395..032edf3c5f 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -436,9 +436,15 @@  const char *job_type_str(const Job *job);
 /** Returns true if the job should not be visible to the management layer. */
 bool job_is_internal(Job *job);
 
-/** Returns whether the job is scheduled for cancellation. */
+/** Returns whether the job is being cancelled. */
 bool job_is_cancelled(Job *job);
 
+/**
+ * Returns whether the job is scheduled for cancellation (at an
+ * indefinite point).
+ */
+bool job_cancel_requested(Job *job);
+
 /** Returns whether the job is in a completed state. */
 bool job_is_completed(Job *job);
 
diff --git a/block/mirror.c b/block/mirror.c
index e93631a9f6..72e02fa34e 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -936,7 +936,7 @@  static int coroutine_fn mirror_run(Job *job, Error **errp)
         /* Transition to the READY state and wait for complete. */
         job_transition_to_ready(&s->common.job);
         s->actively_synced = true;
-        while (!job_is_cancelled(&s->common.job) && !s->should_complete) {
+        while (!job_cancel_requested(&s->common.job) && !s->should_complete) {
             job_yield(&s->common.job);
         }
         s->common.job.cancelled = false;
@@ -1043,7 +1043,7 @@  static int coroutine_fn mirror_run(Job *job, Error **errp)
             }
 
             should_complete = s->should_complete ||
-                job_is_cancelled(&s->common.job);
+                job_cancel_requested(&s->common.job);
             cnt = bdrv_get_dirty_count(s->dirty_bitmap);
         }
 
@@ -1087,7 +1087,7 @@  static int coroutine_fn mirror_run(Job *job, Error **errp)
         trace_mirror_before_sleep(s, cnt, job_is_ready(&s->common.job),
                                   delay_ns);
         job_sleep_ns(&s->common.job, delay_ns);
-        if (job_is_cancelled(&s->common.job) && s->common.job.force_cancel) {
+        if (job_is_cancelled(&s->common.job)) {
             break;
         }
         s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
@@ -1099,9 +1099,7 @@  immediate_exit:
          * or it was cancelled prematurely so that we do not guarantee that
          * the target is a copy of the source.
          */
-        assert(ret < 0 ||
-               (s->common.job.force_cancel &&
-                job_is_cancelled(&s->common.job)));
+        assert(ret < 0 || job_is_cancelled(&s->common.job));
         assert(need_drain);
         mirror_wait_for_all_io(s);
     }
diff --git a/job.c b/job.c
index e78d893a9c..dba17a680f 100644
--- a/job.c
+++ b/job.c
@@ -216,6 +216,11 @@  const char *job_type_str(const Job *job)
 }
 
 bool job_is_cancelled(Job *job)
+{
+    return job->cancelled && job->force_cancel;
+}
+
+bool job_cancel_requested(Job *job)
 {
     return job->cancelled;
 }
@@ -1015,7 +1020,7 @@  void job_complete(Job *job, Error **errp)
     if (job_apply_verb(job, JOB_VERB_COMPLETE, errp)) {
         return;
     }
-    if (job_is_cancelled(job) || !job->driver->complete) {
+    if (job_cancel_requested(job) || !job->driver->complete) {
         error_setg(errp, "The active block job '%s' cannot be completed",
                    job->id);
         return;