diff mbox series

[v3] blockcommit: Reopen base image as RO after abort

Message ID 20240130091440.2346274-1-alexander.ivanov@virtuozzo.com
State New
Headers show
Series [v3] blockcommit: Reopen base image as RO after abort | expand

Commit Message

Alexander Ivanov Jan. 30, 2024, 9:14 a.m. UTC
If a blockcommit is aborted the base image remains in RW mode, that leads
to a fail of subsequent live migration.

How to reproduce:
  $ virsh snapshot-create-as vm snp1 --disk-only

  *** write something to the disk inside the guest ***

  $ virsh blockcommit vm vda --active --shallow && virsh blockjob vm vda --abort
  $ lsof /vzt/vm.qcow2
  COMMAND      PID USER   FD   TYPE DEVICE   SIZE/OFF NODE NAME
  qemu-syst 433203 root   45u   REG  253,0 1724776448  133 /vzt/vm.qcow2
  $ cat /proc/433203/fdinfo/45
  pos:    0
  flags:  02140002 <==== The last 2 means RW mode

If the base image is in RW mode at the end of blockcommit and was in RO
mode before blockcommit, check if src BDS has refcnt > 1. If so, the BDS
will not be removed after blockcommit, and we should make the base image
RO. Otherwise check recursively if there is a parent BDS of src BDS and
reopen the base BDS in RO in this case.

Signed-off-by: Alexander Ivanov <alexander.ivanov@virtuozzo.com>
---
 block/mirror.c | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

Comments

Alexander Ivanov Feb. 9, 2024, 12:29 p.m. UTC | #1
Could you please review the patch?

On 1/30/24 10:14, Alexander Ivanov wrote:
> If a blockcommit is aborted the base image remains in RW mode, that leads
> to a fail of subsequent live migration.
>
> How to reproduce:
>    $ virsh snapshot-create-as vm snp1 --disk-only
>
>    *** write something to the disk inside the guest ***
>
>    $ virsh blockcommit vm vda --active --shallow && virsh blockjob vm vda --abort
>    $ lsof /vzt/vm.qcow2
>    COMMAND      PID USER   FD   TYPE DEVICE   SIZE/OFF NODE NAME
>    qemu-syst 433203 root   45u   REG  253,0 1724776448  133 /vzt/vm.qcow2
>    $ cat /proc/433203/fdinfo/45
>    pos:    0
>    flags:  02140002 <==== The last 2 means RW mode
>
> If the base image is in RW mode at the end of blockcommit and was in RO
> mode before blockcommit, check if src BDS has refcnt > 1. If so, the BDS
> will not be removed after blockcommit, and we should make the base image
> RO. Otherwise check recursively if there is a parent BDS of src BDS and
> reopen the base BDS in RO in this case.
>
> Signed-off-by: Alexander Ivanov <alexander.ivanov@virtuozzo.com>
> ---
>   block/mirror.c | 38 ++++++++++++++++++++++++++++++++++++--
>   1 file changed, 36 insertions(+), 2 deletions(-)
>
> diff --git a/block/mirror.c b/block/mirror.c
> index 5145eb53e1..52a7fee75e 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -93,6 +93,7 @@ typedef struct MirrorBlockJob {
>       int64_t active_write_bytes_in_flight;
>       bool prepared;
>       bool in_drain;
> +    bool base_ro;
>   } MirrorBlockJob;
>   
>   typedef struct MirrorBDSOpaque {
> @@ -652,6 +653,32 @@ static void coroutine_fn mirror_wait_for_all_io(MirrorBlockJob *s)
>       }
>   }
>   
> +/*
> + * Check recursively if there is a parent BDS referenced more than
> + * min_refcnt times. This argument is needed because at the first
> + * call there is a bds referenced in blockcommit.
> + */
> +static bool bdrv_chain_has_significant_parent(BlockDriverState *bs)
> +{
> +    BdrvChild *parent;
> +    BlockDriverState *parent_bs;
> +
> +    QLIST_FOREACH(parent, &bs->parents, next) {
> +        if (!(parent->klass->parent_is_bds)) {
> +            continue;
> +        }
> +        parent_bs = parent->opaque;
> +        if (parent_bs->drv && !parent_bs->drv->is_filter) {
> +            return true;
> +        }
> +        if (bdrv_chain_has_significant_parent(parent_bs)) {
> +            return true;
> +        }
> +    }
> +
> +    return false;
> +}
> +
>   /**
>    * mirror_exit_common: handle both abort() and prepare() cases.
>    * for .prepare, returns 0 on success and -errno on failure.
> @@ -793,6 +820,11 @@ static int mirror_exit_common(Job *job)
>       bdrv_drained_end(target_bs);
>       bdrv_unref(target_bs);
>   
> +    if (s->base_ro && !bdrv_is_read_only(target_bs) &&
> +        (src->refcnt > 1 || bdrv_chain_has_significant_parent(src))) {
> +        bdrv_reopen_set_read_only(target_bs, true, NULL);
> +    }
> +
>       bs_opaque->job = NULL;
>   
>       bdrv_drained_end(src);
> @@ -1715,6 +1747,7 @@ static BlockJob *mirror_start_job(
>                                bool is_none_mode, BlockDriverState *base,
>                                bool auto_complete, const char *filter_node_name,
>                                bool is_mirror, MirrorCopyMode copy_mode,
> +                             bool base_ro,
>                                Error **errp)
>   {
>       MirrorBlockJob *s;
> @@ -1798,6 +1831,7 @@ static BlockJob *mirror_start_job(
>       bdrv_unref(mirror_top_bs);
>   
>       s->mirror_top_bs = mirror_top_bs;
> +    s->base_ro = base_ro;
>   
>       /* No resize for the target either; while the mirror is still running, a
>        * consistent read isn't necessarily possible. We could possibly allow
> @@ -2027,7 +2061,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
>                        speed, granularity, buf_size, backing_mode, zero_target,
>                        on_source_error, on_target_error, unmap, NULL, NULL,
>                        &mirror_job_driver, is_none_mode, base, false,
> -                     filter_node_name, true, copy_mode, errp);
> +                     filter_node_name, true, copy_mode, false, errp);
>   }
>   
>   BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
> @@ -2056,7 +2090,7 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
>                        on_error, on_error, true, cb, opaque,
>                        &commit_active_job_driver, false, base, auto_complete,
>                        filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND,
> -                     errp);
> +                     base_read_only, errp);
>       if (!job) {
>           goto error_restore_flags;
>       }
Vladimir Sementsov-Ogievskiy Feb. 28, 2024, 4:48 p.m. UTC | #2
On 09.02.24 15:29, Alexander Ivanov wrote:
> Could you please review the patch?

Sorry for long delay.

Honestly, I don't like refcnt in block-driver. It violate incapsulation, refcnt is interal thing of common block layer. And actually, you can't make any assumptions from value of refcnt, as you don't know which additional parents were created and why, and when they are going unref their children.

What was wrong with v2?

> 
> On 1/30/24 10:14, Alexander Ivanov wrote:
>> If a blockcommit is aborted the base image remains in RW mode, that leads
>> to a fail of subsequent live migration.
>>
>> How to reproduce:
>>    $ virsh snapshot-create-as vm snp1 --disk-only
>>
>>    *** write something to the disk inside the guest ***
>>
>>    $ virsh blockcommit vm vda --active --shallow && virsh blockjob vm vda --abort
>>    $ lsof /vzt/vm.qcow2
>>    COMMAND      PID USER   FD   TYPE DEVICE   SIZE/OFF NODE NAME
>>    qemu-syst 433203 root   45u   REG  253,0 1724776448  133 /vzt/vm.qcow2
>>    $ cat /proc/433203/fdinfo/45
>>    pos:    0
>>    flags:  02140002 <==== The last 2 means RW mode
>>
>> If the base image is in RW mode at the end of blockcommit and was in RO
>> mode before blockcommit, check if src BDS has refcnt > 1. If so, the BDS
>> will not be removed after blockcommit, and we should make the base image
>> RO. Otherwise check recursively if there is a parent BDS of src BDS and
>> reopen the base BDS in RO in this case.
>>
>> Signed-off-by: Alexander Ivanov <alexander.ivanov@virtuozzo.com>
>> ---
>>   block/mirror.c | 38 ++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 36 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/mirror.c b/block/mirror.c
>> index 5145eb53e1..52a7fee75e 100644
>> --- a/block/mirror.c
>> +++ b/block/mirror.c
>> @@ -93,6 +93,7 @@ typedef struct MirrorBlockJob {
>>       int64_t active_write_bytes_in_flight;
>>       bool prepared;
>>       bool in_drain;
>> +    bool base_ro;
>>   } MirrorBlockJob;
>>   typedef struct MirrorBDSOpaque {
>> @@ -652,6 +653,32 @@ static void coroutine_fn mirror_wait_for_all_io(MirrorBlockJob *s)
>>       }
>>   }
>> +/*
>> + * Check recursively if there is a parent BDS referenced more than
>> + * min_refcnt times. This argument is needed because at the first
>> + * call there is a bds referenced in blockcommit.
>> + */
>> +static bool bdrv_chain_has_significant_parent(BlockDriverState *bs)
>> +{
>> +    BdrvChild *parent;
>> +    BlockDriverState *parent_bs;
>> +
>> +    QLIST_FOREACH(parent, &bs->parents, next) {
>> +        if (!(parent->klass->parent_is_bds)) {
>> +            continue;
>> +        }
>> +        parent_bs = parent->opaque;
>> +        if (parent_bs->drv && !parent_bs->drv->is_filter) {
>> +            return true;
>> +        }
>> +        if (bdrv_chain_has_significant_parent(parent_bs)) {
>> +            return true;
>> +        }
>> +    }
>> +
>> +    return false;
>> +}
>> +
>>   /**
>>    * mirror_exit_common: handle both abort() and prepare() cases.
>>    * for .prepare, returns 0 on success and -errno on failure.
>> @@ -793,6 +820,11 @@ static int mirror_exit_common(Job *job)
>>       bdrv_drained_end(target_bs);
>>       bdrv_unref(target_bs);
>> +    if (s->base_ro && !bdrv_is_read_only(target_bs) &&
>> +        (src->refcnt > 1 || bdrv_chain_has_significant_parent(src))) {
>> +        bdrv_reopen_set_read_only(target_bs, true, NULL);
>> +    }
>> +
>>       bs_opaque->job = NULL;
>>       bdrv_drained_end(src);
>> @@ -1715,6 +1747,7 @@ static BlockJob *mirror_start_job(
>>                                bool is_none_mode, BlockDriverState *base,
>>                                bool auto_complete, const char *filter_node_name,
>>                                bool is_mirror, MirrorCopyMode copy_mode,
>> +                             bool base_ro,
>>                                Error **errp)
>>   {
>>       MirrorBlockJob *s;
>> @@ -1798,6 +1831,7 @@ static BlockJob *mirror_start_job(
>>       bdrv_unref(mirror_top_bs);
>>       s->mirror_top_bs = mirror_top_bs;
>> +    s->base_ro = base_ro;
>>       /* No resize for the target either; while the mirror is still running, a
>>        * consistent read isn't necessarily possible. We could possibly allow
>> @@ -2027,7 +2061,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
>>                        speed, granularity, buf_size, backing_mode, zero_target,
>>                        on_source_error, on_target_error, unmap, NULL, NULL,
>>                        &mirror_job_driver, is_none_mode, base, false,
>> -                     filter_node_name, true, copy_mode, errp);
>> +                     filter_node_name, true, copy_mode, false, errp);
>>   }
>>   BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
>> @@ -2056,7 +2090,7 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
>>                        on_error, on_error, true, cb, opaque,
>>                        &commit_active_job_driver, false, base, auto_complete,
>>                        filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND,
>> -                     errp);
>> +                     base_read_only, errp);
>>       if (!job) {
>>           goto error_restore_flags;
>>       }
>
Alexander Ivanov March 15, 2024, 9:55 a.m. UTC | #3
On 2/28/24 17:48, Vladimir Sementsov-Ogievskiy wrote:
> On 09.02.24 15:29, Alexander Ivanov wrote:
>> Could you please review the patch?
>
> Sorry for long delay.
>
> Honestly, I don't like refcnt in block-driver. It violate 
> incapsulation, refcnt is interal thing of common block layer. And 
> actually, you can't make any assumptions from value of refcnt, as you 
> don't know which additional parents were created and why, and when 
> they are going unref their children.
Hmmm... Maybe I can just exclude refcnt check from the condition, can't 
I. If BDS will be removed it doesn't matter if we make it RO. What do 
you think?
>
> What was wrong with v2?
My bad, it seems, I didn't send v2 before I decided to change the patch.
>
>>
>> On 1/30/24 10:14, Alexander Ivanov wrote:
>>> If a blockcommit is aborted the base image remains in RW mode, that 
>>> leads
>>> to a fail of subsequent live migration.
>>>
>>> How to reproduce:
>>>    $ virsh snapshot-create-as vm snp1 --disk-only
>>>
>>>    *** write something to the disk inside the guest ***
>>>
>>>    $ virsh blockcommit vm vda --active --shallow && virsh blockjob 
>>> vm vda --abort
>>>    $ lsof /vzt/vm.qcow2
>>>    COMMAND      PID USER   FD   TYPE DEVICE   SIZE/OFF NODE NAME
>>>    qemu-syst 433203 root   45u   REG  253,0 1724776448  133 
>>> /vzt/vm.qcow2
>>>    $ cat /proc/433203/fdinfo/45
>>>    pos:    0
>>>    flags:  02140002 <==== The last 2 means RW mode
>>>
>>> If the base image is in RW mode at the end of blockcommit and was in RO
>>> mode before blockcommit, check if src BDS has refcnt > 1. If so, the 
>>> BDS
>>> will not be removed after blockcommit, and we should make the base 
>>> image
>>> RO. Otherwise check recursively if there is a parent BDS of src BDS and
>>> reopen the base BDS in RO in this case.
>>>
>>> Signed-off-by: Alexander Ivanov <alexander.ivanov@virtuozzo.com>
>>> ---
>>>   block/mirror.c | 38 ++++++++++++++++++++++++++++++++++++--
>>>   1 file changed, 36 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/block/mirror.c b/block/mirror.c
>>> index 5145eb53e1..52a7fee75e 100644
>>> --- a/block/mirror.c
>>> +++ b/block/mirror.c
>>> @@ -93,6 +93,7 @@ typedef struct MirrorBlockJob {
>>>       int64_t active_write_bytes_in_flight;
>>>       bool prepared;
>>>       bool in_drain;
>>> +    bool base_ro;
>>>   } MirrorBlockJob;
>>>   typedef struct MirrorBDSOpaque {
>>> @@ -652,6 +653,32 @@ static void coroutine_fn 
>>> mirror_wait_for_all_io(MirrorBlockJob *s)
>>>       }
>>>   }
>>> +/*
>>> + * Check recursively if there is a parent BDS referenced more than
>>> + * min_refcnt times. This argument is needed because at the first
>>> + * call there is a bds referenced in blockcommit.
>>> + */
>>> +static bool bdrv_chain_has_significant_parent(BlockDriverState *bs)
>>> +{
>>> +    BdrvChild *parent;
>>> +    BlockDriverState *parent_bs;
>>> +
>>> +    QLIST_FOREACH(parent, &bs->parents, next) {
>>> +        if (!(parent->klass->parent_is_bds)) {
>>> +            continue;
>>> +        }
>>> +        parent_bs = parent->opaque;
>>> +        if (parent_bs->drv && !parent_bs->drv->is_filter) {
>>> +            return true;
>>> +        }
>>> +        if (bdrv_chain_has_significant_parent(parent_bs)) {
>>> +            return true;
>>> +        }
>>> +    }
>>> +
>>> +    return false;
>>> +}
>>> +
>>>   /**
>>>    * mirror_exit_common: handle both abort() and prepare() cases.
>>>    * for .prepare, returns 0 on success and -errno on failure.
>>> @@ -793,6 +820,11 @@ static int mirror_exit_common(Job *job)
>>>       bdrv_drained_end(target_bs);
>>>       bdrv_unref(target_bs);
>>> +    if (s->base_ro && !bdrv_is_read_only(target_bs) &&
>>> +        (src->refcnt > 1 || bdrv_chain_has_significant_parent(src))) {
>>> +        bdrv_reopen_set_read_only(target_bs, true, NULL);
>>> +    }
>>> +
>>>       bs_opaque->job = NULL;
>>>       bdrv_drained_end(src);
>>> @@ -1715,6 +1747,7 @@ static BlockJob *mirror_start_job(
>>>                                bool is_none_mode, BlockDriverState 
>>> *base,
>>>                                bool auto_complete, const char 
>>> *filter_node_name,
>>>                                bool is_mirror, MirrorCopyMode 
>>> copy_mode,
>>> +                             bool base_ro,
>>>                                Error **errp)
>>>   {
>>>       MirrorBlockJob *s;
>>> @@ -1798,6 +1831,7 @@ static BlockJob *mirror_start_job(
>>>       bdrv_unref(mirror_top_bs);
>>>       s->mirror_top_bs = mirror_top_bs;
>>> +    s->base_ro = base_ro;
>>>       /* No resize for the target either; while the mirror is still 
>>> running, a
>>>        * consistent read isn't necessarily possible. We could 
>>> possibly allow
>>> @@ -2027,7 +2061,7 @@ void mirror_start(const char *job_id, 
>>> BlockDriverState *bs,
>>>                        speed, granularity, buf_size, backing_mode, 
>>> zero_target,
>>>                        on_source_error, on_target_error, unmap, 
>>> NULL, NULL,
>>>                        &mirror_job_driver, is_none_mode, base, false,
>>> -                     filter_node_name, true, copy_mode, errp);
>>> +                     filter_node_name, true, copy_mode, false, errp);
>>>   }
>>>   BlockJob *commit_active_start(const char *job_id, BlockDriverState 
>>> *bs,
>>> @@ -2056,7 +2090,7 @@ BlockJob *commit_active_start(const char 
>>> *job_id, BlockDriverState *bs,
>>>                        on_error, on_error, true, cb, opaque,
>>>                        &commit_active_job_driver, false, base, 
>>> auto_complete,
>>>                        filter_node_name, false, 
>>> MIRROR_COPY_MODE_BACKGROUND,
>>> -                     errp);
>>> +                     base_read_only, errp);
>>>       if (!job) {
>>>           goto error_restore_flags;
>>>       }
>>
>
Vladimir Sementsov-Ogievskiy March 15, 2024, 1:48 p.m. UTC | #4
On 15.03.24 12:55, Alexander Ivanov wrote:
> 
> 
> On 2/28/24 17:48, Vladimir Sementsov-Ogievskiy wrote:
>> On 09.02.24 15:29, Alexander Ivanov wrote:
>>> Could you please review the patch?
>>
>> Sorry for long delay.
>>
>> Honestly, I don't like refcnt in block-driver. It violate incapsulation, refcnt is interal thing of common block layer. And actually, you can't make any assumptions from value of refcnt, as you don't know which additional parents were created and why, and when they are going unref their children.
> Hmmm... Maybe I can just exclude refcnt check from the condition, can't I. If BDS will be removed it doesn't matter if we make it RO. What do you think?

Sounds good. I even don't see, why you need bdrv_chain_has_significant_parent() check. We just roll-back ro->rw transition on failure case, isn't just always correct thing to do?

>>
>> What was wrong with v2?
> My bad, it seems, I didn't send v2 before I decided to change the patch.

Hmm, somehow, I don't have it in my mailbox, but here it is: https://patchew.org/QEMU/20240109093128.157460-1-alexander.ivanov@virtuozzo.com/


===

More: in commit message you say about failure case. And it seems OK to roll-back ro->rw transition on failure, if we did it. But mirror_exit_common() called on success path too. I think, on success patch, we should do any additional reopenings?
diff mbox series

Patch

diff --git a/block/mirror.c b/block/mirror.c
index 5145eb53e1..52a7fee75e 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -93,6 +93,7 @@  typedef struct MirrorBlockJob {
     int64_t active_write_bytes_in_flight;
     bool prepared;
     bool in_drain;
+    bool base_ro;
 } MirrorBlockJob;
 
 typedef struct MirrorBDSOpaque {
@@ -652,6 +653,32 @@  static void coroutine_fn mirror_wait_for_all_io(MirrorBlockJob *s)
     }
 }
 
+/*
+ * Check recursively if there is a parent BDS referenced more than
+ * min_refcnt times. This argument is needed because at the first
+ * call there is a bds referenced in blockcommit.
+ */
+static bool bdrv_chain_has_significant_parent(BlockDriverState *bs)
+{
+    BdrvChild *parent;
+    BlockDriverState *parent_bs;
+
+    QLIST_FOREACH(parent, &bs->parents, next) {
+        if (!(parent->klass->parent_is_bds)) {
+            continue;
+        }
+        parent_bs = parent->opaque;
+        if (parent_bs->drv && !parent_bs->drv->is_filter) {
+            return true;
+        }
+        if (bdrv_chain_has_significant_parent(parent_bs)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 /**
  * mirror_exit_common: handle both abort() and prepare() cases.
  * for .prepare, returns 0 on success and -errno on failure.
@@ -793,6 +820,11 @@  static int mirror_exit_common(Job *job)
     bdrv_drained_end(target_bs);
     bdrv_unref(target_bs);
 
+    if (s->base_ro && !bdrv_is_read_only(target_bs) &&
+        (src->refcnt > 1 || bdrv_chain_has_significant_parent(src))) {
+        bdrv_reopen_set_read_only(target_bs, true, NULL);
+    }
+
     bs_opaque->job = NULL;
 
     bdrv_drained_end(src);
@@ -1715,6 +1747,7 @@  static BlockJob *mirror_start_job(
                              bool is_none_mode, BlockDriverState *base,
                              bool auto_complete, const char *filter_node_name,
                              bool is_mirror, MirrorCopyMode copy_mode,
+                             bool base_ro,
                              Error **errp)
 {
     MirrorBlockJob *s;
@@ -1798,6 +1831,7 @@  static BlockJob *mirror_start_job(
     bdrv_unref(mirror_top_bs);
 
     s->mirror_top_bs = mirror_top_bs;
+    s->base_ro = base_ro;
 
     /* No resize for the target either; while the mirror is still running, a
      * consistent read isn't necessarily possible. We could possibly allow
@@ -2027,7 +2061,7 @@  void mirror_start(const char *job_id, BlockDriverState *bs,
                      speed, granularity, buf_size, backing_mode, zero_target,
                      on_source_error, on_target_error, unmap, NULL, NULL,
                      &mirror_job_driver, is_none_mode, base, false,
-                     filter_node_name, true, copy_mode, errp);
+                     filter_node_name, true, copy_mode, false, errp);
 }
 
 BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
@@ -2056,7 +2090,7 @@  BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
                      on_error, on_error, true, cb, opaque,
                      &commit_active_job_driver, false, base, auto_complete,
                      filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND,
-                     errp);
+                     base_read_only, errp);
     if (!job) {
         goto error_restore_flags;
     }