diff mbox

[v7,11/14] block/backup: support block job transactions

Message ID 1442889976-8733-12-git-send-email-famz@redhat.com
State New
Headers show

Commit Message

Fam Zheng Sept. 22, 2015, 2:46 a.m. UTC
From: Stefan Hajnoczi <stefanha@redhat.com>

Join the transaction when the 'transactional-cancel' QMP argument is
true.

This ensures that the sync bitmap is not thrown away if another block
job in the transaction is cancelled or fails.  This is critical so
incremental backup with multiple disks can be retried in case of
cancellation/failure.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Fam Zheng <famz@redhat.com>
---
 block/backup.c            |  25 +++++++--
 blockdev.c                | 132 +++++++++++++++++++++++++++++++++++-----------
 include/block/block_int.h |   3 +-
 qapi-schema.json          |   4 +-
 qapi/block-core.json      |  27 +++++++++-
 5 files changed, 152 insertions(+), 39 deletions(-)

Comments

Eric Blake Sept. 22, 2015, 4:03 p.m. UTC | #1
On 09/21/2015 08:46 PM, Fam Zheng wrote:
> From: Stefan Hajnoczi <stefanha@redhat.com>
> 
> Join the transaction when the 'transactional-cancel' QMP argument is
> true.
> 
> This ensures that the sync bitmap is not thrown away if another block
> job in the transaction is cancelled or fails.  This is critical so
> incremental backup with multiple disks can be retried in case of
> cancellation/failure.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> Reviewed-by: Max Reitz <mreitz@redhat.com>
> Reviewed-by: Eric Blake <eblake@redhat.com>
> Signed-off-by: Fam Zheng <famz@redhat.com>
> ---

> +++ b/qapi/block-core.json
> @@ -735,7 +735,6 @@
>  # @on-target-error: #optional the action to take on an error on the target,
>  #                   default 'report' (no limitations, since this applies to
>  #                   a different block device than @device).
> -#
>  # Note that @on-source-error and @on-target-error only affect background I/O.
>  # If an error occurs during a guest write request, the device's rerror/werror
>  # actions will be used.

Spurious line deletion when addressing review comment. Maintainer can
touch it up for the pull request, so my R-b still stands.
John Snow Sept. 22, 2015, 9:08 p.m. UTC | #2
Eric, Markus: I've CC'd you for some crazy what-if QAPI questions after
my R-B on this patch.

On 09/21/2015 10:46 PM, Fam Zheng wrote:
> From: Stefan Hajnoczi <stefanha@redhat.com>
> 
> Join the transaction when the 'transactional-cancel' QMP argument is
> true.
> 
> This ensures that the sync bitmap is not thrown away if another block
> job in the transaction is cancelled or fails.  This is critical so
> incremental backup with multiple disks can be retried in case of
> cancellation/failure.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> Reviewed-by: Max Reitz <mreitz@redhat.com>
> Reviewed-by: Eric Blake <eblake@redhat.com>
> Signed-off-by: Fam Zheng <famz@redhat.com>
> ---
>  block/backup.c            |  25 +++++++--
>  blockdev.c                | 132 +++++++++++++++++++++++++++++++++++-----------
>  include/block/block_int.h |   3 +-
>  qapi-schema.json          |   4 +-
>  qapi/block-core.json      |  27 +++++++++-
>  5 files changed, 152 insertions(+), 39 deletions(-)
> 
> diff --git a/block/backup.c b/block/backup.c
> index 609b199..5880116 100644
> --- a/block/backup.c
> +++ b/block/backup.c
> @@ -227,11 +227,29 @@ static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
>      }
>  }
>  
> +static void backup_commit(BlockJob *job)
> +{
> +    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
> +    if (s->sync_bitmap) {
> +        backup_cleanup_sync_bitmap(s, 0);
> +    }
> +}
> +
> +static void backup_abort(BlockJob *job)
> +{
> +    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
> +    if (s->sync_bitmap) {
> +        backup_cleanup_sync_bitmap(s, -1);
> +    }
> +}
> +
>  static const BlockJobDriver backup_job_driver = {
>      .instance_size  = sizeof(BackupBlockJob),
>      .job_type       = BLOCK_JOB_TYPE_BACKUP,
>      .set_speed      = backup_set_speed,
>      .iostatus_reset = backup_iostatus_reset,
> +    .commit         = backup_commit,
> +    .abort          = backup_abort,
>  };
>  
>  static BlockErrorAction backup_error_action(BackupBlockJob *job,
> @@ -444,10 +462,6 @@ static void coroutine_fn backup_run(void *opaque)
>      /* wait until pending backup_do_cow() calls have completed */
>      qemu_co_rwlock_wrlock(&job->flush_rwlock);
>      qemu_co_rwlock_unlock(&job->flush_rwlock);
> -
> -    if (job->sync_bitmap) {
> -        backup_cleanup_sync_bitmap(job, ret);
> -    }
>      hbitmap_free(job->bitmap);
>  
>      bdrv_iostatus_disable(target);
> @@ -464,7 +478,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
>                    BlockdevOnError on_source_error,
>                    BlockdevOnError on_target_error,
>                    BlockCompletionFunc *cb, void *opaque,
> -                  Error **errp)
> +                  BlockJobTxn *txn, Error **errp)
>  {
>      int64_t len;
>  
> @@ -546,6 +560,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
>                         sync_bitmap : NULL;
>      job->common.len = len;
>      job->common.co = qemu_coroutine_create(backup_run);
> +    block_job_txn_add_job(txn, &job->common);
>      qemu_coroutine_enter(job->common.co, job);
>      return;
>  

So a backup job will join a transaction unconditionally if it is given one.

> diff --git a/blockdev.c b/blockdev.c
> index ed50cb4..45a9fe7 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -1586,16 +1586,31 @@ typedef struct DriveBackupState {
>      BlockJob *job;
>  } DriveBackupState;
>  
> +static void do_drive_backup(const char *device, const char *target,
> +                            bool has_format, const char *format,
> +                            enum MirrorSyncMode sync,
> +                            bool has_mode, enum NewImageMode mode,
> +                            bool has_speed, int64_t speed,
> +                            bool has_bitmap, const char *bitmap,
> +                            bool has_on_source_error,
> +                            BlockdevOnError on_source_error,
> +                            bool has_on_target_error,
> +                            BlockdevOnError on_target_error,
> +                            BlockJobTxn *txn, Error **errp);
> +
>  static void drive_backup_prepare(BlkActionState *common, Error **errp)
>  {
>      DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common);
>      BlockDriverState *bs;
>      BlockBackend *blk;
> +    DriveBackupTxn *backup_txn;
>      DriveBackup *backup;
> +    BlockJobTxn *txn = NULL;
>      Error *local_err = NULL;
>  
>      assert(common->action->kind == TRANSACTION_ACTION_KIND_DRIVE_BACKUP);
> -    backup = common->action->drive_backup;
> +    backup_txn = common->action->drive_backup;
> +    backup = backup_txn->base;
>  
>      blk = blk_by_name(backup->device);
>      if (!blk) {
> @@ -1609,15 +1624,20 @@ static void drive_backup_prepare(BlkActionState *common, Error **errp)
>      state->aio_context = bdrv_get_aio_context(bs);
>      aio_context_acquire(state->aio_context);
>  
> -    qmp_drive_backup(backup->device, backup->target,
> -                     backup->has_format, backup->format,
> -                     backup->sync,
> -                     backup->has_mode, backup->mode,
> -                     backup->has_speed, backup->speed,
> -                     backup->has_bitmap, backup->bitmap,
> -                     backup->has_on_source_error, backup->on_source_error,
> -                     backup->has_on_target_error, backup->on_target_error,
> -                     &local_err);
> +    if (backup_txn->has_transactional_cancel &&
> +        backup_txn->transactional_cancel) {
> +        txn = common->block_job_txn;
> +    }
> +
> +    do_drive_backup(backup->device, backup->target,
> +                    backup->has_format, backup->format,
> +                    backup->sync,
> +                    backup->has_mode, backup->mode,
> +                    backup->has_speed, backup->speed,
> +                    backup->has_bitmap, backup->bitmap,
> +                    backup->has_on_source_error, backup->on_source_error,
> +                    backup->has_on_target_error, backup->on_target_error,
> +                    txn, &local_err);
>      if (local_err) {
>          error_propagate(errp, local_err);
>          return;

So far so good. "has_transactional_cancel" can be set per-each command
so we have a fine-grained control over which commands in a transaction
we want to fail or die together, which is nice.

> @@ -1654,16 +1674,28 @@ typedef struct BlockdevBackupState {
>      AioContext *aio_context;
>  } BlockdevBackupState;
>  
> +static void do_blockdev_backup(const char *device, const char *target,
> +                               enum MirrorSyncMode sync,
> +                               bool has_speed, int64_t speed,
> +                               bool has_on_source_error,
> +                               BlockdevOnError on_source_error,
> +                               bool has_on_target_error,
> +                               BlockdevOnError on_target_error,
> +                               BlockJobTxn *txn, Error **errp);
> +
>  static void blockdev_backup_prepare(BlkActionState *common, Error **errp)
>  {
>      BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common);
> +    BlockdevBackupTxn *backup_txn;
>      BlockdevBackup *backup;
>      BlockDriverState *bs, *target;
>      BlockBackend *blk;
> +    BlockJobTxn *txn = NULL;
>      Error *local_err = NULL;
>  
>      assert(common->action->kind == TRANSACTION_ACTION_KIND_BLOCKDEV_BACKUP);
> -    backup = common->action->blockdev_backup;
> +    backup_txn = common->action->blockdev_backup;
> +    backup = backup_txn->base;
>  
>      blk = blk_by_name(backup->device);
>      if (!blk) {
> @@ -1688,12 +1720,17 @@ static void blockdev_backup_prepare(BlkActionState *common, Error **errp)
>      }
>      aio_context_acquire(state->aio_context);
>  
> -    qmp_blockdev_backup(backup->device, backup->target,
> -                        backup->sync,
> -                        backup->has_speed, backup->speed,
> -                        backup->has_on_source_error, backup->on_source_error,
> -                        backup->has_on_target_error, backup->on_target_error,
> -                        &local_err);
> +    if (backup_txn->has_transactional_cancel &&
> +        backup_txn->transactional_cancel) {
> +        txn = common->block_job_txn;
> +    }
> +
> +    do_blockdev_backup(backup->device, backup->target,
> +                       backup->sync,
> +                       backup->has_speed, backup->speed,
> +                       backup->has_on_source_error, backup->on_source_error,
> +                       backup->has_on_target_error, backup->on_target_error,
> +                       txn, &local_err);

Might as well take care of blockdev-backup while we're at it, sure.

>      if (local_err) {
>          error_propagate(errp, local_err);
>          return;
> @@ -2573,15 +2610,17 @@ out:
>      aio_context_release(aio_context);
>  }
>  
> -void qmp_drive_backup(const char *device, const char *target,
> -                      bool has_format, const char *format,
> -                      enum MirrorSyncMode sync,
> -                      bool has_mode, enum NewImageMode mode,
> -                      bool has_speed, int64_t speed,
> -                      bool has_bitmap, const char *bitmap,
> -                      bool has_on_source_error, BlockdevOnError on_source_error,
> -                      bool has_on_target_error, BlockdevOnError on_target_error,
> -                      Error **errp)
> +static void do_drive_backup(const char *device, const char *target,
> +                            bool has_format, const char *format,
> +                            enum MirrorSyncMode sync,
> +                            bool has_mode, enum NewImageMode mode,
> +                            bool has_speed, int64_t speed,
> +                            bool has_bitmap, const char *bitmap,
> +                            bool has_on_source_error,
> +                            BlockdevOnError on_source_error,
> +                            bool has_on_target_error,
> +                            BlockdevOnError on_target_error,
> +                            BlockJobTxn *txn, Error **errp)
>  {
>      BlockBackend *blk;
>      BlockDriverState *bs;
> @@ -2696,7 +2735,7 @@ void qmp_drive_backup(const char *device, const char *target,
>  
>      backup_start(bs, target_bs, speed, sync, bmap,
>                   on_source_error, on_target_error,
> -                 block_job_cb, bs, &local_err);
> +                 block_job_cb, bs, txn, &local_err);
>      if (local_err != NULL) {
>          bdrv_unref(target_bs);
>          error_propagate(errp, local_err);
> @@ -2707,19 +2746,37 @@ out:
>      aio_context_release(aio_context);
>  }
>  
> +void qmp_drive_backup(const char *device, const char *target,
> +                      bool has_format, const char *format,
> +                      enum MirrorSyncMode sync,
> +                      bool has_mode, enum NewImageMode mode,
> +                      bool has_speed, int64_t speed,
> +                      bool has_bitmap, const char *bitmap,
> +                      bool has_on_source_error, BlockdevOnError on_source_error,
> +                      bool has_on_target_error, BlockdevOnError on_target_error,
> +                      Error **errp)
> +{
> +    return do_drive_backup(device, target, has_format, format, sync,
> +                           has_mode, mode, has_speed, speed,
> +                           has_bitmap, bitmap,
> +                           has_on_source_error, on_source_error,
> +                           has_on_target_error, on_target_error,
> +                           NULL, errp);
> +}
> +
>  BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp)
>  {
>      return bdrv_named_nodes_list(errp);
>  }
>  
> -void qmp_blockdev_backup(const char *device, const char *target,
> +void do_blockdev_backup(const char *device, const char *target,
>                           enum MirrorSyncMode sync,
>                           bool has_speed, int64_t speed,
>                           bool has_on_source_error,
>                           BlockdevOnError on_source_error,
>                           bool has_on_target_error,
>                           BlockdevOnError on_target_error,
> -                         Error **errp)
> +                         BlockJobTxn *txn, Error **errp)
>  {
>      BlockBackend *blk;
>      BlockDriverState *bs;
> @@ -2757,7 +2814,7 @@ void qmp_blockdev_backup(const char *device, const char *target,
>      bdrv_ref(target_bs);
>      bdrv_set_aio_context(target_bs, aio_context);
>      backup_start(bs, target_bs, speed, sync, NULL, on_source_error,
> -                 on_target_error, block_job_cb, bs, &local_err);
> +                 on_target_error, block_job_cb, bs, txn, &local_err);
>      if (local_err != NULL) {
>          bdrv_unref(target_bs);
>          error_propagate(errp, local_err);
> @@ -2766,6 +2823,21 @@ out:
>      aio_context_release(aio_context);
>  }
>  
> +void qmp_blockdev_backup(const char *device, const char *target,
> +                         enum MirrorSyncMode sync,
> +                         bool has_speed, int64_t speed,
> +                         bool has_on_source_error,
> +                         BlockdevOnError on_source_error,
> +                         bool has_on_target_error,
> +                         BlockdevOnError on_target_error,
> +                         Error **errp)
> +{
> +    do_blockdev_backup(device, target, sync, has_speed, speed,
> +                       has_on_source_error, on_source_error,
> +                       has_on_target_error, on_target_error,
> +                       NULL, errp);
> +}
> +
>  void qmp_drive_mirror(const char *device, const char *target,
>                        bool has_format, const char *format,
>                        bool has_node_name, const char *node_name,

All looks good so far.

> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 281d790..6fd07a2 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -643,6 +643,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
>   * @on_target_error: The action to take upon error writing to the target.
>   * @cb: Completion function for the job.
>   * @opaque: Opaque pointer value passed to @cb.
> + * @txn: Transaction that this job is part of (may be NULL).
>   *
>   * Start a backup operation on @bs.  Clusters in @bs are written to @target
>   * until the job is cancelled or manually completed.
> @@ -653,7 +654,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
>                    BlockdevOnError on_source_error,
>                    BlockdevOnError on_target_error,
>                    BlockCompletionFunc *cb, void *opaque,
> -                  Error **errp);
> +                  BlockJobTxn *txn, Error **errp);
>  
>  void blk_dev_change_media_cb(BlockBackend *blk, bool load);
>  bool blk_dev_has_removable_media(BlockBackend *blk);
> diff --git a/qapi-schema.json b/qapi-schema.json
> index 583e036..6641be3 100644
> --- a/qapi-schema.json
> +++ b/qapi-schema.json
> @@ -1505,8 +1505,8 @@
>  { 'union': 'TransactionAction',
>    'data': {
>         'blockdev-snapshot-sync': 'BlockdevSnapshot',
> -       'drive-backup': 'DriveBackup',
> -       'blockdev-backup': 'BlockdevBackup',
> +       'drive-backup': 'DriveBackupTxn',
> +       'blockdev-backup': 'BlockdevBackupTxn',
>         'abort': 'Abort',
>         'blockdev-snapshot-internal-sync': 'BlockdevSnapshotInternal',
>         'block-dirty-bitmap-add': 'BlockDirtyBitmapAdd',
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index bb2189e..24db5c2 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -735,7 +735,6 @@
>  # @on-target-error: #optional the action to take on an error on the target,
>  #                   default 'report' (no limitations, since this applies to
>  #                   a different block device than @device).
> -#

(Is this intentional?)

>  # Note that @on-source-error and @on-target-error only affect background I/O.
>  # If an error occurs during a guest write request, the device's rerror/werror
>  # actions will be used.
> @@ -750,6 +749,19 @@
>              '*on-target-error': 'BlockdevOnError' } }
>  
>  ##
> +# @DriveBackupTxn
> +#
> +# @transactional-cancel: #optional whether failure or cancellation of other
> +#                        block jobs with @transactional-cancel true in the same
> +#                        transaction causes the whole group to cancel. Default
> +#                        is false.
> +#
> +# Since: 2.5
> +##
> +{ 'struct': 'DriveBackupTxn', 'base': 'DriveBackup',
> +  'data': {'*transactional-cancel': 'bool' } }
> +
> +##
>  # @BlockdevBackup
>  #
>  # @device: the name of the device which should be copied.
> @@ -785,6 +797,19 @@
>              '*on-target-error': 'BlockdevOnError' } }
>  
>  ##
> +# @BlockdevBackupTxn
> +#
> +# @transactional-cancel: #optional whether failure or cancellation of other
> +#                        block jobs with @transactional-cancel true in the same
> +#                        transaction causes the whole group to cancel. Default
> +#                        is false
> +#
> +# Since: 2.5
> +##
> +{ 'struct': 'BlockdevBackupTxn', 'base': 'BlockdevBackup',
> +  'data': {'*transactional-cancel': 'bool' } }
> +
> +##
>  # @blockdev-snapshot-sync
>  #
>  # Generates a synchronous snapshot of a block device.
> 

Going to go ahead and say "OK, looks good:"

Reviewed-by: John Snow <jsnow@redhat.com>

Good, that's out of the way. Let's take a deep breath now and read some
crazy what-if scenarios. I still have some idle questions before 2.5
freeze hits us... Eric, Markus:

The current design of this patch is such that the blockdev-backup and
drive-backup QMP commands are extended for use in the transaction
action. This means that as part of the arguments for the action, we can
specify "transactional-cancel" as on/off, defaulting to off. This
maintains backwards compatible behavior.


As an example of a lone command (for simplicity...)

{ "execute": "transaction",
  "arguments": {
    "actions": [
      {"type": "drive-backup",
       "data": {"device": "drive0",
                "target": "/path/to/new_full_backup.img",
                "sync": "full", "format": "qcow2",
		"transactional-cancel": true
               }
      }
    ]
  }
}

This means that this command should fail if any of the other block jobs
in the transaction (that have also set transactional-cancel(!)) fail.

This is nice because it allows us to specify per-action which actions
should live or die on the success of the transaction as a whole.

What I was wondering is if we wanted to pidgeon-hole ourselves like this
by adding arguments per-action and instead opt for a more generic,
transaction-wide setting.

In my mind, the transactional cancel has nothing to do with each
/specific/ action, but has more to do with a property of transactions
and actions in general.


So I was thinking of two things:

(1) Transactional cancel, while false by default, could be toggled to
true by default for an entire transaction all-at-once

{ "execute": "transaction",
  "arguments": {
    "actions": [ ... ],
    "transactional-cancel": true
  }
}

This would toggle the default state for all actions to "fail if anything
else in the transaction does."

Of course, as of right now, not all actions actually support this
feature, but they might need to in the future -- and as more and more
actions use this feature, it might be nice to have the global setting.

If "transactional-cancel" was specified and is true (i.e. not the
default) and actions are provided that do not support the argument
explicitly, the transaction command itself should fail up-front.

(2) Overridable per-action settings as a property of "action"

Instead of adding the argument per-each action, bake it in as a property
of the action itself. The original idea behind this was to decouple it
from the QMP arguments definition, but this patch here also accomplishes
this -- at the expense of subclassing each QMP argument set manually.

Something like this is what I was originally thinking of doing:

{ "execute": "transaction",
  "arguments": {
    "actions": [
      { "type": "drive-backup",
        "data": { ... },
        "properties": { "transactional-cancel": true }
      }
    ]
  }
}

In light of how Fam and Eric solved the problem of "not polluting the
QMP arguments" for this patch, #2 is perhaps less pressing or interesting.

A benefit, though, is that we can define a set of generic transactional
action properties that all actions can inspect to adjust their behavior
without us needing to specify additional argument subclasses in the QAPI
every time.

Mostly, at this point, I want to ask if we are OK with adding the
"transactional-cancel" argument ad-hoc per-action forever, or if we'd
like to allow a global setting or move the property "up the stack" to
some extent.

Maybe the answer is "no," but I wanted to throw the idea out there
before we committed to an API.

Sorry for the wall'o'text :)

--js
Eric Blake Sept. 22, 2015, 10:34 p.m. UTC | #3
On 09/22/2015 03:08 PM, John Snow wrote:
> Eric, Markus: I've CC'd you for some crazy what-if QAPI questions after
> my R-B on this patch.
> 

> The current design of this patch is such that the blockdev-backup and
> drive-backup QMP commands are extended for use in the transaction
> action. This means that as part of the arguments for the action, we can
> specify "transactional-cancel" as on/off, defaulting to off. This
> maintains backwards compatible behavior.
> 
> 
> As an example of a lone command (for simplicity...)
> 
> { "execute": "transaction",
>   "arguments": {
>     "actions": [
>       {"type": "drive-backup",
>        "data": {"device": "drive0",
>                 "target": "/path/to/new_full_backup.img",
>                 "sync": "full", "format": "qcow2",
> 		"transactional-cancel": true
>                }
>       }
>     ]
>   }
> }
> 
> This means that this command should fail if any of the other block jobs
> in the transaction (that have also set transactional-cancel(!)) fail.

Just wondering - is there ever a case where someone would want to create
a transaction with multiple sub-groups?  That is, I want actions A, B,
C, D to all occur at the same point in time, but with grouping [A,B] and
[C, D] (so that if A fails B gets cancelled, but C and D can still
continue)?  Worse, is there a series of actions to accomplish something
that cannot happen unless it is interleaved across multiple such subgroups?

Here's hoping that, for design simplicity, we only ever need one
subgroup per 'transaction' (auto-cancel semantics applies to all
commands in the group that opted in, with no way to further refine into
sub-groups of commands).

> 
> This is nice because it allows us to specify per-action which actions
> should live or die on the success of the transaction as a whole.
> 
> What I was wondering is if we wanted to pidgeon-hole ourselves like this
> by adding arguments per-action and instead opt for a more generic,
> transaction-wide setting.
> 
> In my mind, the transactional cancel has nothing to do with each
> /specific/ action, but has more to do with a property of transactions
> and actions in general.
> 
> 
> So I was thinking of two things:
> 
> (1) Transactional cancel, while false by default, could be toggled to
> true by default for an entire transaction all-at-once
> 
> { "execute": "transaction",
>   "arguments": {
>     "actions": [ ... ],
>     "transactional-cancel": true
>   }
> }
> 
> This would toggle the default state for all actions to "fail if anything
> else in the transaction does."

Or worded in another way - what is the use case for having a batch of
actions where some commands are grouped-cancel, but the remaining
actions are independent?  Is there ever a case where you would supply
"transactional-cancel":true to one action, but not all of them?  If not,
then this idea of hoisting the bool to the transaction arguments level
makes more sense (it's an all-or-none switch, not a per-action switch).

Qapi-wise, here's how you would do this:

{ 'command': 'transaction',
  'data': { 'actions': [ 'TransactionAction' ],
            '*transactional-cancel': 'bool' } }

> 
> Of course, as of right now, not all actions actually support this
> feature, but they might need to in the future -- and as more and more
> actions use this feature, it might be nice to have the global setting.
> 
> If "transactional-cancel" was specified and is true (i.e. not the
> default) and actions are provided that do not support the argument
> explicitly, the transaction command itself should fail up-front.

This actually makes sense to me, and might be simpler to implement.

> 
> (2) Overridable per-action settings as a property of "action"
> 
> Instead of adding the argument per-each action, bake it in as a property
> of the action itself. The original idea behind this was to decouple it
> from the QMP arguments definition, but this patch here also accomplishes
> this -- at the expense of subclassing each QMP argument set manually.
> 
> Something like this is what I was originally thinking of doing:
> 
> { "execute": "transaction",
>   "arguments": {
>     "actions": [
>       { "type": "drive-backup",
>         "data": { ... },
>         "properties": { "transactional-cancel": true }
>       }
>     ]
>   }
> }
> 
> In light of how Fam and Eric solved the problem of "not polluting the
> QMP arguments" for this patch, #2 is perhaps less pressing or interesting.
> 
> A benefit, though, is that we can define a set of generic transactional
> action properties that all actions can inspect to adjust their behavior
> without us needing to specify additional argument subclasses in the QAPI
> every time.

And here's how we'd do it in qapi.  We'd have to convert
TransactionAction from simple union into flat union (here, using syntax
that doesn't work on qemu.git mainline, but requires my v5 00/46
mega-patch of introspection followups - hmm, it's still verbose, so
maybe we want to invent yet more sugar to avoid having to declare all
those wrapper types):

{ 'enum': 'TransactionActionKind', 'data': [
  'blockdev-snapshot-sync', 'drive-backup', 'blockdev-backup',
  'abort', 'blockdev-snapshot-internal-sync' ] }
{ 'struct': 'TransactionProperties',
  'data': { '*transactional-cancel': 'bool' } }
{ 'struct': 'BlockdevSnapshotSyncWrapper',
  'data': { 'data': 'BlockdevSnapshotSync' } }
{ 'union': 'TransactionAction',
  'base': { 'type': 'TransactionActionKind',
            '*properties': 'TransactionProperties'},
  'discriminator': 'type',
  'data': { 'blockdev-snapshot-sync': 'BlockdevSnapshotSyncWrapper',
            ... } }

> 
> Mostly, at this point, I want to ask if we are OK with adding the
> "transactional-cancel" argument ad-hoc per-action forever, or if we'd
> like to allow a global setting or move the property "up the stack" to
> some extent.
> 
> Maybe the answer is "no," but I wanted to throw the idea out there
> before we committed to an API.

No, it's a good question. And adding it once at the 'transaction' layer
or in the 'TransactionAction' union may indeed make more sense from the
design perspective, rather than ad-hoc adding to each (growing) member
of the union.
John Snow Sept. 22, 2015, 11:27 p.m. UTC | #4
On 09/22/2015 06:34 PM, Eric Blake wrote:
> On 09/22/2015 03:08 PM, John Snow wrote:
>> Eric, Markus: I've CC'd you for some crazy what-if QAPI questions
>> after my R-B on this patch.
>> 
> 
>> The current design of this patch is such that the blockdev-backup
>> and drive-backup QMP commands are extended for use in the
>> transaction action. This means that as part of the arguments for
>> the action, we can specify "transactional-cancel" as on/off,
>> defaulting to off. This maintains backwards compatible behavior.
>> 
>> 
>> As an example of a lone command (for simplicity...)
>> 
>> { "execute": "transaction", "arguments": { "actions": [ {"type":
>> "drive-backup", "data": {"device": "drive0", "target":
>> "/path/to/new_full_backup.img", "sync": "full", "format":
>> "qcow2", "transactional-cancel": true } } ] } }
>> 
>> This means that this command should fail if any of the other
>> block jobs in the transaction (that have also set
>> transactional-cancel(!)) fail.
> 
> Just wondering - is there ever a case where someone would want to
> create a transaction with multiple sub-groups?  That is, I want
> actions A, B, C, D to all occur at the same point in time, but with
> grouping [A,B] and [C, D] (so that if A fails B gets cancelled, but
> C and D can still

Only two sub-groups:

(A) actions that live and die together
(B) actions that proceed no matter what.

The only reason we even allow these two is because the default
behavior has been B historically, so we need to allow that to continue on.

I don't think we need to architect multiple subgroups of "live and die
together" semantics.

> continue)?  Worse, is there a series of actions to accomplish
> something that cannot happen unless it is interleaved across
> multiple such subgroups?
> 

Not sure we'll need to address this.

> Here's hoping that, for design simplicity, we only ever need one 
> subgroup per 'transaction' (auto-cancel semantics applies to all 
> commands in the group that opted in, with no way to further refine
> into sub-groups of commands).
> 

I think this is correct.

>> 
>> This is nice because it allows us to specify per-action which
>> actions should live or die on the success of the transaction as a
>> whole.
>> 
>> What I was wondering is if we wanted to pidgeon-hole ourselves
>> like this by adding arguments per-action and instead opt for a
>> more generic, transaction-wide setting.
>> 
>> In my mind, the transactional cancel has nothing to do with each 
>> /specific/ action, but has more to do with a property of
>> transactions and actions in general.
>> 
>> 
>> So I was thinking of two things:
>> 
>> (1) Transactional cancel, while false by default, could be
>> toggled to true by default for an entire transaction all-at-once
>> 
>> { "execute": "transaction", "arguments": { "actions": [ ... ], 
>> "transactional-cancel": true } }
>> 
>> This would toggle the default state for all actions to "fail if
>> anything else in the transaction does."
> 
> Or worded in another way - what is the use case for having a batch
> of actions where some commands are grouped-cancel, but the
> remaining actions are independent?  Is there ever a case where you
> would supply "transactional-cancel":true to one action, but not all
> of them?  If not, then this idea of hoisting the bool to the
> transaction arguments level makes more sense (it's an all-or-none
> switch, not a per-action switch).
> 
> Qapi-wise, here's how you would do this:
> 
> { 'command': 'transaction', 'data': { 'actions': [
> 'TransactionAction' ], '*transactional-cancel': 'bool' } }
> 

Right. If there's no need for us to specify per-action settings at
all, this becomes the obvious "correct" solution.

If we do want to be able to specify this sub-grouping per-action (for
whatever reason), this might still be nice as a convenience feature.

>> 
>> Of course, as of right now, not all actions actually support
>> this feature, but they might need to in the future -- and as more
>> and more actions use this feature, it might be nice to have the
>> global setting.
>> 
>> If "transactional-cancel" was specified and is true (i.e. not
>> the default) and actions are provided that do not support the
>> argument explicitly, the transaction command itself should fail
>> up-front.
> 
> This actually makes sense to me, and might be simpler to
> implement.
> 

I'm not sure how to implement this, per-se, but in my mind it's
something like:

- A property of the transaction gets set (transactional-cancel) in
qmp_transaction.
- Each action has to prove that it has retrieved this value
	- drive-backup of course actually uses it,
	- other commands might fetch it to intentionally ignore it
	  (i.e. if cancel y/n would have no effect)
- If an action does not fetch this property, the transactional setup
flags an error and aborts the transaction with an error
	- e.g. "Attempted to use property unsupported by action ..."

With perhaps an allowance that, as long as the property is set to
default, actions aren't required to check it.

>> 
>> (2) Overridable per-action settings as a property of "action"
>> 
>> Instead of adding the argument per-each action, bake it in as a
>> property of the action itself. The original idea behind this was
>> to decouple it from the QMP arguments definition, but this patch
>> here also accomplishes this -- at the expense of subclassing each
>> QMP argument set manually.
>> 
>> Something like this is what I was originally thinking of doing:
>> 
>> { "execute": "transaction", "arguments": { "actions": [ { "type":
>> "drive-backup", "data": { ... }, "properties": {
>> "transactional-cancel": true } } ] } }
>> 
>> In light of how Fam and Eric solved the problem of "not polluting
>> the QMP arguments" for this patch, #2 is perhaps less pressing or
>> interesting.
>> 
>> A benefit, though, is that we can define a set of generic
>> transactional action properties that all actions can inspect to
>> adjust their behavior without us needing to specify additional
>> argument subclasses in the QAPI every time.
> 
> And here's how we'd do it in qapi.  We'd have to convert 
> TransactionAction from simple union into flat union (here, using
> syntax that doesn't work on qemu.git mainline, but requires my v5
> 00/46 mega-patch of introspection followups - hmm, it's still
> verbose, so maybe we want to invent yet more sugar to avoid having
> to declare all those wrapper types):
> 
> { 'enum': 'TransactionActionKind', 'data': [ 
> 'blockdev-snapshot-sync', 'drive-backup', 'blockdev-backup', 
> 'abort', 'blockdev-snapshot-internal-sync' ] } { 'struct':
> 'TransactionProperties', 'data': { '*transactional-cancel': 'bool'
> } } { 'struct': 'BlockdevSnapshotSyncWrapper', 'data': { 'data':
> 'BlockdevSnapshotSync' } } { 'union': 'TransactionAction', 'base':
> { 'type': 'TransactionActionKind', '*properties':
> 'TransactionProperties'}, 'discriminator': 'type', 'data': {
> 'blockdev-snapshot-sync': 'BlockdevSnapshotSyncWrapper', ... } }
> 
>> 
>> Mostly, at this point, I want to ask if we are OK with adding
>> the "transactional-cancel" argument ad-hoc per-action forever, or
>> if we'd like to allow a global setting or move the property "up
>> the stack" to some extent.
>> 
>> Maybe the answer is "no," but I wanted to throw the idea out
>> there before we committed to an API.
> 
> No, it's a good question. And adding it once at the 'transaction'
> layer or in the 'TransactionAction' union may indeed make more
> sense from the design perspective, rather than ad-hoc adding to
> each (growing) member of the union.
> 

#2 was just another method of hoisting the QAPI arguments that are
transaction-specific away from the QMP arguments, and won't
necessarily be needed in conjunction with #1.

--js
Markus Armbruster Sept. 23, 2015, 11:09 a.m. UTC | #5
John, your MUA turned the QMP examples to mush.  You may want to teach
it manners.

John Snow <jsnow@redhat.com> writes:

> On 09/22/2015 06:34 PM, Eric Blake wrote:
>> On 09/22/2015 03:08 PM, John Snow wrote:
>>> Eric, Markus: I've CC'd you for some crazy what-if QAPI questions
>>> after my R-B on this patch.
>>>
>>
>>> The current design of this patch is such that the blockdev-backup
>>> and drive-backup QMP commands are extended for use in the
>>> transaction action. This means that as part of the arguments for
>>> the action, we can specify "transactional-cancel" as on/off,
>>> defaulting to off. This maintains backwards compatible behavior.
>>>
>>>
>>> As an example of a lone command (for simplicity...)
>>>
>>> { "execute": "transaction",
>>>   "arguments": {
>>>     "actions": [
>>>       {"type": "drive-backup",
>>>        "data": {"device": "drive0",
>>>                 "target": "/path/to/new_full_backup.img",
>>>                 "sync": "full", "format": "qcow2",
>>>                 "transactional-cancel": true
>>>                }
>>>       }
>>>     ]
>>>   }
>>> }
>>>
>>> This means that this command should fail if any of the other
>>> block jobs in the transaction (that have also set
>>> transactional-cancel(!)) fail.
>>
>> Just wondering - is there ever a case where someone would want to
>> create a transaction with multiple sub-groups?  That is, I want
>> actions A, B, C, D to all occur at the same point in time, but with
>> grouping [A,B] and [C, D] (so that if A fails B gets cancelled, but
>> C and D can still

You make my head hurt.

> Only two sub-groups:
>
> (A) actions that live and die together
> (B) actions that proceed no matter what.
>
> The only reason we even allow these two is because the default
> behavior has been B historically, so we need to allow that to continue on.
>
> I don't think we need to architect multiple subgroups of "live and die
> together" semantics.
>
>> continue)?  Worse, is there a series of actions to accomplish
>> something that cannot happen unless it is interleaved across
>> multiple such subgroups?
>>
>
> Not sure we'll need to address this.
>
>> Here's hoping that, for design simplicity, we only ever need one
>> subgroup per 'transaction' (auto-cancel semantics applies to all
>> commands in the group that opted in, with no way to further refine
>> into sub-groups of commands).
>>
>
> I think this is correct.

Puh!

>>>
>>> This is nice because it allows us to specify per-action which
>>> actions should live or die on the success of the transaction as a
>>> whole.
>>>
>>> What I was wondering is if we wanted to pidgeon-hole ourselves
>>> like this by adding arguments per-action and instead opt for a
>>> more generic, transaction-wide setting.
>>>
>>> In my mind, the transactional cancel has nothing to do with each
>>> /specific/ action, but has more to do with a property of
>>> transactions and actions in general.
>>>
>>>
>>> So I was thinking of two things:
>>>
>>> (1) Transactional cancel, while false by default, could be
>>> toggled to true by default for an entire transaction all-at-once
>>>
>>> { "execute": "transaction",
>>>   "arguments": {
>>>     "actions": [ ... ],
>>>     "transactional-cancel": true
>>>   }
>>> }
>>>
>>> This would toggle the default state for all actions to "fail if
>>> anything else in the transaction does."
>>
>> Or worded in another way - what is the use case for having a batch
>> of actions where some commands are grouped-cancel, but the
>> remaining actions are independent?  Is there ever a case where you
>> would supply "transactional-cancel":true to one action, but not all
>> of them?  If not, then this idea of hoisting the bool to the
>> transaction arguments level makes more sense (it's an all-or-none
>> switch, not a per-action switch).
>>
>> Qapi-wise, here's how you would do this:
>>
>> { 'command': 'transaction',
>>   'data': { 'actions': [ 'TransactionAction' ],
>>             '*transactional-cancel': 'bool' } }
>>
>
> Right. If there's no need for us to specify per-action settings at
> all, this becomes the obvious "correct" solution.
>
> If we do want to be able to specify this sub-grouping per-action (for
> whatever reason), this might still be nice as a convenience feature.

A common flag is semantically simpler than a per-action flag.  As
always, the more complex solution needs to be justified with an actual
use case.

A common @transactional-cancel defaulting to false suffices for backward
compatibility.

We think users will generally want to set it to true for all actions.
Again, a common flag suffices.

Unless someone comes up with actual use case for a per-action flag,
let's stick to the simpler common flag.

>>> Of course, as of right now, not all actions actually support this
>>>feature, but they might need to in the future -- and as more and more
>>>actions use this feature, it might be nice to have the global
>>>setting.

Uh-oh, you mean the flag is currently per-action because only some kinds
of actions actually implement it being set?

>>> If "transactional-cancel" was specified and is true (i.e. not
>>> the default) and actions are provided that do not support the
>>> argument explicitly, the transaction command itself should fail
>>> up-front.
>>
>> This actually makes sense to me, and might be simpler to
>> implement.

Yes, that's the stupidest solution that could possibly work, and
therefore the one that should be tried first.

> I'm not sure how to implement this, per-se, but in my mind it's
> something like:
>
> - A property of the transaction gets set (transactional-cancel) in
> qmp_transaction.
> - Each action has to prove that it has retrieved this value
> 	- drive-backup of course actually uses it,
> 	- other commands might fetch it to intentionally ignore it
> 	  (i.e. if cancel y/n would have no effect)
> - If an action does not fetch this property, the transactional setup
> flags an error and aborts the transaction with an error
> 	- e.g. "Attempted to use property unsupported by action ..."
>
> With perhaps an allowance that, as long as the property is set to
> default, actions aren't required to check it.

Do we really need code to detect commands that don't know about the
flag?  As long as the number of transaction-capable commands is small,
why not simple make them all aware of the flag?  Make the ones that
don't implement it fail, which nicely fails the whole transaction.

[...]
John Snow Sept. 23, 2015, 4:14 p.m. UTC | #6
On 09/23/2015 07:09 AM, Markus Armbruster wrote:
> John, your MUA turned the QMP examples to mush.  You may want to teach
> it manners.
> 

Ugh, sorry. I apparently can't trust

> John Snow <jsnow@redhat.com> writes:
> 
>> On 09/22/2015 06:34 PM, Eric Blake wrote:
>>> On 09/22/2015 03:08 PM, John Snow wrote:
>>>> Eric, Markus: I've CC'd you for some crazy what-if QAPI questions
>>>> after my R-B on this patch.
>>>>
>>>
>>>> The current design of this patch is such that the blockdev-backup
>>>> and drive-backup QMP commands are extended for use in the
>>>> transaction action. This means that as part of the arguments for
>>>> the action, we can specify "transactional-cancel" as on/off,
>>>> defaulting to off. This maintains backwards compatible behavior.
>>>>
>>>>
>>>> As an example of a lone command (for simplicity...)
>>>>
>>>> { "execute": "transaction",
>>>>   "arguments": {
>>>>     "actions": [
>>>>       {"type": "drive-backup",
>>>>        "data": {"device": "drive0",
>>>>                 "target": "/path/to/new_full_backup.img",
>>>>                 "sync": "full", "format": "qcow2",
>>>>                 "transactional-cancel": true
>>>>                }
>>>>       }
>>>>     ]
>>>>   }
>>>> }
>>>>
>>>> This means that this command should fail if any of the other
>>>> block jobs in the transaction (that have also set
>>>> transactional-cancel(!)) fail.
>>>
>>> Just wondering - is there ever a case where someone would want to
>>> create a transaction with multiple sub-groups?  That is, I want
>>> actions A, B, C, D to all occur at the same point in time, but with
>>> grouping [A,B] and [C, D] (so that if A fails B gets cancelled, but
>>> C and D can still
> 
> You make my head hurt.
> 
>> Only two sub-groups:
>>
>> (A) actions that live and die together
>> (B) actions that proceed no matter what.
>>
>> The only reason we even allow these two is because the default
>> behavior has been B historically, so we need to allow that to continue on.
>>
>> I don't think we need to architect multiple subgroups of "live and die
>> together" semantics.
>>
>>> continue)?  Worse, is there a series of actions to accomplish
>>> something that cannot happen unless it is interleaved across
>>> multiple such subgroups?
>>>
>>
>> Not sure we'll need to address this.
>>
>>> Here's hoping that, for design simplicity, we only ever need one
>>> subgroup per 'transaction' (auto-cancel semantics applies to all
>>> commands in the group that opted in, with no way to further refine
>>> into sub-groups of commands).
>>>
>>
>> I think this is correct.
> 
> Puh!
> 

"I think this is correct*"

"*If we do want to allow per-action specificity of this option."

>>>>
>>>> This is nice because it allows us to specify per-action which
>>>> actions should live or die on the success of the transaction as a
>>>> whole.
>>>>
>>>> What I was wondering is if we wanted to pidgeon-hole ourselves
>>>> like this by adding arguments per-action and instead opt for a
>>>> more generic, transaction-wide setting.
>>>>
>>>> In my mind, the transactional cancel has nothing to do with each
>>>> /specific/ action, but has more to do with a property of
>>>> transactions and actions in general.
>>>>
>>>>
>>>> So I was thinking of two things:
>>>>
>>>> (1) Transactional cancel, while false by default, could be
>>>> toggled to true by default for an entire transaction all-at-once
>>>>
>>>> { "execute": "transaction",
>>>>   "arguments": {
>>>>     "actions": [ ... ],
>>>>     "transactional-cancel": true
>>>>   }
>>>> }
>>>>
>>>> This would toggle the default state for all actions to "fail if
>>>> anything else in the transaction does."
>>>
>>> Or worded in another way - what is the use case for having a batch
>>> of actions where some commands are grouped-cancel, but the
>>> remaining actions are independent?  Is there ever a case where you
>>> would supply "transactional-cancel":true to one action, but not all
>>> of them?  If not, then this idea of hoisting the bool to the
>>> transaction arguments level makes more sense (it's an all-or-none
>>> switch, not a per-action switch).
>>>
>>> Qapi-wise, here's how you would do this:
>>>
>>> { 'command': 'transaction',
>>>   'data': { 'actions': [ 'TransactionAction' ],
>>>             '*transactional-cancel': 'bool' } }
>>>
>>
>> Right. If there's no need for us to specify per-action settings at
>> all, this becomes the obvious "correct" solution.
>>
>> If we do want to be able to specify this sub-grouping per-action (for
>> whatever reason), this might still be nice as a convenience feature.
> 
> A common flag is semantically simpler than a per-action flag.  As
> always, the more complex solution needs to be justified with an actual
> use case.
> 
> A common @transactional-cancel defaulting to false suffices for backward
> compatibility.
> 
> We think users will generally want to set it to true for all actions.
> Again, a common flag suffices.
> 
> Unless someone comes up with actual use case for a per-action flag,
> let's stick to the simpler common flag.
> 
>>>> Of course, as of right now, not all actions actually support this
>>>> feature, but they might need to in the future -- and as more and more
>>>> actions use this feature, it might be nice to have the global
>>>> setting.
> 
> Uh-oh, you mean the flag is currently per-action because only some kinds
> of actions actually implement it being set?
> 

More like: This is the first time we've ever needed it, and we've only
been considering how drive-backup will behave under this flag.

We are implementing this property for the first time for (essentially)
one command. Other commands don't support the behavior yet because we've
just added it.

I am intervening and asking how we want to express the property.

>>>> If "transactional-cancel" was specified and is true (i.e. not
>>>> the default) and actions are provided that do not support the
>>>> argument explicitly, the transaction command itself should fail
>>>> up-front.
>>>
>>> This actually makes sense to me, and might be simpler to
>>> implement.
> 
> Yes, that's the stupidest solution that could possibly work, and
> therefore the one that should be tried first.
> 
>> I'm not sure how to implement this, per-se, but in my mind it's
>> something like:
>>
>> - A property of the transaction gets set (transactional-cancel) in
>> qmp_transaction.
>> - Each action has to prove that it has retrieved this value
>> 	- drive-backup of course actually uses it,
>> 	- other commands might fetch it to intentionally ignore it
>> 	  (i.e. if cancel y/n would have no effect)
>> - If an action does not fetch this property, the transactional setup
>> flags an error and aborts the transaction with an error
>> 	- e.g. "Attempted to use property unsupported by action ..."
>>
>> With perhaps an allowance that, as long as the property is set to
>> default, actions aren't required to check it.
> 
> Do we really need code to detect commands that don't know about the
> flag?  As long as the number of transaction-capable commands is small,
> why not simple make them all aware of the flag?  Make the ones that
> don't implement it fail, which nicely fails the whole transaction.
> 
> [...]
> 

Oh, I see: you're saying ...

each action fetches the property, and if it's true, fail (for 2.5, at
least) with e.g. a message saying "This property is not [yet?] supported
for this action"

That's simple. I always overcomplicate everything ...

--js
Markus Armbruster Sept. 24, 2015, 6:34 a.m. UTC | #7
John Snow <jsnow@redhat.com> writes:

[...]
> Oh, I see: you're saying ...
>
> each action fetches the property, and if it's true, fail (for 2.5, at
> least) with e.g. a message saying "This property is not [yet?] supported
> for this action"

Yes.

> That's simple. I always overcomplicate everything ...

Simple solutions become simple only once you've thought of them :)
John Snow Sept. 25, 2015, 4:05 p.m. UTC | #8
On 09/24/2015 02:34 AM, Markus Armbruster wrote:
> John Snow <jsnow@redhat.com> writes:
> 
> [...]
>> Oh, I see: you're saying ...
>>
>> each action fetches the property, and if it's true, fail (for 2.5, at
>> least) with e.g. a message saying "This property is not [yet?] supported
>> for this action"
> 
> Yes.
> 
>> That's simple. I always overcomplicate everything ...
> 
> Simple solutions become simple only once you've thought of them :)
> 

I authored a quick RFC to illustrate what this solution might look like.

--js
diff mbox

Patch

diff --git a/block/backup.c b/block/backup.c
index 609b199..5880116 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -227,11 +227,29 @@  static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
     }
 }
 
+static void backup_commit(BlockJob *job)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+    if (s->sync_bitmap) {
+        backup_cleanup_sync_bitmap(s, 0);
+    }
+}
+
+static void backup_abort(BlockJob *job)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+    if (s->sync_bitmap) {
+        backup_cleanup_sync_bitmap(s, -1);
+    }
+}
+
 static const BlockJobDriver backup_job_driver = {
     .instance_size  = sizeof(BackupBlockJob),
     .job_type       = BLOCK_JOB_TYPE_BACKUP,
     .set_speed      = backup_set_speed,
     .iostatus_reset = backup_iostatus_reset,
+    .commit         = backup_commit,
+    .abort          = backup_abort,
 };
 
 static BlockErrorAction backup_error_action(BackupBlockJob *job,
@@ -444,10 +462,6 @@  static void coroutine_fn backup_run(void *opaque)
     /* wait until pending backup_do_cow() calls have completed */
     qemu_co_rwlock_wrlock(&job->flush_rwlock);
     qemu_co_rwlock_unlock(&job->flush_rwlock);
-
-    if (job->sync_bitmap) {
-        backup_cleanup_sync_bitmap(job, ret);
-    }
     hbitmap_free(job->bitmap);
 
     bdrv_iostatus_disable(target);
@@ -464,7 +478,7 @@  void backup_start(BlockDriverState *bs, BlockDriverState *target,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockCompletionFunc *cb, void *opaque,
-                  Error **errp)
+                  BlockJobTxn *txn, Error **errp)
 {
     int64_t len;
 
@@ -546,6 +560,7 @@  void backup_start(BlockDriverState *bs, BlockDriverState *target,
                        sync_bitmap : NULL;
     job->common.len = len;
     job->common.co = qemu_coroutine_create(backup_run);
+    block_job_txn_add_job(txn, &job->common);
     qemu_coroutine_enter(job->common.co, job);
     return;
 
diff --git a/blockdev.c b/blockdev.c
index ed50cb4..45a9fe7 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1586,16 +1586,31 @@  typedef struct DriveBackupState {
     BlockJob *job;
 } DriveBackupState;
 
+static void do_drive_backup(const char *device, const char *target,
+                            bool has_format, const char *format,
+                            enum MirrorSyncMode sync,
+                            bool has_mode, enum NewImageMode mode,
+                            bool has_speed, int64_t speed,
+                            bool has_bitmap, const char *bitmap,
+                            bool has_on_source_error,
+                            BlockdevOnError on_source_error,
+                            bool has_on_target_error,
+                            BlockdevOnError on_target_error,
+                            BlockJobTxn *txn, Error **errp);
+
 static void drive_backup_prepare(BlkActionState *common, Error **errp)
 {
     DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common);
     BlockDriverState *bs;
     BlockBackend *blk;
+    DriveBackupTxn *backup_txn;
     DriveBackup *backup;
+    BlockJobTxn *txn = NULL;
     Error *local_err = NULL;
 
     assert(common->action->kind == TRANSACTION_ACTION_KIND_DRIVE_BACKUP);
-    backup = common->action->drive_backup;
+    backup_txn = common->action->drive_backup;
+    backup = backup_txn->base;
 
     blk = blk_by_name(backup->device);
     if (!blk) {
@@ -1609,15 +1624,20 @@  static void drive_backup_prepare(BlkActionState *common, Error **errp)
     state->aio_context = bdrv_get_aio_context(bs);
     aio_context_acquire(state->aio_context);
 
-    qmp_drive_backup(backup->device, backup->target,
-                     backup->has_format, backup->format,
-                     backup->sync,
-                     backup->has_mode, backup->mode,
-                     backup->has_speed, backup->speed,
-                     backup->has_bitmap, backup->bitmap,
-                     backup->has_on_source_error, backup->on_source_error,
-                     backup->has_on_target_error, backup->on_target_error,
-                     &local_err);
+    if (backup_txn->has_transactional_cancel &&
+        backup_txn->transactional_cancel) {
+        txn = common->block_job_txn;
+    }
+
+    do_drive_backup(backup->device, backup->target,
+                    backup->has_format, backup->format,
+                    backup->sync,
+                    backup->has_mode, backup->mode,
+                    backup->has_speed, backup->speed,
+                    backup->has_bitmap, backup->bitmap,
+                    backup->has_on_source_error, backup->on_source_error,
+                    backup->has_on_target_error, backup->on_target_error,
+                    txn, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         return;
@@ -1654,16 +1674,28 @@  typedef struct BlockdevBackupState {
     AioContext *aio_context;
 } BlockdevBackupState;
 
+static void do_blockdev_backup(const char *device, const char *target,
+                               enum MirrorSyncMode sync,
+                               bool has_speed, int64_t speed,
+                               bool has_on_source_error,
+                               BlockdevOnError on_source_error,
+                               bool has_on_target_error,
+                               BlockdevOnError on_target_error,
+                               BlockJobTxn *txn, Error **errp);
+
 static void blockdev_backup_prepare(BlkActionState *common, Error **errp)
 {
     BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common);
+    BlockdevBackupTxn *backup_txn;
     BlockdevBackup *backup;
     BlockDriverState *bs, *target;
     BlockBackend *blk;
+    BlockJobTxn *txn = NULL;
     Error *local_err = NULL;
 
     assert(common->action->kind == TRANSACTION_ACTION_KIND_BLOCKDEV_BACKUP);
-    backup = common->action->blockdev_backup;
+    backup_txn = common->action->blockdev_backup;
+    backup = backup_txn->base;
 
     blk = blk_by_name(backup->device);
     if (!blk) {
@@ -1688,12 +1720,17 @@  static void blockdev_backup_prepare(BlkActionState *common, Error **errp)
     }
     aio_context_acquire(state->aio_context);
 
-    qmp_blockdev_backup(backup->device, backup->target,
-                        backup->sync,
-                        backup->has_speed, backup->speed,
-                        backup->has_on_source_error, backup->on_source_error,
-                        backup->has_on_target_error, backup->on_target_error,
-                        &local_err);
+    if (backup_txn->has_transactional_cancel &&
+        backup_txn->transactional_cancel) {
+        txn = common->block_job_txn;
+    }
+
+    do_blockdev_backup(backup->device, backup->target,
+                       backup->sync,
+                       backup->has_speed, backup->speed,
+                       backup->has_on_source_error, backup->on_source_error,
+                       backup->has_on_target_error, backup->on_target_error,
+                       txn, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         return;
@@ -2573,15 +2610,17 @@  out:
     aio_context_release(aio_context);
 }
 
-void qmp_drive_backup(const char *device, const char *target,
-                      bool has_format, const char *format,
-                      enum MirrorSyncMode sync,
-                      bool has_mode, enum NewImageMode mode,
-                      bool has_speed, int64_t speed,
-                      bool has_bitmap, const char *bitmap,
-                      bool has_on_source_error, BlockdevOnError on_source_error,
-                      bool has_on_target_error, BlockdevOnError on_target_error,
-                      Error **errp)
+static void do_drive_backup(const char *device, const char *target,
+                            bool has_format, const char *format,
+                            enum MirrorSyncMode sync,
+                            bool has_mode, enum NewImageMode mode,
+                            bool has_speed, int64_t speed,
+                            bool has_bitmap, const char *bitmap,
+                            bool has_on_source_error,
+                            BlockdevOnError on_source_error,
+                            bool has_on_target_error,
+                            BlockdevOnError on_target_error,
+                            BlockJobTxn *txn, Error **errp)
 {
     BlockBackend *blk;
     BlockDriverState *bs;
@@ -2696,7 +2735,7 @@  void qmp_drive_backup(const char *device, const char *target,
 
     backup_start(bs, target_bs, speed, sync, bmap,
                  on_source_error, on_target_error,
-                 block_job_cb, bs, &local_err);
+                 block_job_cb, bs, txn, &local_err);
     if (local_err != NULL) {
         bdrv_unref(target_bs);
         error_propagate(errp, local_err);
@@ -2707,19 +2746,37 @@  out:
     aio_context_release(aio_context);
 }
 
+void qmp_drive_backup(const char *device, const char *target,
+                      bool has_format, const char *format,
+                      enum MirrorSyncMode sync,
+                      bool has_mode, enum NewImageMode mode,
+                      bool has_speed, int64_t speed,
+                      bool has_bitmap, const char *bitmap,
+                      bool has_on_source_error, BlockdevOnError on_source_error,
+                      bool has_on_target_error, BlockdevOnError on_target_error,
+                      Error **errp)
+{
+    return do_drive_backup(device, target, has_format, format, sync,
+                           has_mode, mode, has_speed, speed,
+                           has_bitmap, bitmap,
+                           has_on_source_error, on_source_error,
+                           has_on_target_error, on_target_error,
+                           NULL, errp);
+}
+
 BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp)
 {
     return bdrv_named_nodes_list(errp);
 }
 
-void qmp_blockdev_backup(const char *device, const char *target,
+void do_blockdev_backup(const char *device, const char *target,
                          enum MirrorSyncMode sync,
                          bool has_speed, int64_t speed,
                          bool has_on_source_error,
                          BlockdevOnError on_source_error,
                          bool has_on_target_error,
                          BlockdevOnError on_target_error,
-                         Error **errp)
+                         BlockJobTxn *txn, Error **errp)
 {
     BlockBackend *blk;
     BlockDriverState *bs;
@@ -2757,7 +2814,7 @@  void qmp_blockdev_backup(const char *device, const char *target,
     bdrv_ref(target_bs);
     bdrv_set_aio_context(target_bs, aio_context);
     backup_start(bs, target_bs, speed, sync, NULL, on_source_error,
-                 on_target_error, block_job_cb, bs, &local_err);
+                 on_target_error, block_job_cb, bs, txn, &local_err);
     if (local_err != NULL) {
         bdrv_unref(target_bs);
         error_propagate(errp, local_err);
@@ -2766,6 +2823,21 @@  out:
     aio_context_release(aio_context);
 }
 
+void qmp_blockdev_backup(const char *device, const char *target,
+                         enum MirrorSyncMode sync,
+                         bool has_speed, int64_t speed,
+                         bool has_on_source_error,
+                         BlockdevOnError on_source_error,
+                         bool has_on_target_error,
+                         BlockdevOnError on_target_error,
+                         Error **errp)
+{
+    do_blockdev_backup(device, target, sync, has_speed, speed,
+                       has_on_source_error, on_source_error,
+                       has_on_target_error, on_target_error,
+                       NULL, errp);
+}
+
 void qmp_drive_mirror(const char *device, const char *target,
                       bool has_format, const char *format,
                       bool has_node_name, const char *node_name,
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 281d790..6fd07a2 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -643,6 +643,7 @@  void mirror_start(BlockDriverState *bs, BlockDriverState *target,
  * @on_target_error: The action to take upon error writing to the target.
  * @cb: Completion function for the job.
  * @opaque: Opaque pointer value passed to @cb.
+ * @txn: Transaction that this job is part of (may be NULL).
  *
  * Start a backup operation on @bs.  Clusters in @bs are written to @target
  * until the job is cancelled or manually completed.
@@ -653,7 +654,7 @@  void backup_start(BlockDriverState *bs, BlockDriverState *target,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockCompletionFunc *cb, void *opaque,
-                  Error **errp);
+                  BlockJobTxn *txn, Error **errp);
 
 void blk_dev_change_media_cb(BlockBackend *blk, bool load);
 bool blk_dev_has_removable_media(BlockBackend *blk);
diff --git a/qapi-schema.json b/qapi-schema.json
index 583e036..6641be3 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1505,8 +1505,8 @@ 
 { 'union': 'TransactionAction',
   'data': {
        'blockdev-snapshot-sync': 'BlockdevSnapshot',
-       'drive-backup': 'DriveBackup',
-       'blockdev-backup': 'BlockdevBackup',
+       'drive-backup': 'DriveBackupTxn',
+       'blockdev-backup': 'BlockdevBackupTxn',
        'abort': 'Abort',
        'blockdev-snapshot-internal-sync': 'BlockdevSnapshotInternal',
        'block-dirty-bitmap-add': 'BlockDirtyBitmapAdd',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index bb2189e..24db5c2 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -735,7 +735,6 @@ 
 # @on-target-error: #optional the action to take on an error on the target,
 #                   default 'report' (no limitations, since this applies to
 #                   a different block device than @device).
-#
 # Note that @on-source-error and @on-target-error only affect background I/O.
 # If an error occurs during a guest write request, the device's rerror/werror
 # actions will be used.
@@ -750,6 +749,19 @@ 
             '*on-target-error': 'BlockdevOnError' } }
 
 ##
+# @DriveBackupTxn
+#
+# @transactional-cancel: #optional whether failure or cancellation of other
+#                        block jobs with @transactional-cancel true in the same
+#                        transaction causes the whole group to cancel. Default
+#                        is false.
+#
+# Since: 2.5
+##
+{ 'struct': 'DriveBackupTxn', 'base': 'DriveBackup',
+  'data': {'*transactional-cancel': 'bool' } }
+
+##
 # @BlockdevBackup
 #
 # @device: the name of the device which should be copied.
@@ -785,6 +797,19 @@ 
             '*on-target-error': 'BlockdevOnError' } }
 
 ##
+# @BlockdevBackupTxn
+#
+# @transactional-cancel: #optional whether failure or cancellation of other
+#                        block jobs with @transactional-cancel true in the same
+#                        transaction causes the whole group to cancel. Default
+#                        is false
+#
+# Since: 2.5
+##
+{ 'struct': 'BlockdevBackupTxn', 'base': 'BlockdevBackup',
+  'data': {'*transactional-cancel': 'bool' } }
+
+##
 # @blockdev-snapshot-sync
 #
 # Generates a synchronous snapshot of a block device.