diff mbox series

[1/9] block: add persistent reservation in/out api

Message ID 20240508093629.441057-2-luchangqi.123@bytedance.com
State New
Headers show
Series Support persistent reservation operations | expand

Commit Message

Changqi Lu May 8, 2024, 9:36 a.m. UTC
Add persistent reservation in/out operations
at the block level. The following operations
are included:

- read_keys:        retrieves the list of registered keys.
- read_reservation: retrieves the current reservation status.
- register:         registers a new reservation key.
- reserve:          initiates a reservation for a specific key.
- release:          releases a reservation for a specific key.
- clear:            clears all existing reservations.
- preempt:          preempts a reservation held by another key.

Signed-off-by: Changqi Lu <luchangqi.123@bytedance.com>
Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
---
 block/block-backend.c             | 386 ++++++++++++++++++++++++++++++
 block/io.c                        | 161 +++++++++++++
 include/block/block-common.h      |   9 +
 include/block/block-io.h          |  19 ++
 include/block/block_int-common.h  |  31 +++
 include/sysemu/block-backend-io.h |  22 ++
 6 files changed, 628 insertions(+)

Comments

Stefan Hajnoczi May 9, 2024, 6:22 p.m. UTC | #1
On Wed, May 08, 2024 at 05:36:21PM +0800, Changqi Lu wrote:
> Add persistent reservation in/out operations
> at the block level. The following operations
> are included:
> 
> - read_keys:        retrieves the list of registered keys.
> - read_reservation: retrieves the current reservation status.
> - register:         registers a new reservation key.
> - reserve:          initiates a reservation for a specific key.
> - release:          releases a reservation for a specific key.
> - clear:            clears all existing reservations.
> - preempt:          preempts a reservation held by another key.
> 
> Signed-off-by: Changqi Lu <luchangqi.123@bytedance.com>
> Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
> ---
>  block/block-backend.c             | 386 ++++++++++++++++++++++++++++++
>  block/io.c                        | 161 +++++++++++++
>  include/block/block-common.h      |   9 +
>  include/block/block-io.h          |  19 ++
>  include/block/block_int-common.h  |  31 +++
>  include/sysemu/block-backend-io.h |  22 ++
>  6 files changed, 628 insertions(+)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index db6f9b92a3..ec67937d28 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1770,6 +1770,392 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
>      return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
>  }
>  
> +typedef struct BlkPrInCo {
> +    BlockBackend *blk;
> +    uint32_t *generation;
> +    uint32_t num_keys;
> +    BlockPrType *type;
> +    uint64_t *keys;
> +    int ret;
> +} BlkPrInCo;
> +
> +typedef struct BlkPrInCB {
> +    BlockAIOCB common;
> +    BlkPrInCo prco;
> +    bool has_returned;
> +} BlkPrInCB;
> +
> +static const AIOCBInfo blk_pr_in_aiocb_info = {
> +    .aiocb_size         = sizeof(BlkPrInCB),
> +};
> +
> +static void blk_pr_in_complete(BlkPrInCB *acb)
> +{
> +    if (acb->has_returned) {
> +        acb->common.cb(acb->common.opaque, acb->prco.ret);
> +        blk_dec_in_flight(acb->prco.blk);

Please add a comment identifying which blk_inc_in_flight() call this dec
is paired with. That makes it easier for people trying to understand
in-flight reference counting.

> +        qemu_aio_unref(acb);
> +    }
> +}
> +
> +static void blk_pr_in_complete_bh(void *opaque)
> +{
> +    BlkPrInCB *acb = opaque;
> +    assert(acb->has_returned);
> +    blk_pr_in_complete(acb);
> +}
> +
> +static BlockAIOCB *blk_aio_pr_in(BlockBackend *blk, uint32_t *generation,
> +                                 uint32_t num_keys, BlockPrType *type,
> +                                 uint64_t *keys, CoroutineEntry co_entry,
> +                                 BlockCompletionFunc *cb, void *opaque)
> +{
> +    BlkPrInCB *acb;
> +    Coroutine *co;
> +
> +    blk_inc_in_flight(blk);

Please add a comment identifying which blk_dec_in_flight() call this inc
is paired with.

> +    acb = blk_aio_get(&blk_pr_in_aiocb_info, blk, cb, opaque);
> +    acb->prco = (BlkPrInCo) {
> +        .blk        = blk,
> +        .generation = generation,
> +        .num_keys   = num_keys,
> +        .type       = type,
> +        .ret        = NOT_DONE,
> +        .keys       = keys,
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(co_entry, acb);
> +    aio_co_enter(qemu_get_current_aio_context(), co);
> +
> +    acb->has_returned = true;
> +    if (acb->prco.ret != NOT_DONE) {
> +        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
> +                                         blk_pr_in_complete_bh, acb);
> +    }
> +
> +    return &acb->common;
> +}
> +
> +
> +static int coroutine_fn
> +blk_aio_pr_do_read_keys(BlockBackend *blk, uint32_t *generation,
> +                        uint32_t num_keys, uint64_t *keys)
> +{
> +    IO_CODE();
> +
> +    blk_wait_while_drained(blk);
> +    GRAPH_RDLOCK_GUARD();
> +
> +    if (!blk_co_is_available(blk)) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    return bdrv_co_pr_read_keys(blk_bs(blk), generation, num_keys, keys);
> +}
> +
> +static void coroutine_fn blk_aio_pr_read_keys_entry(void *opaque)
> +{
> +    BlkPrInCB *acb = opaque;
> +    BlkPrInCo *prco = &acb->prco;
> +
> +    prco->ret = blk_aio_pr_do_read_keys(prco->blk, prco->generation,
> +                                        prco->num_keys, prco->keys);
> +    blk_pr_in_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t *generation,
> +                                 uint32_t num_keys, uint64_t *keys,
> +                                 BlockCompletionFunc *cb, void *opaque)
> +{
> +    IO_CODE();
> +    return blk_aio_pr_in(blk, generation, num_keys, NULL, keys,
> +                         blk_aio_pr_read_keys_entry, cb, opaque);
> +}
> +
> +
> +static int coroutine_fn
> +blk_aio_pr_do_read_reservation(BlockBackend *blk, uint32_t *generation,
> +                               uint64_t *key, BlockPrType *type)
> +{
> +    IO_CODE();
> +
> +    blk_wait_while_drained(blk);
> +    GRAPH_RDLOCK_GUARD();
> +
> +    if (!blk_co_is_available(blk)) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    return bdrv_co_pr_read_reservation(blk_bs(blk), generation, key, type);
> +}
> +
> +static void coroutine_fn blk_aio_pr_read_reservation_entry(void *opaque)
> +{
> +    BlkPrInCB *acb = opaque;
> +    BlkPrInCo *prco = &acb->prco;
> +
> +    prco->ret = blk_aio_pr_do_read_reservation(prco->blk, prco->generation,
> +                                               prco->keys, prco->type);
> +    blk_pr_in_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t *generation,
> +                                        uint64_t *key, BlockPrType *type,
> +                                        BlockCompletionFunc *cb, void *opaque)
> +{
> +    IO_CODE();
> +    return blk_aio_pr_in(blk, generation, 0, type, key,
> +                         blk_aio_pr_read_reservation_entry, cb, opaque);
> +}
> +
> +
> +typedef struct BlkPrOutCo {
> +    BlockBackend *blk;
> +    uint64_t old_key;
> +    uint64_t new_key;
> +    BlockPrType type;
> +    bool ignore_key;
> +    bool abort;
> +    int ret;
> +} BlkPrOutCo;
> +
> +typedef struct BlkPrOutCB {
> +    BlockAIOCB common;
> +    BlkPrOutCo prco;
> +    bool has_returned;
> +} BlkPrOutCB;
> +
> +static const AIOCBInfo blk_pr_out_aiocb_info = {
> +    .aiocb_size         = sizeof(BlkPrOutCB),
> +};
> +
> +static void blk_pr_out_complete(BlkPrOutCB *acb)
> +{
> +    if (acb->has_returned) {
> +        acb->common.cb(acb->common.opaque, acb->prco.ret);
> +        blk_dec_in_flight(acb->prco.blk);

Same here.

> +        qemu_aio_unref(acb);
> +    }
> +}
> +
> +static void blk_pr_out_complete_bh(void *opaque)
> +{
> +    BlkPrOutCB *acb = opaque;
> +    assert(acb->has_returned);
> +    blk_pr_out_complete(acb);
> +}
> +
> +static BlockAIOCB *blk_aio_pr_out(BlockBackend *blk, uint64_t old_key,
> +                                  uint64_t new_key, BlockPrType type,
> +                                  bool ignore_key, bool abort,
> +                                  CoroutineEntry co_entry,
> +                                  BlockCompletionFunc *cb, void *opaque)
> +{
> +    BlkPrOutCB *acb;
> +    Coroutine *co;
> +
> +    blk_inc_in_flight(blk);

Same here.

> +    acb = blk_aio_get(&blk_pr_out_aiocb_info, blk, cb, opaque);
> +    acb->prco = (BlkPrOutCo) {
> +        .blk        = blk,
> +        .old_key    = old_key,
> +        .new_key    = new_key,
> +        .type       = type,
> +        .ignore_key = ignore_key,
> +        .abort      = abort,
> +        .ret        = NOT_DONE,
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(co_entry, acb);
> +    aio_co_enter(qemu_get_current_aio_context(), co);
> +
> +    acb->has_returned = true;
> +    if (acb->prco.ret != NOT_DONE) {
> +        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
> +                                         blk_pr_out_complete_bh, acb);
> +    }
> +
> +    return &acb->common;
> +}
> +
> +static int coroutine_fn
> +blk_aio_pr_do_register(BlockBackend *blk, uint64_t old_key,
> +                       uint64_t new_key, BlockPrType type,
> +                       bool ignore_key)
> +{
> +    IO_CODE();
> +
> +    blk_wait_while_drained(blk);
> +    GRAPH_RDLOCK_GUARD();
> +
> +    if (!blk_co_is_available(blk)) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    return bdrv_co_pr_register(blk_bs(blk), old_key, new_key, type, ignore_key);
> +}
> +
> +static void coroutine_fn blk_aio_pr_register_entry(void *opaque)
> +{
> +    BlkPrOutCB *acb = opaque;
> +    BlkPrOutCo *prco = &acb->prco;
> +
> +    prco->ret = blk_aio_pr_do_register(prco->blk, prco->old_key, prco->new_key,
> +                                       prco->type, prco->ignore_key);
> +    blk_pr_out_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key,
> +                                uint64_t new_key, BlockPrType type,
> +                                bool ignore_key, BlockCompletionFunc *cb,
> +                                void *opaque)
> +{
> +    IO_CODE();
> +    return blk_aio_pr_out(blk, old_key, new_key, type, ignore_key, false,
> +                          blk_aio_pr_register_entry, cb, opaque);
> +}
> +
> +static int coroutine_fn
> +blk_aio_pr_do_reserve(BlockBackend *blk, uint64_t key, BlockPrType type)
> +{
> +    IO_CODE();
> +
> +    blk_wait_while_drained(blk);
> +    GRAPH_RDLOCK_GUARD();
> +
> +    if (!blk_co_is_available(blk)) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    return bdrv_co_pr_reserve(blk_bs(blk), key, type);
> +}
> +
> +static void coroutine_fn blk_aio_pr_reserve_entry(void *opaque)
> +{
> +    BlkPrOutCB *acb = opaque;
> +    BlkPrOutCo *prco = &acb->prco;
> +
> +    prco->ret = blk_aio_pr_do_reserve(prco->blk, prco->old_key, prco->type);
> +    blk_pr_out_complete(acb);
> +}
> +
> +
> +BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk,  uint64_t key,
> +                               BlockPrType type, BlockCompletionFunc *cb,
> +                               void *opaque)
> +{
> +    IO_CODE();
> +    return blk_aio_pr_out(blk, key, 0, type, false, false,
> +                          blk_aio_pr_reserve_entry, cb, opaque);
> +}
> +
> +static int coroutine_fn
> +blk_aio_pr_do_release(BlockBackend *blk, uint64_t key, BlockPrType type)
> +{
> +    IO_CODE();
> +
> +    blk_wait_while_drained(blk);
> +    GRAPH_RDLOCK_GUARD();
> +
> +    if (!blk_co_is_available(blk)) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    return bdrv_co_pr_release(blk_bs(blk), key, type);
> +}
> +
> +static void coroutine_fn blk_aio_pr_release_entry(void *opaque)
> +{
> +    BlkPrOutCB *acb = opaque;
> +    BlkPrOutCo *prco = &acb->prco;
> +
> +    prco->ret = blk_aio_pr_do_release(prco->blk, prco->old_key, prco->type);
> +    blk_pr_out_complete(acb);
> +}
> +
> +
> +BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key,
> +                               BlockPrType type, BlockCompletionFunc *cb,
> +                               void *opaque)
> +{
> +    IO_CODE();
> +    return blk_aio_pr_out(blk, key, 0, type, false, false,
> +                          blk_aio_pr_release_entry, cb, opaque);
> +}
> +
> +static int coroutine_fn
> +blk_aio_pr_do_clear(BlockBackend *blk, uint64_t key)
> +{
> +    IO_CODE();
> +
> +    blk_wait_while_drained(blk);
> +    GRAPH_RDLOCK_GUARD();
> +
> +    if (!blk_co_is_available(blk)) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    return bdrv_co_pr_clear(blk_bs(blk), key);
> +}
> +
> +static void coroutine_fn blk_aio_pr_clear_entry(void *opaque)
> +{
> +    BlkPrOutCB *acb = opaque;
> +    BlkPrOutCo *prco = &acb->prco;
> +
> +    prco->ret = blk_aio_pr_do_clear(prco->blk, prco->old_key);
> +    blk_pr_out_complete(acb);
> +}
> +
> +
> +BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key,
> +                             BlockCompletionFunc *cb, void *opaque)
> +{
> +    IO_CODE();
> +    return blk_aio_pr_out(blk, key, 0, 0, false, false,
> +                          blk_aio_pr_clear_entry, cb, opaque);
> +}
> +
> +static int coroutine_fn
> +blk_aio_pr_do_preempt(BlockBackend *blk, uint64_t old_key,
> +                      uint64_t new_key, BlockPrType type, bool abort)
> +{
> +    IO_CODE();
> +
> +    blk_wait_while_drained(blk);
> +    GRAPH_RDLOCK_GUARD();
> +
> +    if (!blk_co_is_available(blk)) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    return bdrv_co_pr_preempt(blk_bs(blk), old_key, new_key, type, abort);
> +}
> +
> +static void coroutine_fn blk_aio_pr_preempt_entry(void *opaque)
> +{
> +    BlkPrOutCB *acb = opaque;
> +    BlkPrOutCo *prco = &acb->prco;
> +
> +    prco->ret = blk_aio_pr_do_preempt(prco->blk, prco->old_key,
> +                                      prco->new_key, prco->type,
> +                                      prco->abort);
> +    blk_pr_out_complete(acb);
> +}
> +
> +
> +BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t old_key,
> +                               uint64_t new_key, BlockPrType type,
> +                               bool abort, BlockCompletionFunc *cb,
> +                               void *opaque)
> +{
> +    IO_CODE();
> +    return blk_aio_pr_out(blk, old_key, new_key, type, false, abort,
> +                          blk_aio_pr_preempt_entry, cb, opaque);
> +}
> +
>  /* To be called between exactly one pair of blk_inc/dec_in_flight() */
>  static int coroutine_fn
>  blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes)
> diff --git a/block/io.c b/block/io.c
> index 7217cf811b..98a358c6b2 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -3220,6 +3220,167 @@ out:
>      return co.ret;
>  }
>  
> +int coroutine_fn bdrv_co_pr_read_keys(BlockDriverState *bs,
> +                 uint32_t *generation, uint32_t num_keys,
> +                 uint64_t *keys)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +        .coroutine = qemu_coroutine_self(),
> +    };
> +
> +    IO_CODE();
> +    assert_bdrv_graph_readable();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_pr_read_keys) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    co.ret = drv->bdrv_co_pr_read_keys(bs, generation, num_keys, keys);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
> +int coroutine_fn bdrv_co_pr_read_reservation(BlockDriverState *bs,
> +                 uint32_t *generation, uint64_t *key, BlockPrType *type)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +        .coroutine = qemu_coroutine_self(),
> +    };
> +
> +    IO_CODE();
> +    assert_bdrv_graph_readable();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_pr_read_reservation) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    co.ret = drv->bdrv_co_pr_read_reservation(bs, generation, key, type);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
> +int coroutine_fn bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key,
> +                 uint64_t new_key, BlockPrType type, bool ignore_key)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +        .coroutine = qemu_coroutine_self(),
> +    };
> +
> +    IO_CODE();
> +    assert_bdrv_graph_readable();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_pr_register) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    co.ret = drv->bdrv_co_pr_register(bs, old_key, new_key, type, ignore_key);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
> +int coroutine_fn bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key,
> +                                    BlockPrType type)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +        .coroutine = qemu_coroutine_self(),
> +    };
> +
> +    IO_CODE();
> +    assert_bdrv_graph_readable();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_pr_reserve) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    co.ret = drv->bdrv_co_pr_reserve(bs, key, type);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
> +int coroutine_fn bdrv_co_pr_release(BlockDriverState *bs, uint64_t key,
> +                                    BlockPrType type)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +        .coroutine = qemu_coroutine_self(),
> +    };
> +
> +    IO_CODE();
> +    assert_bdrv_graph_readable();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_pr_release) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    co.ret = drv->bdrv_co_pr_release(bs, key, type);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
> +int coroutine_fn bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +        .coroutine = qemu_coroutine_self(),
> +    };
> +
> +    IO_CODE();
> +    assert_bdrv_graph_readable();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_pr_clear) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    co.ret = drv->bdrv_co_pr_clear(bs, key);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
> +int coroutine_fn bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t old_key,
> +                       uint64_t new_key, BlockPrType type, bool abort)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +        .coroutine = qemu_coroutine_self(),
> +    };
> +
> +    IO_CODE();
> +    assert_bdrv_graph_readable();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_pr_preempt) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    co.ret = drv->bdrv_co_pr_preempt(bs, old_key, new_key, type, abort);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
>  int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
>                          unsigned int *nr_zones,
>                          BlockZoneDescriptor *zones)
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index a846023a09..c078d69f79 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -524,6 +524,15 @@ typedef enum {
>      BDRV_FIX_ERRORS   = 2,
>  } BdrvCheckMode;
>  
> +typedef enum {
> +    BLK_PR_WRITE_EXCLUSIVE              = 0x1,
> +    BLK_PR_EXCLUSIVE_ACCESS             = 0x2,
> +    BLK_PR_WRITE_EXCLUSIVE_REGS_ONLY    = 0x3,
> +    BLK_PR_EXCLUSIVE_ACCESS_REGS_ONLY   = 0x4,
> +    BLK_PR_WRITE_EXCLUSIVE_ALL_REGS     = 0x5,
> +    BLK_PR_EXCLUSIVE_ACCESS_ALL_REGS    = 0x6,

Documentation needed. If you don't want to document the full meaning
here, please include a comment referring to a specific section in a
SCSI/NVMe standard that explains the semantics.

> +} BlockPrType;
> +
>  typedef struct BlockSizes {
>      uint32_t phys;
>      uint32_t log;
> diff --git a/include/block/block-io.h b/include/block/block-io.h
> index b49e0537dd..f2827cd74e 100644
> --- a/include/block/block-io.h
> +++ b/include/block/block-io.h
> @@ -106,6 +106,25 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb);
>  int coroutine_fn GRAPH_RDLOCK
>  bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
>  
> +int coroutine_fn GRAPH_RDLOCK
> +bdrv_co_pr_read_keys(BlockDriverState *bs, uint32_t *generation,
> +                     uint32_t num_keys, uint64_t *keys);
> +int coroutine_fn GRAPH_RDLOCK
> +bdrv_co_pr_read_reservation(BlockDriverState *bs, uint32_t *generation,
> +                            uint64_t *key, BlockPrType *type);
> +int coroutine_fn GRAPH_RDLOCK
> +bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key,
> +                    uint64_t new_key, BlockPrType type, bool ignore_key);
> +int coroutine_fn GRAPH_RDLOCK
> +bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key, BlockPrType type);
> +int coroutine_fn GRAPH_RDLOCK
> +bdrv_co_pr_release(BlockDriverState *bs, uint64_t key, BlockPrType type);
> +int coroutine_fn GRAPH_RDLOCK
> +bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key);
> +int coroutine_fn GRAPH_RDLOCK
> +bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t old_key, uint64_t new_key,
> +                   BlockPrType type, bool abort);
> +
>  /* Ensure contents are flushed to disk.  */
>  int coroutine_fn GRAPH_RDLOCK bdrv_co_flush(BlockDriverState *bs);
>  
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index 761276127e..e5526f1d04 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -766,6 +766,37 @@ struct BlockDriver {
>      int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_ioctl)(
>          BlockDriverState *bs, unsigned long int req, void *buf);
>  
> +    /**
> +     * bdrv_co_pr_read_keys is called to read the registered keys,
> +     * @num_keys is the maximum number of keys that can be transmitted.
> +     * On success, store generation in @generation and store keys @keys
> +     * and return the number of @keys.
> +     * On failure return -errno.
> +     */
> +    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_keys)(
> +        BlockDriverState *bs, uint32_t *generation,
> +        uint32_t num_keys, uint64_t *keys);
> +    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_reservation)(
> +        BlockDriverState *bs, uint32_t *generation,
> +        uint64_t *key, BlockPrType *type);

Plase add doc comments for each new function. It's okay to refer to
specific sections in the SCSI or NVMe standards if explaining the
concepts is too verbose. Someone implementing persistent reservations
support in another block driver must be able to figure out what the
exact model for persistent reservations is.

> +
> +    /**
> +     * Persist Through Power Loss(PTPL) is considered as required in QEMU
> +     * block layer, the block driver need always enable PTPL.
> +     */

What is the reasoning behind this? Will applications that rely on PTPL=0
work?

> +    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_register)(
> +        BlockDriverState *bs, uint64_t old_key,
> +        uint64_t new_key, BlockPrType type, bool ignore_key);
> +    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_reserve)(
> +        BlockDriverState *bs, uint64_t key, BlockPrType type);
> +    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_release)(
> +        BlockDriverState *bs, uint64_t key, BlockPrType type);
> +    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_clear)(
> +        BlockDriverState *bs, uint64_t key);
> +    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_preempt)(
> +        BlockDriverState *bs, uint64_t old_key,
> +        uint64_t new_key, BlockPrType type, bool abort);

Does a block driver need to implement all or none of these callbacks? I
guess they are not optional on a per-callback basis.

> +
>      /*
>       * Returns 0 for completed check, -errno for internal errors.
>       * The check results are stored in result.
> diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
> index d174275a5c..f87c61c06e 100644
> --- a/include/sysemu/block-backend-io.h
> +++ b/include/sysemu/block-backend-io.h
> @@ -62,6 +62,28 @@ void blk_aio_cancel_async(BlockAIOCB *acb);
>  BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
>                            BlockCompletionFunc *cb, void *opaque);
>  
> +BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t *generation,
> +                                 uint32_t num_keys, uint64_t *keys,
> +                                 BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t *generation,
> +                                        uint64_t *key, BlockPrType *type,
> +                                        BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key,
> +                                uint64_t new_key, BlockPrType type,
> +                                bool ignore_key, BlockCompletionFunc *cb,
> +                                void *opaque);
> +BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk,  uint64_t key,
> +                               BlockPrType type, BlockCompletionFunc *cb,
> +                               void *opaque);
> +BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key,
> +                               BlockPrType type, BlockCompletionFunc *cb,
> +                               void *opaque);
> +BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key,
> +                             BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t old_key,
> +                               uint64_t new_key, BlockPrType type, bool abort,
> +                               BlockCompletionFunc *cb, void *opaque);
> +
>  void blk_inc_in_flight(BlockBackend *blk);
>  void blk_dec_in_flight(BlockBackend *blk);
>  
> -- 
> 2.20.1
>
zhenwei pi May 10, 2024, 2:37 a.m. UTC | #2
On 5/10/24 02:22, Stefan Hajnoczi wrote:
> On Wed, May 08, 2024 at 05:36:21PM +0800, Changqi Lu wrote:

[SNIP]

>> +
>> +    /**
>> +     * Persist Through Power Loss(PTPL) is considered as required in QEMU
>> +     * block layer, the block driver need always enable PTPL.
>> +     */
> 
> What is the reasoning behind this? Will applications that rely on PTPL=0
> work?
> 

Hi Stefan,

PTPL needs to be supported at QEMU block layer in theory, include 
reporting PTPL capability and PTPL flag in PR OUT command. However, in 
the real production environment, both SCSI driver and NVMe driver, even 
linux block layer always enable PTPL nnconditionally on a linux platform.

Ref the latest code:
1, SCSI:
https://github.com/torvalds/linux/blob/master/drivers/scsi/sd.c#L1978
static int sd_pr_register(struct block_device *bdev, u64 old_key, u64 
new_key,
		u32 flags)
{
	if (flags & ~PR_FL_IGNORE_KEY)
		return -EOPNOTSUPP;
	return sd_pr_out_command(bdev, (flags & PR_FL_IGNORE_KEY) ? 0x06 : 0x00,
			old_key, new_key, 0,
			(1 << 0) /* APTPL */);
}

2, NVMe:
https://github.com/torvalds/linux/blob/master/drivers/nvme/host/pr.c#L127
static int nvme_pr_register(struct block_device *bdev, u64 old,
		u64 new, unsigned flags)
{
	u32 cdw10;

	if (flags & ~PR_FL_IGNORE_KEY)
		return -EOPNOTSUPP;

	cdw10 = old ? 2 : 0;
	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
}

3, linux block layers also hides PTPL flag:
https://github.com/torvalds/linux/blob/master/block/ioctl.c#L283
static int blkdev_pr_register(struct block_device *bdev, blk_mode_t mode,
		struct pr_registration __user *arg)
{
	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
	struct pr_registration reg;

	if (!blkdev_pr_allowed(bdev, mode))
		return -EPERM;
	if (!ops || !ops->pr_register)
		return -EOPNOTSUPP;
	if (copy_from_user(&reg, arg, sizeof(reg)))
		return -EFAULT;

	if (reg.flags & ~PR_FL_IGNORE_KEY)
		return -EOPNOTSUPP;
	return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags);
}


So we(Changqi and me) wanted to keep PR a bit simple in QEMU block layer:
- consider PTPL is required in QEMU block, then we don't need an extra flag
- the block backend driver always request PR OUT with PTPL flag

Then: Will applications that rely on PTPL=0 work?
Yes, a guest PR out without PTPL will work, but the backend uses PTPL=1 
instead.

Will this request succeed?
If the backend driver' supports PTPL capability, it will succeed. 
Otherwise it will fail.
diff mbox series

Patch

diff --git a/block/block-backend.c b/block/block-backend.c
index db6f9b92a3..ec67937d28 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1770,6 +1770,392 @@  BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
     return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
 }
 
+typedef struct BlkPrInCo {
+    BlockBackend *blk;
+    uint32_t *generation;
+    uint32_t num_keys;
+    BlockPrType *type;
+    uint64_t *keys;
+    int ret;
+} BlkPrInCo;
+
+typedef struct BlkPrInCB {
+    BlockAIOCB common;
+    BlkPrInCo prco;
+    bool has_returned;
+} BlkPrInCB;
+
+static const AIOCBInfo blk_pr_in_aiocb_info = {
+    .aiocb_size         = sizeof(BlkPrInCB),
+};
+
+static void blk_pr_in_complete(BlkPrInCB *acb)
+{
+    if (acb->has_returned) {
+        acb->common.cb(acb->common.opaque, acb->prco.ret);
+        blk_dec_in_flight(acb->prco.blk);
+        qemu_aio_unref(acb);
+    }
+}
+
+static void blk_pr_in_complete_bh(void *opaque)
+{
+    BlkPrInCB *acb = opaque;
+    assert(acb->has_returned);
+    blk_pr_in_complete(acb);
+}
+
+static BlockAIOCB *blk_aio_pr_in(BlockBackend *blk, uint32_t *generation,
+                                 uint32_t num_keys, BlockPrType *type,
+                                 uint64_t *keys, CoroutineEntry co_entry,
+                                 BlockCompletionFunc *cb, void *opaque)
+{
+    BlkPrInCB *acb;
+    Coroutine *co;
+
+    blk_inc_in_flight(blk);
+    acb = blk_aio_get(&blk_pr_in_aiocb_info, blk, cb, opaque);
+    acb->prco = (BlkPrInCo) {
+        .blk        = blk,
+        .generation = generation,
+        .num_keys   = num_keys,
+        .type       = type,
+        .ret        = NOT_DONE,
+        .keys       = keys,
+    };
+    acb->has_returned = false;
+
+    co = qemu_coroutine_create(co_entry, acb);
+    aio_co_enter(qemu_get_current_aio_context(), co);
+
+    acb->has_returned = true;
+    if (acb->prco.ret != NOT_DONE) {
+        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
+                                         blk_pr_in_complete_bh, acb);
+    }
+
+    return &acb->common;
+}
+
+
+static int coroutine_fn
+blk_aio_pr_do_read_keys(BlockBackend *blk, uint32_t *generation,
+                        uint32_t num_keys, uint64_t *keys)
+{
+    IO_CODE();
+
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    if (!blk_co_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
+    return bdrv_co_pr_read_keys(blk_bs(blk), generation, num_keys, keys);
+}
+
+static void coroutine_fn blk_aio_pr_read_keys_entry(void *opaque)
+{
+    BlkPrInCB *acb = opaque;
+    BlkPrInCo *prco = &acb->prco;
+
+    prco->ret = blk_aio_pr_do_read_keys(prco->blk, prco->generation,
+                                        prco->num_keys, prco->keys);
+    blk_pr_in_complete(acb);
+}
+
+BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t *generation,
+                                 uint32_t num_keys, uint64_t *keys,
+                                 BlockCompletionFunc *cb, void *opaque)
+{
+    IO_CODE();
+    return blk_aio_pr_in(blk, generation, num_keys, NULL, keys,
+                         blk_aio_pr_read_keys_entry, cb, opaque);
+}
+
+
+static int coroutine_fn
+blk_aio_pr_do_read_reservation(BlockBackend *blk, uint32_t *generation,
+                               uint64_t *key, BlockPrType *type)
+{
+    IO_CODE();
+
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    if (!blk_co_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
+    return bdrv_co_pr_read_reservation(blk_bs(blk), generation, key, type);
+}
+
+static void coroutine_fn blk_aio_pr_read_reservation_entry(void *opaque)
+{
+    BlkPrInCB *acb = opaque;
+    BlkPrInCo *prco = &acb->prco;
+
+    prco->ret = blk_aio_pr_do_read_reservation(prco->blk, prco->generation,
+                                               prco->keys, prco->type);
+    blk_pr_in_complete(acb);
+}
+
+BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t *generation,
+                                        uint64_t *key, BlockPrType *type,
+                                        BlockCompletionFunc *cb, void *opaque)
+{
+    IO_CODE();
+    return blk_aio_pr_in(blk, generation, 0, type, key,
+                         blk_aio_pr_read_reservation_entry, cb, opaque);
+}
+
+
+typedef struct BlkPrOutCo {
+    BlockBackend *blk;
+    uint64_t old_key;
+    uint64_t new_key;
+    BlockPrType type;
+    bool ignore_key;
+    bool abort;
+    int ret;
+} BlkPrOutCo;
+
+typedef struct BlkPrOutCB {
+    BlockAIOCB common;
+    BlkPrOutCo prco;
+    bool has_returned;
+} BlkPrOutCB;
+
+static const AIOCBInfo blk_pr_out_aiocb_info = {
+    .aiocb_size         = sizeof(BlkPrOutCB),
+};
+
+static void blk_pr_out_complete(BlkPrOutCB *acb)
+{
+    if (acb->has_returned) {
+        acb->common.cb(acb->common.opaque, acb->prco.ret);
+        blk_dec_in_flight(acb->prco.blk);
+        qemu_aio_unref(acb);
+    }
+}
+
+static void blk_pr_out_complete_bh(void *opaque)
+{
+    BlkPrOutCB *acb = opaque;
+    assert(acb->has_returned);
+    blk_pr_out_complete(acb);
+}
+
+static BlockAIOCB *blk_aio_pr_out(BlockBackend *blk, uint64_t old_key,
+                                  uint64_t new_key, BlockPrType type,
+                                  bool ignore_key, bool abort,
+                                  CoroutineEntry co_entry,
+                                  BlockCompletionFunc *cb, void *opaque)
+{
+    BlkPrOutCB *acb;
+    Coroutine *co;
+
+    blk_inc_in_flight(blk);
+    acb = blk_aio_get(&blk_pr_out_aiocb_info, blk, cb, opaque);
+    acb->prco = (BlkPrOutCo) {
+        .blk        = blk,
+        .old_key    = old_key,
+        .new_key    = new_key,
+        .type       = type,
+        .ignore_key = ignore_key,
+        .abort      = abort,
+        .ret        = NOT_DONE,
+    };
+    acb->has_returned = false;
+
+    co = qemu_coroutine_create(co_entry, acb);
+    aio_co_enter(qemu_get_current_aio_context(), co);
+
+    acb->has_returned = true;
+    if (acb->prco.ret != NOT_DONE) {
+        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
+                                         blk_pr_out_complete_bh, acb);
+    }
+
+    return &acb->common;
+}
+
+static int coroutine_fn
+blk_aio_pr_do_register(BlockBackend *blk, uint64_t old_key,
+                       uint64_t new_key, BlockPrType type,
+                       bool ignore_key)
+{
+    IO_CODE();
+
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    if (!blk_co_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
+    return bdrv_co_pr_register(blk_bs(blk), old_key, new_key, type, ignore_key);
+}
+
+static void coroutine_fn blk_aio_pr_register_entry(void *opaque)
+{
+    BlkPrOutCB *acb = opaque;
+    BlkPrOutCo *prco = &acb->prco;
+
+    prco->ret = blk_aio_pr_do_register(prco->blk, prco->old_key, prco->new_key,
+                                       prco->type, prco->ignore_key);
+    blk_pr_out_complete(acb);
+}
+
+BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key,
+                                uint64_t new_key, BlockPrType type,
+                                bool ignore_key, BlockCompletionFunc *cb,
+                                void *opaque)
+{
+    IO_CODE();
+    return blk_aio_pr_out(blk, old_key, new_key, type, ignore_key, false,
+                          blk_aio_pr_register_entry, cb, opaque);
+}
+
+static int coroutine_fn
+blk_aio_pr_do_reserve(BlockBackend *blk, uint64_t key, BlockPrType type)
+{
+    IO_CODE();
+
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    if (!blk_co_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
+    return bdrv_co_pr_reserve(blk_bs(blk), key, type);
+}
+
+static void coroutine_fn blk_aio_pr_reserve_entry(void *opaque)
+{
+    BlkPrOutCB *acb = opaque;
+    BlkPrOutCo *prco = &acb->prco;
+
+    prco->ret = blk_aio_pr_do_reserve(prco->blk, prco->old_key, prco->type);
+    blk_pr_out_complete(acb);
+}
+
+
+BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk,  uint64_t key,
+                               BlockPrType type, BlockCompletionFunc *cb,
+                               void *opaque)
+{
+    IO_CODE();
+    return blk_aio_pr_out(blk, key, 0, type, false, false,
+                          blk_aio_pr_reserve_entry, cb, opaque);
+}
+
+static int coroutine_fn
+blk_aio_pr_do_release(BlockBackend *blk, uint64_t key, BlockPrType type)
+{
+    IO_CODE();
+
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    if (!blk_co_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
+    return bdrv_co_pr_release(blk_bs(blk), key, type);
+}
+
+static void coroutine_fn blk_aio_pr_release_entry(void *opaque)
+{
+    BlkPrOutCB *acb = opaque;
+    BlkPrOutCo *prco = &acb->prco;
+
+    prco->ret = blk_aio_pr_do_release(prco->blk, prco->old_key, prco->type);
+    blk_pr_out_complete(acb);
+}
+
+
+BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key,
+                               BlockPrType type, BlockCompletionFunc *cb,
+                               void *opaque)
+{
+    IO_CODE();
+    return blk_aio_pr_out(blk, key, 0, type, false, false,
+                          blk_aio_pr_release_entry, cb, opaque);
+}
+
+static int coroutine_fn
+blk_aio_pr_do_clear(BlockBackend *blk, uint64_t key)
+{
+    IO_CODE();
+
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    if (!blk_co_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
+    return bdrv_co_pr_clear(blk_bs(blk), key);
+}
+
+static void coroutine_fn blk_aio_pr_clear_entry(void *opaque)
+{
+    BlkPrOutCB *acb = opaque;
+    BlkPrOutCo *prco = &acb->prco;
+
+    prco->ret = blk_aio_pr_do_clear(prco->blk, prco->old_key);
+    blk_pr_out_complete(acb);
+}
+
+
+BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key,
+                             BlockCompletionFunc *cb, void *opaque)
+{
+    IO_CODE();
+    return blk_aio_pr_out(blk, key, 0, 0, false, false,
+                          blk_aio_pr_clear_entry, cb, opaque);
+}
+
+static int coroutine_fn
+blk_aio_pr_do_preempt(BlockBackend *blk, uint64_t old_key,
+                      uint64_t new_key, BlockPrType type, bool abort)
+{
+    IO_CODE();
+
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    if (!blk_co_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
+    return bdrv_co_pr_preempt(blk_bs(blk), old_key, new_key, type, abort);
+}
+
+static void coroutine_fn blk_aio_pr_preempt_entry(void *opaque)
+{
+    BlkPrOutCB *acb = opaque;
+    BlkPrOutCo *prco = &acb->prco;
+
+    prco->ret = blk_aio_pr_do_preempt(prco->blk, prco->old_key,
+                                      prco->new_key, prco->type,
+                                      prco->abort);
+    blk_pr_out_complete(acb);
+}
+
+
+BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t old_key,
+                               uint64_t new_key, BlockPrType type,
+                               bool abort, BlockCompletionFunc *cb,
+                               void *opaque)
+{
+    IO_CODE();
+    return blk_aio_pr_out(blk, old_key, new_key, type, false, abort,
+                          blk_aio_pr_preempt_entry, cb, opaque);
+}
+
 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
 static int coroutine_fn
 blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes)
diff --git a/block/io.c b/block/io.c
index 7217cf811b..98a358c6b2 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3220,6 +3220,167 @@  out:
     return co.ret;
 }
 
+int coroutine_fn bdrv_co_pr_read_keys(BlockDriverState *bs,
+                 uint32_t *generation, uint32_t num_keys,
+                 uint64_t *keys)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+
+    IO_CODE();
+    assert_bdrv_graph_readable();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_pr_read_keys) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    co.ret = drv->bdrv_co_pr_read_keys(bs, generation, num_keys, keys);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_pr_read_reservation(BlockDriverState *bs,
+                 uint32_t *generation, uint64_t *key, BlockPrType *type)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+
+    IO_CODE();
+    assert_bdrv_graph_readable();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_pr_read_reservation) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    co.ret = drv->bdrv_co_pr_read_reservation(bs, generation, key, type);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key,
+                 uint64_t new_key, BlockPrType type, bool ignore_key)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+
+    IO_CODE();
+    assert_bdrv_graph_readable();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_pr_register) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    co.ret = drv->bdrv_co_pr_register(bs, old_key, new_key, type, ignore_key);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key,
+                                    BlockPrType type)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+
+    IO_CODE();
+    assert_bdrv_graph_readable();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_pr_reserve) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    co.ret = drv->bdrv_co_pr_reserve(bs, key, type);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_pr_release(BlockDriverState *bs, uint64_t key,
+                                    BlockPrType type)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+
+    IO_CODE();
+    assert_bdrv_graph_readable();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_pr_release) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    co.ret = drv->bdrv_co_pr_release(bs, key, type);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+
+    IO_CODE();
+    assert_bdrv_graph_readable();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_pr_clear) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    co.ret = drv->bdrv_co_pr_clear(bs, key);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t old_key,
+                       uint64_t new_key, BlockPrType type, bool abort)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+
+    IO_CODE();
+    assert_bdrv_graph_readable();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_pr_preempt) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    co.ret = drv->bdrv_co_pr_preempt(bs, old_key, new_key, type, abort);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
 int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
                         unsigned int *nr_zones,
                         BlockZoneDescriptor *zones)
diff --git a/include/block/block-common.h b/include/block/block-common.h
index a846023a09..c078d69f79 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -524,6 +524,15 @@  typedef enum {
     BDRV_FIX_ERRORS   = 2,
 } BdrvCheckMode;
 
+typedef enum {
+    BLK_PR_WRITE_EXCLUSIVE              = 0x1,
+    BLK_PR_EXCLUSIVE_ACCESS             = 0x2,
+    BLK_PR_WRITE_EXCLUSIVE_REGS_ONLY    = 0x3,
+    BLK_PR_EXCLUSIVE_ACCESS_REGS_ONLY   = 0x4,
+    BLK_PR_WRITE_EXCLUSIVE_ALL_REGS     = 0x5,
+    BLK_PR_EXCLUSIVE_ACCESS_ALL_REGS    = 0x6,
+} BlockPrType;
+
 typedef struct BlockSizes {
     uint32_t phys;
     uint32_t log;
diff --git a/include/block/block-io.h b/include/block/block-io.h
index b49e0537dd..f2827cd74e 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -106,6 +106,25 @@  void bdrv_aio_cancel_async(BlockAIOCB *acb);
 int coroutine_fn GRAPH_RDLOCK
 bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
 
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_pr_read_keys(BlockDriverState *bs, uint32_t *generation,
+                     uint32_t num_keys, uint64_t *keys);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_pr_read_reservation(BlockDriverState *bs, uint32_t *generation,
+                            uint64_t *key, BlockPrType *type);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key,
+                    uint64_t new_key, BlockPrType type, bool ignore_key);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key, BlockPrType type);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_pr_release(BlockDriverState *bs, uint64_t key, BlockPrType type);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t old_key, uint64_t new_key,
+                   BlockPrType type, bool abort);
+
 /* Ensure contents are flushed to disk.  */
 int coroutine_fn GRAPH_RDLOCK bdrv_co_flush(BlockDriverState *bs);
 
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 761276127e..e5526f1d04 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -766,6 +766,37 @@  struct BlockDriver {
     int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_ioctl)(
         BlockDriverState *bs, unsigned long int req, void *buf);
 
+    /**
+     * bdrv_co_pr_read_keys is called to read the registered keys,
+     * @num_keys is the maximum number of keys that can be transmitted.
+     * On success, store generation in @generation and store keys @keys
+     * and return the number of @keys.
+     * On failure return -errno.
+     */
+    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_keys)(
+        BlockDriverState *bs, uint32_t *generation,
+        uint32_t num_keys, uint64_t *keys);
+    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_reservation)(
+        BlockDriverState *bs, uint32_t *generation,
+        uint64_t *key, BlockPrType *type);
+
+    /**
+     * Persist Through Power Loss(PTPL) is considered as required in QEMU
+     * block layer, the block driver need always enable PTPL.
+     */
+    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_register)(
+        BlockDriverState *bs, uint64_t old_key,
+        uint64_t new_key, BlockPrType type, bool ignore_key);
+    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_reserve)(
+        BlockDriverState *bs, uint64_t key, BlockPrType type);
+    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_release)(
+        BlockDriverState *bs, uint64_t key, BlockPrType type);
+    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_clear)(
+        BlockDriverState *bs, uint64_t key);
+    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_preempt)(
+        BlockDriverState *bs, uint64_t old_key,
+        uint64_t new_key, BlockPrType type, bool abort);
+
     /*
      * Returns 0 for completed check, -errno for internal errors.
      * The check results are stored in result.
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
index d174275a5c..f87c61c06e 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -62,6 +62,28 @@  void blk_aio_cancel_async(BlockAIOCB *acb);
 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
                           BlockCompletionFunc *cb, void *opaque);
 
+BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t *generation,
+                                 uint32_t num_keys, uint64_t *keys,
+                                 BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t *generation,
+                                        uint64_t *key, BlockPrType *type,
+                                        BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key,
+                                uint64_t new_key, BlockPrType type,
+                                bool ignore_key, BlockCompletionFunc *cb,
+                                void *opaque);
+BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk,  uint64_t key,
+                               BlockPrType type, BlockCompletionFunc *cb,
+                               void *opaque);
+BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key,
+                               BlockPrType type, BlockCompletionFunc *cb,
+                               void *opaque);
+BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key,
+                             BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t old_key,
+                               uint64_t new_key, BlockPrType type, bool abort,
+                               BlockCompletionFunc *cb, void *opaque);
+
 void blk_inc_in_flight(BlockBackend *blk);
 void blk_dec_in_flight(BlockBackend *blk);