diff mbox series

[v6,1/4] file-posix: add tracking of the zone write pointers

Message ID 20230310103106.62124-2-faithilikerun@gmail.com
State New
Headers show
Series Add zone append write for zoned device | expand

Commit Message

Sam Li March 10, 2023, 10:31 a.m. UTC
Since Linux doesn't have a user API to issue zone append operations to
zoned devices from user space, the file-posix driver is modified to add
zone append emulation using regular writes. To do this, the file-posix
driver tracks the wp location of all zones of the device. It uses an
array of uint64_t. The most significant bit of each wp location indicates
if the zone type is conventional zones.

The zones wp can be changed due to the following operations issued:
- zone reset: change the wp to the start offset of that zone
- zone finish: change to the end location of that zone
- write to a zone
- zone append

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 block/file-posix.c               | 159 ++++++++++++++++++++++++++++++-
 include/block/block-common.h     |  14 +++
 include/block/block_int-common.h |   3 +
 3 files changed, 172 insertions(+), 4 deletions(-)

Comments

Dmitry Fomichev March 14, 2023, 2:23 a.m. UTC | #1
On Fri, 2023-03-10 at 18:31 +0800, Sam Li wrote:
> Since Linux doesn't have a user API to issue zone append operations to
> zoned devices from user space, the file-posix driver is modified to add
> zone append emulation using regular writes. To do this, the file-posix
> driver tracks the wp location of all zones of the device. It uses an
> array of uint64_t. The most significant bit of each wp location indicates
> if the zone type is conventional zones.
> 
> The zones wp can be changed due to the following operations issued:
> - zone reset: change the wp to the start offset of that zone
> - zone finish: change to the end location of that zone
> - write to a zone
> - zone append
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/file-posix.c               | 159 ++++++++++++++++++++++++++++++-
>  include/block/block-common.h     |  14 +++
>  include/block/block_int-common.h |   3 +
>  3 files changed, 172 insertions(+), 4 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 563acc76ae..61ed769ac8 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1324,6 +1324,77 @@ static int hdev_get_max_segments(int fd, struct stat
> *st)
>  #endif
>  }
>  
> +#if defined(CONFIG_BLKZONED)
> +static int get_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +                        unsigned int nrz) {
> +    struct blk_zone *blkz;
> +    size_t rep_size;
> +    uint64_t sector = offset >> BDRV_SECTOR_BITS;
> +    int ret, n = 0, i = 0;
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> +    g_autofree struct blk_zone_report *rep = NULL;
> +
> +    rep = g_malloc(rep_size);
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +        memset(rep, 0, rep_size);
> +        rep->sector = sector;
> +        rep->nr_zones = nrz - n;
> +
> +        do {
> +            ret = ioctl(fd, BLKREPORTZONE, rep);
> +        } while (ret != 0 && errno == EINTR);
> +        if (ret != 0) {
> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> +                    fd, offset, errno);
> +            return -errno;
> +        }
> +
> +        if (!rep->nr_zones) {
> +            break;
> +        }
> +
> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> +            /*
> +             * The wp tracking cares only about sequential writes required and
> +             * sequential write preferred zones so that the wp can advance to
> +             * the right location.
> +             * Use the most significant bit of the wp location to indicate the
> +             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> +             */
> +            if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> +                wps->wp[i] = 1ULL << 63;
> +            } else {
> +                switch(blkz[i].cond) {
> +                case BLK_ZONE_COND_FULL:
> +                case BLK_ZONE_COND_READONLY:
> +                    /* Zone not writable */
> +                    wps->wp[i] = (blkz[i].start + blkz[i].len) <<
> BDRV_SECTOR_BITS;
> +                    break;
> +                case BLK_ZONE_COND_OFFLINE:
> +                    /* Zone not writable nor readable */
> +                    wps->wp[i] = (blkz[i].start) << BDRV_SECTOR_BITS;
> +                    break;
> +                default:
> +                    wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> +                    break;
> +                }
> +            }
> +        }
> +        sector = blkz[i - 1].start + blkz[i - 1].len;
> +    }
> +
> +    return 0;
> +}
> +
> +static void update_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +                            unsigned int nrz) {
> +    if (get_zones_wp(fd, wps, offset, nrz) < 0) {
> +        error_report("update zone wp failed");
> +    }
> +}
> +#endif
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>      BDRVRawState *s = bs->opaque;
> @@ -1413,6 +1484,21 @@ static void raw_refresh_limits(BlockDriverState *bs,
> Error **errp)
>          if (ret >= 0) {
>              bs->bl.max_active_zones = ret;
>          }
> +
> +        ret = get_sysfs_long_val(&st, "physical_block_size");
> +        if (ret >= 0) {
> +            bs->bl.write_granularity = ret;
> +        }
> +
> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) +
> +                sizeof(int64_t) * bs->bl.nr_zones);
> +        ret = get_zones_wp(s->fd, bs->bl.wps, 0, bs->bl.nr_zones);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "report wps failed");
> +            g_free(bs->bl.wps);
> +            return;
> +        }
> +        qemu_co_mutex_init(&bs->bl.wps->colock);
>          return;
>      }
>  out:
> @@ -2338,9 +2424,15 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs,
> uint64_t offset,
>  {
>      BDRVRawState *s = bs->opaque;
>      RawPosixAIOData acb;
> +    int ret;
>  
>      if (fd_open(bs) < 0)
>          return -EIO;
> +#if defined(CONFIG_BLKZONED)
> +    if (bs->bl.wps) {
> +        qemu_co_mutex_lock(&bs->bl.wps->colock);
> +    }
> +#endif
>  
>      /*
>       * When using O_DIRECT, the request must be aligned to be able to use
> @@ -2354,14 +2446,16 @@ static int coroutine_fn raw_co_prw(BlockDriverState
> *bs, uint64_t offset,
>      } else if (s->use_linux_io_uring) {
>          LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
>          assert(qiov->size == bytes);
> -        return luring_co_submit(bs, aio, s->fd, offset, qiov, type);
> +        ret = luring_co_submit(bs, aio, s->fd, offset, qiov, type);
> +        goto out;
>  #endif
>  #ifdef CONFIG_LINUX_AIO
>      } else if (s->use_linux_aio) {
>          LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
>          assert(qiov->size == bytes);
> -        return laio_co_submit(bs, aio, s->fd, offset, qiov, type,
> +        ret = laio_co_submit(bs, aio, s->fd, offset, qiov, type,
>                                s->aio_max_batch);
> +        goto out;
>  #endif
>      }
>  
> @@ -2378,7 +2472,32 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs,
> uint64_t offset,
>      };
>  
>      assert(qiov->size == bytes);
> -    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
> +    ret = raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
> +
> +out:
> +#if defined(CONFIG_BLKZONED)
> +    BlockZoneWps *wps = bs->bl.wps;
> +    if (ret == 0) {
> +        if (type & QEMU_AIO_WRITE && wps && bs->bl.zone_size) {
> +            int index = offset / bs->bl.zone_size;

It might be cleaner to define
int64_t *wp = &wps->wp[offset / bs->bl.zone_size];
here instead of the index and use *wp in the subsequent code.

> +            if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +                /* Advance the wp if needed */
> +                if (offset + bytes > wps->wp[index]) {
> +                    wps->wp[index] = offset + bytes;
> +                }
> +            }
> +        }
> +    } else {
> +        if (type & QEMU_AIO_WRITE) {
> +            update_zones_wp(s->fd, bs->bl.wps, 0, 1);
> +        }
> +    }
> +
> +    if (wps) {
> +        qemu_co_mutex_unlock(&wps->colock);
> +    }
> +#endif
> +    return ret;
>  }
>  
>  static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
> @@ -2486,6 +2605,11 @@ static void raw_close(BlockDriverState *bs)
>      BDRVRawState *s = bs->opaque;
>  
>      if (s->fd >= 0) {
> +#if defined(CONFIG_BLKZONED)
> +        if (bs->bl.wps) {
> +            g_free(bs->bl.wps);
> +        }
> +#endif
>          qemu_close(s->fd);
>          s->fd = -1;
>      }
> @@ -3285,6 +3409,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState
> *bs, BlockZoneOp op,
>      const char *op_name;
>      unsigned long zo;
>      int ret;
> +    BlockZoneWps *wps = bs->bl.wps;
>      int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
>  
>      zone_size = bs->bl.zone_size;
> @@ -3302,6 +3427,14 @@ static int coroutine_fn
> raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>          return -EINVAL;
>      }
>  
> +    qemu_co_mutex_lock(&wps->colock);
> +    uint32_t index = offset / bs->bl.zone_size;
> +    if (BDRV_ZT_IS_CONV(wps->wp[index]) && len != capacity) {

The wps->wp[index] expression is used a lot in this function. Consider defining
int64_t *wp = &wps->wp[index]; to simplify the code.

> +        error_report("zone mgmt operations are not allowed for conventional
> zones");
> +        ret = -EIO;
> +        goto out;
> +    }
> +
>      switch (op) {
>      case BLK_ZO_OPEN:
>          op_name = "BLKOPENZONE";
> @@ -3321,7 +3454,8 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState
> *bs, BlockZoneOp op,
>          break;
>      default:
>          error_report("Unsupported zone op: 0x%x", op);
> -        return -ENOTSUP;
> +        ret = -ENOTSUP;
> +        goto out;
>      }
>  
>      acb = (RawPosixAIOData) {
> @@ -3339,10 +3473,27 @@ static int coroutine_fn
> raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>                          len >> BDRV_SECTOR_BITS);
>      ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
>      if (ret != 0) {
> +        update_zones_wp(s->fd, wps, offset, index);
>          ret = -errno;
>          error_report("ioctl %s failed %d", op_name, ret);
> +        goto out;
>      }
>  
> +    if (zo == BLKRESETZONE && len == capacity) {
> +        for (int i = 0; i < bs->bl.nr_zones; ++i) {
> +            if (!BDRV_ZT_IS_CONV(wps->wp[i])) {
> +                wps->wp[i] = i * bs->bl.zone_size;

This will reset write pointers of all read-only zones that may exist on the
device and make the data stored in those zones unreadable. R/O zones need to be
skipped in this loop.

> +            }
> +        }
> +    } else if (zo == BLKRESETZONE) {
> +        wps->wp[index] = offset;
> +    } else if (zo == BLKFINISHZONE) {
> +        /* The zoned device allows the last zone smaller that the zone size.
> */
> +        wps->wp[index] = offset + len;
> +    }
> +
> +out:
> +    qemu_co_mutex_unlock(&wps->colock);
>      return ret;
>  }
>  #endif
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index 1576fcf2ed..93196229ac 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -118,6 +118,14 @@ typedef struct BlockZoneDescriptor {
>      BlockZoneState state;
>  } BlockZoneDescriptor;
>  
> +/*
> + * Track write pointers of a zone in bytes.
> + */
> +typedef struct BlockZoneWps {
> +    CoMutex colock;
> +    uint64_t wp[];
> +} BlockZoneWps;
> +
>  typedef struct BlockDriverInfo {
>      /* in bytes, 0 if irrelevant */
>      int cluster_size;
> @@ -240,6 +248,12 @@ typedef enum {
>  #define BDRV_SECTOR_BITS   9
>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
>  
> +/*
> + * Get the first most significant bit of wp. If it is zero, then
> + * the zone type is SWR.
> + */
> +#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
> +
>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
>                                             INT_MAX >> BDRV_SECTOR_BITS)
>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
> diff --git a/include/block/block_int-common.h b/include/block/block_int-
> common.h
> index 1bd2aef4d5..19915b34af 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -884,6 +884,9 @@ typedef struct BlockLimits {
>  
>      /* maximum number of active zones */
>      int64_t max_active_zones;
> +
> +    /* array of write pointers' location of each zone in the zoned device. */
> +    BlockZoneWps *wps;
>  } BlockLimits;
>  
>  typedef struct BdrvOpBlocker BdrvOpBlocker;
Damien Le Moal March 14, 2023, 3:49 a.m. UTC | #2
On 3/14/23 11:23, Dmitry Fomichev wrote:
>> @@ -3339,10 +3473,27 @@ static int coroutine_fn
>> raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>                          len >> BDRV_SECTOR_BITS);
>>      ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
>>      if (ret != 0) {
>> +        update_zones_wp(s->fd, wps, offset, index);
>>          ret = -errno;
>>          error_report("ioctl %s failed %d", op_name, ret);
>> +        goto out;
>>      }
>>  
>> +    if (zo == BLKRESETZONE && len == capacity) {
>> +        for (int i = 0; i < bs->bl.nr_zones; ++i) {
>> +            if (!BDRV_ZT_IS_CONV(wps->wp[i])) {
>> +                wps->wp[i] = i * bs->bl.zone_size;
> 
> This will reset write pointers of all read-only zones that may exist on the
> device and make the data stored in those zones unreadable. R/O zones need to be
> skipped in this loop.

And offline zones need to be skipped as well.
Sam Li March 15, 2023, 12:59 p.m. UTC | #3
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2023年3月14日周二 11:49写道:
>
> On 3/14/23 11:23, Dmitry Fomichev wrote:
> >> @@ -3339,10 +3473,27 @@ static int coroutine_fn
> >> raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >>                          len >> BDRV_SECTOR_BITS);
> >>      ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
> >>      if (ret != 0) {
> >> +        update_zones_wp(s->fd, wps, offset, index);
> >>          ret = -errno;
> >>          error_report("ioctl %s failed %d", op_name, ret);
> >> +        goto out;
> >>      }
> >>
> >> +    if (zo == BLKRESETZONE && len == capacity) {
> >> +        for (int i = 0; i < bs->bl.nr_zones; ++i) {
> >> +            if (!BDRV_ZT_IS_CONV(wps->wp[i])) {
> >> +                wps->wp[i] = i * bs->bl.zone_size;
> >
> > This will reset write pointers of all read-only zones that may exist on the
> > device and make the data stored in those zones unreadable. R/O zones need to be
> > skipped in this loop.
>
> And offline zones need to be skipped as well.

I see. That can be done thanks to get_zones_wp() which can show the
state of the zone at specific position.

Sam
Damien Le Moal March 15, 2023, 9:23 p.m. UTC | #4
On 3/15/23 21:59, Sam Li wrote:
> Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2023年3月14日周二 11:49写道:
>>
>> On 3/14/23 11:23, Dmitry Fomichev wrote:
>>>> @@ -3339,10 +3473,27 @@ static int coroutine_fn
>>>> raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>>>                          len >> BDRV_SECTOR_BITS);
>>>>      ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
>>>>      if (ret != 0) {
>>>> +        update_zones_wp(s->fd, wps, offset, index);
>>>>          ret = -errno;
>>>>          error_report("ioctl %s failed %d", op_name, ret);
>>>> +        goto out;
>>>>      }
>>>>
>>>> +    if (zo == BLKRESETZONE && len == capacity) {
>>>> +        for (int i = 0; i < bs->bl.nr_zones; ++i) {
>>>> +            if (!BDRV_ZT_IS_CONV(wps->wp[i])) {
>>>> +                wps->wp[i] = i * bs->bl.zone_size;
>>>
>>> This will reset write pointers of all read-only zones that may exist on the
>>> device and make the data stored in those zones unreadable. R/O zones need to be
>>> skipped in this loop.
>>
>> And offline zones need to be skipped as well.
> 
> I see. That can be done thanks to get_zones_wp() which can show the
> state of the zone at specific position.

I do not think so: a zone wp is invalid for read-only and offline zones. So you
cannot rely on the wp value to detect these states. Even a valid wp value would
not tell you if the zone is read only or offline anyway. You need to track these
states with flags set when doing the first report zone on startup and when doing
a report zone after an IO error.

> 
> Sam
Stefan Hajnoczi March 16, 2023, 6:51 p.m. UTC | #5
On Fri, Mar 10, 2023 at 06:31:03PM +0800, Sam Li wrote:
> @@ -2338,9 +2424,15 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
>  {
>      BDRVRawState *s = bs->opaque;
>      RawPosixAIOData acb;
> +    int ret;
>  
>      if (fd_open(bs) < 0)
>          return -EIO;
> +#if defined(CONFIG_BLKZONED)
> +    if (bs->bl.wps) {
> +        qemu_co_mutex_lock(&bs->bl.wps->colock);
> +    }
> +#endif

Is the lock only needed by QEMU_AIO_WRITE requests? If yes, can we skip
it for other request types to avoid serializing those requests?
diff mbox series

Patch

diff --git a/block/file-posix.c b/block/file-posix.c
index 563acc76ae..61ed769ac8 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1324,6 +1324,77 @@  static int hdev_get_max_segments(int fd, struct stat *st)
 #endif
 }
 
+#if defined(CONFIG_BLKZONED)
+static int get_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
+                        unsigned int nrz) {
+    struct blk_zone *blkz;
+    size_t rep_size;
+    uint64_t sector = offset >> BDRV_SECTOR_BITS;
+    int ret, n = 0, i = 0;
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    g_autofree struct blk_zone_report *rep = NULL;
+
+    rep = g_malloc(rep_size);
+    blkz = (struct blk_zone *)(rep + 1);
+    while (n < nrz) {
+        memset(rep, 0, rep_size);
+        rep->sector = sector;
+        rep->nr_zones = nrz - n;
+
+        do {
+            ret = ioctl(fd, BLKREPORTZONE, rep);
+        } while (ret != 0 && errno == EINTR);
+        if (ret != 0) {
+            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+                    fd, offset, errno);
+            return -errno;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; i++, n++) {
+            /*
+             * The wp tracking cares only about sequential writes required and
+             * sequential write preferred zones so that the wp can advance to
+             * the right location.
+             * Use the most significant bit of the wp location to indicate the
+             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
+             */
+            if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+                wps->wp[i] = 1ULL << 63;
+            } else {
+                switch(blkz[i].cond) {
+                case BLK_ZONE_COND_FULL:
+                case BLK_ZONE_COND_READONLY:
+                    /* Zone not writable */
+                    wps->wp[i] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS;
+                    break;
+                case BLK_ZONE_COND_OFFLINE:
+                    /* Zone not writable nor readable */
+                    wps->wp[i] = (blkz[i].start) << BDRV_SECTOR_BITS;
+                    break;
+                default:
+                    wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
+                    break;
+                }
+            }
+        }
+        sector = blkz[i - 1].start + blkz[i - 1].len;
+    }
+
+    return 0;
+}
+
+static void update_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
+                            unsigned int nrz) {
+    if (get_zones_wp(fd, wps, offset, nrz) < 0) {
+        error_report("update zone wp failed");
+    }
+}
+#endif
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
@@ -1413,6 +1484,21 @@  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
         if (ret >= 0) {
             bs->bl.max_active_zones = ret;
         }
+
+        ret = get_sysfs_long_val(&st, "physical_block_size");
+        if (ret >= 0) {
+            bs->bl.write_granularity = ret;
+        }
+
+        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) +
+                sizeof(int64_t) * bs->bl.nr_zones);
+        ret = get_zones_wp(s->fd, bs->bl.wps, 0, bs->bl.nr_zones);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "report wps failed");
+            g_free(bs->bl.wps);
+            return;
+        }
+        qemu_co_mutex_init(&bs->bl.wps->colock);
         return;
     }
 out:
@@ -2338,9 +2424,15 @@  static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
 {
     BDRVRawState *s = bs->opaque;
     RawPosixAIOData acb;
+    int ret;
 
     if (fd_open(bs) < 0)
         return -EIO;
+#if defined(CONFIG_BLKZONED)
+    if (bs->bl.wps) {
+        qemu_co_mutex_lock(&bs->bl.wps->colock);
+    }
+#endif
 
     /*
      * When using O_DIRECT, the request must be aligned to be able to use
@@ -2354,14 +2446,16 @@  static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
     } else if (s->use_linux_io_uring) {
         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
         assert(qiov->size == bytes);
-        return luring_co_submit(bs, aio, s->fd, offset, qiov, type);
+        ret = luring_co_submit(bs, aio, s->fd, offset, qiov, type);
+        goto out;
 #endif
 #ifdef CONFIG_LINUX_AIO
     } else if (s->use_linux_aio) {
         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
         assert(qiov->size == bytes);
-        return laio_co_submit(bs, aio, s->fd, offset, qiov, type,
+        ret = laio_co_submit(bs, aio, s->fd, offset, qiov, type,
                               s->aio_max_batch);
+        goto out;
 #endif
     }
 
@@ -2378,7 +2472,32 @@  static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
     };
 
     assert(qiov->size == bytes);
-    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
+    ret = raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
+
+out:
+#if defined(CONFIG_BLKZONED)
+    BlockZoneWps *wps = bs->bl.wps;
+    if (ret == 0) {
+        if (type & QEMU_AIO_WRITE && wps && bs->bl.zone_size) {
+            int index = offset / bs->bl.zone_size;
+            if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
+                /* Advance the wp if needed */
+                if (offset + bytes > wps->wp[index]) {
+                    wps->wp[index] = offset + bytes;
+                }
+            }
+        }
+    } else {
+        if (type & QEMU_AIO_WRITE) {
+            update_zones_wp(s->fd, bs->bl.wps, 0, 1);
+        }
+    }
+
+    if (wps) {
+        qemu_co_mutex_unlock(&wps->colock);
+    }
+#endif
+    return ret;
 }
 
 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
@@ -2486,6 +2605,11 @@  static void raw_close(BlockDriverState *bs)
     BDRVRawState *s = bs->opaque;
 
     if (s->fd >= 0) {
+#if defined(CONFIG_BLKZONED)
+        if (bs->bl.wps) {
+            g_free(bs->bl.wps);
+        }
+#endif
         qemu_close(s->fd);
         s->fd = -1;
     }
@@ -3285,6 +3409,7 @@  static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
     const char *op_name;
     unsigned long zo;
     int ret;
+    BlockZoneWps *wps = bs->bl.wps;
     int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
 
     zone_size = bs->bl.zone_size;
@@ -3302,6 +3427,14 @@  static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
         return -EINVAL;
     }
 
+    qemu_co_mutex_lock(&wps->colock);
+    uint32_t index = offset / bs->bl.zone_size;
+    if (BDRV_ZT_IS_CONV(wps->wp[index]) && len != capacity) {
+        error_report("zone mgmt operations are not allowed for conventional zones");
+        ret = -EIO;
+        goto out;
+    }
+
     switch (op) {
     case BLK_ZO_OPEN:
         op_name = "BLKOPENZONE";
@@ -3321,7 +3454,8 @@  static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
         break;
     default:
         error_report("Unsupported zone op: 0x%x", op);
-        return -ENOTSUP;
+        ret = -ENOTSUP;
+        goto out;
     }
 
     acb = (RawPosixAIOData) {
@@ -3339,10 +3473,27 @@  static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
                         len >> BDRV_SECTOR_BITS);
     ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
     if (ret != 0) {
+        update_zones_wp(s->fd, wps, offset, index);
         ret = -errno;
         error_report("ioctl %s failed %d", op_name, ret);
+        goto out;
     }
 
+    if (zo == BLKRESETZONE && len == capacity) {
+        for (int i = 0; i < bs->bl.nr_zones; ++i) {
+            if (!BDRV_ZT_IS_CONV(wps->wp[i])) {
+                wps->wp[i] = i * bs->bl.zone_size;
+            }
+        }
+    } else if (zo == BLKRESETZONE) {
+        wps->wp[index] = offset;
+    } else if (zo == BLKFINISHZONE) {
+        /* The zoned device allows the last zone smaller that the zone size. */
+        wps->wp[index] = offset + len;
+    }
+
+out:
+    qemu_co_mutex_unlock(&wps->colock);
     return ret;
 }
 #endif
diff --git a/include/block/block-common.h b/include/block/block-common.h
index 1576fcf2ed..93196229ac 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -118,6 +118,14 @@  typedef struct BlockZoneDescriptor {
     BlockZoneState state;
 } BlockZoneDescriptor;
 
+/*
+ * Track write pointers of a zone in bytes.
+ */
+typedef struct BlockZoneWps {
+    CoMutex colock;
+    uint64_t wp[];
+} BlockZoneWps;
+
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
     int cluster_size;
@@ -240,6 +248,12 @@  typedef enum {
 #define BDRV_SECTOR_BITS   9
 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
 
+/*
+ * Get the first most significant bit of wp. If it is zero, then
+ * the zone type is SWR.
+ */
+#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
+
 #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
                                            INT_MAX >> BDRV_SECTOR_BITS)
 #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 1bd2aef4d5..19915b34af 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -884,6 +884,9 @@  typedef struct BlockLimits {
 
     /* maximum number of active zones */
     int64_t max_active_zones;
+
+    /* array of write pointers' location of each zone in the zoned device. */
+    BlockZoneWps *wps;
 } BlockLimits;
 
 typedef struct BdrvOpBlocker BdrvOpBlocker;