diff mbox

sheepdog: serialize requests to overwrapping area

Message ID 1437151464-5458-1-git-send-email-mitake.hitoshi@lab.ntt.co.jp
State New
Headers show

Commit Message

Hitoshi Mitake July 17, 2015, 4:44 p.m. UTC
Current sheepdog driver only serializes create requests in oid
unit. This mechanism isn't enough for handling requests to
overwrapping area spanning multiple oids, so it can result bugs like
below:
https://bugs.launchpad.net/sheepdog-project/+bug/1456421

This patch adds a new serialization mechanism for the problem. The
difference from the old one is:
1. serialize entire aiocb if their targetting areas overwrap
2. serialize all requests (read, write, and discard), not only creates

This patch also removes the old mechanism because the new one can be
an alternative.

Cc: Kevin Wolf <kwolf@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
Cc: Vasiliy Tolstov <v.tolstov@selfip.ru>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
---
 block/sheepdog.c | 152 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 71 insertions(+), 81 deletions(-)

Comments

Vasiliy Tolstov July 17, 2015, 4:49 p.m. UTC | #1
2015-07-17 19:44 GMT+03:00 Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>:
> Current sheepdog driver only serializes create requests in oid
> unit. This mechanism isn't enough for handling requests to
> overwrapping area spanning multiple oids, so it can result bugs like
> below:
> https://bugs.launchpad.net/sheepdog-project/+bug/1456421
>
> This patch adds a new serialization mechanism for the problem. The
> difference from the old one is:
> 1. serialize entire aiocb if their targetting areas overwrap
> 2. serialize all requests (read, write, and discard), not only creates
>
> This patch also removes the old mechanism because the new one can be
> an alternative.
>
> Cc: Kevin Wolf <kwolf@redhat.com>
> Cc: Stefan Hajnoczi <stefanha@redhat.com>
> Cc: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> Cc: Vasiliy Tolstov <v.tolstov@selfip.ru>
> Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>


Tested-by: Vasiliy Tolstov <v.tolstov@selfip.ru>
Stefan Hajnoczi July 20, 2015, 3:46 p.m. UTC | #2
On Sat, Jul 18, 2015 at 01:44:24AM +0900, Hitoshi Mitake wrote:
> Current sheepdog driver only serializes create requests in oid
> unit. This mechanism isn't enough for handling requests to
> overwrapping area spanning multiple oids, so it can result bugs like
> below:
> https://bugs.launchpad.net/sheepdog-project/+bug/1456421
> 
> This patch adds a new serialization mechanism for the problem. The
> difference from the old one is:
> 1. serialize entire aiocb if their targetting areas overwrap
> 2. serialize all requests (read, write, and discard), not only creates
> 
> This patch also removes the old mechanism because the new one can be
> an alternative.
> 
> Cc: Kevin Wolf <kwolf@redhat.com>
> Cc: Stefan Hajnoczi <stefanha@redhat.com>
> Cc: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> Cc: Vasiliy Tolstov <v.tolstov@selfip.ru>
> Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
> ---
>  block/sheepdog.c | 152 ++++++++++++++++++++++++++-----------------------------
>  1 file changed, 71 insertions(+), 81 deletions(-)

CCing Jeff Cody who now handles pull requests for network protocols.
Jeff Cody July 27, 2015, 3:23 p.m. UTC | #3
On Sat, Jul 18, 2015 at 01:44:24AM +0900, Hitoshi Mitake wrote:
> Current sheepdog driver only serializes create requests in oid
> unit. This mechanism isn't enough for handling requests to
> overwrapping area spanning multiple oids, so it can result bugs like
> below:
> https://bugs.launchpad.net/sheepdog-project/+bug/1456421
> 
> This patch adds a new serialization mechanism for the problem. The
> difference from the old one is:
> 1. serialize entire aiocb if their targetting areas overwrap
> 2. serialize all requests (read, write, and discard), not only creates
> 
> This patch also removes the old mechanism because the new one can be
> an alternative.
> 
> Cc: Kevin Wolf <kwolf@redhat.com>
> Cc: Stefan Hajnoczi <stefanha@redhat.com>
> Cc: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> Cc: Vasiliy Tolstov <v.tolstov@selfip.ru>
> Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
> ---
>  block/sheepdog.c | 152 ++++++++++++++++++++++++++-----------------------------
>  1 file changed, 71 insertions(+), 81 deletions(-)
> 
> diff --git a/block/sheepdog.c b/block/sheepdog.c
> index bd7cbed..9585beb 100644
> --- a/block/sheepdog.c
> +++ b/block/sheepdog.c
> @@ -318,6 +318,10 @@ enum AIOCBState {
>      AIOCB_DISCARD_OBJ,
>  };
>  
> +#define AIOCBOverwrapping(x, y)                                 \
> +    (!(x->max_affect_data_idx < y->min_affect_data_idx          \
> +       || y->max_affect_data_idx < x->min_affect_data_idx))
> +
>  struct SheepdogAIOCB {
>      BlockAIOCB common;
>  
> @@ -334,6 +338,11 @@ struct SheepdogAIOCB {
>  
>      bool cancelable;
>      int nr_pending;
> +
> +    uint32_t min_affect_data_idx;
> +    uint32_t max_affect_data_idx;
> +
> +    QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
>  };
>  
>  typedef struct BDRVSheepdogState {
> @@ -362,8 +371,10 @@ typedef struct BDRVSheepdogState {
>  
>      /* Every aio request must be linked to either of these queues. */
>      QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
> -    QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head;
>      QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
> +
> +    CoQueue overwrapping_queue;
> +    QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
>  } BDRVSheepdogState;
>  
>  static const char * sd_strerror(int err)
> @@ -498,13 +509,7 @@ static void sd_aio_cancel(BlockAIOCB *blockacb)
>      AIOReq *aioreq, *next;
>  
>      if (sd_acb_cancelable(acb)) {
> -        /* Remove outstanding requests from pending and failed queues.  */
> -        QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings,
> -                           next) {
> -            if (aioreq->aiocb == acb) {
> -                free_aio_req(s, aioreq);
> -            }
> -        }
> +        /* Remove outstanding requests from failed queue.  */
>          QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
>                             next) {
>              if (aioreq->aiocb == acb) {
> @@ -529,6 +534,10 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
>                                     int64_t sector_num, int nb_sectors)
>  {
>      SheepdogAIOCB *acb;
> +    uint32_t object_size;
> +    BDRVSheepdogState *s = bs->opaque;
> +
> +    object_size = (UINT32_C(1) << s->inode.block_size_shift);
>  
>      acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
>  
> @@ -542,6 +551,11 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
>      acb->coroutine = qemu_coroutine_self();
>      acb->ret = 0;
>      acb->nr_pending = 0;
> +
> +    acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
> +    acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
> +                              acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size;
> +
>      return acb;
>  }
>  
> @@ -703,38 +717,6 @@ static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag);
>  static int get_sheep_fd(BDRVSheepdogState *s, Error **errp);
>  static void co_write_request(void *opaque);
>  
> -static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
> -{
> -    AIOReq *aio_req;
> -
> -    QLIST_FOREACH(aio_req, &s->pending_aio_head, aio_siblings) {
> -        if (aio_req->oid == oid) {
> -            return aio_req;
> -        }
> -    }
> -
> -    return NULL;
> -}
> -
> -/*
> - * This function searchs pending requests to the object `oid', and
> - * sends them.
> - */
> -static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
> -{
> -    AIOReq *aio_req;
> -    SheepdogAIOCB *acb;
> -
> -    while ((aio_req = find_pending_req(s, oid)) != NULL) {
> -        acb = aio_req->aiocb;
> -        /* move aio_req from pending list to inflight one */
> -        QLIST_REMOVE(aio_req, aio_siblings);
> -        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
> -        add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
> -                        acb->aiocb_type);
> -    }
> -}
> -
>  static coroutine_fn void reconnect_to_sdog(void *opaque)
>  {
>      BDRVSheepdogState *s = opaque;
> @@ -840,12 +822,6 @@ static void coroutine_fn aio_read_response(void *opaque)
>                  s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
>                  s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
>              }
> -            /*
> -             * Some requests may be blocked because simultaneous
> -             * create requests are not allowed, so we search the
> -             * pending requests here.
> -             */
> -            send_pending_req(s, aio_req->oid);
>          }
>          break;
>      case AIOCB_READ_UDATA:
> @@ -1341,30 +1317,6 @@ out:
>      return ret;
>  }
>  
> -/* Return true if the specified request is linked to the pending list. */
> -static bool check_simultaneous_create(BDRVSheepdogState *s, AIOReq *aio_req)
> -{
> -    AIOReq *areq;
> -    QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
> -        if (areq != aio_req && areq->oid == aio_req->oid) {
> -            /*
> -             * Sheepdog cannot handle simultaneous create requests to the same
> -             * object, so we cannot send the request until the previous request
> -             * finishes.
> -             */
> -            DPRINTF("simultaneous create to %" PRIx64 "\n", aio_req->oid);
> -            aio_req->flags = 0;
> -            aio_req->base_oid = 0;
> -            aio_req->create = false;
> -            QLIST_REMOVE(aio_req, aio_siblings);
> -            QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings);
> -            return true;
> -        }
> -    }
> -
> -    return false;
> -}
> -
>  static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
>  {
>      SheepdogAIOCB *acb = aio_req->aiocb;
> @@ -1379,10 +1331,6 @@ static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
>              goto out;
>          }
>  
> -        if (check_simultaneous_create(s, aio_req)) {
> -            return;
> -        }
> -
>          if (s->inode.data_vdi_id[idx]) {
>              aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
>              aio_req->flags |= SD_FLAG_CMD_COW;
> @@ -1458,8 +1406,8 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
>      filename = qemu_opt_get(opts, "filename");
>  
>      QLIST_INIT(&s->inflight_aio_head);
> -    QLIST_INIT(&s->pending_aio_head);
>      QLIST_INIT(&s->failed_aio_head);
> +    QLIST_INIT(&s->inflight_aiocb_head);
>      s->fd = -1;
>  
>      memset(vdi, 0, sizeof(vdi));
> @@ -1524,6 +1472,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
>      bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
>      pstrcpy(s->name, sizeof(s->name), vdi);
>      qemu_co_mutex_init(&s->lock);
> +    qemu_co_queue_init(&s->overwrapping_queue);
>      qemu_opts_del(opts);
>      g_free(buf);
>      return 0;
> @@ -2195,12 +2144,6 @@ static int coroutine_fn sd_co_rw_vector(void *p)
>                                  old_oid, done);
>          QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
>  
> -        if (create) {
> -            if (check_simultaneous_create(s, aio_req)) {
> -                goto done;
> -            }
> -        }
> -
>          add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
>                          acb->aiocb_type);
>      done:
> @@ -2215,6 +2158,20 @@ out:
>      return 1;
>  }
>  
> +static bool check_overwrapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
> +{
> +    SheepdogAIOCB *cb;
> +
> +    QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
> +        if (AIOCBOverwrapping(aiocb, cb)) {
> +            return true;
> +        }
> +    }
> +
> +    QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings);
> +    return false;
> +}
> +
>  static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
>                          int nb_sectors, QEMUIOVector *qiov)
>  {
> @@ -2234,14 +2191,25 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
>      acb->aio_done_func = sd_write_done;
>      acb->aiocb_type = AIOCB_WRITE_UDATA;
>  
> +retry:
> +    if (check_overwrapping_aiocb(s, acb)) {
> +        qemu_co_queue_wait(&s->overwrapping_queue);
> +        goto retry;
> +    }
> +
>      ret = sd_co_rw_vector(acb);
>      if (ret <= 0) {
> +        QLIST_REMOVE(acb, aiocb_siblings);
> +        qemu_co_queue_restart_all(&s->overwrapping_queue);
>          qemu_aio_unref(acb);
>          return ret;
>      }
>  
>      qemu_coroutine_yield();
>  
> +    QLIST_REMOVE(acb, aiocb_siblings);
> +    qemu_co_queue_restart_all(&s->overwrapping_queue);
> +
>      return acb->ret;
>  }
>  
> @@ -2250,19 +2218,30 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
>  {
>      SheepdogAIOCB *acb;
>      int ret;
> +    BDRVSheepdogState *s = bs->opaque;
>  
>      acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
>      acb->aiocb_type = AIOCB_READ_UDATA;
>      acb->aio_done_func = sd_finish_aiocb;
>  
> +retry:
> +    if (check_overwrapping_aiocb(s, acb)) {
> +        qemu_co_queue_wait(&s->overwrapping_queue);
> +        goto retry;
> +    }
> +
>      ret = sd_co_rw_vector(acb);
>      if (ret <= 0) {
> +        QLIST_REMOVE(acb, aiocb_siblings);
> +        qemu_co_queue_restart_all(&s->overwrapping_queue);
>          qemu_aio_unref(acb);
>          return ret;
>      }
>  
>      qemu_coroutine_yield();
>  
> +    QLIST_REMOVE(acb, aiocb_siblings);
> +    qemu_co_queue_restart_all(&s->overwrapping_queue);
>      return acb->ret;
>  }
>  
> @@ -2610,14 +2589,25 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
>      acb->aiocb_type = AIOCB_DISCARD_OBJ;
>      acb->aio_done_func = sd_finish_aiocb;
>  
> +retry:
> +    if (check_overwrapping_aiocb(s, acb)) {
> +        qemu_co_queue_wait(&s->overwrapping_queue);
> +        goto retry;
> +    }
> +
>      ret = sd_co_rw_vector(acb);
>      if (ret <= 0) {
> +        QLIST_REMOVE(acb, aiocb_siblings);
> +        qemu_co_queue_restart_all(&s->overwrapping_queue);
>          qemu_aio_unref(acb);
>          return ret;
>      }
>  
>      qemu_coroutine_yield();
>  
> +    QLIST_REMOVE(acb, aiocb_siblings);
> +    qemu_co_queue_restart_all(&s->overwrapping_queue);
> +
>      return acb->ret;
>  }
>  
> -- 
> 1.9.1
> 
>

Thanks, applied to my block branch:
https://github.com/codyprime/qemu-kvm-jtc/tree/block


Jeff
Vasiliy Tolstov July 27, 2015, 3:36 p.m. UTC | #4
2015-07-27 18:23 GMT+03:00 Jeff Cody <jcody@redhat.com>:
> Thanks, applied to my block branch:
> https://github.com/codyprime/qemu-kvm-jtc/tree/block


Thanks! Waiting for adding to qemu rc =)
Liu Yuan July 28, 2015, 8:50 a.m. UTC | #5
On Sat, Jul 18, 2015 at 01:44:24AM +0900, Hitoshi Mitake wrote:
> Current sheepdog driver only serializes create requests in oid
> unit. This mechanism isn't enough for handling requests to
> overwrapping area spanning multiple oids, so it can result bugs like
> below:
> https://bugs.launchpad.net/sheepdog-project/+bug/1456421

I'm a bit late to review the patch since I'm not on the cc list, but I'd like to
get the idea how the mentioned bug relates to the serialization of requests?

The mentioned bug looks to me more a bug of sheepdog because the create and
write request will only unref a single oid. The bug report is unclear about
why the object idx in inode becomes zero, at least not pointing that it relates
to QEMU.

But this patch assume QEMU send the requests the wrong way and just vaguely
says it is just wrong without reason.

What is overrapping requests? As far as I understand, the request that stride
over two objects will be split into two, to make sure all the requests fit the
sheepdog object size. Allow requests run concurrently on different SD objects is
way achieving high performance. This patch mutes this feature, to me, without a
decent reason. 

Probably I miss something hidden, but I'd like someone enlighten me about it
because this patch might slow down QEMU VM over sheepdog.

Thanks,
Yuan
Liu Yuan July 28, 2015, 9:35 a.m. UTC | #6
On Tue, Jul 28, 2015 at 04:50:08PM +0800, Liu Yuan wrote:
> On Sat, Jul 18, 2015 at 01:44:24AM +0900, Hitoshi Mitake wrote:
> > Current sheepdog driver only serializes create requests in oid
> > unit. This mechanism isn't enough for handling requests to
> > overwrapping area spanning multiple oids, so it can result bugs like
> > below:
> > https://bugs.launchpad.net/sheepdog-project/+bug/1456421
> 
> I'm a bit late to review the patch since I'm not on the cc list, but I'd like to
> get the idea how the mentioned bug relates to the serialization of requests?
> 
> The mentioned bug looks to me more a bug of sheepdog because the create and
> write request will only unref a single oid. The bug report is unclear about
> why the object idx in inode becomes zero, at least not pointing that it relates
> to QEMU.
> 
> But this patch assume QEMU send the requests the wrong way and just vaguely
> says it is just wrong without reason.
> 
> What is overrapping requests? As far as I understand, the request that stride
> over two objects will be split into two, to make sure all the requests fit the
> sheepdog object size. Allow requests run concurrently on different SD objects is
> way achieving high performance. This patch mutes this feature, to me, without a
> decent reason. 
> 
> Probably I miss something hidden, but I'd like someone enlighten me about it
> because this patch might slow down QEMU VM over sheepdog.
> 
> Thanks,
> Yuan

Cc Jeff
Liu Yuan July 28, 2015, 2:31 p.m. UTC | #7
On Mon, Jul 27, 2015 at 11:23:02AM -0400, Jeff Cody wrote:
> On Sat, Jul 18, 2015 at 01:44:24AM +0900, Hitoshi Mitake wrote:
> > Current sheepdog driver only serializes create requests in oid
> > unit. This mechanism isn't enough for handling requests to
> > overwrapping area spanning multiple oids, so it can result bugs like
> > below:
> > https://bugs.launchpad.net/sheepdog-project/+bug/1456421
> > 
> > This patch adds a new serialization mechanism for the problem. The
> > difference from the old one is:
> > 1. serialize entire aiocb if their targetting areas overwrap
> > 2. serialize all requests (read, write, and discard), not only creates
> > 
> > This patch also removes the old mechanism because the new one can be
> > an alternative.
> > 

Okay, I figured out what the problem is myself and allow me to try to make it
clear to non-sheepdog devs:

sheepdog volume is thin-provision, so for the first write, we create the object
internally, meaning that we need to handle write in two case:

1. write to non-allocated object, create it then update inode, so in this case
   two request will be generated: create(oid), update_inode(oid_to_inode_idx)
2. write the allocated object, just write(oid).

Current sheepdog driver use a range update_inode(min_idx, max_idx) for batching
the updates. But there is subtle problem by determining min_idx and max_idx:

for a single create request, min_idx == max_idx, so actually we just update one
one bit as expected.

Suppose we have 2 create request, create(10) and create(20), then min == 10,
max==20 even though we just need to update index 10 and index 20, update_inode(10,20)
will actually update range from 10 to 20. This would work if all the update_inode()
requests won't overlap. But unfortunately, this is not true for some corner case.
So the problem arise as following:

req 1: update_inode(10,20)
req 2: update_inode(15,22)

req 1 and req 2 might have different value between [15,20] and cause problems.

Based on above analysis, I think the real fix is to fix update_inode(), not
serialize all the requests in overkill way. The fix would be easy, considering
most update_inode() update only 1 index, we could just make update_inode a
single bit updater, not a range one, in which way we don't affect performance
as the above patch did. (I actually suspect that the above patch might not solve
the problem because update_inode() can overlap even with the patch).

If everyone agrees with my analysis, I'll post the fix.

Thanks,
Yuan
diff mbox

Patch

diff --git a/block/sheepdog.c b/block/sheepdog.c
index bd7cbed..9585beb 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -318,6 +318,10 @@  enum AIOCBState {
     AIOCB_DISCARD_OBJ,
 };
 
+#define AIOCBOverwrapping(x, y)                                 \
+    (!(x->max_affect_data_idx < y->min_affect_data_idx          \
+       || y->max_affect_data_idx < x->min_affect_data_idx))
+
 struct SheepdogAIOCB {
     BlockAIOCB common;
 
@@ -334,6 +338,11 @@  struct SheepdogAIOCB {
 
     bool cancelable;
     int nr_pending;
+
+    uint32_t min_affect_data_idx;
+    uint32_t max_affect_data_idx;
+
+    QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
 };
 
 typedef struct BDRVSheepdogState {
@@ -362,8 +371,10 @@  typedef struct BDRVSheepdogState {
 
     /* Every aio request must be linked to either of these queues. */
     QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
-    QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head;
     QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
+
+    CoQueue overwrapping_queue;
+    QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
 } BDRVSheepdogState;
 
 static const char * sd_strerror(int err)
@@ -498,13 +509,7 @@  static void sd_aio_cancel(BlockAIOCB *blockacb)
     AIOReq *aioreq, *next;
 
     if (sd_acb_cancelable(acb)) {
-        /* Remove outstanding requests from pending and failed queues.  */
-        QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings,
-                           next) {
-            if (aioreq->aiocb == acb) {
-                free_aio_req(s, aioreq);
-            }
-        }
+        /* Remove outstanding requests from failed queue.  */
         QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
                            next) {
             if (aioreq->aiocb == acb) {
@@ -529,6 +534,10 @@  static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
                                    int64_t sector_num, int nb_sectors)
 {
     SheepdogAIOCB *acb;
+    uint32_t object_size;
+    BDRVSheepdogState *s = bs->opaque;
+
+    object_size = (UINT32_C(1) << s->inode.block_size_shift);
 
     acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
 
@@ -542,6 +551,11 @@  static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
     acb->coroutine = qemu_coroutine_self();
     acb->ret = 0;
     acb->nr_pending = 0;
+
+    acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
+    acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
+                              acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size;
+
     return acb;
 }
 
@@ -703,38 +717,6 @@  static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag);
 static int get_sheep_fd(BDRVSheepdogState *s, Error **errp);
 static void co_write_request(void *opaque);
 
-static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
-{
-    AIOReq *aio_req;
-
-    QLIST_FOREACH(aio_req, &s->pending_aio_head, aio_siblings) {
-        if (aio_req->oid == oid) {
-            return aio_req;
-        }
-    }
-
-    return NULL;
-}
-
-/*
- * This function searchs pending requests to the object `oid', and
- * sends them.
- */
-static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
-{
-    AIOReq *aio_req;
-    SheepdogAIOCB *acb;
-
-    while ((aio_req = find_pending_req(s, oid)) != NULL) {
-        acb = aio_req->aiocb;
-        /* move aio_req from pending list to inflight one */
-        QLIST_REMOVE(aio_req, aio_siblings);
-        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-        add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
-                        acb->aiocb_type);
-    }
-}
-
 static coroutine_fn void reconnect_to_sdog(void *opaque)
 {
     BDRVSheepdogState *s = opaque;
@@ -840,12 +822,6 @@  static void coroutine_fn aio_read_response(void *opaque)
                 s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
                 s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
             }
-            /*
-             * Some requests may be blocked because simultaneous
-             * create requests are not allowed, so we search the
-             * pending requests here.
-             */
-            send_pending_req(s, aio_req->oid);
         }
         break;
     case AIOCB_READ_UDATA:
@@ -1341,30 +1317,6 @@  out:
     return ret;
 }
 
-/* Return true if the specified request is linked to the pending list. */
-static bool check_simultaneous_create(BDRVSheepdogState *s, AIOReq *aio_req)
-{
-    AIOReq *areq;
-    QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
-        if (areq != aio_req && areq->oid == aio_req->oid) {
-            /*
-             * Sheepdog cannot handle simultaneous create requests to the same
-             * object, so we cannot send the request until the previous request
-             * finishes.
-             */
-            DPRINTF("simultaneous create to %" PRIx64 "\n", aio_req->oid);
-            aio_req->flags = 0;
-            aio_req->base_oid = 0;
-            aio_req->create = false;
-            QLIST_REMOVE(aio_req, aio_siblings);
-            QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings);
-            return true;
-        }
-    }
-
-    return false;
-}
-
 static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
 {
     SheepdogAIOCB *acb = aio_req->aiocb;
@@ -1379,10 +1331,6 @@  static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
             goto out;
         }
 
-        if (check_simultaneous_create(s, aio_req)) {
-            return;
-        }
-
         if (s->inode.data_vdi_id[idx]) {
             aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
             aio_req->flags |= SD_FLAG_CMD_COW;
@@ -1458,8 +1406,8 @@  static int sd_open(BlockDriverState *bs, QDict *options, int flags,
     filename = qemu_opt_get(opts, "filename");
 
     QLIST_INIT(&s->inflight_aio_head);
-    QLIST_INIT(&s->pending_aio_head);
     QLIST_INIT(&s->failed_aio_head);
+    QLIST_INIT(&s->inflight_aiocb_head);
     s->fd = -1;
 
     memset(vdi, 0, sizeof(vdi));
@@ -1524,6 +1472,7 @@  static int sd_open(BlockDriverState *bs, QDict *options, int flags,
     bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
     pstrcpy(s->name, sizeof(s->name), vdi);
     qemu_co_mutex_init(&s->lock);
+    qemu_co_queue_init(&s->overwrapping_queue);
     qemu_opts_del(opts);
     g_free(buf);
     return 0;
@@ -2195,12 +2144,6 @@  static int coroutine_fn sd_co_rw_vector(void *p)
                                 old_oid, done);
         QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
 
-        if (create) {
-            if (check_simultaneous_create(s, aio_req)) {
-                goto done;
-            }
-        }
-
         add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
                         acb->aiocb_type);
     done:
@@ -2215,6 +2158,20 @@  out:
     return 1;
 }
 
+static bool check_overwrapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
+{
+    SheepdogAIOCB *cb;
+
+    QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
+        if (AIOCBOverwrapping(aiocb, cb)) {
+            return true;
+        }
+    }
+
+    QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings);
+    return false;
+}
+
 static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
                         int nb_sectors, QEMUIOVector *qiov)
 {
@@ -2234,14 +2191,25 @@  static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
     acb->aio_done_func = sd_write_done;
     acb->aiocb_type = AIOCB_WRITE_UDATA;
 
+retry:
+    if (check_overwrapping_aiocb(s, acb)) {
+        qemu_co_queue_wait(&s->overwrapping_queue);
+        goto retry;
+    }
+
     ret = sd_co_rw_vector(acb);
     if (ret <= 0) {
+        QLIST_REMOVE(acb, aiocb_siblings);
+        qemu_co_queue_restart_all(&s->overwrapping_queue);
         qemu_aio_unref(acb);
         return ret;
     }
 
     qemu_coroutine_yield();
 
+    QLIST_REMOVE(acb, aiocb_siblings);
+    qemu_co_queue_restart_all(&s->overwrapping_queue);
+
     return acb->ret;
 }
 
@@ -2250,19 +2218,30 @@  static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
 {
     SheepdogAIOCB *acb;
     int ret;
+    BDRVSheepdogState *s = bs->opaque;
 
     acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
     acb->aiocb_type = AIOCB_READ_UDATA;
     acb->aio_done_func = sd_finish_aiocb;
 
+retry:
+    if (check_overwrapping_aiocb(s, acb)) {
+        qemu_co_queue_wait(&s->overwrapping_queue);
+        goto retry;
+    }
+
     ret = sd_co_rw_vector(acb);
     if (ret <= 0) {
+        QLIST_REMOVE(acb, aiocb_siblings);
+        qemu_co_queue_restart_all(&s->overwrapping_queue);
         qemu_aio_unref(acb);
         return ret;
     }
 
     qemu_coroutine_yield();
 
+    QLIST_REMOVE(acb, aiocb_siblings);
+    qemu_co_queue_restart_all(&s->overwrapping_queue);
     return acb->ret;
 }
 
@@ -2610,14 +2589,25 @@  static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
     acb->aiocb_type = AIOCB_DISCARD_OBJ;
     acb->aio_done_func = sd_finish_aiocb;
 
+retry:
+    if (check_overwrapping_aiocb(s, acb)) {
+        qemu_co_queue_wait(&s->overwrapping_queue);
+        goto retry;
+    }
+
     ret = sd_co_rw_vector(acb);
     if (ret <= 0) {
+        QLIST_REMOVE(acb, aiocb_siblings);
+        qemu_co_queue_restart_all(&s->overwrapping_queue);
         qemu_aio_unref(acb);
         return ret;
     }
 
     qemu_coroutine_yield();
 
+    QLIST_REMOVE(acb, aiocb_siblings);
+    qemu_co_queue_restart_all(&s->overwrapping_queue);
+
     return acb->ret;
 }