diff mbox

[RFC,V8,03/13] quorum: Add quorum_aio_writev and its dependencies.

Message ID 1359392845-15905-4-git-send-email-benoit@irqsave.net
State New
Headers show

Commit Message

Benoît Canet Jan. 28, 2013, 5:07 p.m. UTC
Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
 block/quorum.c |  111 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)

Comments

Kevin Wolf Feb. 8, 2013, 10:38 a.m. UTC | #1
Am 28.01.2013 18:07, schrieb Benoît Canet:
> Signed-off-by: Benoit Canet <benoit@irqsave.net>
> ---
>  block/quorum.c |  111 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 111 insertions(+)
> 
> diff --git a/block/quorum.c b/block/quorum.c
> index d8fffbe..5d8470b 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -52,11 +52,122 @@ struct QuorumAIOCB {
>      int vote_ret;
>  };
>  
> +static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
> +{
> +    QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
> +    bool finished = false;
> +
> +    /* Wait for the request to finish */
> +    acb->finished = &finished;
> +    while (!finished) {
> +        qemu_aio_wait();
> +    }
> +}
> +
> +static AIOCBInfo quorum_aiocb_info = {
> +    .aiocb_size         = sizeof(QuorumAIOCB),
> +    .cancel             = quorum_aio_cancel,
> +};
> +
> +static void quorum_aio_bh(void *opaque)
> +{
> +    QuorumAIOCB *acb = opaque;
> +    BDRVQuorumState *s = acb->bqs;
> +    int ret;
> +
> +    ret = s->threshold <= acb->success_count ? 0 : -EIO;

It would be very much preferable if you stored the actual error code
instead of turning everything into -EIO.

> +
> +    qemu_bh_delete(acb->bh);
> +    acb->common.cb(acb->common.opaque, ret);
> +    if (acb->finished) {
> +        *acb->finished = true;
> +    }
> +    g_free(acb->aios);
> +    qemu_aio_release(acb);
> +}

Move this down so that it's next to the function using the bottom half.

> +
> +static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> +                                   BlockDriverState *bs,
> +                                   QEMUIOVector *qiov,
> +                                   uint64_t sector_num,
> +                                   int nb_sectors,
> +                                   BlockDriverCompletionFunc *cb,
> +                                   void *opaque)
> +{
> +    QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque);
> +    int i;
> +
> +    acb->aios = g_new0(QuorumSingleAIOCB, s->total);
> +
> +    acb->bqs = s;
> +    acb->qiov = qiov;
> +    acb->bh = NULL;
> +    acb->count = 0;
> +    acb->success_count = 0;
> +    acb->sector_num = sector_num;
> +    acb->nb_sectors = nb_sectors;
> +    acb->vote = NULL;
> +    acb->vote_ret = 0;
> +    acb->finished = NULL;
> +
> +    for (i = 0; i < s->total; i++) {
> +        acb->aios[i].buf = NULL;
> +        acb->aios[i].ret = 0;
> +        acb->aios[i].parent = acb;
> +    }

Would you mind to reorder the initialisation of the fields according to
the order that is used in the struct definition?

> +
> +    return acb;
> +}
> +
> +static void quorum_aio_cb(void *opaque, int ret)
> +{
> +    QuorumSingleAIOCB *sacb = opaque;
> +    QuorumAIOCB *acb = sacb->parent;
> +    BDRVQuorumState *s = acb->bqs;
> +
> +    sacb->ret = ret;
> +    acb->count++;
> +    if (ret == 0) {
> +        acb->success_count++;
> +    }
> +    assert(acb->count <= s->total);
> +    assert(acb->success_count <= s->total);
> +    if (acb->count < s->total) {
> +        return;
> +    }
> +
> +    acb->bh = qemu_bh_new(quorum_aio_bh, acb);
> +    qemu_bh_schedule(acb->bh);

What's the reason for using a bottom half here? Worth a comment?

multiwrite_cb() in block.c doesn't use one to achieve something similar.
Is it buggy when you need one here?

Kevin
Benoît Canet Sept. 26, 2013, 3:25 p.m. UTC | #2
Le Friday 08 Feb 2013 à 11:38:38 (+0100), Kevin Wolf a écrit :
> Am 28.01.2013 18:07, schrieb Benoît Canet:
> > Signed-off-by: Benoit Canet <benoit@irqsave.net>
> > ---
> >  block/quorum.c |  111 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 111 insertions(+)
> > 
> > diff --git a/block/quorum.c b/block/quorum.c
> > index d8fffbe..5d8470b 100644
> > --- a/block/quorum.c
> > +++ b/block/quorum.c
> > @@ -52,11 +52,122 @@ struct QuorumAIOCB {
> >      int vote_ret;
> >  };
> >  
> > +static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
> > +{
> > +    QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
> > +    bool finished = false;
> > +
> > +    /* Wait for the request to finish */
> > +    acb->finished = &finished;
> > +    while (!finished) {
> > +        qemu_aio_wait();
> > +    }
> > +}
> > +
> > +static AIOCBInfo quorum_aiocb_info = {
> > +    .aiocb_size         = sizeof(QuorumAIOCB),
> > +    .cancel             = quorum_aio_cancel,
> > +};
> > +
> > +static void quorum_aio_bh(void *opaque)
> > +{
> > +    QuorumAIOCB *acb = opaque;
> > +    BDRVQuorumState *s = acb->bqs;
> > +    int ret;
> > +
> > +    ret = s->threshold <= acb->success_count ? 0 : -EIO;
> 
> It would be very much preferable if you stored the actual error code
> instead of turning everything into -EIO.
> 
> > +
> > +    qemu_bh_delete(acb->bh);
> > +    acb->common.cb(acb->common.opaque, ret);
> > +    if (acb->finished) {
> > +        *acb->finished = true;
> > +    }
> > +    g_free(acb->aios);
> > +    qemu_aio_release(acb);
> > +}
> 
> Move this down so that it's next to the function using the bottom half.
> 
> > +
> > +static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> > +                                   BlockDriverState *bs,
> > +                                   QEMUIOVector *qiov,
> > +                                   uint64_t sector_num,
> > +                                   int nb_sectors,
> > +                                   BlockDriverCompletionFunc *cb,
> > +                                   void *opaque)
> > +{
> > +    QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque);
> > +    int i;
> > +
> > +    acb->aios = g_new0(QuorumSingleAIOCB, s->total);
> > +
> > +    acb->bqs = s;
> > +    acb->qiov = qiov;
> > +    acb->bh = NULL;
> > +    acb->count = 0;
> > +    acb->success_count = 0;
> > +    acb->sector_num = sector_num;
> > +    acb->nb_sectors = nb_sectors;
> > +    acb->vote = NULL;
> > +    acb->vote_ret = 0;
> > +    acb->finished = NULL;
> > +
> > +    for (i = 0; i < s->total; i++) {
> > +        acb->aios[i].buf = NULL;
> > +        acb->aios[i].ret = 0;
> > +        acb->aios[i].parent = acb;
> > +    }
> 
> Would you mind to reorder the initialisation of the fields according to
> the order that is used in the struct definition?
> 
> > +
> > +    return acb;
> > +}
> > +
> > +static void quorum_aio_cb(void *opaque, int ret)
> > +{
> > +    QuorumSingleAIOCB *sacb = opaque;
> > +    QuorumAIOCB *acb = sacb->parent;
> > +    BDRVQuorumState *s = acb->bqs;
> > +
> > +    sacb->ret = ret;
> > +    acb->count++;
> > +    if (ret == 0) {
> > +        acb->success_count++;
> > +    }
> > +    assert(acb->count <= s->total);
> > +    assert(acb->success_count <= s->total);
> > +    if (acb->count < s->total) {
> > +        return;
> > +    }
> > +
> > +    acb->bh = qemu_bh_new(quorum_aio_bh, acb);
> > +    qemu_bh_schedule(acb->bh);
> 
> What's the reason for using a bottom half here? Worth a comment?
> 
> multiwrite_cb() in block.c doesn't use one to achieve something similar.
> Is it buggy when you need one here?
It think I get the bottom half by largely taking inspiration reading Marcello
blkmirror code.

Best regards

Benoît


> 
> Kevin
>
Benoît Canet Sept. 26, 2013, 4:16 p.m. UTC | #3
> > +static void quorum_aio_bh(void *opaque)
> > +{
> > +    QuorumAIOCB *acb = opaque;
> > +    BDRVQuorumState *s = acb->bqs;
> > +    int ret;
> > +
> > +    ret = s->threshold <= acb->success_count ? 0 : -EIO;
> 
> It would be very much preferable if you stored the actual error code
> instead of turning everything into -EIO.

I am turning everything into -EIO because multiple errors can happen at the same
time.

Best regards

Benoît
Benoît Canet Sept. 26, 2013, 4:29 p.m. UTC | #4
Le Friday 08 Feb 2013 à 11:38:38 (+0100), Kevin Wolf a écrit :
> Am 28.01.2013 18:07, schrieb Benoît Canet:
> > Signed-off-by: Benoit Canet <benoit@irqsave.net>
> > ---
> >  block/quorum.c |  111 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 111 insertions(+)
> > 
> > diff --git a/block/quorum.c b/block/quorum.c
> > index d8fffbe..5d8470b 100644
> > --- a/block/quorum.c
> > +++ b/block/quorum.c
> > @@ -52,11 +52,122 @@ struct QuorumAIOCB {
> >      int vote_ret;
> >  };
> >  
> > +static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
> > +{
> > +    QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
> > +    bool finished = false;
> > +
> > +    /* Wait for the request to finish */
> > +    acb->finished = &finished;
> > +    while (!finished) {
> > +        qemu_aio_wait();
> > +    }
> > +}
> > +
> > +static AIOCBInfo quorum_aiocb_info = {
> > +    .aiocb_size         = sizeof(QuorumAIOCB),
> > +    .cancel             = quorum_aio_cancel,
> > +};
> > +
> > +static void quorum_aio_bh(void *opaque)
> > +{
> > +    QuorumAIOCB *acb = opaque;
> > +    BDRVQuorumState *s = acb->bqs;
> > +    int ret;
> > +
> > +    ret = s->threshold <= acb->success_count ? 0 : -EIO;
> 
> It would be very much preferable if you stored the actual error code
> instead of turning everything into -EIO.
> 
> > +
> > +    qemu_bh_delete(acb->bh);
> > +    acb->common.cb(acb->common.opaque, ret);
> > +    if (acb->finished) {
> > +        *acb->finished = true;
> > +    }
> > +    g_free(acb->aios);
> > +    qemu_aio_release(acb);
> > +}
> 
> Move this down so that it's next to the function using the bottom half.
> 
> > +
> > +static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> > +                                   BlockDriverState *bs,
> > +                                   QEMUIOVector *qiov,
> > +                                   uint64_t sector_num,
> > +                                   int nb_sectors,
> > +                                   BlockDriverCompletionFunc *cb,
> > +                                   void *opaque)
> > +{
> > +    QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque);
> > +    int i;
> > +
> > +    acb->aios = g_new0(QuorumSingleAIOCB, s->total);
> > +
> > +    acb->bqs = s;
> > +    acb->qiov = qiov;
> > +    acb->bh = NULL;
> > +    acb->count = 0;
> > +    acb->success_count = 0;
> > +    acb->sector_num = sector_num;
> > +    acb->nb_sectors = nb_sectors;
> > +    acb->vote = NULL;
> > +    acb->vote_ret = 0;
> > +    acb->finished = NULL;
> > +
> > +    for (i = 0; i < s->total; i++) {
> > +        acb->aios[i].buf = NULL;
> > +        acb->aios[i].ret = 0;
> > +        acb->aios[i].parent = acb;
> > +    }
> 
> Would you mind to reorder the initialisation of the fields according to
> the order that is used in the struct definition?
> 
> > +
> > +    return acb;
> > +}
> > +
> > +static void quorum_aio_cb(void *opaque, int ret)
> > +{
> > +    QuorumSingleAIOCB *sacb = opaque;
> > +    QuorumAIOCB *acb = sacb->parent;
> > +    BDRVQuorumState *s = acb->bqs;
> > +
> > +    sacb->ret = ret;
> > +    acb->count++;
> > +    if (ret == 0) {
> > +        acb->success_count++;
> > +    }
> > +    assert(acb->count <= s->total);
> > +    assert(acb->success_count <= s->total);
> > +    if (acb->count < s->total) {
> > +        return;
> > +    }
> > +
> > +    acb->bh = qemu_bh_new(quorum_aio_bh, acb);
> > +    qemu_bh_schedule(acb->bh);
> 
> What's the reason for using a bottom half here? Worth a comment?
> 
> multiwrite_cb() in block.c doesn't use one to achieve something similar.
> Is it buggy when you need one here?
> 

I tried the code without bh and it doesn't work.

> Kevin
>
Kevin Wolf Sept. 27, 2013, 9:59 a.m. UTC | #5
Am 26.09.2013 um 18:16 hat Benoît Canet geschrieben:
> > > +static void quorum_aio_bh(void *opaque)
> > > +{
> > > +    QuorumAIOCB *acb = opaque;
> > > +    BDRVQuorumState *s = acb->bqs;
> > > +    int ret;
> > > +
> > > +    ret = s->threshold <= acb->success_count ? 0 : -EIO;
> > 
> > It would be very much preferable if you stored the actual error code
> > instead of turning everything into -EIO.
> 
> I am turning everything into -EIO because multiple errors can happen at the same
> time.

Picking simply the first error code seems better than throwing all
information away. In the common case, I guess, you only have one error
at a time anyway. And if you do have multiple errors, you can still fix
one after another.

Kevin
Kevin Wolf Sept. 27, 2013, 10:03 a.m. UTC | #6
Am 26.09.2013 um 18:29 hat Benoît Canet geschrieben:
> Le Friday 08 Feb 2013 à 11:38:38 (+0100), Kevin Wolf a écrit :
> > Am 28.01.2013 18:07, schrieb Benoît Canet:
> > > Signed-off-by: Benoit Canet <benoit@irqsave.net>
> > > ---
> > >  block/quorum.c |  111 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 111 insertions(+)
> > > 
> > > diff --git a/block/quorum.c b/block/quorum.c
> > > index d8fffbe..5d8470b 100644
> > > --- a/block/quorum.c
> > > +++ b/block/quorum.c
> > > @@ -52,11 +52,122 @@ struct QuorumAIOCB {
> > >      int vote_ret;
> > >  };
> > >  
> > > +static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
> > > +{
> > > +    QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
> > > +    bool finished = false;
> > > +
> > > +    /* Wait for the request to finish */
> > > +    acb->finished = &finished;
> > > +    while (!finished) {
> > > +        qemu_aio_wait();
> > > +    }
> > > +}
> > > +
> > > +static AIOCBInfo quorum_aiocb_info = {
> > > +    .aiocb_size         = sizeof(QuorumAIOCB),
> > > +    .cancel             = quorum_aio_cancel,
> > > +};
> > > +
> > > +static void quorum_aio_bh(void *opaque)
> > > +{
> > > +    QuorumAIOCB *acb = opaque;
> > > +    BDRVQuorumState *s = acb->bqs;
> > > +    int ret;
> > > +
> > > +    ret = s->threshold <= acb->success_count ? 0 : -EIO;
> > 
> > It would be very much preferable if you stored the actual error code
> > instead of turning everything into -EIO.
> > 
> > > +
> > > +    qemu_bh_delete(acb->bh);
> > > +    acb->common.cb(acb->common.opaque, ret);
> > > +    if (acb->finished) {
> > > +        *acb->finished = true;
> > > +    }
> > > +    g_free(acb->aios);
> > > +    qemu_aio_release(acb);
> > > +}
> > 
> > Move this down so that it's next to the function using the bottom half.
> > 
> > > +
> > > +static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> > > +                                   BlockDriverState *bs,
> > > +                                   QEMUIOVector *qiov,
> > > +                                   uint64_t sector_num,
> > > +                                   int nb_sectors,
> > > +                                   BlockDriverCompletionFunc *cb,
> > > +                                   void *opaque)
> > > +{
> > > +    QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque);
> > > +    int i;
> > > +
> > > +    acb->aios = g_new0(QuorumSingleAIOCB, s->total);
> > > +
> > > +    acb->bqs = s;
> > > +    acb->qiov = qiov;
> > > +    acb->bh = NULL;
> > > +    acb->count = 0;
> > > +    acb->success_count = 0;
> > > +    acb->sector_num = sector_num;
> > > +    acb->nb_sectors = nb_sectors;
> > > +    acb->vote = NULL;
> > > +    acb->vote_ret = 0;
> > > +    acb->finished = NULL;
> > > +
> > > +    for (i = 0; i < s->total; i++) {
> > > +        acb->aios[i].buf = NULL;
> > > +        acb->aios[i].ret = 0;
> > > +        acb->aios[i].parent = acb;
> > > +    }
> > 
> > Would you mind to reorder the initialisation of the fields according to
> > the order that is used in the struct definition?
> > 
> > > +
> > > +    return acb;
> > > +}
> > > +
> > > +static void quorum_aio_cb(void *opaque, int ret)
> > > +{
> > > +    QuorumSingleAIOCB *sacb = opaque;
> > > +    QuorumAIOCB *acb = sacb->parent;
> > > +    BDRVQuorumState *s = acb->bqs;
> > > +
> > > +    sacb->ret = ret;
> > > +    acb->count++;
> > > +    if (ret == 0) {
> > > +        acb->success_count++;
> > > +    }
> > > +    assert(acb->count <= s->total);
> > > +    assert(acb->success_count <= s->total);
> > > +    if (acb->count < s->total) {
> > > +        return;
> > > +    }
> > > +
> > > +    acb->bh = qemu_bh_new(quorum_aio_bh, acb);
> > > +    qemu_bh_schedule(acb->bh);
> > 
> > What's the reason for using a bottom half here? Worth a comment?
> > 
> > multiwrite_cb() in block.c doesn't use one to achieve something similar.
> > Is it buggy when you need one here?
> > 
> 
> I tried the code without bh and it doesn't work.

It's long ago tbat I wrote that comment, but the remark about
multiwrite_cb() concerns me. Do you know _why_ it doesn't work without
the BH, and whether the same problem affects multiwrite_cb()? I'd prefer
if we understood what we're doing over just basing the code on
experiments.

Kevin
Benoît Canet Sept. 30, 2013, 12:54 p.m. UTC | #7
Le Friday 27 Sep 2013 à 12:03:07 (+0200), Kevin Wolf a écrit :
> Am 26.09.2013 um 18:29 hat Benoît Canet geschrieben:
> > Le Friday 08 Feb 2013 à 11:38:38 (+0100), Kevin Wolf a écrit :
> > > Am 28.01.2013 18:07, schrieb Benoît Canet:
> > > > Signed-off-by: Benoit Canet <benoit@irqsave.net>
> > > > ---
> > > >  block/quorum.c |  111 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 111 insertions(+)
> > > > 
> > > > diff --git a/block/quorum.c b/block/quorum.c
> > > > index d8fffbe..5d8470b 100644
> > > > --- a/block/quorum.c
> > > > +++ b/block/quorum.c
> > > > @@ -52,11 +52,122 @@ struct QuorumAIOCB {
> > > >      int vote_ret;
> > > >  };
> > > >  
> > > > +static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
> > > > +{
> > > > +    QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
> > > > +    bool finished = false;
> > > > +
> > > > +    /* Wait for the request to finish */
> > > > +    acb->finished = &finished;
> > > > +    while (!finished) {
> > > > +        qemu_aio_wait();
> > > > +    }
> > > > +}
> > > > +
> > > > +static AIOCBInfo quorum_aiocb_info = {
> > > > +    .aiocb_size         = sizeof(QuorumAIOCB),
> > > > +    .cancel             = quorum_aio_cancel,
> > > > +};
> > > > +
> > > > +static void quorum_aio_bh(void *opaque)
> > > > +{
> > > > +    QuorumAIOCB *acb = opaque;
> > > > +    BDRVQuorumState *s = acb->bqs;
> > > > +    int ret;
> > > > +
> > > > +    ret = s->threshold <= acb->success_count ? 0 : -EIO;
> > > 
> > > It would be very much preferable if you stored the actual error code
> > > instead of turning everything into -EIO.
> > > 
> > > > +
> > > > +    qemu_bh_delete(acb->bh);
> > > > +    acb->common.cb(acb->common.opaque, ret);
> > > > +    if (acb->finished) {
> > > > +        *acb->finished = true;
> > > > +    }
> > > > +    g_free(acb->aios);
> > > > +    qemu_aio_release(acb);
> > > > +}
> > > 
> > > Move this down so that it's next to the function using the bottom half.
> > > 
> > > > +
> > > > +static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> > > > +                                   BlockDriverState *bs,
> > > > +                                   QEMUIOVector *qiov,
> > > > +                                   uint64_t sector_num,
> > > > +                                   int nb_sectors,
> > > > +                                   BlockDriverCompletionFunc *cb,
> > > > +                                   void *opaque)
> > > > +{
> > > > +    QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque);
> > > > +    int i;
> > > > +
> > > > +    acb->aios = g_new0(QuorumSingleAIOCB, s->total);
> > > > +
> > > > +    acb->bqs = s;
> > > > +    acb->qiov = qiov;
> > > > +    acb->bh = NULL;
> > > > +    acb->count = 0;
> > > > +    acb->success_count = 0;
> > > > +    acb->sector_num = sector_num;
> > > > +    acb->nb_sectors = nb_sectors;
> > > > +    acb->vote = NULL;
> > > > +    acb->vote_ret = 0;
> > > > +    acb->finished = NULL;
> > > > +
> > > > +    for (i = 0; i < s->total; i++) {
> > > > +        acb->aios[i].buf = NULL;
> > > > +        acb->aios[i].ret = 0;
> > > > +        acb->aios[i].parent = acb;
> > > > +    }
> > > 
> > > Would you mind to reorder the initialisation of the fields according to
> > > the order that is used in the struct definition?
> > > 
> > > > +
> > > > +    return acb;
> > > > +}
> > > > +
> > > > +static void quorum_aio_cb(void *opaque, int ret)
> > > > +{
> > > > +    QuorumSingleAIOCB *sacb = opaque;
> > > > +    QuorumAIOCB *acb = sacb->parent;
> > > > +    BDRVQuorumState *s = acb->bqs;
> > > > +
> > > > +    sacb->ret = ret;
> > > > +    acb->count++;
> > > > +    if (ret == 0) {
> > > > +        acb->success_count++;
> > > > +    }
> > > > +    assert(acb->count <= s->total);
> > > > +    assert(acb->success_count <= s->total);
> > > > +    if (acb->count < s->total) {
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    acb->bh = qemu_bh_new(quorum_aio_bh, acb);
> > > > +    qemu_bh_schedule(acb->bh);
> > > 
> > > What's the reason for using a bottom half here? Worth a comment?
> > > 
> > > multiwrite_cb() in block.c doesn't use one to achieve something similar.
> > > Is it buggy when you need one here?
> > > 
> > 
> > I tried the code without bh and it doesn't work.
> 
> It's long ago tbat I wrote that comment, but the remark about
> multiwrite_cb() concerns me. Do you know _why_ it doesn't work without
> the BH, and whether the same problem affects multiwrite_cb()? I'd prefer
> if we understood what we're doing over just basing the code on
> experiments.

Tried to do the conversion again. It seems to works fine.

Best regards

Benoît
> 
> Kevin
diff mbox

Patch

diff --git a/block/quorum.c b/block/quorum.c
index d8fffbe..5d8470b 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -52,11 +52,122 @@  struct QuorumAIOCB {
     int vote_ret;
 };
 
+static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
+    bool finished = false;
+
+    /* Wait for the request to finish */
+    acb->finished = &finished;
+    while (!finished) {
+        qemu_aio_wait();
+    }
+}
+
+static AIOCBInfo quorum_aiocb_info = {
+    .aiocb_size         = sizeof(QuorumAIOCB),
+    .cancel             = quorum_aio_cancel,
+};
+
+static void quorum_aio_bh(void *opaque)
+{
+    QuorumAIOCB *acb = opaque;
+    BDRVQuorumState *s = acb->bqs;
+    int ret;
+
+    ret = s->threshold <= acb->success_count ? 0 : -EIO;
+
+    qemu_bh_delete(acb->bh);
+    acb->common.cb(acb->common.opaque, ret);
+    if (acb->finished) {
+        *acb->finished = true;
+    }
+    g_free(acb->aios);
+    qemu_aio_release(acb);
+}
+
+static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
+                                   BlockDriverState *bs,
+                                   QEMUIOVector *qiov,
+                                   uint64_t sector_num,
+                                   int nb_sectors,
+                                   BlockDriverCompletionFunc *cb,
+                                   void *opaque)
+{
+    QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque);
+    int i;
+
+    acb->aios = g_new0(QuorumSingleAIOCB, s->total);
+
+    acb->bqs = s;
+    acb->qiov = qiov;
+    acb->bh = NULL;
+    acb->count = 0;
+    acb->success_count = 0;
+    acb->sector_num = sector_num;
+    acb->nb_sectors = nb_sectors;
+    acb->vote = NULL;
+    acb->vote_ret = 0;
+    acb->finished = NULL;
+
+    for (i = 0; i < s->total; i++) {
+        acb->aios[i].buf = NULL;
+        acb->aios[i].ret = 0;
+        acb->aios[i].parent = acb;
+    }
+
+    return acb;
+}
+
+static void quorum_aio_cb(void *opaque, int ret)
+{
+    QuorumSingleAIOCB *sacb = opaque;
+    QuorumAIOCB *acb = sacb->parent;
+    BDRVQuorumState *s = acb->bqs;
+
+    sacb->ret = ret;
+    acb->count++;
+    if (ret == 0) {
+        acb->success_count++;
+    }
+    assert(acb->count <= s->total);
+    assert(acb->success_count <= s->total);
+    if (acb->count < s->total) {
+        return;
+    }
+
+    acb->bh = qemu_bh_new(quorum_aio_bh, acb);
+    qemu_bh_schedule(acb->bh);
+}
+
+static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs,
+                                          int64_t sector_num,
+                                          QEMUIOVector *qiov,
+                                          int nb_sectors,
+                                          BlockDriverCompletionFunc *cb,
+                                          void *opaque)
+{
+    BDRVQuorumState *s = bs->opaque;
+    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors,
+                                      cb, opaque);
+    int i;
+
+    for (i = 0; i < s->total; i++) {
+        acb->aios[i].aiocb = bdrv_aio_writev(s->bs[i], sector_num, qiov,
+                                             nb_sectors, &quorum_aio_cb,
+                                             &acb->aios[i]);
+    }
+
+    return &acb->common;
+}
+
 static BlockDriver bdrv_quorum = {
     .format_name        = "quorum",
     .protocol_name      = "quorum",
 
     .instance_size      = sizeof(BDRVQuorumState),
+
+    .bdrv_aio_writev    = quorum_aio_writev,
 };
 
 static void bdrv_quorum_init(void)