diff mbox

[8/8] quorum: add basic device recovery logic

Message ID 1409557394-11853-9-git-send-email-namei.unix@gmail.com
State New
Headers show

Commit Message

Liu Yuan Sept. 1, 2014, 7:43 a.m. UTC
For some configuration, quorum allow VMs to continue while some child devices
are broken and when the child devices are repaired and return back, we need to
sync dirty bits during downtime to keep data consistency.

The recovery logic is based on the driver state bitmap and will sync the dirty
bits with a timeslice window in a coroutine in this prtimive implementation.

Simple graph about 2 children with threshold=1 and read-pattern=fifo:

+ denote device sync iteration
- IO on a single device
= IO on two devices

                                      sync complete, release dirty bitmap
                                         ^
                                         |
  ====-----------------++++----++++----++==========
     |                 |
     |                 v
     |               device repaired and begin to sync
     v
   device broken, create a dirty bitmap

  This sync logic can take care of nested broken problem, that devices are
  broken while in sync. We just start a sync process after the devices are
  repaired again and switch the devices from broken to sound only when the sync
  completes.

For read-pattern=quorum mode, it enjoys the recovery logic without any problem.

Cc: Eric Blake <eblake@redhat.com>
Cc: Benoit Canet <benoit@irqsave.net>
Cc: Kevin Wolf <kwolf@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Liu Yuan <namei.unix@gmail.com>
---
 block/quorum.c | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 trace-events   |   5 ++
 2 files changed, 191 insertions(+), 3 deletions(-)

Comments

Benoît Canet Sept. 1, 2014, 9:37 a.m. UTC | #1
The Monday 01 Sep 2014 à 15:43:14 (+0800), Liu Yuan wrote :
> For some configuration, quorum allow VMs to continue while some child devices
> are broken and when the child devices are repaired and return back, we need to
> sync dirty bits during downtime to keep data consistency.
> 
> The recovery logic is based on the driver state bitmap and will sync the dirty
> bits with a timeslice window in a coroutine in this prtimive implementation.
> 
> Simple graph about 2 children with threshold=1 and read-pattern=fifo:
> 
> + denote device sync iteration
> - IO on a single device
> = IO on two devices
> 
>                                       sync complete, release dirty bitmap
>                                          ^
>                                          |
>   ====-----------------++++----++++----++==========
>      |                 |
>      |                 v
>      |               device repaired and begin to sync
>      v
>    device broken, create a dirty bitmap
> 
>   This sync logic can take care of nested broken problem, that devices are
>   broken while in sync. We just start a sync process after the devices are
>   repaired again and switch the devices from broken to sound only when the sync
>   completes.
> 
> For read-pattern=quorum mode, it enjoys the recovery logic without any problem.
> 
> Cc: Eric Blake <eblake@redhat.com>
> Cc: Benoit Canet <benoit@irqsave.net>
> Cc: Kevin Wolf <kwolf@redhat.com>
> Cc: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Liu Yuan <namei.unix@gmail.com>
> ---
>  block/quorum.c | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  trace-events   |   5 ++
>  2 files changed, 191 insertions(+), 3 deletions(-)
> 
> diff --git a/block/quorum.c b/block/quorum.c
> index 7b07e35..ffd7c2d 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -23,6 +23,7 @@
>  #include "qapi/qmp/qlist.h"
>  #include "qapi/qmp/qstring.h"
>  #include "qapi-event.h"
> +#include "trace.h"
>  
>  #define HASH_LENGTH 32
>  
> @@ -31,6 +32,10 @@
>  #define QUORUM_OPT_REWRITE        "rewrite-corrupted"
>  #define QUORUM_OPT_READ_PATTERN   "read-pattern"
>  
> +#define SLICE_TIME          100000000ULL /* 100 ms */
> +#define CHUNK_SIZE          (1 << 20) /* 1M */
> +#define SECTORS_PER_CHUNK   (CHUNK_SIZE >> BDRV_SECTOR_BITS)
> +
>  /* This union holds a vote hash value */
>  typedef union QuorumVoteValue {
>      char h[HASH_LENGTH];       /* SHA-256 hash */
> @@ -64,6 +69,7 @@ typedef struct QuorumVotes {
>  
>  /* the following structure holds the state of one quorum instance */
>  typedef struct BDRVQuorumState {
> +    BlockDriverState *mybs;/* Quorum block driver base state */
>      BlockDriverState **bs; /* children BlockDriverStates */
>      int num_children;      /* children count */
>      int threshold;         /* if less than threshold children reads gave the
> @@ -82,6 +88,10 @@ typedef struct BDRVQuorumState {
>                              */
>  
>      QuorumReadPattern read_pattern;
> +    BdrvDirtyBitmap *dirty_bitmap;
> +    uint8_t *sync_buf;
> +    HBitmapIter hbi;
> +    int64_t sector_num;
>  } BDRVQuorumState;
>  
>  typedef struct QuorumAIOCB QuorumAIOCB;
> @@ -290,12 +300,11 @@ static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
>      }
>  }
>  
> -static int next_fifo_child(QuorumAIOCB *acb)
> +static int get_good_child(BDRVQuorumState *s, int iter)
>  {
> -    BDRVQuorumState *s = acb->common.bs->opaque;
>      int i;
>  
> -    for (i = acb->child_iter; i < s->num_children; i++) {
> +    for (i = iter; i < s->num_children; i++) {
>          if (!s->bs[i]->broken) {
>              break;
>          }
> @@ -306,6 +315,13 @@ static int next_fifo_child(QuorumAIOCB *acb)
>      return i;
>  }
>  
> +static int next_fifo_child(QuorumAIOCB *acb)
> +{
> +    BDRVQuorumState *s = acb->common.bs->opaque;
> +
> +    return get_good_child(s, acb->child_iter);
> +}
> +
>  static void quorum_aio_cb(void *opaque, int ret)
>  {
>      QuorumChildRequest *sacb = opaque;
> @@ -951,6 +967,171 @@ static int parse_read_pattern(const char *opt)
>      return -EINVAL;
>  }
>  
> +static void sync_prepare(BDRVQuorumState *qs, int64_t *num)
> +{
> +    int64_t nb, total = bdrv_nb_sectors(qs->mybs);
> +
> +    qs->sector_num = hbitmap_iter_next(&qs->hbi);
> +    /* Wrap around if previous bits get dirty while syncing */
> +    if (qs->sector_num < 0) {
> +        bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi);
> +        qs->sector_num = hbitmap_iter_next(&qs->hbi);
> +        assert(qs->sector_num >= 0);
> +    }
> +
> +    for (nb = 1; nb < SECTORS_PER_CHUNK && qs->sector_num + nb < total;
> +         nb++) {
> +        if (!bdrv_get_dirty(qs->mybs, qs->dirty_bitmap, qs->sector_num + nb)) {
> +            break;
> +        }
> +    }
> +    *num = nb;
> +}
> +
> +static void sync_finish(BDRVQuorumState *qs, int64_t num)
> +{
> +    int64_t i;
> +
> +    for (i = 0; i < num; i++) {
> +        /* We need to advance the iterator manually */
> +        hbitmap_iter_next(&qs->hbi);
> +    }
> +    bdrv_reset_dirty(qs->mybs, qs->sector_num, num);
> +}
> +
> +static int quorum_sync_iteration(BDRVQuorumState *qs, BlockDriverState *target)
> +{
> +    BlockDriverState *source;
> +    QEMUIOVector qiov;
> +    int ret, good;
> +    int64_t nb_sectors;
> +    struct iovec iov;
> +    const char *sname, *tname = bdrv_get_filename(target);
> +
> +    good = get_good_child(qs, 0);
> +    if (good < 0) {
> +        error_report("No good device available.");
> +        return -1;
> +    }
> +    source = qs->bs[good];
> +    sname = bdrv_get_filename(source);
> +    sync_prepare(qs, &nb_sectors);
> +    iov.iov_base = qs->sync_buf;
> +    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
> +    qemu_iovec_init_external(&qiov, &iov, 1);
> +
> +    trace_quorum_sync_iteration(sname, tname, qs->sector_num, nb_sectors);
> +    ret = bdrv_co_readv(source, qs->sector_num, nb_sectors, &qiov);
> +    if (ret < 0) {
> +        error_report("Read source %s failed.", sname);

I didn't read this patch throughfully but in quorum if you need to name a child BDS
you must use bs->node_name.

bs->node_name was introduced to be able to merge quorum and uniquely identify a given
node of the BDS graph.

Best regards

Benoît

> +        return ret;
> +    }
> +    ret = bdrv_co_writev(target, qs->sector_num, nb_sectors, &qiov);
> +    if (ret < 0) {
> +        error_report("Write target %s failed.", tname);
> +        return ret;
> +    }
> +    sync_finish(qs, nb_sectors);
> +
> +    return 0;
> +}
> +
> +static int quorum_sync_device(BDRVQuorumState *qs, BlockDriverState *target)
> +{
> +    uint64_t last_pause_ns;
> +
> +    bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi);
> +    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> +    for (;;) {
> +        int64_t cnt;
> +
> +        cnt = bdrv_get_dirty_count(qs->mybs, qs->dirty_bitmap);
> +        if (cnt == 0) {
> +            break;
> +        }
> +        error_report("count %ld", cnt);
> +        if (quorum_sync_iteration(qs, target) < 0) {
> +            return -1;
> +        }
> +        cnt = bdrv_get_dirty_count(qs->mybs, qs->dirty_bitmap);
> +        if (cnt == 0) {
> +            break;
> +        }
> +
> +        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns >=
> +            SLICE_TIME) {
> +            co_aio_sleep_ns(bdrv_get_aio_context(target), QEMU_CLOCK_REALTIME,
> +                            SLICE_TIME);
> +            last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static BlockDriverState *file_to_bs(BDRVQuorumState *qs, BlockDriverState *file)
> +{
> +    int i;
> +
> +    for (i = 0; i < qs->num_children; i++) {
> +        BlockDriverState *f = bdrv_get_file(qs->bs[i]);
> +
> +        if (f == file) {
> +            return qs->bs[i];
> +        }
> +    }
> +
> +    error_report("Can't find driver state for %s", bdrv_get_filename(file));
> +    abort();
> +}
> +
> +static void quorum_driver_reconnect(BlockDriverState *file)
> +{
> +    BDRVQuorumState *qs = file->drv_opaque;
> +    BlockDriverState *bs = file_to_bs(qs, file);
> +    const char *name = bdrv_get_filename(bs);
> +
> +    trace_quorum_driver_reconnect(name);
> +    assert(bs->broken == true);
> +    if (quorum_sync_device(qs, bs) < 0) {
> +        error_report("Failed to sync device %s", name);
> +        return;
> +    }
> +
> +    bdrv_release_dirty_bitmap(qs->mybs, qs->dirty_bitmap);
> +    qemu_vfree(qs->sync_buf);
> +    bs->broken = false;
> +}
> +
> +static void quorum_driver_disconnect(BlockDriverState *file)
> +{
> +    BDRVQuorumState *qs = file->drv_opaque;
> +    BlockDriverState *bs = file_to_bs(qs, file);
> +    const char *name = bdrv_get_filename(bs);
> +
> +    trace_quorum_driver_disconnect(name);
> +    /*
> +     * If we are disconnected while being syncing, we expect to reconnect to the
> +     * target again and resume the data sync from the last synced point.
> +     */
> +    if (bs->broken) {
> +        return;
> +    }
> +
> +    bs->broken = true;
> +    qs->dirty_bitmap = bdrv_create_dirty_bitmap(qs->mybs, BDRV_SECTOR_SIZE,
> +                                                NULL);
> +    if (!qs->dirty_bitmap) {
> +        abort();
> +    }
> +    qs->sync_buf = qemu_blockalign(bs, CHUNK_SIZE);
> +}
> +
> +static const BlockDrvOps quorum_block_drv_ops = {
> +    .driver_reconnect = quorum_driver_reconnect,
> +    .driver_disconnect = quorum_driver_disconnect,
> +};
> +
>  static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
>                         Error **errp)
>  {
> @@ -975,6 +1156,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
>          goto exit;
>      }
>  
> +    s->mybs = bs;
>      /* count how many different children are present */
>      s->num_children = qlist_size(list);
>      if (s->num_children < 2) {
> @@ -1061,6 +1243,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
>              goto close_exit;
>          }
>          opened[i] = true;
> +        bdrv_set_drv_ops(bdrv_get_file(s->bs[i]), &quorum_block_drv_ops, s);
>      }
>  
>      g_free(opened);
> diff --git a/trace-events b/trace-events
> index 81bc915..8da0a13 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -572,6 +572,11 @@ qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t o
>  qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
>  qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
>  
> +# block/quorum.c
> +quorum_sync_iteration(const char *source, const char *target, int64_t sector, int num) "%s -> %s, sector %"PRId64" nb_sectors %d"
> +quorum_driver_reconnect(const char *target) "%s"
> +quorum_driver_disconnect(const char *target) "%s"
> +
>  # hw/display/g364fb.c
>  g364fb_read(uint64_t addr, uint32_t val) "read addr=0x%"PRIx64": 0x%x"
>  g364fb_write(uint64_t addr, uint32_t new) "write addr=0x%"PRIx64": 0x%x"
> -- 
> 1.9.1
>
Liu Yuan Sept. 1, 2014, 9:45 a.m. UTC | #2
On Mon, Sep 01, 2014 at 11:37:20AM +0200, Benoît Canet wrote:
> The Monday 01 Sep 2014 à 15:43:14 (+0800), Liu Yuan wrote :
> > For some configuration, quorum allow VMs to continue while some child devices
> > are broken and when the child devices are repaired and return back, we need to
> > sync dirty bits during downtime to keep data consistency.
> > 
> > The recovery logic is based on the driver state bitmap and will sync the dirty
> > bits with a timeslice window in a coroutine in this prtimive implementation.
> > 
> > Simple graph about 2 children with threshold=1 and read-pattern=fifo:
> > 
> > + denote device sync iteration
> > - IO on a single device
> > = IO on two devices
> > 
> >                                       sync complete, release dirty bitmap
> >                                          ^
> >                                          |
> >   ====-----------------++++----++++----++==========
> >      |                 |
> >      |                 v
> >      |               device repaired and begin to sync
> >      v
> >    device broken, create a dirty bitmap
> > 
> >   This sync logic can take care of nested broken problem, that devices are
> >   broken while in sync. We just start a sync process after the devices are
> >   repaired again and switch the devices from broken to sound only when the sync
> >   completes.
> > 
> > For read-pattern=quorum mode, it enjoys the recovery logic without any problem.
> > 
> > Cc: Eric Blake <eblake@redhat.com>
> > Cc: Benoit Canet <benoit@irqsave.net>
> > Cc: Kevin Wolf <kwolf@redhat.com>
> > Cc: Stefan Hajnoczi <stefanha@redhat.com>
> > Signed-off-by: Liu Yuan <namei.unix@gmail.com>
> > ---
> >  block/quorum.c | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  trace-events   |   5 ++
> >  2 files changed, 191 insertions(+), 3 deletions(-)
> > 
> > diff --git a/block/quorum.c b/block/quorum.c
> > index 7b07e35..ffd7c2d 100644
> > --- a/block/quorum.c
> > +++ b/block/quorum.c
> > @@ -23,6 +23,7 @@
> >  #include "qapi/qmp/qlist.h"
> >  #include "qapi/qmp/qstring.h"
> >  #include "qapi-event.h"
> > +#include "trace.h"
> >  
> >  #define HASH_LENGTH 32
> >  
> > @@ -31,6 +32,10 @@
> >  #define QUORUM_OPT_REWRITE        "rewrite-corrupted"
> >  #define QUORUM_OPT_READ_PATTERN   "read-pattern"
> >  
> > +#define SLICE_TIME          100000000ULL /* 100 ms */
> > +#define CHUNK_SIZE          (1 << 20) /* 1M */
> > +#define SECTORS_PER_CHUNK   (CHUNK_SIZE >> BDRV_SECTOR_BITS)
> > +
> >  /* This union holds a vote hash value */
> >  typedef union QuorumVoteValue {
> >      char h[HASH_LENGTH];       /* SHA-256 hash */
> > @@ -64,6 +69,7 @@ typedef struct QuorumVotes {
> >  
> >  /* the following structure holds the state of one quorum instance */
> >  typedef struct BDRVQuorumState {
> > +    BlockDriverState *mybs;/* Quorum block driver base state */
> >      BlockDriverState **bs; /* children BlockDriverStates */
> >      int num_children;      /* children count */
> >      int threshold;         /* if less than threshold children reads gave the
> > @@ -82,6 +88,10 @@ typedef struct BDRVQuorumState {
> >                              */
> >  
> >      QuorumReadPattern read_pattern;
> > +    BdrvDirtyBitmap *dirty_bitmap;
> > +    uint8_t *sync_buf;
> > +    HBitmapIter hbi;
> > +    int64_t sector_num;
> >  } BDRVQuorumState;
> >  
> >  typedef struct QuorumAIOCB QuorumAIOCB;
> > @@ -290,12 +300,11 @@ static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
> >      }
> >  }
> >  
> > -static int next_fifo_child(QuorumAIOCB *acb)
> > +static int get_good_child(BDRVQuorumState *s, int iter)
> >  {
> > -    BDRVQuorumState *s = acb->common.bs->opaque;
> >      int i;
> >  
> > -    for (i = acb->child_iter; i < s->num_children; i++) {
> > +    for (i = iter; i < s->num_children; i++) {
> >          if (!s->bs[i]->broken) {
> >              break;
> >          }
> > @@ -306,6 +315,13 @@ static int next_fifo_child(QuorumAIOCB *acb)
> >      return i;
> >  }
> >  
> > +static int next_fifo_child(QuorumAIOCB *acb)
> > +{
> > +    BDRVQuorumState *s = acb->common.bs->opaque;
> > +
> > +    return get_good_child(s, acb->child_iter);
> > +}
> > +
> >  static void quorum_aio_cb(void *opaque, int ret)
> >  {
> >      QuorumChildRequest *sacb = opaque;
> > @@ -951,6 +967,171 @@ static int parse_read_pattern(const char *opt)
> >      return -EINVAL;
> >  }
> >  
> > +static void sync_prepare(BDRVQuorumState *qs, int64_t *num)
> > +{
> > +    int64_t nb, total = bdrv_nb_sectors(qs->mybs);
> > +
> > +    qs->sector_num = hbitmap_iter_next(&qs->hbi);
> > +    /* Wrap around if previous bits get dirty while syncing */
> > +    if (qs->sector_num < 0) {
> > +        bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi);
> > +        qs->sector_num = hbitmap_iter_next(&qs->hbi);
> > +        assert(qs->sector_num >= 0);
> > +    }
> > +
> > +    for (nb = 1; nb < SECTORS_PER_CHUNK && qs->sector_num + nb < total;
> > +         nb++) {
> > +        if (!bdrv_get_dirty(qs->mybs, qs->dirty_bitmap, qs->sector_num + nb)) {
> > +            break;
> > +        }
> > +    }
> > +    *num = nb;
> > +}
> > +
> > +static void sync_finish(BDRVQuorumState *qs, int64_t num)
> > +{
> > +    int64_t i;
> > +
> > +    for (i = 0; i < num; i++) {
> > +        /* We need to advance the iterator manually */
> > +        hbitmap_iter_next(&qs->hbi);
> > +    }
> > +    bdrv_reset_dirty(qs->mybs, qs->sector_num, num);
> > +}
> > +
> > +static int quorum_sync_iteration(BDRVQuorumState *qs, BlockDriverState *target)
> > +{
> > +    BlockDriverState *source;
> > +    QEMUIOVector qiov;
> > +    int ret, good;
> > +    int64_t nb_sectors;
> > +    struct iovec iov;
> > +    const char *sname, *tname = bdrv_get_filename(target);
> > +
> > +    good = get_good_child(qs, 0);
> > +    if (good < 0) {
> > +        error_report("No good device available.");
> > +        return -1;
> > +    }
> > +    source = qs->bs[good];
> > +    sname = bdrv_get_filename(source);
> > +    sync_prepare(qs, &nb_sectors);
> > +    iov.iov_base = qs->sync_buf;
> > +    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
> > +    qemu_iovec_init_external(&qiov, &iov, 1);
> > +
> > +    trace_quorum_sync_iteration(sname, tname, qs->sector_num, nb_sectors);
> > +    ret = bdrv_co_readv(source, qs->sector_num, nb_sectors, &qiov);
> > +    if (ret < 0) {
> > +        error_report("Read source %s failed.", sname);
> 
> I didn't read this patch throughfully but in quorum if you need to name a child BDS
> you must use bs->node_name.
> 
> bs->node_name was introduced to be able to merge quorum and uniquely identify a given
> node of the BDS graph.

Ah I see, thanks for reminding. Will do in next version.

Stefan and Kevin, a minor question I need to make sure if it is conventional to
add a helper to access any member of bs outside block.c? In this case, I need to
add bdrv_get_node_name(struct *bs)?

Thanks
Yuan
diff mbox

Patch

diff --git a/block/quorum.c b/block/quorum.c
index 7b07e35..ffd7c2d 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -23,6 +23,7 @@ 
 #include "qapi/qmp/qlist.h"
 #include "qapi/qmp/qstring.h"
 #include "qapi-event.h"
+#include "trace.h"
 
 #define HASH_LENGTH 32
 
@@ -31,6 +32,10 @@ 
 #define QUORUM_OPT_REWRITE        "rewrite-corrupted"
 #define QUORUM_OPT_READ_PATTERN   "read-pattern"
 
+#define SLICE_TIME          100000000ULL /* 100 ms */
+#define CHUNK_SIZE          (1 << 20) /* 1M */
+#define SECTORS_PER_CHUNK   (CHUNK_SIZE >> BDRV_SECTOR_BITS)
+
 /* This union holds a vote hash value */
 typedef union QuorumVoteValue {
     char h[HASH_LENGTH];       /* SHA-256 hash */
@@ -64,6 +69,7 @@  typedef struct QuorumVotes {
 
 /* the following structure holds the state of one quorum instance */
 typedef struct BDRVQuorumState {
+    BlockDriverState *mybs;/* Quorum block driver base state */
     BlockDriverState **bs; /* children BlockDriverStates */
     int num_children;      /* children count */
     int threshold;         /* if less than threshold children reads gave the
@@ -82,6 +88,10 @@  typedef struct BDRVQuorumState {
                             */
 
     QuorumReadPattern read_pattern;
+    BdrvDirtyBitmap *dirty_bitmap;
+    uint8_t *sync_buf;
+    HBitmapIter hbi;
+    int64_t sector_num;
 } BDRVQuorumState;
 
 typedef struct QuorumAIOCB QuorumAIOCB;
@@ -290,12 +300,11 @@  static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
     }
 }
 
-static int next_fifo_child(QuorumAIOCB *acb)
+static int get_good_child(BDRVQuorumState *s, int iter)
 {
-    BDRVQuorumState *s = acb->common.bs->opaque;
     int i;
 
-    for (i = acb->child_iter; i < s->num_children; i++) {
+    for (i = iter; i < s->num_children; i++) {
         if (!s->bs[i]->broken) {
             break;
         }
@@ -306,6 +315,13 @@  static int next_fifo_child(QuorumAIOCB *acb)
     return i;
 }
 
+static int next_fifo_child(QuorumAIOCB *acb)
+{
+    BDRVQuorumState *s = acb->common.bs->opaque;
+
+    return get_good_child(s, acb->child_iter);
+}
+
 static void quorum_aio_cb(void *opaque, int ret)
 {
     QuorumChildRequest *sacb = opaque;
@@ -951,6 +967,171 @@  static int parse_read_pattern(const char *opt)
     return -EINVAL;
 }
 
+static void sync_prepare(BDRVQuorumState *qs, int64_t *num)
+{
+    int64_t nb, total = bdrv_nb_sectors(qs->mybs);
+
+    qs->sector_num = hbitmap_iter_next(&qs->hbi);
+    /* Wrap around if previous bits get dirty while syncing */
+    if (qs->sector_num < 0) {
+        bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi);
+        qs->sector_num = hbitmap_iter_next(&qs->hbi);
+        assert(qs->sector_num >= 0);
+    }
+
+    for (nb = 1; nb < SECTORS_PER_CHUNK && qs->sector_num + nb < total;
+         nb++) {
+        if (!bdrv_get_dirty(qs->mybs, qs->dirty_bitmap, qs->sector_num + nb)) {
+            break;
+        }
+    }
+    *num = nb;
+}
+
+static void sync_finish(BDRVQuorumState *qs, int64_t num)
+{
+    int64_t i;
+
+    for (i = 0; i < num; i++) {
+        /* We need to advance the iterator manually */
+        hbitmap_iter_next(&qs->hbi);
+    }
+    bdrv_reset_dirty(qs->mybs, qs->sector_num, num);
+}
+
+static int quorum_sync_iteration(BDRVQuorumState *qs, BlockDriverState *target)
+{
+    BlockDriverState *source;
+    QEMUIOVector qiov;
+    int ret, good;
+    int64_t nb_sectors;
+    struct iovec iov;
+    const char *sname, *tname = bdrv_get_filename(target);
+
+    good = get_good_child(qs, 0);
+    if (good < 0) {
+        error_report("No good device available.");
+        return -1;
+    }
+    source = qs->bs[good];
+    sname = bdrv_get_filename(source);
+    sync_prepare(qs, &nb_sectors);
+    iov.iov_base = qs->sync_buf;
+    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    trace_quorum_sync_iteration(sname, tname, qs->sector_num, nb_sectors);
+    ret = bdrv_co_readv(source, qs->sector_num, nb_sectors, &qiov);
+    if (ret < 0) {
+        error_report("Read source %s failed.", sname);
+        return ret;
+    }
+    ret = bdrv_co_writev(target, qs->sector_num, nb_sectors, &qiov);
+    if (ret < 0) {
+        error_report("Write target %s failed.", tname);
+        return ret;
+    }
+    sync_finish(qs, nb_sectors);
+
+    return 0;
+}
+
+static int quorum_sync_device(BDRVQuorumState *qs, BlockDriverState *target)
+{
+    uint64_t last_pause_ns;
+
+    bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi);
+    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    for (;;) {
+        int64_t cnt;
+
+        cnt = bdrv_get_dirty_count(qs->mybs, qs->dirty_bitmap);
+        if (cnt == 0) {
+            break;
+        }
+        error_report("count %ld", cnt);
+        if (quorum_sync_iteration(qs, target) < 0) {
+            return -1;
+        }
+        cnt = bdrv_get_dirty_count(qs->mybs, qs->dirty_bitmap);
+        if (cnt == 0) {
+            break;
+        }
+
+        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns >=
+            SLICE_TIME) {
+            co_aio_sleep_ns(bdrv_get_aio_context(target), QEMU_CLOCK_REALTIME,
+                            SLICE_TIME);
+            last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+        }
+    }
+
+    return 0;
+}
+
+static BlockDriverState *file_to_bs(BDRVQuorumState *qs, BlockDriverState *file)
+{
+    int i;
+
+    for (i = 0; i < qs->num_children; i++) {
+        BlockDriverState *f = bdrv_get_file(qs->bs[i]);
+
+        if (f == file) {
+            return qs->bs[i];
+        }
+    }
+
+    error_report("Can't find driver state for %s", bdrv_get_filename(file));
+    abort();
+}
+
+static void quorum_driver_reconnect(BlockDriverState *file)
+{
+    BDRVQuorumState *qs = file->drv_opaque;
+    BlockDriverState *bs = file_to_bs(qs, file);
+    const char *name = bdrv_get_filename(bs);
+
+    trace_quorum_driver_reconnect(name);
+    assert(bs->broken == true);
+    if (quorum_sync_device(qs, bs) < 0) {
+        error_report("Failed to sync device %s", name);
+        return;
+    }
+
+    bdrv_release_dirty_bitmap(qs->mybs, qs->dirty_bitmap);
+    qemu_vfree(qs->sync_buf);
+    bs->broken = false;
+}
+
+static void quorum_driver_disconnect(BlockDriverState *file)
+{
+    BDRVQuorumState *qs = file->drv_opaque;
+    BlockDriverState *bs = file_to_bs(qs, file);
+    const char *name = bdrv_get_filename(bs);
+
+    trace_quorum_driver_disconnect(name);
+    /*
+     * If we are disconnected while being syncing, we expect to reconnect to the
+     * target again and resume the data sync from the last synced point.
+     */
+    if (bs->broken) {
+        return;
+    }
+
+    bs->broken = true;
+    qs->dirty_bitmap = bdrv_create_dirty_bitmap(qs->mybs, BDRV_SECTOR_SIZE,
+                                                NULL);
+    if (!qs->dirty_bitmap) {
+        abort();
+    }
+    qs->sync_buf = qemu_blockalign(bs, CHUNK_SIZE);
+}
+
+static const BlockDrvOps quorum_block_drv_ops = {
+    .driver_reconnect = quorum_driver_reconnect,
+    .driver_disconnect = quorum_driver_disconnect,
+};
+
 static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
                        Error **errp)
 {
@@ -975,6 +1156,7 @@  static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
         goto exit;
     }
 
+    s->mybs = bs;
     /* count how many different children are present */
     s->num_children = qlist_size(list);
     if (s->num_children < 2) {
@@ -1061,6 +1243,7 @@  static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
             goto close_exit;
         }
         opened[i] = true;
+        bdrv_set_drv_ops(bdrv_get_file(s->bs[i]), &quorum_block_drv_ops, s);
     }
 
     g_free(opened);
diff --git a/trace-events b/trace-events
index 81bc915..8da0a13 100644
--- a/trace-events
+++ b/trace-events
@@ -572,6 +572,11 @@  qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t o
 qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
 qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
 
+# block/quorum.c
+quorum_sync_iteration(const char *source, const char *target, int64_t sector, int num) "%s -> %s, sector %"PRId64" nb_sectors %d"
+quorum_driver_reconnect(const char *target) "%s"
+quorum_driver_disconnect(const char *target) "%s"
+
 # hw/display/g364fb.c
 g364fb_read(uint64_t addr, uint32_t val) "read addr=0x%"PRIx64": 0x%x"
 g364fb_write(uint64_t addr, uint32_t new) "write addr=0x%"PRIx64": 0x%x"