Patchwork [3/3] block migration: do not submit multiple AIOs for same sector

login
register
mail settings
Submitter Marcelo Tosatti
Date Nov. 8, 2010, 7:02 p.m.
Message ID <20101108190918.201486474@redhat.com>
Download mbox | patch
Permalink /patch/70446/
State New
Headers show

Comments

Marcelo Tosatti - Nov. 8, 2010, 7:02 p.m.
Block migration can submit multiple AIO reads for the same sector/chunk, but
completion of such reads can happen out of order:

migration               guest
- get_dirty(N)
- aio_read(N) 
- clear_dirty(N)
                        write(N)
                        set_dirty(N)
- get_dirty(N)
- aio_read(N)

If the first aio_read completes after the second, stale data will be 
migrated to the destination.

Fix by not allowing multiple AIOs inflight for the same sector.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Kevin Wolf - Nov. 9, 2010, 12:42 p.m.
Am 08.11.2010 20:02, schrieb Marcelo Tosatti:
> Block migration can submit multiple AIO reads for the same sector/chunk, but
> completion of such reads can happen out of order:
> 
> migration               guest
> - get_dirty(N)
> - aio_read(N) 
> - clear_dirty(N)
>                         write(N)
>                         set_dirty(N)
> - get_dirty(N)
> - aio_read(N)
> 
> If the first aio_read completes after the second, stale data will be 
> migrated to the destination.
> 
> Fix by not allowing multiple AIOs inflight for the same sector.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> 
> Index: qemu-kvm/block-migration.c
> ===================================================================
> --- qemu-kvm.orig/block-migration.c
> +++ qemu-kvm/block-migration.c
> @@ -49,12 +49,14 @@ typedef struct BlkMigDevState {
>      int64_t total_sectors;
>      int64_t dirty;
>      QSIMPLEQ_ENTRY(BlkMigDevState) entry;
> +    unsigned long *aio_bitmap;
>  } BlkMigDevState;
>  
>  typedef struct BlkMigBlock {
>      uint8_t *buf;
>      BlkMigDevState *bmds;
>      int64_t sector;
> +    int nr_sectors;
>      struct iovec iov;
>      QEMUIOVector qiov;
>      BlockDriverAIOCB *aiocb;
> @@ -140,6 +142,57 @@ static inline long double compute_read_b
>      return  (block_mig_state.reads * BLOCK_SIZE)/ block_mig_state.total_time;
>  }
>  
> +static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
> +{
> +    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
> +
> +    if (bmds->aio_bitmap &&

Is bmds->aio_bitmap ever supposed to be NULL?

> +        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {

Hm, the problem probably already exists without this patch, but I was
wondering what happens if the result of bdrv_getlength() changes.

An image that is presented to the guest can't grow at the moment, even
though I think there were some thoughts about live resizing. However, do
we make sure that you can't close the image during block migration, by
using eject, change or hot-unplugging a disk?

> +        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
> +            (1UL << (chunk % (sizeof(unsigned long) * 8))));
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
> +                             int nb_sectors, int set)
> +{
> +    int64_t start, end;
> +    unsigned long val, idx, bit;
> +
> +    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
> +    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
> +
> +    for (; start <= end; start++) {
> +        idx = start / (sizeof(unsigned long) * 8);
> +        bit = start % (sizeof(unsigned long) * 8);
> +        val = bmds->aio_bitmap[idx];
> +        if (set) {
> +            if (!(val & (1UL << bit))) {
> +                val |= 1UL << bit;
> +            }

Why the if? Just doing the |= unconditionally would be enough.

> +        } else {
> +            if (val & (1UL << bit)) {
> +                val &= ~(1UL << bit);
> +            }

Same here.

> +        }
> +        bmds->aio_bitmap[idx] = val;
> +    }
> +}
> +
> +static void alloc_aio_bitmap(BlkMigDevState *bmds)
> +{
> +    BlockDriverState *bs = bmds->bs;
> +    int64_t bitmap_size;
> +
> +    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
> +            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
> +    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
> +
> +    bmds->aio_bitmap = qemu_mallocz(bitmap_size);
> +}
> +
>  static void blk_mig_read_cb(void *opaque, int ret)
>  {
>      BlkMigBlock *blk = opaque;
> @@ -151,6 +204,7 @@ static void blk_mig_read_cb(void *opaque
>      add_avg_read_time(blk->time);
>  
>      QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
> +    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
>  
>      block_mig_state.submitted--;
>      block_mig_state.read_done++;
> @@ -194,6 +248,7 @@ static int mig_save_device_bulk(Monitor 
>      blk->buf = qemu_malloc(BLOCK_SIZE);
>      blk->bmds = bmds;
>      blk->sector = cur_sector;
> +    blk->nr_sectors = nr_sectors;
>  
>      blk->iov.iov_base = blk->buf;
>      blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
> @@ -248,6 +303,7 @@ static void init_blk_migration_it(void *
>          bmds->total_sectors = sectors;
>          bmds->completed_sectors = 0;
>          bmds->shared_base = block_mig_state.shared_base;
> +        alloc_aio_bitmap(bmds);
>  
>          block_mig_state.total_sector_sum += sectors;
>  
> @@ -329,6 +385,8 @@ static int mig_save_device_dirty(Monitor
>      int nr_sectors;
>  
>      for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
> +        if (bmds_aio_inflight(bmds, sector))
> +            qemu_aio_flush();

Coding style requires braces.

Also, qemu_aio_flush() isn't a very nice solution. It works and fixes a
bug, but it blocks everything until all requests (and the requests
created by their callbacks) are completed. Can't we just skip this dirty
sector and retry next time?

>          if (bdrv_get_dirty(bmds->bs, sector)) {
>  
>              if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
> @@ -340,6 +398,7 @@ static int mig_save_device_dirty(Monitor
>              blk->buf = qemu_malloc(BLOCK_SIZE);
>              blk->bmds = bmds;
>              blk->sector = sector;
> +            blk->nr_sectors = nr_sectors;
>  
>              if (is_async) {
>                  blk->iov.iov_base = blk->buf;
> @@ -354,6 +413,7 @@ static int mig_save_device_dirty(Monitor
>                      goto error;
>                  }
>                  block_mig_state.submitted++;
> +                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);

Just to double-check if I understand right: mig_save_device_bulk doesn't
set the bit, but that's okay because it would never try to submit
overlapping requests anyway. Also copying the dirty regions only starts
when no more bulk requests are in flight.

Correct?

>              } else {
>                  if (bdrv_read(bmds->bs, sector, blk->buf,
>                                nr_sectors) < 0) {
> @@ -474,6 +534,7 @@ static void blk_mig_cleanup(Monitor *mon
>  
>      while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
>          QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
> +        qemu_free(bmds->aio_bitmap);
>          qemu_free(bmds);
>      }
>  
> 
> 

Kevin
Marcelo Tosatti - Nov. 9, 2010, 1:49 p.m.
On Tue, Nov 09, 2010 at 01:42:10PM +0100, Kevin Wolf wrote:
> > +static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
> > +{
> > +    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
> > +
> > +    if (bmds->aio_bitmap &&
> 
> Is bmds->aio_bitmap ever supposed to be NULL?

Nope.

> > +        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {
> 
> Hm, the problem probably already exists without this patch, but I was
> wondering what happens if the result of bdrv_getlength() changes.
> 
> An image that is presented to the guest can't grow at the moment, even
> though I think there were some thoughts about live resizing. However, do
> we make sure that you can't close the image during block migration, by
> using eject, change or hot-unplugging a disk?

Don't see how that is the case (yes, it looks broken).

> > +        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
> > +            (1UL << (chunk % (sizeof(unsigned long) * 8))));
> > +    } else {
> > +        return 0;
> > +    }
> > +}
> > +
> > +static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
> > +                             int nb_sectors, int set)
> > +{
> > +    int64_t start, end;
> > +    unsigned long val, idx, bit;
> > +
> > +    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
> > +    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
> > +
> > +    for (; start <= end; start++) {
> > +        idx = start / (sizeof(unsigned long) * 8);
> > +        bit = start % (sizeof(unsigned long) * 8);
> > +        val = bmds->aio_bitmap[idx];
> > +        if (set) {
> > +            if (!(val & (1UL << bit))) {
> > +                val |= 1UL << bit;
> > +            }
> 
> Why the if? Just doing the |= unconditionally would be enough.
> 
> > +        } else {
> > +            if (val & (1UL << bit)) {
> > +                val &= ~(1UL << bit);
> > +            }
> 
> Same here.
> 
> > +        }
> > +        bmds->aio_bitmap[idx] = val;
> > +    }
> > +}
> > +
> > +static void alloc_aio_bitmap(BlkMigDevState *bmds)
> > +{
> > +    BlockDriverState *bs = bmds->bs;
> > +    int64_t bitmap_size;
> > +
> > +    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
> > +            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
> > +    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
> > +
> > +    bmds->aio_bitmap = qemu_mallocz(bitmap_size);
> > +}
> > +
> >  static void blk_mig_read_cb(void *opaque, int ret)
> >  {
> >      BlkMigBlock *blk = opaque;
> > @@ -151,6 +204,7 @@ static void blk_mig_read_cb(void *opaque
> >      add_avg_read_time(blk->time);
> >  
> >      QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
> > +    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
> >  
> >      block_mig_state.submitted--;
> >      block_mig_state.read_done++;
> > @@ -194,6 +248,7 @@ static int mig_save_device_bulk(Monitor 
> >      blk->buf = qemu_malloc(BLOCK_SIZE);
> >      blk->bmds = bmds;
> >      blk->sector = cur_sector;
> > +    blk->nr_sectors = nr_sectors;
> >  
> >      blk->iov.iov_base = blk->buf;
> >      blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
> > @@ -248,6 +303,7 @@ static void init_blk_migration_it(void *
> >          bmds->total_sectors = sectors;
> >          bmds->completed_sectors = 0;
> >          bmds->shared_base = block_mig_state.shared_base;
> > +        alloc_aio_bitmap(bmds);
> >  
> >          block_mig_state.total_sector_sum += sectors;
> >  
> > @@ -329,6 +385,8 @@ static int mig_save_device_dirty(Monitor
> >      int nr_sectors;
> >  
> >      for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
> > +        if (bmds_aio_inflight(bmds, sector))
> > +            qemu_aio_flush();
> 
> Coding style requires braces.
> 
> Also, qemu_aio_flush() isn't a very nice solution. It works and fixes a
> bug, but it blocks everything until all requests (and the requests
> created by their callbacks) are completed. Can't we just skip this dirty
> sector and retry next time?

At some point we have to wait on the AIOs that are submitted (or remove
them from the list, which is much trickier). Note that currently it
does skip the request (and submit reads for all other unsubmitted dirty
blocks), before waiting.

So i don't see the need to optimize, yet.

> > +            blk->nr_sectors = nr_sectors;
> >  
> >              if (is_async) {
> >                  blk->iov.iov_base = blk->buf;
> > @@ -354,6 +413,7 @@ static int mig_save_device_dirty(Monitor
> >                      goto error;
> >                  }
> >                  block_mig_state.submitted++;
> > +                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
> 
> Just to double-check if I understand right: mig_save_device_bulk doesn't
> set the bit, but that's okay because it would never try to submit
> overlapping requests anyway. Also copying the dirty regions only starts
> when no more bulk requests are in flight.
> 
> Correct?

Yes, correct.

Thanks for your comments, i'll update the patch.

Patch

Index: qemu-kvm/block-migration.c
===================================================================
--- qemu-kvm.orig/block-migration.c
+++ qemu-kvm/block-migration.c
@@ -49,12 +49,14 @@  typedef struct BlkMigDevState {
     int64_t total_sectors;
     int64_t dirty;
     QSIMPLEQ_ENTRY(BlkMigDevState) entry;
+    unsigned long *aio_bitmap;
 } BlkMigDevState;
 
 typedef struct BlkMigBlock {
     uint8_t *buf;
     BlkMigDevState *bmds;
     int64_t sector;
+    int nr_sectors;
     struct iovec iov;
     QEMUIOVector qiov;
     BlockDriverAIOCB *aiocb;
@@ -140,6 +142,57 @@  static inline long double compute_read_b
     return  (block_mig_state.reads * BLOCK_SIZE)/ block_mig_state.total_time;
 }
 
+static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
+{
+    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+    if (bmds->aio_bitmap &&
+        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {
+        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
+            (1UL << (chunk % (sizeof(unsigned long) * 8))));
+    } else {
+        return 0;
+    }
+}
+
+static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
+                             int nb_sectors, int set)
+{
+    int64_t start, end;
+    unsigned long val, idx, bit;
+
+    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
+    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+    for (; start <= end; start++) {
+        idx = start / (sizeof(unsigned long) * 8);
+        bit = start % (sizeof(unsigned long) * 8);
+        val = bmds->aio_bitmap[idx];
+        if (set) {
+            if (!(val & (1UL << bit))) {
+                val |= 1UL << bit;
+            }
+        } else {
+            if (val & (1UL << bit)) {
+                val &= ~(1UL << bit);
+            }
+        }
+        bmds->aio_bitmap[idx] = val;
+    }
+}
+
+static void alloc_aio_bitmap(BlkMigDevState *bmds)
+{
+    BlockDriverState *bs = bmds->bs;
+    int64_t bitmap_size;
+
+    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
+            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
+    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
+
+    bmds->aio_bitmap = qemu_mallocz(bitmap_size);
+}
+
 static void blk_mig_read_cb(void *opaque, int ret)
 {
     BlkMigBlock *blk = opaque;
@@ -151,6 +204,7 @@  static void blk_mig_read_cb(void *opaque
     add_avg_read_time(blk->time);
 
     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
+    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
 
     block_mig_state.submitted--;
     block_mig_state.read_done++;
@@ -194,6 +248,7 @@  static int mig_save_device_bulk(Monitor 
     blk->buf = qemu_malloc(BLOCK_SIZE);
     blk->bmds = bmds;
     blk->sector = cur_sector;
+    blk->nr_sectors = nr_sectors;
 
     blk->iov.iov_base = blk->buf;
     blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
@@ -248,6 +303,7 @@  static void init_blk_migration_it(void *
         bmds->total_sectors = sectors;
         bmds->completed_sectors = 0;
         bmds->shared_base = block_mig_state.shared_base;
+        alloc_aio_bitmap(bmds);
 
         block_mig_state.total_sector_sum += sectors;
 
@@ -329,6 +385,8 @@  static int mig_save_device_dirty(Monitor
     int nr_sectors;
 
     for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
+        if (bmds_aio_inflight(bmds, sector))
+            qemu_aio_flush();
         if (bdrv_get_dirty(bmds->bs, sector)) {
 
             if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
@@ -340,6 +398,7 @@  static int mig_save_device_dirty(Monitor
             blk->buf = qemu_malloc(BLOCK_SIZE);
             blk->bmds = bmds;
             blk->sector = sector;
+            blk->nr_sectors = nr_sectors;
 
             if (is_async) {
                 blk->iov.iov_base = blk->buf;
@@ -354,6 +413,7 @@  static int mig_save_device_dirty(Monitor
                     goto error;
                 }
                 block_mig_state.submitted++;
+                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
             } else {
                 if (bdrv_read(bmds->bs, sector, blk->buf,
                               nr_sectors) < 0) {
@@ -474,6 +534,7 @@  static void blk_mig_cleanup(Monitor *mon
 
     while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
+        qemu_free(bmds->aio_bitmap);
         qemu_free(bmds);
     }