diff mbox

[v3] vmdk: Optimize cluster allocation

Message ID 1399528635-30159-1-git-send-email-famz@redhat.com
State New
Headers show

Commit Message

Fam Zheng May 8, 2014, 5:57 a.m. UTC
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.

Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.

This is not efficient, so it's now rewritten as:

  - Save the extent file length when opening.

  - When allocating cluster, use the saved length as cluster offset.

  - Don't truncate image, because we'll anyway write data there: just
    write any data at the EOF position, in descending priority:

    * New user data (cluster allocation happens in a write request).

    * Filling data in the beginning and/or ending of the new cluster, if
      not covered by user data: either backing file content (COW), or
      zero for standalone images.

One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:

    $ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk

    Before:
        real    0m21.796s
        user    0m0.130s
        sys     0m0.483s

    After:
        real    0m2.017s
        user    0m0.047s
        sys     0m0.190s

We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.

Tested that this passes qemu-iotests for all VMDK subformats.

Signed-off-by: Fam Zheng <famz@redhat.com>

---
V3: A new implementation following Kevin's suggestion.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 block/vmdk.c | 184 +++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 121 insertions(+), 63 deletions(-)

Comments

Kevin Wolf May 14, 2014, 2:23 p.m. UTC | #1
Am 08.05.2014 um 07:57 hat Fam Zheng geschrieben:
> This drops the unnecessary bdrv_truncate() from, and also improves,
> cluster allocation code path.
> [...]
> 
> Tested that this passes qemu-iotests for all VMDK subformats.
> 
> Signed-off-by: Fam Zheng <famz@redhat.com>

Unfortunately, this is seriously broken and only compatible with itself,
but appears not to work any more with real VMDKs.

> ---
> V3: A new implementation following Kevin's suggestion.
> 
> Signed-off-by: Fam Zheng <famz@redhat.com>
> ---
>  block/vmdk.c | 184 +++++++++++++++++++++++++++++++++++++++--------------------
>  1 file changed, 121 insertions(+), 63 deletions(-)
> 
> diff --git a/block/vmdk.c b/block/vmdk.c
> index 06a1f9f..8c34d5e 100644
> --- a/block/vmdk.c
> +++ b/block/vmdk.c
> @@ -106,6 +106,7 @@ typedef struct VmdkExtent {
>      uint32_t l2_cache_counts[L2_CACHE_SIZE];
>  
>      int64_t cluster_sectors;
> +    int64_t next_cluster_offset;
>      char *type;
>  } VmdkExtent;
>  
> @@ -397,6 +398,7 @@ static int vmdk_add_extent(BlockDriverState *bs,
>  {
>      VmdkExtent *extent;
>      BDRVVmdkState *s = bs->opaque;
> +    int64_t ret;
>  
>      if (cluster_sectors > 0x200000) {
>          /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
> @@ -428,6 +430,13 @@ static int vmdk_add_extent(BlockDriverState *bs,
>      extent->l2_size = l2_size;
>      extent->cluster_sectors = flat ? sectors : cluster_sectors;
>  
> +    ret = bdrv_getlength(extent->file);
> +

Why this empty line?

> +    if (ret < 0) {
> +        return ret;
> +    }
> +    extent->next_cluster_offset = ROUND_UP(ret, BDRV_SECTOR_SIZE);
> +
>      if (s->num_extents > 1) {
>          extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
>      } else {
> @@ -954,42 +963,77 @@ static int vmdk_refresh_limits(BlockDriverState *bs)
>      return 0;
>  }
>  
> +/**
> + * get_whole_cluster
> + *
> + * Copy backing file's cluster that covers @sector_num, otherwise write zero,
> + * to the cluster at @cluster_sector_num.
> + *
> + * If @skip_start < @skip_end, the relative range [@skip_start, @skip_end) is
> + * not copied or written, and leave it for call to write user data in the
> + * request.
> + */
>  static int get_whole_cluster(BlockDriverState *bs,
> -                VmdkExtent *extent,
> -                uint64_t cluster_offset,
> -                uint64_t offset,
> -                bool allocate)
> +                             VmdkExtent *extent,
> +                             uint64_t cluster_sector_num,
> +                             uint64_t sector_num,
> +                             uint64_t skip_start, uint64_t skip_end)
>  {
>      int ret = VMDK_OK;
> -    uint8_t *whole_grain = NULL;
> +    int64_t cluster_bytes;
> +    uint8_t *whole_grain;
> +
> +    /* For COW, align request sector_num to cluster start */
> +    sector_num -= sector_num % extent->cluster_sectors;

QEMU_ALIGN_DOWN?

> +    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
> +    whole_grain = qemu_blockalign(bs, cluster_bytes);
> +    memset(whole_grain, 0, cluster_bytes);

This is completely unnecessary for cases with backing files, and
unnecessary for the skipped part in any case.

> +    assert(skip_end <= sector_num + extent->cluster_sectors);
>      /* we will be here if it's first write on non-exist grain(cluster).
>       * try to read from parent image, if exist */
> -    if (bs->backing_hd) {
> -        whole_grain =
> -            qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS);
> -        if (!vmdk_is_cid_valid(bs)) {
> -            ret = VMDK_ERROR;
> -            goto exit;
> -        }
> +    if (bs->backing_hd && !vmdk_is_cid_valid(bs)) {
> +        ret = VMDK_ERROR;
> +        goto exit;
> +    }
>  
> -        /* floor offset to cluster */
> -        offset -= offset % (extent->cluster_sectors * 512);
> -        ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
> -                extent->cluster_sectors);
> +    /* Read backing data before skip range */
> +    if (skip_start > 0) {
> +        if (bs->backing_hd) {
> +            ret = bdrv_read(bs->backing_hd, sector_num,
> +                            whole_grain, skip_start);
> +            if (ret < 0) {
> +                ret = VMDK_ERROR;
> +                goto exit;
> +            }
> +        }
> +        ret = bdrv_write(extent->file, cluster_sector_num, whole_grain,
> +                         skip_start);
>          if (ret < 0) {
>              ret = VMDK_ERROR;
>              goto exit;
>          }
> -
> -        /* Write grain only into the active image */
> -        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
> -                extent->cluster_sectors);
> +    }
> +    /* Read backing data after skip range */
> +    if (skip_end < extent->cluster_sectors) {
> +        if (bs->backing_hd) {
> +            ret = bdrv_read(bs->backing_hd, sector_num + skip_end,
> +                            whole_grain + (skip_end << BDRV_SECTOR_BITS),
> +                            extent->cluster_sectors - skip_end);
> +            if (ret < 0) {
> +                ret = VMDK_ERROR;
> +                goto exit;
> +            }
> +        }
> +        ret = bdrv_write(extent->file, cluster_sector_num + skip_end,
> +                         whole_grain + (skip_end << BDRV_SECTOR_BITS),
> +                         extent->cluster_sectors - skip_end);
>          if (ret < 0) {
>              ret = VMDK_ERROR;
>              goto exit;
>          }
>      }
> +
>  exit:
>      qemu_vfree(whole_grain);
>      return ret;
> @@ -1026,17 +1070,40 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>      return VMDK_OK;
>  }
>  
> +/**
> + * get_cluster_offset
> + *
> + * Look up cluster offset in extent file by sector number, and stor in

store

> + * @cluster_offset.
> + *
> + * For flat extent, the start offset as parsed from the description file is
> + * returned.
> + *
> + * For sparse extent, look up in L1, L2 table. If allocate is true, return an
> + * offset for a new cluster and update L2 cache. If there is a backing file,
> + * COW is done before returning; otherwise, zeroes are written to the allocated
> + * cluster. Both COW and zero writting skips the sector range
> + * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
> + * has new data to write there.
> + *
> + * Returns: VMDK_OK if cluster exists and mapped in the image.
> + *          VMDK_UNALLOC if cluster is not mapped and @allocate is false.
> + *          VMDK_ERROR if failed.
> + */
>  static int get_cluster_offset(BlockDriverState *bs,
> -                                    VmdkExtent *extent,
> -                                    VmdkMetaData *m_data,
> -                                    uint64_t offset,
> -                                    int allocate,
> -                                    uint64_t *cluster_offset)
> +                              VmdkExtent *extent,
> +                              VmdkMetaData *m_data,
> +                              uint64_t offset,
> +                              bool allocate,
> +                              uint64_t *cluster_offset,
> +                              uint64_t skip_start_sector,
> +                              uint64_t skip_end_sector)
>  {
>      unsigned int l1_index, l2_offset, l2_index;
>      int min_index, i, j;
>      uint32_t min_count, *l2_table;
>      bool zeroed = false;
> +    int64_t ret;
>  
>      if (m_data) {
>          m_data->valid = 0;
> @@ -1109,33 +1176,29 @@ static int get_cluster_offset(BlockDriverState *bs,
>              return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
>          }
>  
> -        /* Avoid the L2 tables update for the images that have snapshots. */
> -        *cluster_offset = bdrv_getlength(extent->file);
> -        if (!extent->compressed) {
> -            bdrv_truncate(
> -                extent->file,
> -                *cluster_offset + (extent->cluster_sectors << 9)
> -            );
> -        }
> +        *cluster_offset = extent->next_cluster_offset;
> +        extent->next_cluster_offset +=
> +            extent->cluster_sectors << BDRV_SECTOR_BITS;
>  
> -        *cluster_offset >>= 9;
> -        l2_table[l2_index] = cpu_to_le32(*cluster_offset);
> +        l2_table[l2_index] = cpu_to_le32(*cluster_offset >> BDRV_SECTOR_BITS);

Something is fishy with the whole VMDK cluster allocation. The L2 table
entry is set here (to a cluster number), and later again to
m_data->offset, which is a byte offset now.

I don't quite understand why the L2 table is updated _twice_ and why we
have both *cluster_offset and m_data->offset, which should always both
be the same value (but aren't in practice with this patch applied)

>          /* First of all we write grain itself, to avoid race condition
>           * that may to corrupt the image.
>           * This problem may occur because of insufficient space on host disk
>           * or inappropriate VM shutdown.
>           */
> -        if (get_whole_cluster(
> -                bs, extent, *cluster_offset, offset, allocate) == -1) {
> -            return VMDK_ERROR;
> +        ret = get_whole_cluster(bs, extent,
> +                                *cluster_offset >> BDRV_SECTOR_BITS,
> +                                offset >> BDRV_SECTOR_BITS,
> +                                skip_start_sector, skip_end_sector);
> +        if (ret) {
> +            return ret;
>          }
>  
>          if (m_data) {
>              m_data->offset = *cluster_offset;
>          }
>      }
> -    *cluster_offset <<= 9;

In the case where the cluster was already allocated, *cluster_offset is
now a sector number. In the case of a newly allocated cluster, it is a
byte offset. Can't be right.

The bug is partly cancelled out because m_data->offset is also byte
allocated and the L2 table that is stored above gets overwritten with
it, so you end up writing byte offsets to the L2 table. This isn't VMDK
any more, obviously, but it explains why qemu-iotests couldn't catch it.
Perhaps we should add some binary sample image even for formats that we
implement r/w.

>      return VMDK_OK;
>  }

Kevin
Fam Zheng May 15, 2014, 5:10 a.m. UTC | #2
On Wed, 05/14 16:23, Kevin Wolf wrote:
> Am 08.05.2014 um 07:57 hat Fam Zheng geschrieben:
> > This drops the unnecessary bdrv_truncate() from, and also improves,
> > cluster allocation code path.
> > [...]
> > 
> > Tested that this passes qemu-iotests for all VMDK subformats.
> > 
> > Signed-off-by: Fam Zheng <famz@redhat.com>
> 
> Unfortunately, this is seriously broken and only compatible with itself,
> but appears not to work any more with real VMDKs.
> 
> > ---
> > V3: A new implementation following Kevin's suggestion.
> > 
> > Signed-off-by: Fam Zheng <famz@redhat.com>
> > ---
> >  block/vmdk.c | 184 +++++++++++++++++++++++++++++++++++++++--------------------
> >  1 file changed, 121 insertions(+), 63 deletions(-)
> > 
> > diff --git a/block/vmdk.c b/block/vmdk.c
> > index 06a1f9f..8c34d5e 100644
> > --- a/block/vmdk.c
> > +++ b/block/vmdk.c
> > @@ -106,6 +106,7 @@ typedef struct VmdkExtent {
> >      uint32_t l2_cache_counts[L2_CACHE_SIZE];
> >  
> >      int64_t cluster_sectors;
> > +    int64_t next_cluster_offset;
> >      char *type;
> >  } VmdkExtent;
> >  
> > @@ -397,6 +398,7 @@ static int vmdk_add_extent(BlockDriverState *bs,
> >  {
> >      VmdkExtent *extent;
> >      BDRVVmdkState *s = bs->opaque;
> > +    int64_t ret;
> >  
> >      if (cluster_sectors > 0x200000) {
> >          /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
> > @@ -428,6 +430,13 @@ static int vmdk_add_extent(BlockDriverState *bs,
> >      extent->l2_size = l2_size;
> >      extent->cluster_sectors = flat ? sectors : cluster_sectors;
> >  
> > +    ret = bdrv_getlength(extent->file);
> > +
> 
> Why this empty line?

Removing.

> 
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +    extent->next_cluster_offset = ROUND_UP(ret, BDRV_SECTOR_SIZE);
> > +
> >      if (s->num_extents > 1) {
> >          extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
> >      } else {
> > @@ -954,42 +963,77 @@ static int vmdk_refresh_limits(BlockDriverState *bs)
> >      return 0;
> >  }
> >  
> > +/**
> > + * get_whole_cluster
> > + *
> > + * Copy backing file's cluster that covers @sector_num, otherwise write zero,
> > + * to the cluster at @cluster_sector_num.
> > + *
> > + * If @skip_start < @skip_end, the relative range [@skip_start, @skip_end) is
> > + * not copied or written, and leave it for call to write user data in the
> > + * request.
> > + */
> >  static int get_whole_cluster(BlockDriverState *bs,
> > -                VmdkExtent *extent,
> > -                uint64_t cluster_offset,
> > -                uint64_t offset,
> > -                bool allocate)
> > +                             VmdkExtent *extent,
> > +                             uint64_t cluster_sector_num,
> > +                             uint64_t sector_num,
> > +                             uint64_t skip_start, uint64_t skip_end)
> >  {
> >      int ret = VMDK_OK;
> > -    uint8_t *whole_grain = NULL;
> > +    int64_t cluster_bytes;
> > +    uint8_t *whole_grain;
> > +
> > +    /* For COW, align request sector_num to cluster start */
> > +    sector_num -= sector_num % extent->cluster_sectors;
> 
> QEMU_ALIGN_DOWN?

Yes.

> 
> > +    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
> > +    whole_grain = qemu_blockalign(bs, cluster_bytes);
> > +    memset(whole_grain, 0, cluster_bytes);
> 
> This is completely unnecessary for cases with backing files, and
> unnecessary for the skipped part in any case.

Yes.

> 
> > +    assert(skip_end <= sector_num + extent->cluster_sectors);
> >      /* we will be here if it's first write on non-exist grain(cluster).
> >       * try to read from parent image, if exist */
> > -    if (bs->backing_hd) {
> > -        whole_grain =
> > -            qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS);
> > -        if (!vmdk_is_cid_valid(bs)) {
> > -            ret = VMDK_ERROR;
> > -            goto exit;
> > -        }
> > +    if (bs->backing_hd && !vmdk_is_cid_valid(bs)) {
> > +        ret = VMDK_ERROR;
> > +        goto exit;
> > +    }
> >  
> > -        /* floor offset to cluster */
> > -        offset -= offset % (extent->cluster_sectors * 512);
> > -        ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
> > -                extent->cluster_sectors);
> > +    /* Read backing data before skip range */
> > +    if (skip_start > 0) {
> > +        if (bs->backing_hd) {
> > +            ret = bdrv_read(bs->backing_hd, sector_num,
> > +                            whole_grain, skip_start);
> > +            if (ret < 0) {
> > +                ret = VMDK_ERROR;
> > +                goto exit;
> > +            }
> > +        }
> > +        ret = bdrv_write(extent->file, cluster_sector_num, whole_grain,
> > +                         skip_start);
> >          if (ret < 0) {
> >              ret = VMDK_ERROR;
> >              goto exit;
> >          }
> > -
> > -        /* Write grain only into the active image */
> > -        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
> > -                extent->cluster_sectors);
> > +    }
> > +    /* Read backing data after skip range */
> > +    if (skip_end < extent->cluster_sectors) {
> > +        if (bs->backing_hd) {
> > +            ret = bdrv_read(bs->backing_hd, sector_num + skip_end,
> > +                            whole_grain + (skip_end << BDRV_SECTOR_BITS),
> > +                            extent->cluster_sectors - skip_end);
> > +            if (ret < 0) {
> > +                ret = VMDK_ERROR;
> > +                goto exit;
> > +            }
> > +        }
> > +        ret = bdrv_write(extent->file, cluster_sector_num + skip_end,
> > +                         whole_grain + (skip_end << BDRV_SECTOR_BITS),
> > +                         extent->cluster_sectors - skip_end);
> >          if (ret < 0) {
> >              ret = VMDK_ERROR;
> >              goto exit;
> >          }
> >      }
> > +
> >  exit:
> >      qemu_vfree(whole_grain);
> >      return ret;
> > @@ -1026,17 +1070,40 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
> >      return VMDK_OK;
> >  }
> >  
> > +/**
> > + * get_cluster_offset
> > + *
> > + * Look up cluster offset in extent file by sector number, and stor in
> 
> store

Thanks.

> 
> > + * @cluster_offset.
> > + *
> > + * For flat extent, the start offset as parsed from the description file is
> > + * returned.
> > + *
> > + * For sparse extent, look up in L1, L2 table. If allocate is true, return an
> > + * offset for a new cluster and update L2 cache. If there is a backing file,
> > + * COW is done before returning; otherwise, zeroes are written to the allocated
> > + * cluster. Both COW and zero writting skips the sector range
> > + * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
> > + * has new data to write there.
> > + *
> > + * Returns: VMDK_OK if cluster exists and mapped in the image.
> > + *          VMDK_UNALLOC if cluster is not mapped and @allocate is false.
> > + *          VMDK_ERROR if failed.
> > + */
> >  static int get_cluster_offset(BlockDriverState *bs,
> > -                                    VmdkExtent *extent,
> > -                                    VmdkMetaData *m_data,
> > -                                    uint64_t offset,
> > -                                    int allocate,
> > -                                    uint64_t *cluster_offset)
> > +                              VmdkExtent *extent,
> > +                              VmdkMetaData *m_data,
> > +                              uint64_t offset,
> > +                              bool allocate,
> > +                              uint64_t *cluster_offset,
> > +                              uint64_t skip_start_sector,
> > +                              uint64_t skip_end_sector)
> >  {
> >      unsigned int l1_index, l2_offset, l2_index;
> >      int min_index, i, j;
> >      uint32_t min_count, *l2_table;
> >      bool zeroed = false;
> > +    int64_t ret;
> >  
> >      if (m_data) {
> >          m_data->valid = 0;
> > @@ -1109,33 +1176,29 @@ static int get_cluster_offset(BlockDriverState *bs,
> >              return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
> >          }
> >  
> > -        /* Avoid the L2 tables update for the images that have snapshots. */
> > -        *cluster_offset = bdrv_getlength(extent->file);
> > -        if (!extent->compressed) {
> > -            bdrv_truncate(
> > -                extent->file,
> > -                *cluster_offset + (extent->cluster_sectors << 9)
> > -            );
> > -        }
> > +        *cluster_offset = extent->next_cluster_offset;
> > +        extent->next_cluster_offset +=
> > +            extent->cluster_sectors << BDRV_SECTOR_BITS;
> >  
> > -        *cluster_offset >>= 9;
> > -        l2_table[l2_index] = cpu_to_le32(*cluster_offset);
> > +        l2_table[l2_index] = cpu_to_le32(*cluster_offset >> BDRV_SECTOR_BITS);
> 
> Something is fishy with the whole VMDK cluster allocation. The L2 table
> entry is set here (to a cluster number), and later again to
> m_data->offset, which is a byte offset now.
> 
> I don't quite understand why the L2 table is updated _twice_ and why we
> have both *cluster_offset and m_data->offset, which should always both
> be the same value (but aren't in practice with this patch applied)

I'm dropping this assignment, and m_data->offset. Will use returned
*cluster_offset in vmdk_L2update.

> 
> >          /* First of all we write grain itself, to avoid race condition
> >           * that may to corrupt the image.
> >           * This problem may occur because of insufficient space on host disk
> >           * or inappropriate VM shutdown.
> >           */
> > -        if (get_whole_cluster(
> > -                bs, extent, *cluster_offset, offset, allocate) == -1) {
> > -            return VMDK_ERROR;
> > +        ret = get_whole_cluster(bs, extent,
> > +                                *cluster_offset >> BDRV_SECTOR_BITS,
> > +                                offset >> BDRV_SECTOR_BITS,
> > +                                skip_start_sector, skip_end_sector);
> > +        if (ret) {
> > +            return ret;
> >          }
> >  
> >          if (m_data) {
> >              m_data->offset = *cluster_offset;
> >          }
> >      }
> > -    *cluster_offset <<= 9;
> 
> In the case where the cluster was already allocated, *cluster_offset is
> now a sector number. In the case of a newly allocated cluster, it is a
> byte offset. Can't be right.
> 
> The bug is partly cancelled out because m_data->offset is also byte
> allocated and the L2 table that is stored above gets overwritten with
> it, so you end up writing byte offsets to the L2 table. This isn't VMDK
> any more, obviously, but it explains why qemu-iotests couldn't catch it.
> Perhaps we should add some binary sample image even for formats that we
> implement r/w.

Yes, good catch indeed! It's broken by removing the shifting down of
*cluster_offset above and shifting up here, but I missed the assignment to
m_data->offset just above here, then the metadata we write to image and update
in L2 tables are both wrong.

So you're completely right in general, things are doubled here so it's error
prone when being modified! Will fix, and add a sample image with data in
qemu-iotests.

Thanks for reviewing, Kevin!

Fam
diff mbox

Patch

diff --git a/block/vmdk.c b/block/vmdk.c
index 06a1f9f..8c34d5e 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -106,6 +106,7 @@  typedef struct VmdkExtent {
     uint32_t l2_cache_counts[L2_CACHE_SIZE];
 
     int64_t cluster_sectors;
+    int64_t next_cluster_offset;
     char *type;
 } VmdkExtent;
 
@@ -397,6 +398,7 @@  static int vmdk_add_extent(BlockDriverState *bs,
 {
     VmdkExtent *extent;
     BDRVVmdkState *s = bs->opaque;
+    int64_t ret;
 
     if (cluster_sectors > 0x200000) {
         /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
@@ -428,6 +430,13 @@  static int vmdk_add_extent(BlockDriverState *bs,
     extent->l2_size = l2_size;
     extent->cluster_sectors = flat ? sectors : cluster_sectors;
 
+    ret = bdrv_getlength(extent->file);
+
+    if (ret < 0) {
+        return ret;
+    }
+    extent->next_cluster_offset = ROUND_UP(ret, BDRV_SECTOR_SIZE);
+
     if (s->num_extents > 1) {
         extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
     } else {
@@ -954,42 +963,77 @@  static int vmdk_refresh_limits(BlockDriverState *bs)
     return 0;
 }
 
+/**
+ * get_whole_cluster
+ *
+ * Copy backing file's cluster that covers @sector_num, otherwise write zero,
+ * to the cluster at @cluster_sector_num.
+ *
+ * If @skip_start < @skip_end, the relative range [@skip_start, @skip_end) is
+ * not copied or written, and leave it for call to write user data in the
+ * request.
+ */
 static int get_whole_cluster(BlockDriverState *bs,
-                VmdkExtent *extent,
-                uint64_t cluster_offset,
-                uint64_t offset,
-                bool allocate)
+                             VmdkExtent *extent,
+                             uint64_t cluster_sector_num,
+                             uint64_t sector_num,
+                             uint64_t skip_start, uint64_t skip_end)
 {
     int ret = VMDK_OK;
-    uint8_t *whole_grain = NULL;
+    int64_t cluster_bytes;
+    uint8_t *whole_grain;
+
+    /* For COW, align request sector_num to cluster start */
+    sector_num -= sector_num % extent->cluster_sectors;
+    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
+    whole_grain = qemu_blockalign(bs, cluster_bytes);
+    memset(whole_grain, 0, cluster_bytes);
 
+    assert(skip_end <= sector_num + extent->cluster_sectors);
     /* we will be here if it's first write on non-exist grain(cluster).
      * try to read from parent image, if exist */
-    if (bs->backing_hd) {
-        whole_grain =
-            qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS);
-        if (!vmdk_is_cid_valid(bs)) {
-            ret = VMDK_ERROR;
-            goto exit;
-        }
+    if (bs->backing_hd && !vmdk_is_cid_valid(bs)) {
+        ret = VMDK_ERROR;
+        goto exit;
+    }
 
-        /* floor offset to cluster */
-        offset -= offset % (extent->cluster_sectors * 512);
-        ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
-                extent->cluster_sectors);
+    /* Read backing data before skip range */
+    if (skip_start > 0) {
+        if (bs->backing_hd) {
+            ret = bdrv_read(bs->backing_hd, sector_num,
+                            whole_grain, skip_start);
+            if (ret < 0) {
+                ret = VMDK_ERROR;
+                goto exit;
+            }
+        }
+        ret = bdrv_write(extent->file, cluster_sector_num, whole_grain,
+                         skip_start);
         if (ret < 0) {
             ret = VMDK_ERROR;
             goto exit;
         }
-
-        /* Write grain only into the active image */
-        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
-                extent->cluster_sectors);
+    }
+    /* Read backing data after skip range */
+    if (skip_end < extent->cluster_sectors) {
+        if (bs->backing_hd) {
+            ret = bdrv_read(bs->backing_hd, sector_num + skip_end,
+                            whole_grain + (skip_end << BDRV_SECTOR_BITS),
+                            extent->cluster_sectors - skip_end);
+            if (ret < 0) {
+                ret = VMDK_ERROR;
+                goto exit;
+            }
+        }
+        ret = bdrv_write(extent->file, cluster_sector_num + skip_end,
+                         whole_grain + (skip_end << BDRV_SECTOR_BITS),
+                         extent->cluster_sectors - skip_end);
         if (ret < 0) {
             ret = VMDK_ERROR;
             goto exit;
         }
     }
+
 exit:
     qemu_vfree(whole_grain);
     return ret;
@@ -1026,17 +1070,40 @@  static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
     return VMDK_OK;
 }
 
+/**
+ * get_cluster_offset
+ *
+ * Look up cluster offset in extent file by sector number, and stor in
+ * @cluster_offset.
+ *
+ * For flat extent, the start offset as parsed from the description file is
+ * returned.
+ *
+ * For sparse extent, look up in L1, L2 table. If allocate is true, return an
+ * offset for a new cluster and update L2 cache. If there is a backing file,
+ * COW is done before returning; otherwise, zeroes are written to the allocated
+ * cluster. Both COW and zero writting skips the sector range
+ * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
+ * has new data to write there.
+ *
+ * Returns: VMDK_OK if cluster exists and mapped in the image.
+ *          VMDK_UNALLOC if cluster is not mapped and @allocate is false.
+ *          VMDK_ERROR if failed.
+ */
 static int get_cluster_offset(BlockDriverState *bs,
-                                    VmdkExtent *extent,
-                                    VmdkMetaData *m_data,
-                                    uint64_t offset,
-                                    int allocate,
-                                    uint64_t *cluster_offset)
+                              VmdkExtent *extent,
+                              VmdkMetaData *m_data,
+                              uint64_t offset,
+                              bool allocate,
+                              uint64_t *cluster_offset,
+                              uint64_t skip_start_sector,
+                              uint64_t skip_end_sector)
 {
     unsigned int l1_index, l2_offset, l2_index;
     int min_index, i, j;
     uint32_t min_count, *l2_table;
     bool zeroed = false;
+    int64_t ret;
 
     if (m_data) {
         m_data->valid = 0;
@@ -1109,33 +1176,29 @@  static int get_cluster_offset(BlockDriverState *bs,
             return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
         }
 
-        /* Avoid the L2 tables update for the images that have snapshots. */
-        *cluster_offset = bdrv_getlength(extent->file);
-        if (!extent->compressed) {
-            bdrv_truncate(
-                extent->file,
-                *cluster_offset + (extent->cluster_sectors << 9)
-            );
-        }
+        *cluster_offset = extent->next_cluster_offset;
+        extent->next_cluster_offset +=
+            extent->cluster_sectors << BDRV_SECTOR_BITS;
 
-        *cluster_offset >>= 9;
-        l2_table[l2_index] = cpu_to_le32(*cluster_offset);
+        l2_table[l2_index] = cpu_to_le32(*cluster_offset >> BDRV_SECTOR_BITS);
 
         /* First of all we write grain itself, to avoid race condition
          * that may to corrupt the image.
          * This problem may occur because of insufficient space on host disk
          * or inappropriate VM shutdown.
          */
-        if (get_whole_cluster(
-                bs, extent, *cluster_offset, offset, allocate) == -1) {
-            return VMDK_ERROR;
+        ret = get_whole_cluster(bs, extent,
+                                *cluster_offset >> BDRV_SECTOR_BITS,
+                                offset >> BDRV_SECTOR_BITS,
+                                skip_start_sector, skip_end_sector);
+        if (ret) {
+            return ret;
         }
 
         if (m_data) {
             m_data->offset = *cluster_offset;
         }
     }
-    *cluster_offset <<= 9;
     return VMDK_OK;
 }
 
@@ -1170,7 +1233,8 @@  static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
     }
     qemu_co_mutex_lock(&s->lock);
     ret = get_cluster_offset(bs, extent, NULL,
-                            sector_num * 512, 0, &offset);
+                             sector_num * 512, false, &offset,
+                             0, 0);
     qemu_co_mutex_unlock(&s->lock);
 
     switch (ret) {
@@ -1323,9 +1387,9 @@  static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
         if (!extent) {
             return -EIO;
         }
-        ret = get_cluster_offset(
-                            bs, extent, NULL,
-                            sector_num << 9, 0, &cluster_offset);
+        ret = get_cluster_offset(bs, extent, NULL,
+                                 sector_num << 9, false, &cluster_offset,
+                                 0, 0);
         extent_begin_sector = extent->end_sector - extent->sectors;
         extent_relative_sector_num = sector_num - extent_begin_sector;
         index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
@@ -1406,12 +1470,17 @@  static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
         if (!extent) {
             return -EIO;
         }
-        ret = get_cluster_offset(
-                                bs,
-                                extent,
-                                &m_data,
-                                sector_num << 9, !extent->compressed,
-                                &cluster_offset);
+        extent_begin_sector = extent->end_sector - extent->sectors;
+        extent_relative_sector_num = sector_num - extent_begin_sector;
+        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
+        n = extent->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors) {
+            n = nb_sectors;
+        }
+        ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+                                 !(extent->compressed || zeroed),
+                                 &cluster_offset,
+                                 index_in_cluster, index_in_cluster + n);
         if (extent->compressed) {
             if (ret == VMDK_OK) {
                 /* Refuse write to allocated cluster for streamOptimized */
@@ -1420,24 +1489,13 @@  static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                 return -EIO;
             } else {
                 /* allocate */
-                ret = get_cluster_offset(
-                                        bs,
-                                        extent,
-                                        &m_data,
-                                        sector_num << 9, 1,
-                                        &cluster_offset);
+                ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+                                         true, &cluster_offset, 0, 0);
             }
         }
         if (ret == VMDK_ERROR) {
             return -EINVAL;
         }
-        extent_begin_sector = extent->end_sector - extent->sectors;
-        extent_relative_sector_num = sector_num - extent_begin_sector;
-        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
-        n = extent->cluster_sectors - index_in_cluster;
-        if (n > nb_sectors) {
-            n = nb_sectors;
-        }
         if (zeroed) {
             /* Do zeroed write, buf is ignored */
             if (extent->has_zero_grain &&
@@ -2012,7 +2070,7 @@  static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
         }
         ret = get_cluster_offset(bs, extent, NULL,
                                  sector_num << BDRV_SECTOR_BITS,
-                                 0, &cluster_offset);
+                                 false, &cluster_offset, 0, 0);
         if (ret == VMDK_ERROR) {
             fprintf(stderr,
                     "ERROR: could not get cluster_offset for sector %"