Patchwork [01/12] VMDK: Introduced VmdkExtent

login
register
mail settings
Submitter Feiran Zheng
Date June 4, 2011, 12:40 a.m.
Message ID <BANLkTimSy08r+3W_Zf5aiuLe-+Q-45qx=g@mail.gmail.com>
Download mbox | patch
Permalink /patch/98701/
State New
Headers show

Comments

Feiran Zheng - June 4, 2011, 12:40 a.m.
Added VmdkExtent array to hold image extent information, with the
exsiting functionalities fit into this structure.

Signed-off-by: Fam Zheng <famcool@gmail.com>
---
 block/vmdk.c |  301 ++++++++++++++++++++++++++++++++++++++--------------------
 1 files changed, 200 insertions(+), 101 deletions(-)

     }
@@ -363,6 +370,7 @@ static int vmdk_open(BlockDriverState *bs, int flags)
     BDRVVmdkState *s = bs->opaque;
     uint32_t magic;
     int l1_size, i;
+    VmdkExtent *extent = NULL;

     if (bdrv_pread(bs->file, 0, &magic, sizeof(magic)) != sizeof(magic))
         goto fail;
@@ -370,94 +378,126 @@ static int vmdk_open(BlockDriverState *bs, int flags)
     magic = be32_to_cpu(magic);
     if (magic == VMDK3_MAGIC) {
         VMDK3Header header;
-
-        if (bdrv_pread(bs->file, sizeof(magic), &header,
sizeof(header)) != sizeof(header))
+        s->extents = qemu_mallocz(sizeof(VmdkExtent));
+        s->num_extents = 1;
+        if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header))
+            != sizeof(header))
             goto fail;
-        s->cluster_sectors = le32_to_cpu(header.granularity);
-        s->l2_size = 1 << 9;
-        s->l1_size = 1 << 6;
-        bs->total_sectors = le32_to_cpu(header.disk_sectors);
-        s->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
-        s->l1_backup_table_offset = 0;
-        s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
+        extent = s->extents;
+        extent->flat = false;
+        extent->file = bs->file;
+        extent->cluster_sectors = le32_to_cpu(header.granularity);
+        extent->l2_size = 1 << 9;
+        extent->l1_size = 1 << 6;
+        extent->sectors = le32_to_cpu(header.disk_sectors);
+        extent->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
+        extent->l1_backup_table_offset = 0;
+        extent->l1_entry_sectors = extent->l2_size * extent->cluster_sectors;
     } else if (magic == VMDK4_MAGIC) {
         VMDK4Header header;
-
-        if (bdrv_pread(bs->file, sizeof(magic), &header,
sizeof(header)) != sizeof(header))
+        s->extents = qemu_mallocz(sizeof(VmdkExtent));
+        s->num_extents = 1;
+        if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header))
+            != sizeof(header))
             goto fail;
-        bs->total_sectors = le64_to_cpu(header.capacity);
-        s->cluster_sectors = le64_to_cpu(header.granularity);
-        s->l2_size = le32_to_cpu(header.num_gtes_per_gte);
-        s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
-        if (s->l1_entry_sectors <= 0)
+        extent = s->extents;
+        extent->file = bs->file;
+        extent->sectors = le64_to_cpu(header.capacity);
+        extent->cluster_sectors = le64_to_cpu(header.granularity);
+        extent->l2_size = le32_to_cpu(header.num_gtes_per_gte);
+        extent->l1_entry_sectors = extent->l2_size * extent->cluster_sectors;
+        if (extent->l1_entry_sectors <= 0)
             goto fail;
-        s->l1_size = (bs->total_sectors + s->l1_entry_sectors - 1)
-            / s->l1_entry_sectors;
-        s->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
-        s->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;
+        extent->l1_size = (extent->sectors + extent->l1_entry_sectors - 1)
+            / extent->l1_entry_sectors;
+        extent->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
+        extent->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;

         // try to open parent images, if exist
         if (vmdk_parent_open(bs) != 0)
             goto fail;
         // write the CID once after the image creation
-        s->parent_cid = vmdk_read_cid(bs,1);
+        extent->parent_cid = vmdk_read_cid(bs,1);
     } else {
         goto fail;
     }

+    /* sum up the total sectors */
+    bs->total_sectors = 0;
+    for (i = 0; i < s->num_extents; i++)
+    {
+        bs->total_sectors += s->extents[i].sectors;
+    }
+
     /* read the L1 table */
-    l1_size = s->l1_size * sizeof(uint32_t);
-    s->l1_table = qemu_malloc(l1_size);
-    if (bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
l1_size) != l1_size)
+    l1_size = extent->l1_size * sizeof(uint32_t);
+    extent->l1_table = qemu_malloc(l1_size);
+    if (bdrv_pread(bs->file,
+            extent->l1_table_offset,
+            extent->l1_table,
+            l1_size)
+        != l1_size)
         goto fail;
-    for(i = 0; i < s->l1_size; i++) {
-        le32_to_cpus(&s->l1_table[i]);
+    for(i = 0; i < extent->l1_size; i++) {
+        le32_to_cpus(&extent->l1_table[i]);
     }

-    if (s->l1_backup_table_offset) {
-        s->l1_backup_table = qemu_malloc(l1_size);
-        if (bdrv_pread(bs->file, s->l1_backup_table_offset,
s->l1_backup_table, l1_size) != l1_size)
+    if (extent->l1_backup_table_offset) {
+        extent->l1_backup_table = qemu_malloc(l1_size);
+        if (bdrv_pread(bs->file,
+                extent->l1_backup_table_offset,
+                extent->l1_backup_table,
+                l1_size)
+            != l1_size)
             goto fail;
-        for(i = 0; i < s->l1_size; i++) {
-            le32_to_cpus(&s->l1_backup_table[i]);
+        for(i = 0; i < extent->l1_size; i++) {
+            le32_to_cpus(&extent->l1_backup_table[i]);
         }
     }

-    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
+    extent->l2_cache =
+        qemu_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
     return 0;
  fail:
-    qemu_free(s->l1_backup_table);
-    qemu_free(s->l1_table);
-    qemu_free(s->l2_cache);
+    if(s->extents) {
+        qemu_free(s->extents[0].l1_backup_table);
+        qemu_free(s->extents[0].l1_table);
+        qemu_free(s->extents[0].l2_cache);
+    }
     return -1;
 }

-static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
-                                   uint64_t offset, int allocate);
-
-static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset,
-                             uint64_t offset, int allocate)
+static uint64_t get_cluster_offset(BlockDriverState *bs,
+                    VmdkExtent *extent,
+                    VmdkMetaData *m_data,
+                    uint64_t offset,
+                    int allocate);
+
+static int get_whole_cluster(BlockDriverState *bs,
+                VmdkExtent *extent,
+                uint64_t cluster_offset,
+                uint64_t offset,
+                bool allocate)
 {
-    BDRVVmdkState *s = bs->opaque;
-    uint8_t  whole_grain[s->cluster_sectors*512];        // 128
sectors * 512 bytes each = grain size 64KB
+    uint8_t  whole_grain[extent->cluster_sectors*512];        // 128
sectors * 512 bytes each = grain size 64KB

     // we will be here if it's first write on non-exist grain(cluster).
     // try to read from parent image, if exist
     if (bs->backing_hd) {
         int ret;

-        if (!vmdk_is_cid_valid(bs))
+        if (!vmdk_is_cid_valid(bs, extent))
             return -1;

         ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
-            s->cluster_sectors);
+                extent->cluster_sectors);
         if (ret < 0) {
             return -1;
         }

         //Write grain only into the active image
-        ret = bdrv_write(bs->file, cluster_offset, whole_grain,
-            s->cluster_sectors);
+        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
+                extent->cluster_sectors);
         if (ret < 0) {
             return -1;
         }
@@ -465,29 +505,36 @@ static int get_whole_cluster(BlockDriverState
*bs, uint64_t cluster_offset,
     return 0;
 }

-static int vmdk_L2update(BlockDriverState *bs, VmdkMetaData *m_data)
+static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
 {
-    BDRVVmdkState *s = bs->opaque;
-
     /* update L2 table */
-    if (bdrv_pwrite_sync(bs->file, ((int64_t)m_data->l2_offset * 512)
+ (m_data->l2_index * sizeof(m_data->offset)),
-                    &(m_data->offset), sizeof(m_data->offset)) < 0)
+    if (bdrv_pwrite_sync(
+            extent->file,
+            ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index *
sizeof(m_data->offset)),
+            &(m_data->offset),
+            sizeof(m_data->offset)
+        ) < 0)
         return -1;
     /* update backup L2 table */
-    if (s->l1_backup_table_offset != 0) {
-        m_data->l2_offset = s->l1_backup_table[m_data->l1_index];
-        if (bdrv_pwrite_sync(bs->file, ((int64_t)m_data->l2_offset *
512) + (m_data->l2_index * sizeof(m_data->offset)),
-                        &(m_data->offset), sizeof(m_data->offset)) < 0)
+    if (extent->l1_backup_table_offset != 0) {
+        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
+        if (bdrv_pwrite_sync(
+                extent->file,
+                ((int64_t)m_data->l2_offset * 512)
+                    + (m_data->l2_index * sizeof(m_data->offset)),
+                &(m_data->offset), sizeof(m_data->offset)
+            ) < 0)
             return -1;
     }

     return 0;
 }

-static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
-                                   uint64_t offset, int allocate)
+static uint64_t get_cluster_offset(BlockDriverState *bs,
+                                    VmdkExtent *extent,
+                                    VmdkMetaData *m_data,
+                                    uint64_t offset, int allocate)
 {
-    BDRVVmdkState *s = bs->opaque;
     unsigned int l1_index, l2_offset, l2_index;
     int min_index, i, j;
     uint32_t min_count, *l2_table, tmp = 0;
@@ -496,21 +543,21 @@ static uint64_t
get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
     if (m_data)
         m_data->valid = 0;

-    l1_index = (offset >> 9) / s->l1_entry_sectors;
-    if (l1_index >= s->l1_size)
+    l1_index = (offset >> 9) / extent->l1_entry_sectors;
+    if (l1_index >= extent->l1_size)
         return 0;
-    l2_offset = s->l1_table[l1_index];
+    l2_offset = extent->l1_table[l1_index];
     if (!l2_offset)
         return 0;
     for(i = 0; i < L2_CACHE_SIZE; i++) {
-        if (l2_offset == s->l2_cache_offsets[i]) {
+        if (l2_offset == extent->l2_cache_offsets[i]) {
             /* increment the hit count */
-            if (++s->l2_cache_counts[i] == 0xffffffff) {
+            if (++extent->l2_cache_counts[i] == 0xffffffff) {
                 for(j = 0; j < L2_CACHE_SIZE; j++) {
-                    s->l2_cache_counts[j] >>= 1;
+                    extent->l2_cache_counts[j] >>= 1;
                 }
             }
-            l2_table = s->l2_cache + (i * s->l2_size);
+            l2_table = extent->l2_cache + (i * extent->l2_size);
             goto found;
         }
     }
@@ -518,20 +565,24 @@ static uint64_t
get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
     min_index = 0;
     min_count = 0xffffffff;
     for(i = 0; i < L2_CACHE_SIZE; i++) {
-        if (s->l2_cache_counts[i] < min_count) {
-            min_count = s->l2_cache_counts[i];
+        if (extent->l2_cache_counts[i] < min_count) {
+            min_count = extent->l2_cache_counts[i];
             min_index = i;
         }
     }
-    l2_table = s->l2_cache + (min_index * s->l2_size);
-    if (bdrv_pread(bs->file, (int64_t)l2_offset * 512, l2_table,
s->l2_size * sizeof(uint32_t)) !=
-
  s->l2_size * sizeof(uint32_t))
+    l2_table = extent->l2_cache + (min_index * extent->l2_size);
+    if (bdrv_pread(
+            extent->file,
+            (int64_t)l2_offset * 512,
+            l2_table,
+            extent->l2_size * sizeof(uint32_t)
+        ) != extent->l2_size * sizeof(uint32_t))
         return 0;

-    s->l2_cache_offsets[min_index] = l2_offset;
-    s->l2_cache_counts[min_index] = 1;
+    extent->l2_cache_offsets[min_index] = l2_offset;
+    extent->l2_cache_counts[min_index] = 1;
  found:
-    l2_index = ((offset >> 9) / s->cluster_sectors) % s->l2_size;
+    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
     cluster_offset = le32_to_cpu(l2_table[l2_index]);

     if (!cluster_offset) {
@@ -539,8 +590,11 @@ static uint64_t
get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
             return 0;

         // Avoid the L2 tables update for the images that have snapshots.
-        cluster_offset = bdrv_getlength(bs->file);
-        bdrv_truncate(bs->file, cluster_offset + (s->cluster_sectors << 9));
+        cluster_offset = bdrv_getlength(extent->file);
+        bdrv_truncate(
+            extent->file,
+            cluster_offset + (extent->cluster_sectors << 9)
+        );

         cluster_offset >>= 9;
         tmp = cpu_to_le32(cluster_offset);
@@ -551,7 +605,8 @@ static uint64_t
get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
          * This problem may occur because of insufficient space on host disk
          * or inappropriate VM shutdown.
          */
-        if (get_whole_cluster(bs, cluster_offset, offset, allocate) == -1)
+        if (get_whole_cluster(
+                bs, extent, cluster_offset, offset, allocate) == -1)
             return 0;

         if (m_data) {
@@ -566,39 +621,72 @@ static uint64_t
get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
     return cluster_offset;
 }

+static int find_extent(BDRVVmdkState *s, int64_t sector_num, int start_idx)
+{
+    int ext_idx = start_idx;
+    while (ext_idx < s->num_extents
+            && sector_num >= s->extents[ext_idx].sectors) {
+        sector_num -= s->extents[ext_idx].sectors;
+        ext_idx++;
+    }
+    if (ext_idx == s->num_extents) return -1;
+    return ext_idx;
+}
+
 static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
                              int nb_sectors, int *pnum)
 {
     BDRVVmdkState *s = bs->opaque;
-    int index_in_cluster, n;
-    uint64_t cluster_offset;

-    cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
-    index_in_cluster = sector_num % s->cluster_sectors;
-    n = s->cluster_sectors - index_in_cluster;
+    int index_in_cluster, n, ret;
+    uint64_t offset;
+    VmdkExtent *extent;
+    int ext_idx;
+
+    ext_idx = find_extent(s, sector_num, 0);
+    if (ext_idx == -1) return 0;
+    extent = &s->extents[ext_idx];
+    if (s->extents[ext_idx].flat) {
+        n = extent->sectors - sector_num;
+        ret = 1;
+    } else {
+        offset = get_cluster_offset(bs, extent, NULL, sector_num * 512, 0);
+        index_in_cluster = sector_num % extent->cluster_sectors;
+        n = extent->cluster_sectors - index_in_cluster;
+        ret = offset ? 1 : 0;
+    }
     if (n > nb_sectors)
         n = nb_sectors;
     *pnum = n;
-    return (cluster_offset != 0);
+    return ret;
 }

 static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
                     uint8_t *buf, int nb_sectors)
 {
     BDRVVmdkState *s = bs->opaque;
-    int index_in_cluster, n, ret;
+    int ext_idx = 0;
+    int ret;
+    uint64_t n, index_in_cluster;
+    VmdkExtent *extent;
     uint64_t cluster_offset;

     while (nb_sectors > 0) {
-        cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
-        index_in_cluster = sector_num % s->cluster_sectors;
-        n = s->cluster_sectors - index_in_cluster;
+        ext_idx = find_extent(s, sector_num, ext_idx);
+        if (ext_idx == -1) {
+            return -1;
+        }
+        extent = &s->extents[ext_idx];
+        cluster_offset = get_cluster_offset(
+                            bs, extent, NULL, sector_num << 9, 0);
+        index_in_cluster = sector_num % extent->cluster_sectors;
+        n = extent->cluster_sectors - index_in_cluster;
         if (n > nb_sectors)
             n = nb_sectors;
         if (!cluster_offset) {
             // try to read from parent image, if exist
             if (bs->backing_hd) {
-                if (!vmdk_is_cid_valid(bs))
+                if (!vmdk_is_cid_valid(bs, extent))
                     return -1;
                 ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
                 if (ret < 0)
@@ -621,10 +709,12 @@ static int vmdk_write(BlockDriverState *bs,
int64_t sector_num,
                      const uint8_t *buf, int nb_sectors)
 {
     BDRVVmdkState *s = bs->opaque;
-    VmdkMetaData m_data;
-    int index_in_cluster, n;
+    VmdkExtent *extent;
+    int ext_idx = 0;
+    int n, index_in_cluster;
     uint64_t cluster_offset;
     static int cid_update = 0;
+    VmdkMetaData m_data;

     if (sector_num > bs->total_sectors) {
         fprintf(stderr,
@@ -635,19 +725,28 @@ static int vmdk_write(BlockDriverState *bs,
int64_t sector_num,
     }

     while (nb_sectors > 0) {
-        index_in_cluster = sector_num & (s->cluster_sectors - 1);
-        n = s->cluster_sectors - index_in_cluster;
-        if (n > nb_sectors)
-            n = nb_sectors;
-        cluster_offset = get_cluster_offset(bs, &m_data, sector_num << 9, 1);
+        ext_idx = find_extent(s, sector_num, ext_idx);
+        if (ext_idx == -1) {
+            return -1;
+        }
+        extent = &s->extents[ext_idx];
+        cluster_offset = get_cluster_offset(
+                                bs,
+                                extent,
+                                &m_data,
+                                sector_num << 9, 1);
         if (!cluster_offset)
             return -1;
+        index_in_cluster = sector_num & (extent->cluster_sectors - 1);
+        n = extent->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors)
+            n = nb_sectors;

         if (bdrv_pwrite(bs->file, cluster_offset + index_in_cluster *
512, buf, n * 512) != n * 512)
             return -1;
         if (m_data.valid) {
             /* update L2 tables */
-            if (vmdk_L2update(bs, &m_data) == -1)
+            if (vmdk_L2update(extent, &m_data) == -1)
                 return -1;
         }
         nb_sectors -= n;
@@ -818,8 +917,8 @@ static void vmdk_close(BlockDriverState *bs)
 {
     BDRVVmdkState *s = bs->opaque;

-    qemu_free(s->l1_table);
-    qemu_free(s->l2_cache);
+    qemu_free(s->extents[0].l1_table);
+    qemu_free(s->extents[0].l2_cache);
 }

 static int vmdk_flush(BlockDriverState *bs)
Stefan Hajnoczi - June 9, 2011, 7:58 a.m.
On Sat, Jun 04, 2011 at 08:40:22AM +0800, Fam Zheng wrote:
>   fail:
> -    qemu_free(s->l1_backup_table);
> -    qemu_free(s->l1_table);
> -    qemu_free(s->l2_cache);
> +    if(s->extents) {
> +        qemu_free(s->extents[0].l1_backup_table);
> +        qemu_free(s->extents[0].l1_table);
> +        qemu_free(s->extents[0].l2_cache);
> +    }

for (i = 0; i < s->num_extents; i++) {
    qemu_free(s->extents[i].l1_backup_table);
    qemu_free(s->extents[i].l1_table);
    qemu_free(s->extents[i].l2_cache);
}
qemu_free(s->extents);

> +static int find_extent(BDRVVmdkState *s, int64_t sector_num, int start_idx)
> +{
> +    int ext_idx = start_idx;
> +    while (ext_idx < s->num_extents
> +            && sector_num >= s->extents[ext_idx].sectors) {
> +        sector_num -= s->extents[ext_idx].sectors;
> +        ext_idx++;
> +    }
> +    if (ext_idx == s->num_extents) return -1;
> +    return ext_idx;
> +}

Callers don't really need the index, they just want the extent and an
optimized way of repeated calls to avoid O(N^2) find_extent() times:

static VmdkExtent *find_extent(BDRVVmdkState *s, int64_t sector_num,
                               VmdkExtent *start_extent)
{
    VmdkExtent *extent = start_extent;

    if (!start_extent) {
        extent = &s->extent[0];
    }

    while (extent < &s->extents[s->num_extents]) {
        if (sector_num < extent->end) {
	    return extent;
        }
	extent++;
    }
    return NULL;
}

I added the VmdkExtent.end field so that this function can be called
repeatedly for an increasing series of sector_num values.  It seems like
your code would fail when called with a non-0 index since sector_num -=
s->extents[ext_idx].sectors is incorrect when starting from an arbitrary
extent_idx.

> +
>  static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
>                               int nb_sectors, int *pnum)
>  {
>      BDRVVmdkState *s = bs->opaque;
> -    int index_in_cluster, n;
> -    uint64_t cluster_offset;
> 
> -    cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
> -    index_in_cluster = sector_num % s->cluster_sectors;
> -    n = s->cluster_sectors - index_in_cluster;
> +    int index_in_cluster, n, ret;
> +    uint64_t offset;
> +    VmdkExtent *extent;
> +    int ext_idx;
> +
> +    ext_idx = find_extent(s, sector_num, 0);
> +    if (ext_idx == -1) return 0;
> +    extent = &s->extents[ext_idx];
> +    if (s->extents[ext_idx].flat) {
> +        n = extent->sectors - sector_num;

If I have two flat extents:
Extent A: 0     - 1.5MB
Extent B: 1.5MB - 2MB

And I call vmdk_is_allocated(sector_num=1.5MB), then n = 512KB - 1.5MB
which is negative!  Also n is an int but it should be an int64_t (or
uint64_t) which can hold sector units.

Stefan
Feiran Zheng - June 9, 2011, 10:20 a.m.
On Thu, Jun 9, 2011 at 3:58 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Sat, Jun 04, 2011 at 08:40:22AM +0800, Fam Zheng wrote:
>>   fail:
>> -    qemu_free(s->l1_backup_table);
>> -    qemu_free(s->l1_table);
>> -    qemu_free(s->l2_cache);
>> +    if(s->extents) {
>> +        qemu_free(s->extents[0].l1_backup_table);
>> +        qemu_free(s->extents[0].l1_table);
>> +        qemu_free(s->extents[0].l2_cache);
>> +    }
>
> for (i = 0; i < s->num_extents; i++) {
>    qemu_free(s->extents[i].l1_backup_table);
>    qemu_free(s->extents[i].l1_table);
>    qemu_free(s->extents[i].l2_cache);
> }
> qemu_free(s->extents);
>

This looks better, although num_extents is 1 at most here.

>> +static int find_extent(BDRVVmdkState *s, int64_t sector_num, int start_idx)
>> +{
>> +    int ext_idx = start_idx;
>> +    while (ext_idx < s->num_extents
>> +            && sector_num >= s->extents[ext_idx].sectors) {
>> +        sector_num -= s->extents[ext_idx].sectors;
>> +        ext_idx++;
>> +    }
>> +    if (ext_idx == s->num_extents) return -1;
>> +    return ext_idx;
>> +}
>
> Callers don't really need the index, they just want the extent and an
> optimized way of repeated calls to avoid O(N^2) find_extent() times:
>
> static VmdkExtent *find_extent(BDRVVmdkState *s, int64_t sector_num,
>                               VmdkExtent *start_extent)
> {
>    VmdkExtent *extent = start_extent;
>
>    if (!start_extent) {
>        extent = &s->extent[0];
>    }
>
>    while (extent < &s->extents[s->num_extents]) {
>        if (sector_num < extent->end) {
>            return extent;
>        }
>        extent++;
>    }
>    return NULL;
> }
>
> I added the VmdkExtent.end field so that this function can be called
> repeatedly for an increasing series of sector_num values.  It seems like
> your code would fail when called with a non-0 index since sector_num -=
> s->extents[ext_idx].sectors is incorrect when starting from an arbitrary
> extent_idx.
>

Parameter sector_num which I meant was relative to start_idx, so
caller  passes a relative sector_num when calling with a non-0 index.

>> +
>>  static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
>>                               int nb_sectors, int *pnum)
>>  {
>>      BDRVVmdkState *s = bs->opaque;
>> -    int index_in_cluster, n;
>> -    uint64_t cluster_offset;
>>
>> -    cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
>> -    index_in_cluster = sector_num % s->cluster_sectors;
>> -    n = s->cluster_sectors - index_in_cluster;
>> +    int index_in_cluster, n, ret;
>> +    uint64_t offset;
>> +    VmdkExtent *extent;
>> +    int ext_idx;
>> +
>> +    ext_idx = find_extent(s, sector_num, 0);
>> +    if (ext_idx == -1) return 0;
>> +    extent = &s->extents[ext_idx];
>> +    if (s->extents[ext_idx].flat) {
>> +        n = extent->sectors - sector_num;
>
> If I have two flat extents:
> Extent A: 0     - 1.5MB
> Extent B: 1.5MB - 2MB
>
> And I call vmdk_is_allocated(sector_num=1.5MB), then n = 512KB - 1.5MB
> which is negative!  Also n is an int but it should be an int64_t (or
> uint64_t) which can hold sector units.

You are right. I did this because I wasn't considering multi extent
situations yet in this patch, and when introducing multi extents
support this problem is fixed.
Stefan Hajnoczi - June 9, 2011, 11:07 a.m.
On Thu, Jun 9, 2011 at 11:20 AM, Fam Zheng <famcool@gmail.com> wrote:
> On Thu, Jun 9, 2011 at 3:58 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> On Sat, Jun 04, 2011 at 08:40:22AM +0800, Fam Zheng wrote:
>>>   fail:
>>> -    qemu_free(s->l1_backup_table);
>>> -    qemu_free(s->l1_table);
>>> -    qemu_free(s->l2_cache);
>>> +    if(s->extents) {
>>> +        qemu_free(s->extents[0].l1_backup_table);
>>> +        qemu_free(s->extents[0].l1_table);
>>> +        qemu_free(s->extents[0].l2_cache);
>>> +    }
>>
>> for (i = 0; i < s->num_extents; i++) {
>>    qemu_free(s->extents[i].l1_backup_table);
>>    qemu_free(s->extents[i].l1_table);
>>    qemu_free(s->extents[i].l2_cache);
>> }
>> qemu_free(s->extents);
>>
>
> This looks better, although num_extents is 1 at most here.

The important thing is to free s->extents.

>>> +static int find_extent(BDRVVmdkState *s, int64_t sector_num, int start_idx)
>>> +{
>>> +    int ext_idx = start_idx;
>>> +    while (ext_idx < s->num_extents
>>> +            && sector_num >= s->extents[ext_idx].sectors) {
>>> +        sector_num -= s->extents[ext_idx].sectors;
>>> +        ext_idx++;
>>> +    }
>>> +    if (ext_idx == s->num_extents) return -1;
>>> +    return ext_idx;
>>> +}
>>
>> Callers don't really need the index, they just want the extent and an
>> optimized way of repeated calls to avoid O(N^2) find_extent() times:
>>
>> static VmdkExtent *find_extent(BDRVVmdkState *s, int64_t sector_num,
>>                               VmdkExtent *start_extent)
>> {
>>    VmdkExtent *extent = start_extent;
>>
>>    if (!start_extent) {
>>        extent = &s->extent[0];
>>    }
>>
>>    while (extent < &s->extents[s->num_extents]) {
>>        if (sector_num < extent->end) {
>>            return extent;
>>        }
>>        extent++;
>>    }
>>    return NULL;
>> }
>>
>> I added the VmdkExtent.end field so that this function can be called
>> repeatedly for an increasing series of sector_num values.  It seems like
>> your code would fail when called with a non-0 index since sector_num -=
>> s->extents[ext_idx].sectors is incorrect when starting from an arbitrary
>> extent_idx.
>>
>
> Parameter sector_num which I meant was relative to start_idx, so
> caller  passes a relative sector_num when calling with a non-0 index.

How can they calculate a relative sector_num?  It seems to me like
they need to do this manually by traversing the extents again.  I
think this exposes too much of the find_extents() iteration, it would
be simpler to switch to the VmdkExtent *find_extent(BDRVVmdkState *s,
int64_t sector_num, VmdkExtent *start_extent) function above.

Also, feel free to include doc comments to explain what function
arguments and return value do.

>>> +
>>>  static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
>>>                               int nb_sectors, int *pnum)
>>>  {
>>>      BDRVVmdkState *s = bs->opaque;
>>> -    int index_in_cluster, n;
>>> -    uint64_t cluster_offset;
>>>
>>> -    cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
>>> -    index_in_cluster = sector_num % s->cluster_sectors;
>>> -    n = s->cluster_sectors - index_in_cluster;
>>> +    int index_in_cluster, n, ret;
>>> +    uint64_t offset;
>>> +    VmdkExtent *extent;
>>> +    int ext_idx;
>>> +
>>> +    ext_idx = find_extent(s, sector_num, 0);
>>> +    if (ext_idx == -1) return 0;
>>> +    extent = &s->extents[ext_idx];
>>> +    if (s->extents[ext_idx].flat) {
>>> +        n = extent->sectors - sector_num;
>>
>> If I have two flat extents:
>> Extent A: 0     - 1.5MB
>> Extent B: 1.5MB - 2MB
>>
>> And I call vmdk_is_allocated(sector_num=1.5MB), then n = 512KB - 1.5MB
>> which is negative!  Also n is an int but it should be an int64_t (or
>> uint64_t) which can hold sector units.
>
> You are right. I did this because I wasn't considering multi extent
> situations yet in this patch, and when introducing multi extents
> support this problem is fixed.

Okay, I only looked at the first patch so far.  Please try to
structure patch series so they build on each other and do not
introduce intermediate build failures or bugs.  This is important so
that git-bisect(1) works and to make review straightforward (I can't
keep 12 patches in my head at the same time, I need to review one at a
time :)).

If it is necessary to introduce a temporary bug or weirdness in an
earlier patch, please include a comment or something in the commit
description to explain what is going to happen.

Stefan

Patch

diff --git a/block/vmdk.c b/block/vmdk.c
index 8fc9d67..9442794 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -60,7 +60,10 @@  typedef struct {

 #define L2_CACHE_SIZE 16

-typedef struct BDRVVmdkState {
+typedef struct VmdkExtent {
+    BlockDriverState *file;
+    bool flat;
+    int64_t sectors;
     int64_t l1_table_offset;
     int64_t l1_backup_table_offset;
     uint32_t *l1_table;
@@ -75,6 +78,11 @@  typedef struct BDRVVmdkState {

     unsigned int cluster_sectors;
     uint32_t parent_cid;
+} VmdkExtent;
+
+typedef struct BDRVVmdkState {
+    int num_extents;
+    VmdkExtent *extents;
 } BDRVVmdkState;

 typedef struct VmdkMetaData {
@@ -154,16 +162,15 @@  static int vmdk_write_cid(BlockDriverState *bs,
uint32_t cid)
     return 0;
 }

-static int vmdk_is_cid_valid(BlockDriverState *bs)
+static int vmdk_is_cid_valid(BlockDriverState *bs, VmdkExtent *extent)
 {
 #ifdef CHECK_CID
-    BDRVVmdkState *s = bs->opaque;
     BlockDriverState *p_bs = bs->backing_hd;
     uint32_t cur_pcid;

     if (p_bs) {
         cur_pcid = vmdk_read_cid(p_bs,0);
-        if (s->parent_cid != cur_pcid)
+        if (extent->parent_cid != cur_pcid)
             // CID not valid
             return 0;