Patchwork VMDK: add monolithic flat image support

login
register
mail settings
Submitter Feiran Zheng
Date May 30, 2011, 7:49 a.m.
Message ID <BANLkTikZkEXH_UEH0gHnoL88nQOUtmEiWg@mail.gmail.com>
Download mbox | patch
Permalink /patch/97894/
State New
Headers show

Comments

Feiran Zheng - May 30, 2011, 7:49 a.m.
VMDK multiple file images can not be recognized for now. This patch is
adding monolithic flat support to it, that is the image type with two
files, one text descriptor file and a plain data file. This type of
image can be created in VMWare, with the options "allocate all disk
space now" and "store virtual disk as a single file" checked.

A VmdkExtent structure is introduced to hold the image "extent"
information, which makes further adding multi extents support of VMDK
easy. An image creating option "flat" is added for creating flat
(preallocated) image.

Signed-off-by: Feiran (Fam) Zheng <famcool@gmail.com>
---
 block/vmdk.c |  686 +++++++++++++++++++++++++++++++++++++++++++++-------------
 block_int.h  |    2 +
 qemu-img.c   |   12 +-
 3 files changed, 542 insertions(+), 158 deletions(-)

GetProcAddress(GetModuleHandle("kernel32"), "GetCompressedFileSizeA");
     if (get_compressed) {
@@ -1000,9 +1003,12 @@ static int64_t get_allocated_file_size(const
char *filename)
     return st.st_size;
 }
 #else
-static int64_t get_allocated_file_size(const char *filename)
+static int64_t get_allocated_file_size(
+        BlockDriverState *bs, const char *filename)
 {
     struct stat st;
+    if (bs->disk_size)
+        return bs->disk_size;
     if (stat(filename, &st) < 0)
         return -1;
     return (int64_t)st.st_blocks * 512;
@@ -1067,7 +1073,7 @@ static int img_info(int argc, char **argv)
     bdrv_get_format(bs, fmt_name, sizeof(fmt_name));
     bdrv_get_geometry(bs, &total_sectors);
     get_human_readable_size(size_buf, sizeof(size_buf), total_sectors * 512);
-    allocated_size = get_allocated_file_size(filename);
+    allocated_size = get_allocated_file_size(bs, filename);
     if (allocated_size < 0) {
         snprintf(dsize_buf, sizeof(dsize_buf), "unavailable");
     } else {
Kevin Wolf - May 30, 2011, 11:41 a.m.
Am 30.05.2011 09:49, schrieb Fam Zheng:
> VMDK multiple file images can not be recognized for now. This patch is
> adding monolithic flat support to it, that is the image type with two
> files, one text descriptor file and a plain data file. This type of
> image can be created in VMWare, with the options "allocate all disk
> space now" and "store virtual disk as a single file" checked.
> 
> A VmdkExtent structure is introduced to hold the image "extent"
> information, which makes further adding multi extents support of VMDK
> easy. An image creating option "flat" is added for creating flat
> (preallocated) image.
> 
> Signed-off-by: Feiran (Fam) Zheng <famcool@gmail.com>
> ---
>  block/vmdk.c |  686 +++++++++++++++++++++++++++++++++++++++++++++-------------
>  block_int.h  |    2 +
>  qemu-img.c   |   12 +-
>  3 files changed, 542 insertions(+), 158 deletions(-)
> 
> diff --git a/block/vmdk.c b/block/vmdk.c
> index 8fc9d67..726ad3a 100644
> --- a/block/vmdk.c
> +++ b/block/vmdk.c
> @@ -30,6 +30,8 @@
>  #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
>  #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
> 
> +#define VMDK_FLAT_BACKING 0

Is there any specific reason why it's useful to enable the corresponding
code only conditionally? Generally no code that is #ifdefed out should
be committed.

> +
>  typedef struct {
>      uint32_t version;
>      uint32_t flags;
> @@ -60,7 +62,10 @@ typedef struct {
> 
>  #define L2_CACHE_SIZE 16
> 
> -typedef struct BDRVVmdkState {
> +typedef struct VmdkExtent {
> +    BlockDriverState *file;
> +    int flat;

bool?

And just to make sure I understand the terminology: flat means !sparse,
i.e. doesn't have L1/L2 tables?

> +    int64_t sectors;
>      int64_t l1_table_offset;
>      int64_t l1_backup_table_offset;
>      uint32_t *l1_table;
> @@ -73,8 +78,15 @@ typedef struct BDRVVmdkState {
>      uint32_t l2_cache_offsets[L2_CACHE_SIZE];
>      uint32_t l2_cache_counts[L2_CACHE_SIZE];
> 
> -    unsigned int cluster_sectors;
>      uint32_t parent_cid;
> +    unsigned int cluster_sectors;
> +} VmdkExtent;
> +
> +typedef struct BDRVVmdkState {
> +    int has_descriptor_file;

bool

> +    int extent_size;

Is this the array size of extents? If so, num_extents would be a better
name.

> +    int cid_update;
> +    VmdkExtent *extents;
>  } BDRVVmdkState;
> 
>  typedef struct VmdkMetaData {
> @@ -88,21 +100,30 @@ typedef struct VmdkMetaData {
>  static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
>  {
>      uint32_t magic;
> -
> +    char *cid_p, *ct_p, *extent_p;
> +    char cid_str[] = "CID";
> +    char ct_str[] = "createType";
> +    char extent_str[] = "RW";

Don't use variables for this but just use the literal strings below.

>      if (buf_size < 4)
>          return 0;
>      magic = be32_to_cpu(*(uint32_t *)buf);
>      if (magic == VMDK3_MAGIC ||
> -        magic == VMDK4_MAGIC)
> +        magic == VMDK4_MAGIC) {
>          return 100;
> -    else
> +    } else {
> +        cid_p = strstr((char *)buf, cid_str);
> +        ct_p = strstr((char *)buf, ct_str);
> +        extent_p = strstr((char *)buf, extent_str);
> +        if (cid_p && ct_p && extent_p)
> +            return 100;
> +    }
>          return 0;

The return 0; isn't correctly indented any more.

>  }
> 
>  #define CHECK_CID 1
> 
>  #define SECTOR_SIZE 512
> -#define DESC_SIZE 20*SECTOR_SIZE	// 20 sectors of 512 bytes each
> +#define DESC_SIZE (20 * SECTOR_SIZE)    // 20 sectors of 512 bytes each
>  #define HEADER_SIZE 512   			// first sector of 512 bytes
> 
>  static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
> @@ -111,11 +132,11 @@ static uint32_t vmdk_read_cid(BlockDriverState
> *bs, int parent)
>      uint32_t cid;
>      const char *p_name, *cid_str;
>      size_t cid_str_size;
> +    BDRVVmdkState *s = bs->opaque;
> +    int desc_offset = s->has_descriptor_file ? 0 : 0x200;
> 
> -    /* the descriptor offset = 0x200 */
> -    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
> +    if (bdrv_pread(bs->file, desc_offset, desc, DESC_SIZE) != DESC_SIZE)
>          return 0;
> -
>      if (parent) {
>          cid_str = "parentCID";
>          cid_str_size = sizeof("parentCID");
> @@ -128,19 +149,52 @@ static uint32_t vmdk_read_cid(BlockDriverState
> *bs, int parent)
>          p_name += cid_str_size;
>          sscanf(p_name,"%x",&cid);
>      }
> -
>      return cid;
>  }
> 
> +#ifdef _WIN32
> +static int64_t get_file_size(const char *filename)
> +{
> +    typedef DWORD (WINAPI * get_compressed_t)
> +                    (const char *filename, DWORD *high);
> +    get_compressed_t get_compressed;
> +    struct _stati64 st;
> +
> +    /* WinNT support GetCompressedFileSize to determine allocate size */
> +    get_compressed = (get_compressed_t)
> +        GetProcAddress(GetModuleHandle("kernel32"), "GetCompressedFileSizeA");
> +    if (get_compressed) {
> +        DWORD high, low;
> +        low = get_compressed(filename, &high);
> +        if (low != 0xFFFFFFFFlu || GetLastError() == NO_ERROR)
> +        return (((int64_t) high) << 32) + low;
> +    }
> +
> +    if (_stati64(filename, &st) < 0)
> +        return -1;
> +    return st.st_size;
> +}
> +#else
> +static int64_t get_file_size(const char *filename)
> +{
> +    struct stat st;
> +    if (stat(filename, &st) < 0)
> +        return -1;
> +    return st.st_size;
> +}
> +#endif

You can't assume that the backend is a file, it might as well be an NBD
connection or whatever. Also there should be no reason for having
platform dependent code in a VMDK driver.

Use bdrv_getlength() instead.

> +
>  static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
>  {
>      char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
>      char *p_name, *tmp_str;
> -
> -    /* the descriptor offset = 0x200 */
> -    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
> +    BDRVVmdkState * s = bs->opaque;
> +    int desc_offset = s->has_descriptor_file ? 0 : 0x200;
> +    int desc_size = s->has_descriptor_file ?
> +        get_file_size(bs->file->filename) : DESC_SIZE;
> +    memset(desc, 0, DESC_SIZE);
> +    if (bdrv_pread(bs->file, desc_offset, desc, desc_size) != desc_size)
>          return -1;
> -
>      tmp_str = strstr(desc,"parentCID");
>      pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
>      if ((p_name = strstr(desc,"CID")) != NULL) {
> @@ -149,21 +203,20 @@ static int vmdk_write_cid(BlockDriverState *bs,
> uint32_t cid)
>          pstrcat(desc, sizeof(desc), tmp_desc);
>      }
> 
> -    if (bdrv_pwrite_sync(bs->file, 0x200, desc, DESC_SIZE) < 0)
> +    if (bdrv_pwrite_sync(bs->file, desc_offset, desc, desc_size) < 0)
>          return -1;
>      return 0;
>  }
> 
> -static int vmdk_is_cid_valid(BlockDriverState *bs)
> +static int vmdk_is_cid_valid(BlockDriverState *bs, VmdkExtent *extent)
>  {
>  #ifdef CHECK_CID
> -    BDRVVmdkState *s = bs->opaque;
>      BlockDriverState *p_bs = bs->backing_hd;
>      uint32_t cur_pcid;
> 
>      if (p_bs) {
> -        cur_pcid = vmdk_read_cid(p_bs,0);
> -        if (s->parent_cid != cur_pcid)
> +        cur_pcid = vmdk_read_cid(p_bs, 0);
> +        if (extent->parent_cid != cur_pcid)
>              // CID not valid
>              return 0;
>      }
> @@ -198,7 +251,9 @@ static int vmdk_snapshot_create(const char
> *filename, const char *backing_file)
>      "#DDB\n"
>      "\n";
> 
> -    snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY |
> O_LARGEFILE, 0644);
> +    snp_fd = open(filename,
> +                O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
> +                0644);

This hunk looks like it's not required for monolithic flat support.

>      if (snp_fd < 0)
>          return -errno;
>      p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE);
> @@ -277,7 +332,8 @@ static int vmdk_snapshot_create(const char
> *filename, const char *backing_file)
>       * Each GDE span 32M disk, means:
>       * 512 GTE per GT, each GTE points to grain
>       */
> -    gt_size = (int64_t)header.num_gtes_per_gte * header.granularity *
> SECTOR_SIZE;
> +    gt_size =
> +        (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE;

Same here.

>      if (!gt_size) {
>          ret = -EINVAL;
>          goto fail;
> @@ -338,9 +394,9 @@ static int vmdk_parent_open(BlockDriverState *bs)
>  {
>      char *p_name;
>      char desc[DESC_SIZE];
> -
> -    /* the descriptor offset = 0x200 */
> -    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
> +    BDRVVmdkState *s = bs->opaque;
> +    int desc_offset = s->has_descriptor_file ? 0 : 0x200;
> +    if (bdrv_pread(bs->file, desc_offset, desc, DESC_SIZE) != DESC_SIZE)
>          return -1;

Please add braces.

> 
>      if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) {
> @@ -358,106 +414,242 @@ static int vmdk_parent_open(BlockDriverState *bs)
>      return 0;
>  }
> 
> +/* find an option value out of descriptor file */
> +static int vmdk_parse_description(const char *desc, const char *opt_name,
> +        char *buf, int buf_size)
> +{
> +    char *opt_pos = strstr(desc, opt_name);
> +    int r;
> +    if (!opt_pos) return -1;

if (!opt_pos) {
    return -1;
}

> +    opt_pos += strlen(opt_name) + 2;
> +    r = sscanf(opt_pos, "%[^\"]s", buf);
> +    assert(r <= buf_size);
> +    return r <= 0;
> +}
> +
> +static int vmdk_parse_extents(const char *desc, VmdkExtent extents[],
> +        const char *desc_file_path)
> +{
> +    const char *p = desc;
> +    int ret = 0;
> +    int r;
> +    while (*p) {
> +        if (strncmp(p, "RW", strlen("RW")) == 0) {
> +            /* parse extent line:
> +             * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
> +             * or
> +             * RW [size in sectors] SPARSE "file-name.vmdk"
> +             */
> +            char access[11] = "";
> +            int64_t sectors = 0;
> +            char type[11] = "";
> +            char fname[512] = "";
> +            sscanf(p, "%10s %lld %10s \"%[^\"]512s\"",
> +                    access, &sectors, type, fname);
> +            if (!(strlen(access) && sectors && strlen(type)
> +                        && strlen(fname)))
> +                goto cont;
> +            if (strcmp(type, "FLAT") && strcmp(type, "SPARSE")) goto cont;
> +            if (strcmp(access, "RW")) goto cont;
> +
> +            ret++;
> +            if (!extents) goto cont;
> +
> +            /* save to extents array */
> +            if (!strcmp(type, "FLAT")) {
> +                /* FLAT extent */
> +                char extent_path[1024];
> +                path_combine(extent_path, sizeof(extent_path),
> +                        desc_file_path, fname);
> +                VmdkExtent *ext = &extents[ret - 1];
> +                ext->flat = 1;
> +                ext->sectors = sectors;
> +                ext->cluster_sectors = sectors;
> +                ext->file = bdrv_new("");
> +                if (!ext->file) return -1;
> +                r = bdrv_open(ext->file,
> +                    extent_path,
> +                    BDRV_O_RDWR | BDRV_O_NO_BACKING,
> +                    NULL);
> +                if (r) return -1;
> +            } else {
> +                /* SPARSE extent, should not be here */
> +                assert(0 && "not supported");
> +            }
> +
> +        }
> +cont:
> +        /* move to next line */
> +        while (*p && *p != '\n') p++;
> +        p++;
> +
> +    }
> +    return ret;
> +}

Please check the whole function for conformance with CODING_STYLE.

> +
>  static int vmdk_open(BlockDriverState *bs, int flags)
>  {
>      BDRVVmdkState *s = bs->opaque;
>      uint32_t magic;
>      int l1_size, i;
> -
> +    VmdkExtent *extent = NULL;
>      if (bdrv_pread(bs->file, 0, &magic, sizeof(magic)) != sizeof(magic))
>          goto fail;
> 
>      magic = be32_to_cpu(magic);
>      if (magic == VMDK3_MAGIC) {
>          VMDK3Header header;
> -
> -        if (bdrv_pread(bs->file, sizeof(magic), &header,
> sizeof(header)) != sizeof(header))
> +        s->extents = qemu_mallocz(sizeof(VmdkExtent));
> +        s->extent_size = 1;
> +        if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header))
> +            != sizeof(header))
>              goto fail;
> -        s->cluster_sectors = le32_to_cpu(header.granularity);
> -        s->l2_size = 1 << 9;
> -        s->l1_size = 1 << 6;
> -        bs->total_sectors = le32_to_cpu(header.disk_sectors);
> -        s->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
> -        s->l1_backup_table_offset = 0;
> -        s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
> +        extent = s->extents;
> +        extent->flat = 0;
> +        extent->file = bs->file;
> +        extent->cluster_sectors = le32_to_cpu(header.granularity);
> +        extent->l2_size = 1 << 9;
> +        extent->l1_size = 1 << 6;
> +        extent->sectors = le32_to_cpu(header.disk_sectors);
> +        extent->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
> +        extent->l1_backup_table_offset = 0;
> +        extent->l1_entry_sectors = extent->l2_size * extent->cluster_sectors;
>      } else if (magic == VMDK4_MAGIC) {
>          VMDK4Header header;
> -
> -        if (bdrv_pread(bs->file, sizeof(magic), &header,
> sizeof(header)) != sizeof(header))
> +        s->extents = qemu_mallocz(sizeof(VmdkExtent));
> +        s->extent_size = 1;
> +        if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header))
> +            != sizeof(header))
>              goto fail;
> -        bs->total_sectors = le64_to_cpu(header.capacity);
> -        s->cluster_sectors = le64_to_cpu(header.granularity);
> -        s->l2_size = le32_to_cpu(header.num_gtes_per_gte);
> -        s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
> -        if (s->l1_entry_sectors <= 0)
> +        extent = s->extents;
> +        extent->file = bs->file;
> +        extent->sectors = le64_to_cpu(header.capacity);
> +        extent->cluster_sectors = le64_to_cpu(header.granularity);
> +        extent->l2_size = le32_to_cpu(header.num_gtes_per_gte);
> +        extent->l1_entry_sectors = extent->l2_size * extent->cluster_sectors;
> +        if (extent->l1_entry_sectors <= 0)
>              goto fail;
> -        s->l1_size = (bs->total_sectors + s->l1_entry_sectors - 1)
> -            / s->l1_entry_sectors;
> -        s->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
> -        s->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;
> +        extent->l1_size = (extent->sectors + extent->l1_entry_sectors - 1)
> +            / extent->l1_entry_sectors;
> +        extent->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
> +        extent->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;
> 
>          // try to open parent images, if exist
>          if (vmdk_parent_open(bs) != 0)
>              goto fail;
>          // write the CID once after the image creation
> -        s->parent_cid = vmdk_read_cid(bs,1);
> +        extent->parent_cid = vmdk_read_cid(bs,1);
>      } else {
> +        char buf[2048];
> +        char ct[128];
> +        if (bdrv_pread(bs->file, 0, buf, sizeof(buf)) == 0)
> +            goto fail;
> +        if (0 != vmdk_parse_description(buf, "createType", ct, sizeof(ct)))
> +            goto fail;
> +        if (0 != strcmp(ct, "monolithicFlat"))
> +            goto fail;
> +        bs->disk_size += get_file_size(bs->file->filename);
> +        s->has_descriptor_file = 1;
> +        s->extent_size = vmdk_parse_extents(buf, NULL, NULL);
> +        if  (!s->extent_size)
> +            goto fail;
> +        s->extents = qemu_mallocz(s->extent_size * sizeof(VmdkExtent));
> +        extent = s->extents;
> +        vmdk_parse_extents(buf, s->extents, bs->file->filename);
> +        bs->total_sectors = 0;
> +
> +        // try to open parent images, if exist
> +        if (vmdk_parent_open(bs) != 0)
>          goto fail;
> +        // write the CID once after the image creation
> +        extent->parent_cid = vmdk_read_cid(bs,1);
> +
> +        for (i = 0; i < s->extent_size; i++)
> +        {
> +            bs->disk_size += get_file_size(s->extents[i].file->filename);
> +            bs->total_sectors += s->extents[i].sectors;
> +        }
> +        return 0;
> +    }

Would it make sense to split out opening the different image subformats
in separate functions?

> +
> +    bs->total_sectors = 0;
> +    for (i = 0; i < s->extent_size; i++)
> +    {
> +        bs->disk_size += get_file_size(s->extents[i].file->filename);
> +        bs->total_sectors += s->extents[i].sectors;
>      }
> 
>      /* read the L1 table */
> -    l1_size = s->l1_size * sizeof(uint32_t);
> -    s->l1_table = qemu_malloc(l1_size);
> -    if (bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
> l1_size) != l1_size)
> +    l1_size = extent->l1_size * sizeof(uint32_t);
> +    extent->l1_table = qemu_malloc(l1_size);
> +    if (bdrv_pread(bs->file,
> +            extent->l1_table_offset,
> +            extent->l1_table,
> +            l1_size)
> +        != l1_size)
>          goto fail;
> -    for(i = 0; i < s->l1_size; i++) {
> -        le32_to_cpus(&s->l1_table[i]);
> +    for(i = 0; i < extent->l1_size; i++) {
> +        le32_to_cpus(&extent->l1_table[i]);
>      }
> 
> -    if (s->l1_backup_table_offset) {
> -        s->l1_backup_table = qemu_malloc(l1_size);
> -        if (bdrv_pread(bs->file, s->l1_backup_table_offset,
> s->l1_backup_table, l1_size) != l1_size)
> +    if (extent->l1_backup_table_offset) {
> +        extent->l1_backup_table = qemu_malloc(l1_size);
> +        if (bdrv_pread(bs->file,
> +                extent->l1_backup_table_offset,
> +                extent->l1_backup_table,
> +                l1_size)
> +            != l1_size)
>              goto fail;
> -        for(i = 0; i < s->l1_size; i++) {
> -            le32_to_cpus(&s->l1_backup_table[i]);
> +        for(i = 0; i < extent->l1_size; i++) {
> +            le32_to_cpus(&extent->l1_backup_table[i]);
>          }
>      }
> 
> -    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
> +    extent->l2_cache =
> +        qemu_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
>      return 0;
>   fail:
> -    qemu_free(s->l1_backup_table);
> -    qemu_free(s->l1_table);
> -    qemu_free(s->l2_cache);
> +    if (extent) {
> +        qemu_free(extent->l1_backup_table);
> +        qemu_free(extent->l1_table);
> +        qemu_free(extent->l2_cache);
> +    }
>      return -1;
>  }
> 
> -static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
> -                                   uint64_t offset, int allocate);
> -
> -static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset,
> -                             uint64_t offset, int allocate)
> +static uint64_t get_cluster_offset(BlockDriverState *bs,
> +                    VmdkExtent *extent,
> +                    VmdkMetaData *m_data,
> +                    uint64_t offset,
> +                    int allocate);
> +
> +static int get_whole_cluster(BlockDriverState *bs,
> +                VmdkExtent *extent,
> +                uint64_t cluster_offset,
> +                uint64_t offset,
> +                int allocate)
>  {
> -    BDRVVmdkState *s = bs->opaque;
> -    uint8_t  whole_grain[s->cluster_sectors*512];        // 128
> sectors * 512 bytes each = grain size 64KB
> +    // 128 sectors * 512 bytes each = grain size 64KB
> +    uint8_t  whole_grain[extent->cluster_sectors * 512];
> 
>      // we will be here if it's first write on non-exist grain(cluster).
>      // try to read from parent image, if exist
>      if (bs->backing_hd) {
>          int ret;
> 
> -        if (!vmdk_is_cid_valid(bs))
> +        if (!vmdk_is_cid_valid(bs, extent))
>              return -1;
> -
> +        // floor offset to cluster
> +        offset -= offset % (extent->cluster_sectors * 512);

Hm, is this really related to monolithic flat?

>          ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
> -            s->cluster_sectors);
> +                extent->cluster_sectors);
>          if (ret < 0) {
>              return -1;
>          }
> 
>          //Write grain only into the active image
> -        ret = bdrv_write(bs->file, cluster_offset, whole_grain,
> -            s->cluster_sectors);
> +        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
> +                extent->cluster_sectors);
>          if (ret < 0) {
>              return -1;
>          }
> @@ -465,52 +657,62 @@ static int get_whole_cluster(BlockDriverState
> *bs, uint64_t cluster_offset,
>      return 0;
>  }
> 
> -static int vmdk_L2update(BlockDriverState *bs, VmdkMetaData *m_data)
> +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>  {
> -    BDRVVmdkState *s = bs->opaque;
> -
> +    if (extent->flat) return -1;
>      /* update L2 table */
> -    if (bdrv_pwrite_sync(bs->file, ((int64_t)m_data->l2_offset * 512)
> + (m_data->l2_index * sizeof(m_data->offset)),
> -                    &(m_data->offset), sizeof(m_data->offset)) < 0)
> +    if (bdrv_pwrite_sync(
> +            extent->file,
> +            ((int64_t)m_data->l2_offset * 512)
> +                + (m_data->l2_index * sizeof(m_data->offset)),
> +            &(m_data->offset),
> +            sizeof(m_data->offset)
> +        ) < 0)
>          return -1;
>      /* update backup L2 table */
> -    if (s->l1_backup_table_offset != 0) {
> -        m_data->l2_offset = s->l1_backup_table[m_data->l1_index];
> -        if (bdrv_pwrite_sync(bs->file, ((int64_t)m_data->l2_offset *
> 512) + (m_data->l2_index * sizeof(m_data->offset)),
> -                        &(m_data->offset), sizeof(m_data->offset)) < 0)
> +    if (extent->l1_backup_table_offset != 0) {
> +        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
> +        if (bdrv_pwrite_sync(
> +                extent->file,
> +                ((int64_t)m_data->l2_offset * 512)
> +                    + (m_data->l2_index * sizeof(m_data->offset)),
> +                &(m_data->offset), sizeof(m_data->offset)
> +            ) < 0)
>              return -1;
>      }
> 
>      return 0;
>  }
> 
> -static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
> +static uint64_t get_cluster_offset(BlockDriverState *bs,
> +                                    VmdkExtent *extent,
> +                                    VmdkMetaData *m_data,
>                                     uint64_t offset, int allocate)
>  {
> -    BDRVVmdkState *s = bs->opaque;
>      unsigned int l1_index, l2_offset, l2_index;
>      int min_index, i, j;
>      uint32_t min_count, *l2_table, tmp = 0;
>      uint64_t cluster_offset;
> 
> +    if (extent->flat) return 0;
>      if (m_data)
>          m_data->valid = 0;
> 
> -    l1_index = (offset >> 9) / s->l1_entry_sectors;
> -    if (l1_index >= s->l1_size)
> +    l1_index = (offset >> 9) / extent->l1_entry_sectors;
> +    if (l1_index >= extent->l1_size)
>          return 0;
> -    l2_offset = s->l1_table[l1_index];
> +    l2_offset = extent->l1_table[l1_index];
>      if (!l2_offset)
>          return 0;
>      for(i = 0; i < L2_CACHE_SIZE; i++) {
> -        if (l2_offset == s->l2_cache_offsets[i]) {
> +        if (l2_offset == extent->l2_cache_offsets[i]) {
>              /* increment the hit count */
> -            if (++s->l2_cache_counts[i] == 0xffffffff) {
> +            if (++extent->l2_cache_counts[i] == 0xffffffff) {
>                  for(j = 0; j < L2_CACHE_SIZE; j++) {
> -                    s->l2_cache_counts[j] >>= 1;
> +                    extent->l2_cache_counts[j] >>= 1;
>                  }
>              }
> -            l2_table = s->l2_cache + (i * s->l2_size);
> +            l2_table = extent->l2_cache + (i * extent->l2_size);
>              goto found;
>          }
>      }
> @@ -518,20 +720,24 @@ static uint64_t
> get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
>      min_index = 0;
>      min_count = 0xffffffff;
>      for(i = 0; i < L2_CACHE_SIZE; i++) {
> -        if (s->l2_cache_counts[i] < min_count) {
> -            min_count = s->l2_cache_counts[i];
> +        if (extent->l2_cache_counts[i] < min_count) {
> +            min_count = extent->l2_cache_counts[i];
>              min_index = i;
>          }
>      }
> -    l2_table = s->l2_cache + (min_index * s->l2_size);
> -    if (bdrv_pread(bs->file, (int64_t)l2_offset * 512, l2_table,
> s->l2_size * sizeof(uint32_t)) !=
> -
>   s->l2_size * sizeof(uint32_t))
> +    l2_table = extent->l2_cache + (min_index * extent->l2_size);
> +    if (bdrv_pread(
> +            extent->file,
> +            (int64_t)l2_offset * 512,
> +            l2_table,
> +            extent->l2_size * sizeof(uint32_t)
> +        ) != extent->l2_size * sizeof(uint32_t))
>          return 0;
> 
> -    s->l2_cache_offsets[min_index] = l2_offset;
> -    s->l2_cache_counts[min_index] = 1;
> +    extent->l2_cache_offsets[min_index] = l2_offset;
> +    extent->l2_cache_counts[min_index] = 1;
>   found:
> -    l2_index = ((offset >> 9) / s->cluster_sectors) % s->l2_size;
> +    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
>      cluster_offset = le32_to_cpu(l2_table[l2_index]);
> 
>      if (!cluster_offset) {
> @@ -539,8 +745,11 @@ static uint64_t
> get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
>              return 0;
> 
>          // Avoid the L2 tables update for the images that have snapshots.
> -        cluster_offset = bdrv_getlength(bs->file);
> -        bdrv_truncate(bs->file, cluster_offset + (s->cluster_sectors << 9));
> +        cluster_offset = bdrv_getlength(extent->file);
> +        bdrv_truncate(
> +            extent->file,
> +            cluster_offset + (extent->cluster_sectors << 9)
> +        );
> 
>          cluster_offset >>= 9;
>          tmp = cpu_to_le32(cluster_offset);
> @@ -551,7 +760,8 @@ static uint64_t
> get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
>           * This problem may occur because of insufficient space on host disk
>           * or inappropriate VM shutdown.
>           */
> -        if (get_whole_cluster(bs, cluster_offset, offset, allocate) == -1)
> +        if (get_whole_cluster(
> +                bs, extent, cluster_offset, offset, allocate) == -1)
>              return 0;
> 
>          if (m_data) {
> @@ -570,35 +780,68 @@ static int vmdk_is_allocated(BlockDriverState
> *bs, int64_t sector_num,
>                               int nb_sectors, int *pnum)
>  {
>      BDRVVmdkState *s = bs->opaque;
> -    int index_in_cluster, n;
> -    uint64_t cluster_offset;
> -
> -    cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
> -    index_in_cluster = sector_num % s->cluster_sectors;
> -    n = s->cluster_sectors - index_in_cluster;
> +    int ext_idx = 0;
> +    int index_in_cluster, n, ret;
> +    uint64_t offset;
> +    VmdkExtent *extent;

> +    while (ext_idx < s->extent_size
> +            && sector_num >= s->extents[ext_idx].sectors) {
> +        sector_num -= s->extents[ext_idx].sectors;
> +        ext_idx++;
> +    }

This looks like a pattern that you'll need in more places. Probably
makes sense to turn it into a function.

> +    if (ext_idx == s->extent_size) return 0;
> +    extent = &s->extents[ext_idx];
> +    if (s->extents[ext_idx].flat) {
> +        n = extent->sectors - sector_num;
> +        ret = 1;
> +    } else {
> +        offset = get_cluster_offset(bs, extent, NULL, sector_num * 512, 0);
> +        index_in_cluster = sector_num % extent->cluster_sectors;
> +        n = extent->cluster_sectors - index_in_cluster;
> +        ret = offset ? 1 : 0;
> +    }
>      if (n > nb_sectors)
>          n = nb_sectors;
>      *pnum = n;
> -    return (cluster_offset != 0);
> +    return ret;
>  }
> 
>  static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
>                      uint8_t *buf, int nb_sectors)
>  {
>      BDRVVmdkState *s = bs->opaque;
> -    int index_in_cluster, n, ret;
> +    int ext_idx = 0;
> +    int ret;
> +    uint64_t n, index_in_cluster;
> +    VmdkExtent *extent;
>      uint64_t cluster_offset;
> 
>      while (nb_sectors > 0) {
> -        cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
> -        index_in_cluster = sector_num % s->cluster_sectors;
> -        n = s->cluster_sectors - index_in_cluster;
> +        while (ext_idx < s->extent_size
> +                && sector_num >= s->extents[ext_idx].sectors) {
> +            sector_num -= s->extents[ext_idx].sectors;
> +            ext_idx++;
> +        }
> +        if (ext_idx == s->extent_size) {
> +            return -1;
> +        }
> +        extent = &s->extents[ext_idx];
> +        if (extent->flat)
> +            cluster_offset = 0;
> +        else
> +            cluster_offset = get_cluster_offset(
> +                                bs, extent, NULL, sector_num << 9, 0);

I think you should move this if into get_cluster_offset.

> +
> +        cluster_offset =
> +            get_cluster_offset(bs, extent, NULL, sector_num << 9, 0);

Huh, so you call get_cluster_offset twice?

> +        index_in_cluster = sector_num % extent->cluster_sectors;
> +        n = extent->cluster_sectors - index_in_cluster;
>          if (n > nb_sectors)
>              n = nb_sectors;
> -        if (!cluster_offset) {
> +        if (!extent->flat && !cluster_offset) {

Maybe this shows that cluster_offset == 0 for an unallocated cluster
isn't a good convention any more as you add more subformats where a
cluster_offset of 0 is perfectly valid. We could consider changing this
in another patch.

>              // try to read from parent image, if exist
>              if (bs->backing_hd) {
> -                if (!vmdk_is_cid_valid(bs))
> +                if (!vmdk_is_cid_valid(bs, extent))
>                      return -1;
>                  ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
>                  if (ret < 0)
> @@ -607,7 +850,12 @@ static int vmdk_read(BlockDriverState *bs,
> int64_t sector_num,
>                  memset(buf, 0, 512 * n);
>              }
>          } else {
> -            if(bdrv_pread(bs->file, cluster_offset + index_in_cluster
> * 512, buf, n * 512) != n * 512)
> +            if(bdrv_pread(
> +                    extent->file,
> +                    cluster_offset + index_in_cluster * 512,
> +                    buf,
> +                    n * 512
> +                ) != n * 512)
>                  return -1;
>          }
>          nb_sectors -= n;
> @@ -621,11 +869,11 @@ static int vmdk_write(BlockDriverState *bs,
> int64_t sector_num,
>                       const uint8_t *buf, int nb_sectors)
>  {
>      BDRVVmdkState *s = bs->opaque;
> -    VmdkMetaData m_data;
> -    int index_in_cluster, n;
> +    VmdkExtent *extent;
> +    int ext_idx = 0;
> +    int n, index_in_cluster;
>      uint64_t cluster_offset;
> -    static int cid_update = 0;
> -
> +    VmdkMetaData m_data;
>      if (sector_num > bs->total_sectors) {
>          fprintf(stderr,
>                  "(VMDK) Wrong offset: sector_num=0x%" PRIx64
> @@ -635,19 +883,40 @@ static int vmdk_write(BlockDriverState *bs,
> int64_t sector_num,
>      }
> 
>      while (nb_sectors > 0) {
> -        index_in_cluster = sector_num & (s->cluster_sectors - 1);
> -        n = s->cluster_sectors - index_in_cluster;
> +        while (ext_idx < s->extent_size
> +            && sector_num >= s->extents[ext_idx].sectors) {
> +            sector_num -= s->extents[ext_idx].sectors;
> +            ext_idx++;
> +        }
> +        if (ext_idx == s->extent_size) {
> +            return -1;
> +        }
> +        extent = &s->extents[ext_idx];
> +        if (extent->flat)
> +            cluster_offset = 0;
> +        else
> +            cluster_offset = get_cluster_offset(
> +                                bs,
> +                                extent,
> +                                &m_data,
> +                                sector_num << 9, 1);
> +
> +        if (!extent->flat && !cluster_offset)
> +            return -1;
> +        index_in_cluster = sector_num & (extent->cluster_sectors - 1);
> +        n = extent->cluster_sectors - index_in_cluster;
>          if (n > nb_sectors)
>              n = nb_sectors;
> -        cluster_offset = get_cluster_offset(bs, &m_data, sector_num << 9, 1);
> -        if (!cluster_offset)
> -            return -1;
> 
> -        if (bdrv_pwrite(bs->file, cluster_offset + index_in_cluster *
> 512, buf, n * 512) != n * 512)
> +        if (bdrv_pwrite(
> +                extent->file,
> +                cluster_offset + index_in_cluster * 512,
> +                buf, n * 512
> +            ) != n * 512)
>              return -1;
> -        if (m_data.valid) {
> +        if (!extent->flat && m_data.valid) {
>              /* update L2 tables */
> -            if (vmdk_L2update(bs, &m_data) == -1)
> +            if (vmdk_L2update(extent, &m_data) == -1)
>                  return -1;
>          }
>          nb_sectors -= n;
> @@ -655,9 +924,9 @@ static int vmdk_write(BlockDriverState *bs,
> int64_t sector_num,
>          buf += n * 512;
> 
>          // update CID on the first write every time the virtual disk is opened
> -        if (!cid_update) {
> +        if (!s->cid_update) {
>              vmdk_write_cid(bs, time(NULL));
> -            cid_update++;
> +            s->cid_update++;
>          }
>      }
>      return 0;
> @@ -665,32 +934,17 @@ static int vmdk_write(BlockDriverState *bs,
> int64_t sector_num,
> 
>  static int vmdk_create(const char *filename, QEMUOptionParameter *options)
>  {
> -    int fd, i;
> +    int fd;
> +    int64_t i;
>      VMDK4Header header;
>      uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
> -    static const char desc_template[] =
> -        "# Disk DescriptorFile\n"
> -        "version=1\n"
> -        "CID=%x\n"
> -        "parentCID=ffffffff\n"
> -        "createType=\"monolithicSparse\"\n"
> -        "\n"
> -        "# Extent description\n"
> -        "RW %" PRId64 " SPARSE \"%s\"\n"
> -        "\n"
> -        "# The Disk Data Base \n"
> -        "#DDB\n"
> -        "\n"
> -        "ddb.virtualHWVersion = \"%d\"\n"
> -        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
> -        "ddb.geometry.heads = \"16\"\n"
> -        "ddb.geometry.sectors = \"63\"\n"
> -        "ddb.adapterType = \"ide\"\n";
> +
>      char desc[1024];
>      const char *real_filename, *temp_str;
>      int64_t total_size = 0;
>      const char *backing_file = NULL;
>      int flags = 0;
> +    int flat = 0;
>      int ret;
> 
>      // Read out options
> @@ -701,16 +955,125 @@ static int vmdk_create(const char *filename,
> QEMUOptionParameter *options)
>              backing_file = options->value.s;
>          } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
>              flags |= options->value.n ? BLOCK_FLAG_COMPAT6: 0;
> +        } else if (!strcmp(options->name, BLOCK_OPT_FLAT)) {
> +            flat = options->value.n;
>          }
>          options++;
>      }
> 
> +    if (flat) {

Make the whole following block a separate function.

> +        const char desc_template[] =
> +        "# Disk DescriptorFile\n"
> +        "version=1\n"
> +        "CID=%x\n"
> +        "parentCID=ffffffff\n"
> +        "createType=\"monolithicFlat\"\n"
> +        "\n"
> +        "# Extent description\n"
> +        "RW %" PRId64 " FLAT \"%s\" 0\n"
> +        "\n"
> +        "# The Disk Data Base \n"
> +        "#DDB\n"
> +        "\n"
> +        "ddb.virtualHWVersion = \"%d\"\n"
> +        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
> +        "ddb.geometry.heads = \"16\"\n"
> +        "ddb.geometry.sectors = \"63\"\n"
> +        "ddb.adapterType = \"ide\"\n";
> +        char ext_filename[1024];
> +        strncpy(ext_filename, filename, 1024);
> +        ext_filename[1023] = '\0';
> +        if (!strcmp(&ext_filename[strlen(ext_filename) - 5], ".vmdk"))
> +            strcpy(&ext_filename[strlen(ext_filename) - 5], "-flat.vmdk");
> +        else
> +            strcat(ext_filename, "-flat.vmdk");
> +        /* create extent first */
> +        fd = open(
> +                ext_filename,
> +                O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
> +                0644);
> +        if (fd < 0)
> +            return -errno;
> +        ret = ftruncate(fd, total_size * 512);
> +        if (ret) goto exit;
> +#if VMDK_FLAT_BACKING
> +        if (backing_file) {
> +            uint8_t buf[512];
> +            BlockDriverState *bf = bdrv_new("");
> +            if (!bf) {
> +                ret = -1;
> +                goto exit;
> +            }
> +            ret = bdrv_open(bf, backing_file, BDRV_O_RDWR, NULL);
> +            if (ret) {
> +                bdrv_delete(bf);
> +                goto exit;
> +            }
> +            for (i = 0; i < total_size; i++) {
> +                int num;
> +                if (bdrv_is_allocated(bf, i, 1, &num)) {
> +                    ret = bdrv_read(bf, i, buf, 1);
> +                    if (ret)
> +                    {
> +                        bdrv_delete(bf);
> +                        goto exit;
> +                    }
> +                    ret = pwrite(fd, buf, 512, i * 512);
> +                    if (ret != 512)
> +                    {
> +                        bdrv_delete(bf);
> +                        goto exit;
> +                    }
> +                }
> +            }
> +        }
> +#endif
> +        close(fd);
> +
> +        /* generate descriptor file */
> +        snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL),
> +                 total_size, ext_filename,
> +                 (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
> +                 total_size / (int64_t)(63 * 16));
> +        fd = open(
> +                filename,
> +                O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
> +                0644);
> +        if (fd < 0)
> +            return -errno;
> +        ret = qemu_write_full(fd, desc, strlen(desc));
> +        if (ret != strlen(desc)) {
> +            ret = -errno;
> +            goto exit;
> +        }
> +        ret = 0;
> +    } else {
> +        const char desc_template[] =
> +        "# Disk DescriptorFile\n"
> +        "version=1\n"
> +        "CID=%x\n"
> +        "parentCID=ffffffff\n"
> +        "createType=\"monolithicSparse\"\n"
> +        "\n"
> +        "# Extent description\n"
> +        "RW %" PRId64 " SPARSE \"%s\"\n"
> +        "\n"
> +        "# The Disk Data Base \n"
> +        "#DDB\n"
> +        "\n"
> +        "ddb.virtualHWVersion = \"%d\"\n"
> +        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
> +        "ddb.geometry.heads = \"16\"\n"
> +        "ddb.geometry.sectors = \"63\"\n"
> +        "ddb.adapterType = \"ide\"\n";
>      /* XXX: add support for backing file */
>      if (backing_file) {
>          return vmdk_snapshot_create(filename, backing_file);
>      }
> 
> -    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
> +        fd = open(
> +            filename,
> +            O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
>                0644);
>      if (fd < 0)
>          return -errno;
> @@ -724,7 +1087,8 @@ static int vmdk_create(const char *filename,
> QEMUOptionParameter *options)
> 
>      grains = (total_size + header.granularity - 1) / header.granularity;
>      gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
> -    gt_count = (grains + header.num_gtes_per_gte - 1) /
> header.num_gtes_per_gte;
> +        gt_count =
> +            (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
>      gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
> 
>      header.desc_offset = 1;
> @@ -807,8 +1171,8 @@ static int vmdk_create(const char *filename,
> QEMUOptionParameter *options)
>          ret = -errno;
>          goto exit;
>      }
> -
>      ret = 0;
> +    }
>  exit:
>      close(fd);
>      return ret;
> @@ -817,14 +1181,20 @@ exit:
>  static void vmdk_close(BlockDriverState *bs)
>  {
>      BDRVVmdkState *s = bs->opaque;
> -
> -    qemu_free(s->l1_table);
> -    qemu_free(s->l2_cache);
> +    if (s->extent_size)
> +        qemu_free(s->extents);

Can you even have a state without an allocated s->extents?

>  }
> 
>  static int vmdk_flush(BlockDriverState *bs)
>  {
> -    return bdrv_flush(bs->file);
> +    int ret = 0;
> +    int i;
> +    BDRVVmdkState *s = bs->opaque;
> +    ret = bdrv_flush(bs->file);
> +    for (i = 0; i <  s->extent_size; i++)
> +        ret |= bdrv_flush(s->extents[i].file);
> +    return ret;
> +
>  }
> 
> 
> @@ -844,6 +1214,11 @@ static QEMUOptionParameter vmdk_create_options[] = {
>          .type = OPT_FLAG,
>          .help = "VMDK version 6 image"
>      },
> +    {
> +        .name = BLOCK_OPT_FLAT,
> +        .type = OPT_FLAG,
> +        .help = "VMDK flat extent image"
> +    },

What are the other subformats that will be added? Do all or most of them
have something like both flat and sparse variants, so that this option
is universally applicable?

I'm asking to make sure that maybe three or four flags really make more
sense than just a subformat name.

Kevin
Kevin Wolf - May 30, 2011, 12:28 p.m.
Am 30.05.2011 09:49, schrieb Fam Zheng:
> VMDK multiple file images can not be recognized for now. This patch is
> adding monolithic flat support to it, that is the image type with two
> files, one text descriptor file and a plain data file. This type of
> image can be created in VMWare, with the options "allocate all disk
> space now" and "store virtual disk as a single file" checked.
> 
> A VmdkExtent structure is introduced to hold the image "extent"
> information, which makes further adding multi extents support of VMDK
> easy. An image creating option "flat" is added for creating flat
> (preallocated) image.
> 
> Signed-off-by: Feiran (Fam) Zheng <famcool@gmail.com>

Ok, seems I commented on so many details that in the end I forgot to add
the general comment. :-)

I think this patch is too big to be well reviewable. You should always
try to make only one logical change in one patch. I think you can split
this at least in two parts: First adding the VmdkExtent data structure
without adding any new functionality, and second adding the monolithic
flat support. Depending on how big the second patch is, you can split it
further into image creation and the rest (or any other split that you
feel is natural).

On two or three hunks I commented that they probably aren't required for
monolithic flat support. They should be separate bugfix or cleanup patches.

Kevin

Patch

diff --git a/block/vmdk.c b/block/vmdk.c
index 8fc9d67..726ad3a 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -30,6 +30,8 @@ 
 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')

+#define VMDK_FLAT_BACKING 0
+
 typedef struct {
     uint32_t version;
     uint32_t flags;
@@ -60,7 +62,10 @@  typedef struct {

 #define L2_CACHE_SIZE 16

-typedef struct BDRVVmdkState {
+typedef struct VmdkExtent {
+    BlockDriverState *file;
+    int flat;
+    int64_t sectors;
     int64_t l1_table_offset;
     int64_t l1_backup_table_offset;
     uint32_t *l1_table;
@@ -73,8 +78,15 @@  typedef struct BDRVVmdkState {
     uint32_t l2_cache_offsets[L2_CACHE_SIZE];
     uint32_t l2_cache_counts[L2_CACHE_SIZE];

-    unsigned int cluster_sectors;
     uint32_t parent_cid;
+    unsigned int cluster_sectors;
+} VmdkExtent;
+
+typedef struct BDRVVmdkState {
+    int has_descriptor_file;
+    int extent_size;
+    int cid_update;
+    VmdkExtent *extents;
 } BDRVVmdkState;

 typedef struct VmdkMetaData {
@@ -88,21 +100,30 @@  typedef struct VmdkMetaData {
 static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
 {
     uint32_t magic;
-
+    char *cid_p, *ct_p, *extent_p;
+    char cid_str[] = "CID";
+    char ct_str[] = "createType";
+    char extent_str[] = "RW";
     if (buf_size < 4)
         return 0;
     magic = be32_to_cpu(*(uint32_t *)buf);
     if (magic == VMDK3_MAGIC ||
-        magic == VMDK4_MAGIC)
+        magic == VMDK4_MAGIC) {
         return 100;
-    else
+    } else {
+        cid_p = strstr((char *)buf, cid_str);
+        ct_p = strstr((char *)buf, ct_str);
+        extent_p = strstr((char *)buf, extent_str);
+        if (cid_p && ct_p && extent_p)
+            return 100;
+    }
         return 0;
 }

 #define CHECK_CID 1

 #define SECTOR_SIZE 512
-#define DESC_SIZE 20*SECTOR_SIZE	// 20 sectors of 512 bytes each
+#define DESC_SIZE (20 * SECTOR_SIZE)    // 20 sectors of 512 bytes each
 #define HEADER_SIZE 512   			// first sector of 512 bytes

 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
@@ -111,11 +132,11 @@  static uint32_t vmdk_read_cid(BlockDriverState
*bs, int parent)
     uint32_t cid;
     const char *p_name, *cid_str;
     size_t cid_str_size;
+    BDRVVmdkState *s = bs->opaque;
+    int desc_offset = s->has_descriptor_file ? 0 : 0x200;

-    /* the descriptor offset = 0x200 */
-    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+    if (bdrv_pread(bs->file, desc_offset, desc, DESC_SIZE) != DESC_SIZE)
         return 0;
-
     if (parent) {
         cid_str = "parentCID";
         cid_str_size = sizeof("parentCID");
@@ -128,19 +149,52 @@  static uint32_t vmdk_read_cid(BlockDriverState
*bs, int parent)
         p_name += cid_str_size;
         sscanf(p_name,"%x",&cid);
     }
-
     return cid;
 }

+#ifdef _WIN32
+static int64_t get_file_size(const char *filename)
+{
+    typedef DWORD (WINAPI * get_compressed_t)
+                    (const char *filename, DWORD *high);
+    get_compressed_t get_compressed;
+    struct _stati64 st;
+
+    /* WinNT support GetCompressedFileSize to determine allocate size */
+    get_compressed = (get_compressed_t)
+        GetProcAddress(GetModuleHandle("kernel32"), "GetCompressedFileSizeA");
+    if (get_compressed) {
+        DWORD high, low;
+        low = get_compressed(filename, &high);
+        if (low != 0xFFFFFFFFlu || GetLastError() == NO_ERROR)
+        return (((int64_t) high) << 32) + low;
+    }
+
+    if (_stati64(filename, &st) < 0)
+        return -1;
+    return st.st_size;
+}
+#else
+static int64_t get_file_size(const char *filename)
+{
+    struct stat st;
+    if (stat(filename, &st) < 0)
+        return -1;
+    return st.st_size;
+}
+#endif
+
 static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
 {
     char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
     char *p_name, *tmp_str;
-
-    /* the descriptor offset = 0x200 */
-    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+    BDRVVmdkState * s = bs->opaque;
+    int desc_offset = s->has_descriptor_file ? 0 : 0x200;
+    int desc_size = s->has_descriptor_file ?
+        get_file_size(bs->file->filename) : DESC_SIZE;
+    memset(desc, 0, DESC_SIZE);
+    if (bdrv_pread(bs->file, desc_offset, desc, desc_size) != desc_size)
         return -1;
-
     tmp_str = strstr(desc,"parentCID");
     pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
     if ((p_name = strstr(desc,"CID")) != NULL) {
@@ -149,21 +203,20 @@  static int vmdk_write_cid(BlockDriverState *bs,
uint32_t cid)
         pstrcat(desc, sizeof(desc), tmp_desc);
     }

-    if (bdrv_pwrite_sync(bs->file, 0x200, desc, DESC_SIZE) < 0)
+    if (bdrv_pwrite_sync(bs->file, desc_offset, desc, desc_size) < 0)
         return -1;
     return 0;
 }

-static int vmdk_is_cid_valid(BlockDriverState *bs)
+static int vmdk_is_cid_valid(BlockDriverState *bs, VmdkExtent *extent)
 {
 #ifdef CHECK_CID
-    BDRVVmdkState *s = bs->opaque;
     BlockDriverState *p_bs = bs->backing_hd;
     uint32_t cur_pcid;

     if (p_bs) {
-        cur_pcid = vmdk_read_cid(p_bs,0);
-        if (s->parent_cid != cur_pcid)
+        cur_pcid = vmdk_read_cid(p_bs, 0);
+        if (extent->parent_cid != cur_pcid)
             // CID not valid
             return 0;
     }
@@ -198,7 +251,9 @@  static int vmdk_snapshot_create(const char
*filename, const char *backing_file)
     "#DDB\n"
     "\n";

-    snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY |
O_LARGEFILE, 0644);
+    snp_fd = open(filename,
+                O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+                0644);
     if (snp_fd < 0)
         return -errno;
     p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE);
@@ -277,7 +332,8 @@  static int vmdk_snapshot_create(const char
*filename, const char *backing_file)
      * Each GDE span 32M disk, means:
      * 512 GTE per GT, each GTE points to grain
      */
-    gt_size = (int64_t)header.num_gtes_per_gte * header.granularity *
SECTOR_SIZE;
+    gt_size =
+        (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE;
     if (!gt_size) {
         ret = -EINVAL;
         goto fail;
@@ -338,9 +394,9 @@  static int vmdk_parent_open(BlockDriverState *bs)
 {
     char *p_name;
     char desc[DESC_SIZE];
-
-    /* the descriptor offset = 0x200 */
-    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+    BDRVVmdkState *s = bs->opaque;
+    int desc_offset = s->has_descriptor_file ? 0 : 0x200;
+    if (bdrv_pread(bs->file, desc_offset, desc, DESC_SIZE) != DESC_SIZE)
         return -1;

     if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) {
@@ -358,106 +414,242 @@  static int vmdk_parent_open(BlockDriverState *bs)
     return 0;
 }

+/* find an option value out of descriptor file */
+static int vmdk_parse_description(const char *desc, const char *opt_name,
+        char *buf, int buf_size)
+{
+    char *opt_pos = strstr(desc, opt_name);
+    int r;
+    if (!opt_pos) return -1;
+    opt_pos += strlen(opt_name) + 2;
+    r = sscanf(opt_pos, "%[^\"]s", buf);
+    assert(r <= buf_size);
+    return r <= 0;
+}
+
+static int vmdk_parse_extents(const char *desc, VmdkExtent extents[],
+        const char *desc_file_path)
+{
+    const char *p = desc;
+    int ret = 0;
+    int r;
+    while (*p) {
+        if (strncmp(p, "RW", strlen("RW")) == 0) {
+            /* parse extent line:
+             * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
+             * or
+             * RW [size in sectors] SPARSE "file-name.vmdk"
+             */
+            char access[11] = "";
+            int64_t sectors = 0;
+            char type[11] = "";
+            char fname[512] = "";
+            sscanf(p, "%10s %lld %10s \"%[^\"]512s\"",
+                    access, &sectors, type, fname);
+            if (!(strlen(access) && sectors && strlen(type)
+                        && strlen(fname)))
+                goto cont;
+            if (strcmp(type, "FLAT") && strcmp(type, "SPARSE")) goto cont;
+            if (strcmp(access, "RW")) goto cont;
+
+            ret++;
+            if (!extents) goto cont;
+
+            /* save to extents array */
+            if (!strcmp(type, "FLAT")) {
+                /* FLAT extent */
+                char extent_path[1024];
+                path_combine(extent_path, sizeof(extent_path),
+                        desc_file_path, fname);
+                VmdkExtent *ext = &extents[ret - 1];
+                ext->flat = 1;
+                ext->sectors = sectors;
+                ext->cluster_sectors = sectors;
+                ext->file = bdrv_new("");
+                if (!ext->file) return -1;
+                r = bdrv_open(ext->file,
+                    extent_path,
+                    BDRV_O_RDWR | BDRV_O_NO_BACKING,
+                    NULL);
+                if (r) return -1;
+            } else {
+                /* SPARSE extent, should not be here */
+                assert(0 && "not supported");
+            }
+
+        }
+cont:
+        /* move to next line */
+        while (*p && *p != '\n') p++;
+        p++;
+
+    }
+    return ret;
+}
+
 static int vmdk_open(BlockDriverState *bs, int flags)
 {
     BDRVVmdkState *s = bs->opaque;
     uint32_t magic;
     int l1_size, i;
-
+    VmdkExtent *extent = NULL;
     if (bdrv_pread(bs->file, 0, &magic, sizeof(magic)) != sizeof(magic))
         goto fail;

     magic = be32_to_cpu(magic);
     if (magic == VMDK3_MAGIC) {
         VMDK3Header header;
-
-        if (bdrv_pread(bs->file, sizeof(magic), &header,
sizeof(header)) != sizeof(header))
+        s->extents = qemu_mallocz(sizeof(VmdkExtent));
+        s->extent_size = 1;
+        if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header))
+            != sizeof(header))
             goto fail;
-        s->cluster_sectors = le32_to_cpu(header.granularity);
-        s->l2_size = 1 << 9;
-        s->l1_size = 1 << 6;
-        bs->total_sectors = le32_to_cpu(header.disk_sectors);
-        s->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
-        s->l1_backup_table_offset = 0;
-        s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
+        extent = s->extents;
+        extent->flat = 0;
+        extent->file = bs->file;
+        extent->cluster_sectors = le32_to_cpu(header.granularity);
+        extent->l2_size = 1 << 9;
+        extent->l1_size = 1 << 6;
+        extent->sectors = le32_to_cpu(header.disk_sectors);
+        extent->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
+        extent->l1_backup_table_offset = 0;
+        extent->l1_entry_sectors = extent->l2_size * extent->cluster_sectors;
     } else if (magic == VMDK4_MAGIC) {
         VMDK4Header header;
-
-        if (bdrv_pread(bs->file, sizeof(magic), &header,
sizeof(header)) != sizeof(header))
+        s->extents = qemu_mallocz(sizeof(VmdkExtent));
+        s->extent_size = 1;
+        if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header))
+            != sizeof(header))
             goto fail;
-        bs->total_sectors = le64_to_cpu(header.capacity);
-        s->cluster_sectors = le64_to_cpu(header.granularity);
-        s->l2_size = le32_to_cpu(header.num_gtes_per_gte);
-        s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
-        if (s->l1_entry_sectors <= 0)
+        extent = s->extents;
+        extent->file = bs->file;
+        extent->sectors = le64_to_cpu(header.capacity);
+        extent->cluster_sectors = le64_to_cpu(header.granularity);
+        extent->l2_size = le32_to_cpu(header.num_gtes_per_gte);
+        extent->l1_entry_sectors = extent->l2_size * extent->cluster_sectors;
+        if (extent->l1_entry_sectors <= 0)
             goto fail;
-        s->l1_size = (bs->total_sectors + s->l1_entry_sectors - 1)
-            / s->l1_entry_sectors;
-        s->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
-        s->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;
+        extent->l1_size = (extent->sectors + extent->l1_entry_sectors - 1)
+            / extent->l1_entry_sectors;
+        extent->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
+        extent->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;

         // try to open parent images, if exist
         if (vmdk_parent_open(bs) != 0)
             goto fail;
         // write the CID once after the image creation
-        s->parent_cid = vmdk_read_cid(bs,1);
+        extent->parent_cid = vmdk_read_cid(bs,1);
     } else {
+        char buf[2048];
+        char ct[128];
+        if (bdrv_pread(bs->file, 0, buf, sizeof(buf)) == 0)
+            goto fail;
+        if (0 != vmdk_parse_description(buf, "createType", ct, sizeof(ct)))
+            goto fail;
+        if (0 != strcmp(ct, "monolithicFlat"))
+            goto fail;
+        bs->disk_size += get_file_size(bs->file->filename);
+        s->has_descriptor_file = 1;
+        s->extent_size = vmdk_parse_extents(buf, NULL, NULL);
+        if  (!s->extent_size)
+            goto fail;
+        s->extents = qemu_mallocz(s->extent_size * sizeof(VmdkExtent));
+        extent = s->extents;
+        vmdk_parse_extents(buf, s->extents, bs->file->filename);
+        bs->total_sectors = 0;
+
+        // try to open parent images, if exist
+        if (vmdk_parent_open(bs) != 0)
         goto fail;
+        // write the CID once after the image creation
+        extent->parent_cid = vmdk_read_cid(bs,1);
+
+        for (i = 0; i < s->extent_size; i++)
+        {
+            bs->disk_size += get_file_size(s->extents[i].file->filename);
+            bs->total_sectors += s->extents[i].sectors;
+        }
+        return 0;
+    }
+
+    bs->total_sectors = 0;
+    for (i = 0; i < s->extent_size; i++)
+    {
+        bs->disk_size += get_file_size(s->extents[i].file->filename);
+        bs->total_sectors += s->extents[i].sectors;
     }

     /* read the L1 table */
-    l1_size = s->l1_size * sizeof(uint32_t);
-    s->l1_table = qemu_malloc(l1_size);
-    if (bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
l1_size) != l1_size)
+    l1_size = extent->l1_size * sizeof(uint32_t);
+    extent->l1_table = qemu_malloc(l1_size);
+    if (bdrv_pread(bs->file,
+            extent->l1_table_offset,
+            extent->l1_table,
+            l1_size)
+        != l1_size)
         goto fail;
-    for(i = 0; i < s->l1_size; i++) {
-        le32_to_cpus(&s->l1_table[i]);
+    for(i = 0; i < extent->l1_size; i++) {
+        le32_to_cpus(&extent->l1_table[i]);
     }

-    if (s->l1_backup_table_offset) {
-        s->l1_backup_table = qemu_malloc(l1_size);
-        if (bdrv_pread(bs->file, s->l1_backup_table_offset,
s->l1_backup_table, l1_size) != l1_size)
+    if (extent->l1_backup_table_offset) {
+        extent->l1_backup_table = qemu_malloc(l1_size);
+        if (bdrv_pread(bs->file,
+                extent->l1_backup_table_offset,
+                extent->l1_backup_table,
+                l1_size)
+            != l1_size)
             goto fail;
-        for(i = 0; i < s->l1_size; i++) {
-            le32_to_cpus(&s->l1_backup_table[i]);
+        for(i = 0; i < extent->l1_size; i++) {
+            le32_to_cpus(&extent->l1_backup_table[i]);
         }
     }

-    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
+    extent->l2_cache =
+        qemu_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
     return 0;
  fail:
-    qemu_free(s->l1_backup_table);
-    qemu_free(s->l1_table);
-    qemu_free(s->l2_cache);
+    if (extent) {
+        qemu_free(extent->l1_backup_table);
+        qemu_free(extent->l1_table);
+        qemu_free(extent->l2_cache);
+    }
     return -1;
 }

-static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
-                                   uint64_t offset, int allocate);
-
-static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset,
-                             uint64_t offset, int allocate)
+static uint64_t get_cluster_offset(BlockDriverState *bs,
+                    VmdkExtent *extent,
+                    VmdkMetaData *m_data,
+                    uint64_t offset,
+                    int allocate);
+
+static int get_whole_cluster(BlockDriverState *bs,
+                VmdkExtent *extent,
+                uint64_t cluster_offset,
+                uint64_t offset,
+                int allocate)
 {
-    BDRVVmdkState *s = bs->opaque;
-    uint8_t  whole_grain[s->cluster_sectors*512];        // 128
sectors * 512 bytes each = grain size 64KB
+    // 128 sectors * 512 bytes each = grain size 64KB
+    uint8_t  whole_grain[extent->cluster_sectors * 512];

     // we will be here if it's first write on non-exist grain(cluster).
     // try to read from parent image, if exist
     if (bs->backing_hd) {
         int ret;

-        if (!vmdk_is_cid_valid(bs))
+        if (!vmdk_is_cid_valid(bs, extent))
             return -1;
-
+        // floor offset to cluster
+        offset -= offset % (extent->cluster_sectors * 512);
         ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
-            s->cluster_sectors);
+                extent->cluster_sectors);
         if (ret < 0) {
             return -1;
         }

         //Write grain only into the active image
-        ret = bdrv_write(bs->file, cluster_offset, whole_grain,
-            s->cluster_sectors);
+        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
+                extent->cluster_sectors);
         if (ret < 0) {
             return -1;
         }
@@ -465,52 +657,62 @@  static int get_whole_cluster(BlockDriverState
*bs, uint64_t cluster_offset,
     return 0;
 }

-static int vmdk_L2update(BlockDriverState *bs, VmdkMetaData *m_data)
+static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
 {
-    BDRVVmdkState *s = bs->opaque;
-
+    if (extent->flat) return -1;
     /* update L2 table */
-    if (bdrv_pwrite_sync(bs->file, ((int64_t)m_data->l2_offset * 512)
+ (m_data->l2_index * sizeof(m_data->offset)),
-                    &(m_data->offset), sizeof(m_data->offset)) < 0)
+    if (bdrv_pwrite_sync(
+            extent->file,
+            ((int64_t)m_data->l2_offset * 512)
+                + (m_data->l2_index * sizeof(m_data->offset)),
+            &(m_data->offset),
+            sizeof(m_data->offset)
+        ) < 0)
         return -1;
     /* update backup L2 table */
-    if (s->l1_backup_table_offset != 0) {
-        m_data->l2_offset = s->l1_backup_table[m_data->l1_index];
-        if (bdrv_pwrite_sync(bs->file, ((int64_t)m_data->l2_offset *
512) + (m_data->l2_index * sizeof(m_data->offset)),
-                        &(m_data->offset), sizeof(m_data->offset)) < 0)
+    if (extent->l1_backup_table_offset != 0) {
+        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
+        if (bdrv_pwrite_sync(
+                extent->file,
+                ((int64_t)m_data->l2_offset * 512)
+                    + (m_data->l2_index * sizeof(m_data->offset)),
+                &(m_data->offset), sizeof(m_data->offset)
+            ) < 0)
             return -1;
     }

     return 0;
 }

-static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
+static uint64_t get_cluster_offset(BlockDriverState *bs,
+                                    VmdkExtent *extent,
+                                    VmdkMetaData *m_data,
                                    uint64_t offset, int allocate)
 {
-    BDRVVmdkState *s = bs->opaque;
     unsigned int l1_index, l2_offset, l2_index;
     int min_index, i, j;
     uint32_t min_count, *l2_table, tmp = 0;
     uint64_t cluster_offset;

+    if (extent->flat) return 0;
     if (m_data)
         m_data->valid = 0;

-    l1_index = (offset >> 9) / s->l1_entry_sectors;
-    if (l1_index >= s->l1_size)
+    l1_index = (offset >> 9) / extent->l1_entry_sectors;
+    if (l1_index >= extent->l1_size)
         return 0;
-    l2_offset = s->l1_table[l1_index];
+    l2_offset = extent->l1_table[l1_index];
     if (!l2_offset)
         return 0;
     for(i = 0; i < L2_CACHE_SIZE; i++) {
-        if (l2_offset == s->l2_cache_offsets[i]) {
+        if (l2_offset == extent->l2_cache_offsets[i]) {
             /* increment the hit count */
-            if (++s->l2_cache_counts[i] == 0xffffffff) {
+            if (++extent->l2_cache_counts[i] == 0xffffffff) {
                 for(j = 0; j < L2_CACHE_SIZE; j++) {
-                    s->l2_cache_counts[j] >>= 1;
+                    extent->l2_cache_counts[j] >>= 1;
                 }
             }
-            l2_table = s->l2_cache + (i * s->l2_size);
+            l2_table = extent->l2_cache + (i * extent->l2_size);
             goto found;
         }
     }
@@ -518,20 +720,24 @@  static uint64_t
get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
     min_index = 0;
     min_count = 0xffffffff;
     for(i = 0; i < L2_CACHE_SIZE; i++) {
-        if (s->l2_cache_counts[i] < min_count) {
-            min_count = s->l2_cache_counts[i];
+        if (extent->l2_cache_counts[i] < min_count) {
+            min_count = extent->l2_cache_counts[i];
             min_index = i;
         }
     }
-    l2_table = s->l2_cache + (min_index * s->l2_size);
-    if (bdrv_pread(bs->file, (int64_t)l2_offset * 512, l2_table,
s->l2_size * sizeof(uint32_t)) !=
-
  s->l2_size * sizeof(uint32_t))
+    l2_table = extent->l2_cache + (min_index * extent->l2_size);
+    if (bdrv_pread(
+            extent->file,
+            (int64_t)l2_offset * 512,
+            l2_table,
+            extent->l2_size * sizeof(uint32_t)
+        ) != extent->l2_size * sizeof(uint32_t))
         return 0;

-    s->l2_cache_offsets[min_index] = l2_offset;
-    s->l2_cache_counts[min_index] = 1;
+    extent->l2_cache_offsets[min_index] = l2_offset;
+    extent->l2_cache_counts[min_index] = 1;
  found:
-    l2_index = ((offset >> 9) / s->cluster_sectors) % s->l2_size;
+    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
     cluster_offset = le32_to_cpu(l2_table[l2_index]);

     if (!cluster_offset) {
@@ -539,8 +745,11 @@  static uint64_t
get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
             return 0;

         // Avoid the L2 tables update for the images that have snapshots.
-        cluster_offset = bdrv_getlength(bs->file);
-        bdrv_truncate(bs->file, cluster_offset + (s->cluster_sectors << 9));
+        cluster_offset = bdrv_getlength(extent->file);
+        bdrv_truncate(
+            extent->file,
+            cluster_offset + (extent->cluster_sectors << 9)
+        );

         cluster_offset >>= 9;
         tmp = cpu_to_le32(cluster_offset);
@@ -551,7 +760,8 @@  static uint64_t
get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
          * This problem may occur because of insufficient space on host disk
          * or inappropriate VM shutdown.
          */
-        if (get_whole_cluster(bs, cluster_offset, offset, allocate) == -1)
+        if (get_whole_cluster(
+                bs, extent, cluster_offset, offset, allocate) == -1)
             return 0;

         if (m_data) {
@@ -570,35 +780,68 @@  static int vmdk_is_allocated(BlockDriverState
*bs, int64_t sector_num,
                              int nb_sectors, int *pnum)
 {
     BDRVVmdkState *s = bs->opaque;
-    int index_in_cluster, n;
-    uint64_t cluster_offset;
-
-    cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
-    index_in_cluster = sector_num % s->cluster_sectors;
-    n = s->cluster_sectors - index_in_cluster;
+    int ext_idx = 0;
+    int index_in_cluster, n, ret;
+    uint64_t offset;
+    VmdkExtent *extent;
+    while (ext_idx < s->extent_size
+            && sector_num >= s->extents[ext_idx].sectors) {
+        sector_num -= s->extents[ext_idx].sectors;
+        ext_idx++;
+    }
+    if (ext_idx == s->extent_size) return 0;
+    extent = &s->extents[ext_idx];
+    if (s->extents[ext_idx].flat) {
+        n = extent->sectors - sector_num;
+        ret = 1;
+    } else {
+        offset = get_cluster_offset(bs, extent, NULL, sector_num * 512, 0);
+        index_in_cluster = sector_num % extent->cluster_sectors;
+        n = extent->cluster_sectors - index_in_cluster;
+        ret = offset ? 1 : 0;
+    }
     if (n > nb_sectors)
         n = nb_sectors;
     *pnum = n;
-    return (cluster_offset != 0);
+    return ret;
 }

 static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
                     uint8_t *buf, int nb_sectors)
 {
     BDRVVmdkState *s = bs->opaque;
-    int index_in_cluster, n, ret;
+    int ext_idx = 0;
+    int ret;
+    uint64_t n, index_in_cluster;
+    VmdkExtent *extent;
     uint64_t cluster_offset;

     while (nb_sectors > 0) {
-        cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
-        index_in_cluster = sector_num % s->cluster_sectors;
-        n = s->cluster_sectors - index_in_cluster;
+        while (ext_idx < s->extent_size
+                && sector_num >= s->extents[ext_idx].sectors) {
+            sector_num -= s->extents[ext_idx].sectors;
+            ext_idx++;
+        }
+        if (ext_idx == s->extent_size) {
+            return -1;
+        }
+        extent = &s->extents[ext_idx];
+        if (extent->flat)
+            cluster_offset = 0;
+        else
+            cluster_offset = get_cluster_offset(
+                                bs, extent, NULL, sector_num << 9, 0);
+
+        cluster_offset =
+            get_cluster_offset(bs, extent, NULL, sector_num << 9, 0);
+        index_in_cluster = sector_num % extent->cluster_sectors;
+        n = extent->cluster_sectors - index_in_cluster;
         if (n > nb_sectors)
             n = nb_sectors;
-        if (!cluster_offset) {
+        if (!extent->flat && !cluster_offset) {
             // try to read from parent image, if exist
             if (bs->backing_hd) {
-                if (!vmdk_is_cid_valid(bs))
+                if (!vmdk_is_cid_valid(bs, extent))
                     return -1;
                 ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
                 if (ret < 0)
@@ -607,7 +850,12 @@  static int vmdk_read(BlockDriverState *bs,
int64_t sector_num,
                 memset(buf, 0, 512 * n);
             }
         } else {
-            if(bdrv_pread(bs->file, cluster_offset + index_in_cluster
* 512, buf, n * 512) != n * 512)
+            if(bdrv_pread(
+                    extent->file,
+                    cluster_offset + index_in_cluster * 512,
+                    buf,
+                    n * 512
+                ) != n * 512)
                 return -1;
         }
         nb_sectors -= n;
@@ -621,11 +869,11 @@  static int vmdk_write(BlockDriverState *bs,
int64_t sector_num,
                      const uint8_t *buf, int nb_sectors)
 {
     BDRVVmdkState *s = bs->opaque;
-    VmdkMetaData m_data;
-    int index_in_cluster, n;
+    VmdkExtent *extent;
+    int ext_idx = 0;
+    int n, index_in_cluster;
     uint64_t cluster_offset;
-    static int cid_update = 0;
-
+    VmdkMetaData m_data;
     if (sector_num > bs->total_sectors) {
         fprintf(stderr,
                 "(VMDK) Wrong offset: sector_num=0x%" PRIx64
@@ -635,19 +883,40 @@  static int vmdk_write(BlockDriverState *bs,
int64_t sector_num,
     }

     while (nb_sectors > 0) {
-        index_in_cluster = sector_num & (s->cluster_sectors - 1);
-        n = s->cluster_sectors - index_in_cluster;
+        while (ext_idx < s->extent_size
+            && sector_num >= s->extents[ext_idx].sectors) {
+            sector_num -= s->extents[ext_idx].sectors;
+            ext_idx++;
+        }
+        if (ext_idx == s->extent_size) {
+            return -1;
+        }
+        extent = &s->extents[ext_idx];
+        if (extent->flat)
+            cluster_offset = 0;
+        else
+            cluster_offset = get_cluster_offset(
+                                bs,
+                                extent,
+                                &m_data,
+                                sector_num << 9, 1);
+
+        if (!extent->flat && !cluster_offset)
+            return -1;
+        index_in_cluster = sector_num & (extent->cluster_sectors - 1);
+        n = extent->cluster_sectors - index_in_cluster;
         if (n > nb_sectors)
             n = nb_sectors;
-        cluster_offset = get_cluster_offset(bs, &m_data, sector_num << 9, 1);
-        if (!cluster_offset)
-            return -1;

-        if (bdrv_pwrite(bs->file, cluster_offset + index_in_cluster *
512, buf, n * 512) != n * 512)
+        if (bdrv_pwrite(
+                extent->file,
+                cluster_offset + index_in_cluster * 512,
+                buf, n * 512
+            ) != n * 512)
             return -1;
-        if (m_data.valid) {
+        if (!extent->flat && m_data.valid) {
             /* update L2 tables */
-            if (vmdk_L2update(bs, &m_data) == -1)
+            if (vmdk_L2update(extent, &m_data) == -1)
                 return -1;
         }
         nb_sectors -= n;
@@ -655,9 +924,9 @@  static int vmdk_write(BlockDriverState *bs,
int64_t sector_num,
         buf += n * 512;

         // update CID on the first write every time the virtual disk is opened
-        if (!cid_update) {
+        if (!s->cid_update) {
             vmdk_write_cid(bs, time(NULL));
-            cid_update++;
+            s->cid_update++;
         }
     }
     return 0;
@@ -665,32 +934,17 @@  static int vmdk_write(BlockDriverState *bs,
int64_t sector_num,

 static int vmdk_create(const char *filename, QEMUOptionParameter *options)
 {
-    int fd, i;
+    int fd;
+    int64_t i;
     VMDK4Header header;
     uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
-    static const char desc_template[] =
-        "# Disk DescriptorFile\n"
-        "version=1\n"
-        "CID=%x\n"
-        "parentCID=ffffffff\n"
-        "createType=\"monolithicSparse\"\n"
-        "\n"
-        "# Extent description\n"
-        "RW %" PRId64 " SPARSE \"%s\"\n"
-        "\n"
-        "# The Disk Data Base \n"
-        "#DDB\n"
-        "\n"
-        "ddb.virtualHWVersion = \"%d\"\n"
-        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
-        "ddb.geometry.heads = \"16\"\n"
-        "ddb.geometry.sectors = \"63\"\n"
-        "ddb.adapterType = \"ide\"\n";
+
     char desc[1024];
     const char *real_filename, *temp_str;
     int64_t total_size = 0;
     const char *backing_file = NULL;
     int flags = 0;
+    int flat = 0;
     int ret;

     // Read out options
@@ -701,16 +955,125 @@  static int vmdk_create(const char *filename,
QEMUOptionParameter *options)
             backing_file = options->value.s;
         } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
             flags |= options->value.n ? BLOCK_FLAG_COMPAT6: 0;
+        } else if (!strcmp(options->name, BLOCK_OPT_FLAT)) {
+            flat = options->value.n;
         }
         options++;
     }

+    if (flat) {
+        const char desc_template[] =
+        "# Disk DescriptorFile\n"
+        "version=1\n"
+        "CID=%x\n"
+        "parentCID=ffffffff\n"
+        "createType=\"monolithicFlat\"\n"
+        "\n"
+        "# Extent description\n"
+        "RW %" PRId64 " FLAT \"%s\" 0\n"
+        "\n"
+        "# The Disk Data Base \n"
+        "#DDB\n"
+        "\n"
+        "ddb.virtualHWVersion = \"%d\"\n"
+        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
+        "ddb.geometry.heads = \"16\"\n"
+        "ddb.geometry.sectors = \"63\"\n"
+        "ddb.adapterType = \"ide\"\n";
+        char ext_filename[1024];
+        strncpy(ext_filename, filename, 1024);
+        ext_filename[1023] = '\0';
+        if (!strcmp(&ext_filename[strlen(ext_filename) - 5], ".vmdk"))
+            strcpy(&ext_filename[strlen(ext_filename) - 5], "-flat.vmdk");
+        else
+            strcat(ext_filename, "-flat.vmdk");
+        /* create extent first */
+        fd = open(
+                ext_filename,
+                O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+                0644);
+        if (fd < 0)
+            return -errno;
+        ret = ftruncate(fd, total_size * 512);
+        if (ret) goto exit;
+#if VMDK_FLAT_BACKING
+        if (backing_file) {
+            uint8_t buf[512];
+            BlockDriverState *bf = bdrv_new("");
+            if (!bf) {
+                ret = -1;
+                goto exit;
+            }
+            ret = bdrv_open(bf, backing_file, BDRV_O_RDWR, NULL);
+            if (ret) {
+                bdrv_delete(bf);
+                goto exit;
+            }
+            for (i = 0; i < total_size; i++) {
+                int num;
+                if (bdrv_is_allocated(bf, i, 1, &num)) {
+                    ret = bdrv_read(bf, i, buf, 1);
+                    if (ret)
+                    {
+                        bdrv_delete(bf);
+                        goto exit;
+                    }
+                    ret = pwrite(fd, buf, 512, i * 512);
+                    if (ret != 512)
+                    {
+                        bdrv_delete(bf);
+                        goto exit;
+                    }
+                }
+            }
+        }
+#endif
+        close(fd);
+
+        /* generate descriptor file */
+        snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL),
+                 total_size, ext_filename,
+                 (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
+                 total_size / (int64_t)(63 * 16));
+        fd = open(
+                filename,
+                O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+                0644);
+        if (fd < 0)
+            return -errno;
+        ret = qemu_write_full(fd, desc, strlen(desc));
+        if (ret != strlen(desc)) {
+            ret = -errno;
+            goto exit;
+        }
+        ret = 0;
+    } else {
+        const char desc_template[] =
+        "# Disk DescriptorFile\n"
+        "version=1\n"
+        "CID=%x\n"
+        "parentCID=ffffffff\n"
+        "createType=\"monolithicSparse\"\n"
+        "\n"
+        "# Extent description\n"
+        "RW %" PRId64 " SPARSE \"%s\"\n"
+        "\n"
+        "# The Disk Data Base \n"
+        "#DDB\n"
+        "\n"
+        "ddb.virtualHWVersion = \"%d\"\n"
+        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
+        "ddb.geometry.heads = \"16\"\n"
+        "ddb.geometry.sectors = \"63\"\n"
+        "ddb.adapterType = \"ide\"\n";
     /* XXX: add support for backing file */
     if (backing_file) {
         return vmdk_snapshot_create(filename, backing_file);
     }

-    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+        fd = open(
+            filename,
+            O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
               0644);
     if (fd < 0)
         return -errno;
@@ -724,7 +1087,8 @@  static int vmdk_create(const char *filename,
QEMUOptionParameter *options)

     grains = (total_size + header.granularity - 1) / header.granularity;
     gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
-    gt_count = (grains + header.num_gtes_per_gte - 1) /
header.num_gtes_per_gte;
+        gt_count =
+            (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
     gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;

     header.desc_offset = 1;
@@ -807,8 +1171,8 @@  static int vmdk_create(const char *filename,
QEMUOptionParameter *options)
         ret = -errno;
         goto exit;
     }
-
     ret = 0;
+    }
 exit:
     close(fd);
     return ret;
@@ -817,14 +1181,20 @@  exit:
 static void vmdk_close(BlockDriverState *bs)
 {
     BDRVVmdkState *s = bs->opaque;
-
-    qemu_free(s->l1_table);
-    qemu_free(s->l2_cache);
+    if (s->extent_size)
+        qemu_free(s->extents);
 }

 static int vmdk_flush(BlockDriverState *bs)
 {
-    return bdrv_flush(bs->file);
+    int ret = 0;
+    int i;
+    BDRVVmdkState *s = bs->opaque;
+    ret = bdrv_flush(bs->file);
+    for (i = 0; i <  s->extent_size; i++)
+        ret |= bdrv_flush(s->extents[i].file);
+    return ret;
+
 }


@@ -844,6 +1214,11 @@  static QEMUOptionParameter vmdk_create_options[] = {
         .type = OPT_FLAG,
         .help = "VMDK version 6 image"
     },
+    {
+        .name = BLOCK_OPT_FLAT,
+        .type = OPT_FLAG,
+        .help = "VMDK flat extent image"
+    },
     { NULL }
 };

@@ -862,6 +1237,7 @@  static BlockDriver bdrv_vmdk = {
     .create_options = vmdk_create_options,
 };

+
 static void bdrv_vmdk_init(void)
 {
     bdrv_register(&bdrv_vmdk);
diff --git a/block_int.h b/block_int.h
index fa91337..8df11dd 100644
--- a/block_int.h
+++ b/block_int.h
@@ -34,6 +34,7 @@ 
 #define BLOCK_OPT_SIZE          "size"
 #define BLOCK_OPT_ENCRYPT       "encryption"
 #define BLOCK_OPT_COMPAT6       "compat6"
+#define BLOCK_OPT_FLAT          "flat"
 #define BLOCK_OPT_BACKING_FILE  "backing_file"
 #define BLOCK_OPT_BACKING_FMT   "backing_fmt"
 #define BLOCK_OPT_CLUSTER_SIZE  "cluster_size"
@@ -143,6 +144,7 @@  struct BlockDriver {
 struct BlockDriverState {
     int64_t total_sectors; /* if we are reading a disk image, give its
                               size in sectors */
+    int64_t disk_size; /* if non-zero, holds the total disk size */
     int read_only; /* if true, the media is read only */
     int keep_read_only; /* if true, the media was requested to stay
read only */
     int open_flags; /* flags used to open the file, re-used for re-open */
diff --git a/qemu-img.c b/qemu-img.c
index 4f162d1..0c6c0fb 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -980,12 +980,15 @@  out:
 }

 #ifdef _WIN32
-static int64_t get_allocated_file_size(const char *filename)
+static int64_t get_allocated_file_size(
+        BlockDriverState *bs, const char *filename)
 {
     typedef DWORD (WINAPI * get_compressed_t)(const char *filename,
DWORD *high);
     get_compressed_t get_compressed;
     struct _stati64 st;

+    if (bs->disk_size)
+        return bs->disk_size;
     /* WinNT support GetCompressedFileSize to determine allocate size */
     get_compressed = (get_compressed_t)