Patchwork [v2,3/5] block: initial VHDX driver support framework - supports open and probe

login
register
mail settings
Submitter Jeff Cody
Date April 23, 2013, 2:24 p.m.
Message ID <56a0e0347fde5396fb6b2a260de5b1a867a588d6.1366726446.git.jcody@redhat.com>
Download mbox | patch
Permalink /patch/238923/
State New
Headers show

Comments

Jeff Cody - April 23, 2013, 2:24 p.m.
This is the initial block driver framework for VHDX image support (
i.e. Hyper-V image file formats), that supports opening VHDX files, and
parsing the headers.

This commit does not yet enable:
    - reading
    - writing
    - updating the header
    - differencing files (images with parents)
    - log replay / dirty logs (only clean images)

This is based on Microsoft's VHDX specification:
    "VHDX Format Specification v0.95", published 4/12/2012
    https://www.microsoft.com/en-us/download/details.aspx?id=29681

Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 block/Makefile.objs |   1 +
 block/vhdx.c        | 769 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/vhdx.h        |  25 ++
 3 files changed, 795 insertions(+)
 create mode 100644 block/vhdx.c
Kevin Wolf - April 23, 2013, 3:46 p.m.
Am 23.04.2013 um 16:24 hat Jeff Cody geschrieben:
> This is the initial block driver framework for VHDX image support (
> i.e. Hyper-V image file formats), that supports opening VHDX files, and
> parsing the headers.
> 
> This commit does not yet enable:
>     - reading
>     - writing
>     - updating the header
>     - differencing files (images with parents)
>     - log replay / dirty logs (only clean images)
> 
> This is based on Microsoft's VHDX specification:
>     "VHDX Format Specification v0.95", published 4/12/2012
>     https://www.microsoft.com/en-us/download/details.aspx?id=29681
> 
> Signed-off-by: Jeff Cody <jcody@redhat.com>

Okay, I need more time actually reading the spec and comparing this code
to it before I can really say anything about the logic (I hope I'll get
to it tomorrow), but for a start, let me point out the obvious things.

> ---
>  block/Makefile.objs |   1 +
>  block/vhdx.c        | 769 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/vhdx.h        |  25 ++
>  3 files changed, 795 insertions(+)
>  create mode 100644 block/vhdx.c
> 
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 6c4b5bc..5f0358a 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -2,6 +2,7 @@ block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat
>  block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>  block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-obj-y += qed-check.o
> +block-obj-y += vhdx.o
>  block-obj-y += parallels.o blkdebug.o blkverify.o
>  block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
>  block-obj-$(CONFIG_POSIX) += raw-posix.o
> diff --git a/block/vhdx.c b/block/vhdx.c
> new file mode 100644
> index 0000000..b0ea2ba
> --- /dev/null
> +++ b/block/vhdx.c
> @@ -0,0 +1,769 @@
> +/*
> + * Block driver for Hyper-V VHDX Images
> + *
> + * Copyright (c) 2013 Red Hat, Inc.,
> + *
> + * Authors:
> + *  Jeff Cody <jcody@redhat.com>
> + *
> + *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
> + *  by Microsoft:
> + *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
> +
> +#include "qemu-common.h"
> +#include "block/block_int.h"
> +#include "qemu/module.h"
> +#include "qemu/crc32c.h"
> +#include "block/vhdx.h"
> +
> +
> +/* Several metadata and region table data entries are identified by
> + * guids in  a MS-specific GUID format. */
> +
> +
> +/* ------- Known Region Table GUIDs ---------------------- */
> +static const ms_guid bat_guid =      { .data1 = 0x2dc27766,
> +                                       .data2 = 0xf623,
> +                                       .data3 = 0x4200,
> +                                       .data4 = { 0x9d, 0x64, 0x11, 0x5e,
> +                                                  0x9b, 0xfd, 0x4a, 0x08} };
> +
> +static const ms_guid metadata_guid = { .data1 = 0x8b7ca206,
> +                                       .data2 = 0x4790,
> +                                       .data3 = 0x4b9a,
> +                                       .data4 = { 0xb8, 0xfe, 0x57, 0x5f,
> +                                                  0x05, 0x0f, 0x88, 0x6e} };
> +
> +
> +
> +/* ------- Known Metadata Entry GUIDs ---------------------- */
> +static const ms_guid file_param_guid =   { .data1 = 0xcaa16737,
> +                                           .data2 = 0xfa36,
> +                                           .data3 = 0x4d43,
> +                                           .data4 = { 0xb3, 0xb6, 0x33, 0xf0,
> +                                                      0xaa, 0x44, 0xe7, 0x6b} };
> +
> +static const ms_guid virtual_size_guid = { .data1 = 0x2FA54224,
> +                                           .data2 = 0xcd1b,
> +                                           .data3 = 0x4876,
> +                                           .data4 = { 0xb2, 0x11, 0x5d, 0xbe,
> +                                                      0xd8, 0x3b, 0xf4, 0xb8} };
> +
> +static const ms_guid page83_guid =       { .data1 = 0xbeca12ab,
> +                                           .data2 = 0xb2e6,
> +                                           .data3 = 0x4523,
> +                                           .data4 = { 0x93, 0xef, 0xc3, 0x09,
> +                                                      0xe0, 0x00, 0xc7, 0x46} };
> +
> +
> +static const ms_guid phys_sector_guid =  { .data1 = 0xcda348c7,
> +                                           .data2 = 0x445d,
> +                                           .data3 = 0x4471,
> +                                           .data4 = { 0x9c, 0xc9, 0xe9, 0x88,
> +                                                      0x52, 0x51, 0xc5, 0x56} };
> +
> +static const ms_guid parent_locator_guid = { .data1 = 0xa8d35f2d,
> +                                             .data2 = 0xb30b,
> +                                             .data3 = 0x454d,
> +                                             .data4 = { 0xab, 0xf7, 0xd3,
> +                                                        0xd8, 0x48, 0x34,
> +                                                        0xab, 0x0c} };
> +
> +static const ms_guid logical_sector_guid = { .data1 = 0x8141bf1d,
> +                                             .data2 = 0xa96f,
> +                                             .data3 = 0x4709,
> +                                             .data4 = { 0xba, 0x47, 0xf2,
> +                                                        0x33, 0xa8, 0xfa,
> +                                                        0xab, 0x5f} };
> +
> +/* Each parent type must have a valid GUID; this is for parent images
> + * of type 'VHDX'.  If we were to allow e.g. a QCOW2 parent, we would
> + * need to make up our own QCOW2 GUID type */
> +static const ms_guid parent_vhdx_guid = { .data1 = 0xb04aefb7,
> +                                          .data2 = 0xd19e,
> +                                          .data3 = 0x4a81,
> +                                          .data4 = { 0xb7, 0x89, 0x25, 0xb8,
> +                                                     0xe9, 0x44, 0x59, 0x13} };
> +
> +
> +#define META_FILE_PARAMETER_PRESENT      0x01
> +#define META_VIRTUAL_DISK_SIZE_PRESENT   0x02
> +#define META_PAGE_83_PRESENT             0x04
> +#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08
> +#define META_PHYS_SECTOR_SIZE_PRESENT    0x10
> +#define META_PARENT_LOCATOR_PRESENT      0x20
> +
> +#define META_ALL_PRESENT    \
> +    (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \
> +     META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
> +     META_PHYS_SECTOR_SIZE_PRESENT)
> +
> +typedef struct vhdx_metadata_entries {
> +    vhdx_metadata_table_entry file_parameters_entry;
> +    vhdx_metadata_table_entry virtual_disk_size_entry;
> +    vhdx_metadata_table_entry page83_data_entry;
> +    vhdx_metadata_table_entry logical_sector_size_entry;
> +    vhdx_metadata_table_entry phys_sector_size_entry;
> +    vhdx_metadata_table_entry parent_locator_entry;
> +    uint16_t present;
> +} vhdx_metadata_entries;

Should everything before this line actually be in the header?

> +
> +typedef struct BDRVVHDXState {
> +    CoMutex lock;
> +
> +    int curr_header;
> +    vhdx_header *headers[2];
> +
> +    vhdx_region_table_header rt;
> +    vhdx_region_table_entry bat_rt;         /* region table for the BAT */
> +    vhdx_region_table_entry metadata_rt;    /* region table for the metadata */
> +
> +    vhdx_metadata_table_header  metadata_hdr;
> +    vhdx_metadata_entries metadata_entries;
> +
> +    vhdx_file_parameters params;
> +    uint32_t block_size;
> +    uint32_t block_size_bits;
> +    uint32_t sectors_per_block;
> +    uint32_t sectors_per_block_bits;
> +
> +    uint64_t virtual_disk_size;
> +    uint32_t logical_sector_size;
> +    uint32_t physical_sector_size;
> +
> +    uint64_t chunk_ratio;
> +    uint32_t chunk_ratio_bits;
> +    uint32_t logical_sector_size_bits;
> +
> +    uint32_t bat_entries;
> +    vhdx_bat_entry *bat;
> +    uint64_t bat_offset;
> +
> +    vhdx_parent_locator_header parent_header;
> +    vhdx_parent_locator_entry *parent_entries;
> +
> +} BDRVVHDXState;
> +
> +uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
> +                            int crc_offset)
> +{
> +    uint32_t crc_new;
> +    uint32_t crc_orig;
> +    assert(buf != NULL);
> +
> +    if (crc_offset > 0) {
> +        memcpy(&crc_orig, buf+crc_offset, sizeof(crc_orig));

Please add spaces before and after operators. I saw this a lot in this
series. I'm relatively sure that you can use checkpatch.pl to get all
instances pointed out.

> +        memset(buf+crc_offset, 0, sizeof(crc_orig));
> +    }
> +
> +    crc_new = crc32c(crc, buf, size);
> +    if (crc_offset > 0) {
> +        memcpy(buf+crc_offset, &crc_orig, sizeof(crc_orig));
> +    }
> +
> +    return crc_new;
> +}
> +
> +/* Validates the checksum of the buffer, with an in-place CRC.
> + *
> + * Zero is substituted during crc calculation for the original crc field,
> + * and the crc field is restored afterwards.  But the buffer will be modifed
> + * during the calculation, so this may not be not suitable for multi-threaded
> + * use.
> + *
> + * crc_offset: byte offset in buf of the buffer crc
> + * buf: buffer pointer
> + * size: size of buffer (must be > crc_offset+4)
> + *
> + * returns true if checksum is valid, false otherwise
> + */
> +bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset)
> +{
> +    uint32_t crc_orig;
> +    uint32_t crc;
> +
> +    assert(buf != NULL);
> +    assert(size > (crc_offset+4));
> +
> +    memcpy(&crc_orig, buf+crc_offset, sizeof(crc_orig));
> +    crc_orig = le32_to_cpu(crc_orig);
> +
> +    crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset);
> +
> +    return crc == crc_orig;
> +}
> +
> +
> +/*
> + * Per the MS VHDX Specification, for every VHDX file:
> + *      - The header section is fixed size - 1 MB
> + *      - The header section is always the first "object"
> + *      - The first 64KB of the header is the File Identifier
> + *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
> + *      - The following 512 bytes constitute a UTF-16 string identifiying the
> + *        software that created the file, and is optional and diagnostic only.
> + *
> + *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
> + */
> +static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
> +{
> +    if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) {

There's a VHDX_FILE_ID_MAGIC constant in the header. Don't you want to
use it?

> +        return 100;
> +    }
> +    return 0;
> +}
> +
> +/* All VHDX structures on disk are little endian */
> +static void vhdx_header_le_import(vhdx_header *h)
> +{
> +    assert(h != NULL);
> +
> +    le32_to_cpus(&h->signature);
> +    le32_to_cpus(&h->checksum);
> +    le64_to_cpus(&h->sequence_number);
> +
> +    leguid_to_cpus(&h->file_write_guid);
> +    leguid_to_cpus(&h->data_write_guid);
> +    leguid_to_cpus(&h->log_guid);
> +
> +    le16_to_cpus(&h->log_version);
> +    le16_to_cpus(&h->version);
> +    le32_to_cpus(&h->log_length);
> +    le64_to_cpus(&h->log_offset);
> +}
> +
> +
> +/* opens the specified header block from the VHDX file header section */
> +static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
> +{
> +    int ret = 0;
> +    vhdx_header *header1;
> +    vhdx_header *header2;
> +    uint64_t h1_seq = 0;
> +    uint64_t h2_seq = 0;
> +    uint8_t *buffer;
> +
> +    header1 = qemu_blockalign(bs, sizeof(vhdx_header));
> +    header2 = qemu_blockalign(bs, sizeof(vhdx_header));
> +
> +    buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE);
> +
> +    s->headers[0] = header1;
> +    s->headers[1] = header2;
> +
> +    /* We have to read the whole VHDX_HEADER_SIZE instead of
> +     * sizeof(vhdx_header), because the checksum is over the whole
> +     * region */
> +    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +    /* copy over just the relevant portion that we need */
> +    memcpy(header1, buffer, sizeof(vhdx_header));
> +    vhdx_header_le_import(header1);
> +
> +    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
> +        header1->signature == VHDX_HDR_MAGIC) {
> +        h1_seq = header1->sequence_number;
> +    }
> +
> +    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +    /* copy over just the relevant portion that we need */
> +    memcpy(header2, buffer, sizeof(vhdx_header));
> +    vhdx_header_le_import(header2);
> +
> +    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
> +        header2->signature == VHDX_HDR_MAGIC) {
> +        h2_seq = header2->sequence_number;
> +    }
> +
> +    if (h1_seq > h2_seq) {
> +        s->curr_header = 0;
> +    } else if (h2_seq > h1_seq) {
> +        s->curr_header = 1;
> +    } else {
> +        printf("NO VALID HEADER\n");

Make that an qerror_report() at least.

> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +
> +    ret = 0;
> +
> +    goto exit;
> +
> +fail:
> +    qemu_vfree(header1);
> +    qemu_vfree(header2);
> +    s->headers[0] = NULL;
> +    s->headers[1] = NULL;
> +exit:
> +    qemu_vfree(buffer);
> +    return ret;
> +}
> +
> +
> +static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
> +{
> +    int ret = 0;
> +    uint8_t *buffer;
> +    int offset = 0;
> +    vhdx_region_table_entry rt_entry;
> +    int i;
> +
> +    /* We have to read the whole 64KB block, because the crc32 is over the
> +     * whole block */
> +    buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE);
> +
> +    ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
> +                    VHDX_HEADER_BLOCK_SIZE);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +    memcpy(&s->rt, buffer, sizeof(s->rt));
> +    le32_to_cpus(&s->rt.signature);
> +    le32_to_cpus(&s->rt.checksum);
> +    le32_to_cpus(&s->rt.entry_count);
> +    le32_to_cpus(&s->rt.reserved);
> +    offset += sizeof(s->rt);
> +
> +    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
> +        s->rt.signature != VHDX_RT_MAGIC) {
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +
> +    for (i = 0; i < s->rt.entry_count; i++) {
> +        memcpy(&rt_entry, buffer+offset, sizeof(rt_entry));
> +        offset += sizeof(rt_entry);
> +
> +        leguid_to_cpus(&rt_entry.guid);
> +        le64_to_cpus(&rt_entry.file_offset);
> +        le32_to_cpus(&rt_entry.length);
> +        le32_to_cpus(&rt_entry.data_bits);
> +
> +        /* see if we recognize the entry */
> +        if (guid_eq(rt_entry.guid, bat_guid)) {
> +            s->bat_rt = rt_entry;
> +            continue;
> +        }
> +
> +        if (guid_eq(rt_entry.guid, metadata_guid)) {
> +            s->metadata_rt = rt_entry;
> +            continue;
> +        }
> +
> +        if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) {
> +            /* cannot read vhdx file - required region table entry that
> +             * we do not understand.  per spec, we must fail to open */
> +            ret = -ENOTSUP;
> +            goto fail;
> +        }
> +    }
> +    ret = 0;
> +
> +fail:
> +    qemu_vfree(buffer);
> +    return ret;
> +}
> +
> +
> +
> +/* Metadata initial parser
> + *
> + * This loads all the metadata entry fields.  This may cause additional
> + * fields to be processed (e.g. parent locator, etc..).
> + *
> + * There are 5 Metadata items that are always required:
> + *      - File Parameters (block size, has a parent)
> + *      - Virtual Disk Size (size, in bytes, of the virtual drive)
> + *      - Page 83 Data (scsi page 83 guid)
> + *      - Logical Sector Size (logical sector size in bytes, either 512 or
> + *                             4096.  We only support 512 currently)
> + *      - Physical Sector Size (512 or 4096)
> + *
> + * Also, if the File Parameters indicate this is a differencing file,
> + * we must also look for the Parent Locator metadata item.
> + */
> +static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
> +{
> +    int ret = 0;
> +    uint8_t *buffer;
> +    int offset = 0;
> +    int i = 0;
> +    uint32_t block_size, sectors_per_block, logical_sector_size;
> +    uint64_t chunk_ratio;
> +    vhdx_metadata_table_entry md_entry;
> +
> +    buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE);
> +
> +    ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
> +                     VHDX_METADATA_TABLE_MAX_SIZE);
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +    memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr));
> +    offset += sizeof(s->metadata_hdr);
> +
> +    le64_to_cpus(&s->metadata_hdr.signature);
> +    le16_to_cpus(&s->metadata_hdr.reserved);
> +    le16_to_cpus(&s->metadata_hdr.entry_count);
> +
> +    if (s->metadata_hdr.signature != VHDX_METADATA_MAGIC) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +
> +    s->metadata_entries.present = 0;
> +
> +    if ((s->metadata_hdr.entry_count * sizeof(md_entry)) >
> +        (VHDX_METADATA_TABLE_MAX_SIZE - offset)) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +
> +    for (i = 0; i < s->metadata_hdr.entry_count; i++) {
> +        memcpy(&md_entry, buffer+offset, sizeof(md_entry));
> +        offset += sizeof(md_entry);
> +
> +        leguid_to_cpus(&md_entry.item_id);
> +        le32_to_cpus(&md_entry.offset);
> +        le32_to_cpus(&md_entry.length);
> +        le32_to_cpus(&md_entry.data_bits);
> +        le32_to_cpus(&md_entry.reserved2);
> +
> +        if (guid_eq(md_entry.item_id, file_param_guid)) {
> +            s->metadata_entries.file_parameters_entry = md_entry;
> +            s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_eq(md_entry.item_id, virtual_size_guid)) {
> +            s->metadata_entries.virtual_disk_size_entry = md_entry;
> +            s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_eq(md_entry.item_id, page83_guid)) {
> +            s->metadata_entries.page83_data_entry = md_entry;
> +            s->metadata_entries.present |= META_PAGE_83_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_eq(md_entry.item_id, logical_sector_guid)) {
> +            s->metadata_entries.logical_sector_size_entry = md_entry;
> +            s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_eq(md_entry.item_id, phys_sector_guid)) {
> +            s->metadata_entries.phys_sector_size_entry = md_entry;
> +            s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_eq(md_entry.item_id, parent_locator_guid)) {
> +            s->metadata_entries.parent_locator_entry = md_entry;
> +            s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT;
> +            continue;
> +        }
> +
> +        if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) {
> +            /* cannot read vhdx file - required region table entry that
> +             * we do not understand.  per spec, we must fail to open */
> +            ret = -ENOTSUP;
> +            goto exit;
> +        }
> +    }
> +
> +    if (s->metadata_entries.present != META_ALL_PRESENT) {
> +        ret = -ENOTSUP;
> +        goto exit;
> +    }
> +
> +    ret = bdrv_pread(bs->file,
> +                     s->metadata_entries.file_parameters_entry.offset
> +                                         + s->metadata_rt.file_offset,
> +                     &s->params,
> +                     sizeof(s->params));
> +
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +
> +    le32_to_cpus(&s->params.block_size);
> +    le32_to_cpus(&s->params.data_bits);
> +
> +
> +    /* We now have the file parameters, so we can tell if this is a
> +     * differencing file (i.e.. has_parent), is dynamic or fixed
> +     * sized (leave_blocks_allocated), and the block size */
> +
> +    /* The parent locator required iff the file parameters has_parent set */
> +    if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
> +        if (s->metadata_entries.present & ~META_PARENT_LOCATOR_PRESENT) {
> +            /* TODO: parse  parent locator fields */
> +            ret = -ENOTSUP; /* temp, until differencing files are supported */
> +            goto exit;
> +        } else {
> +            /* if has_parent is set, but there is not parent locator present,
> +             * then that is an invalid combination */
> +            ret = -EINVAL;
> +            goto exit;
> +        }
> +    }
> +
> +    /* determine virtual disk size, logical sector size,
> +     * and phys sector size */
> +
> +    ret = bdrv_pread(bs->file,
> +                     s->metadata_entries.virtual_disk_size_entry.offset
> +                                           + s->metadata_rt.file_offset,
> +                     &s->virtual_disk_size,
> +                     sizeof(uint64_t));
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +    ret = bdrv_pread(bs->file,
> +                     s->metadata_entries.logical_sector_size_entry.offset
> +                                             + s->metadata_rt.file_offset,
> +                     &s->logical_sector_size,
> +                     sizeof(uint32_t));
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +    ret = bdrv_pread(bs->file,
> +                     s->metadata_entries.phys_sector_size_entry.offset
> +                                          + s->metadata_rt.file_offset,
> +                     &s->physical_sector_size,
> +                     sizeof(uint32_t));
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +
> +    le64_to_cpus(&s->virtual_disk_size);
> +    le32_to_cpus(&s->logical_sector_size);
> +    le32_to_cpus(&s->physical_sector_size);
> +
> +    if (s->logical_sector_size == 0 || s->params.block_size == 0) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +
> +    /* both block_size and sector_size are guaranteed powers of 2 */
> +    s->sectors_per_block = s->params.block_size / s->logical_sector_size;
> +    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
> +                     (uint64_t)s->logical_sector_size /
> +                     (uint64_t)s->params.block_size;
> +
> +    /* These values are ones we will want to use for division / multiplication
> +     * later on, and they are all guaranteed (per the spec) to be powers of 2,
> +     * so we can take advantage of that for shift operations during
> +     * reads/writes */
> +    logical_sector_size = s->logical_sector_size;
> +    if (logical_sector_size & (logical_sector_size - 1)) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +
> +    sectors_per_block = s->sectors_per_block;
> +    if (sectors_per_block & (sectors_per_block - 1)) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +    chunk_ratio = s->chunk_ratio;
> +    if (chunk_ratio & (chunk_ratio - 1)) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +    block_size = s->params.block_size;
> +    if (block_size & (block_size - 1)) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +
> +    while (logical_sector_size >>= 1) {
> +        s->logical_sector_size_bits++;
> +    }
> +    while (sectors_per_block >>= 1) {
> +        s->sectors_per_block_bits++;
> +    }
> +    while (chunk_ratio >>= 1) {
> +        s->chunk_ratio_bits++;
> +    }
> +    while (block_size >>= 1) {
> +        s->block_size_bits++;
> +    }
> +
> +    if (s->logical_sector_size != BDRV_SECTOR_SIZE) {
> +        printf("VHDX error - QEMU only supports 512 byte sector sizes\n");
> +        ret = -ENOTSUP;
> +        goto exit;
> +    }
> +
> +    ret = 0;
> +
> +exit:
> +    qemu_vfree(buffer);
> +    return ret;
> +}
> +
> +/* Parse the replay log.  Per the VHDX spec, if the log is present
> + * it must be replayed prior to opening the file, even read-only.
> + *
> + * If read-only, we must replay the log in RAM (or refuse to open
> + * a dirty VHDX file read-only */
> +static int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s)
> +{
> +    int ret = 0;
> +    int i;
> +    vhdx_header *hdr;
> +
> +    hdr = s->headers[s->curr_header];
> +
> +    /* either either the log guid, or log length is zero,
> +     * then a replay log is present */
> +    for (i = 0; i < sizeof(hdr->log_guid.data4); i++) {
> +        ret |= hdr->log_guid.data4[i];
> +    }
> +    if (hdr->log_guid.data1 == 0 &&
> +        hdr->log_guid.data2 == 0 &&
> +        hdr->log_guid.data3 == 0 &&
> +        ret == 0) {
> +        goto exit;
> +    }
> +
> +    /* per spec, only log version of 0 is supported */
> +    if (hdr->log_version != 0) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +
> +    if (hdr->log_length == 0) {
> +        goto exit;
> +    }
> +
> +    /* We currently do not support images with logs to replay */
> +    ret = -ENOTSUP;
> +
> +exit:
> +    return ret;
> +}
> +
> +
> +static int vhdx_open(BlockDriverState *bs, QDict *options, int flags)
> +{
> +    BDRVVHDXState *s = bs->opaque;
> +    int ret = 0;
> +    int i;
> +
> +    s->bat = NULL;
> +
> +    qemu_co_mutex_init(&s->lock);
> +
> +    ret = vhdx_parse_header(bs, s);
> +    if (ret) {
> +        goto fail;
> +    }
> +
> +    ret = vhdx_parse_log(bs, s);
> +    if (ret) {
> +        goto fail;
> +    }
> +
> +    ret = vhdx_open_region_tables(bs, s);
> +    if (ret) {
> +        goto fail;
> +    }
> +
> +    ret = vhdx_parse_metadata(bs, s);
> +    if (ret) {
> +        goto fail;
> +    }
> +    s->block_size = s->params.block_size;
> +
> +    /* the VHDX spec dictates that virtual_disk_size is always a multiple of
> +     * logical_sector_size */
> +    bs->total_sectors = s->virtual_disk_size / s->logical_sector_size;
> +
> +    s->bat_offset = s->bat_rt.file_offset;
> +    s->bat_entries = s->bat_rt.length / sizeof(vhdx_bat_entry);
> +    s->bat = qemu_blockalign(bs, s->bat_rt.length);
> +
> +    ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length);

No error checking?

> +    for (i = 0; i < s->bat_entries; i++) {
> +        le64_to_cpus(&s->bat[i]);
> +    }
> +
> +    if (flags & BDRV_O_RDWR) {
> +        ret = -ENOTSUP;
> +        goto fail;
> +    }
> +
> +    /* TODO: differencing files, read, write */
> +
> +    return 0;
> +fail:
> +    qemu_vfree(s->bat);

This doesn't consider all the structs that were allocated by functions
called in vhdx_open(). Here the same things should be freed as in
vhdx_close().

> +    return ret;
> +}
> +
> +static int vhdx_reopen_prepare(BDRVReopenState *state,
> +                               BlockReopenQueue *queue, Error **errp)
> +{
> +    return 0;
> +}
> +
> +
> +static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
> +                                      int nb_sectors, QEMUIOVector *qiov)
> +{
> +    return -ENOTSUP;
> +}
> +
> +
> +
> +static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
> +                                      int nb_sectors, QEMUIOVector *qiov)
> +{
> +    return -ENOTSUP;
> +}
> +
> +
> +static void vhdx_close(BlockDriverState *bs)
> +{
> +    BDRVVHDXState *s = bs->opaque;
> +
> +    qemu_vfree(s->headers[0]);
> +    qemu_vfree(s->headers[1]);
> +    qemu_vfree(s->bat);
> +    qemu_vfree(s->parent_entries);
> +}
> +
> +static BlockDriver bdrv_vhdx = {
> +    .format_name    = "vhdx",
> +    .instance_size  = sizeof(BDRVVHDXState),

Use the same alignment for = as below?

> +    .bdrv_probe             = vhdx_probe,
> +    .bdrv_open              = vhdx_open,
> +    .bdrv_close             = vhdx_close,
> +    .bdrv_reopen_prepare    = vhdx_reopen_prepare,
> +    .bdrv_co_readv          = vhdx_co_readv,
> +    .bdrv_co_writev         = vhdx_co_writev,
> +};

Kevin
Jeff Cody - April 23, 2013, 4:11 p.m.
On Tue, Apr 23, 2013 at 05:46:28PM +0200, Kevin Wolf wrote:
> Am 23.04.2013 um 16:24 hat Jeff Cody geschrieben:
> > This is the initial block driver framework for VHDX image support (
> > i.e. Hyper-V image file formats), that supports opening VHDX files, and
> > parsing the headers.
> > 
> > This commit does not yet enable:
> >     - reading
> >     - writing
> >     - updating the header
> >     - differencing files (images with parents)
> >     - log replay / dirty logs (only clean images)
> > 
> > This is based on Microsoft's VHDX specification:
> >     "VHDX Format Specification v0.95", published 4/12/2012
> >     https://www.microsoft.com/en-us/download/details.aspx?id=29681
> > 
> > Signed-off-by: Jeff Cody <jcody@redhat.com>
> 
> Okay, I need more time actually reading the spec and comparing this code
> to it before I can really say anything about the logic (I hope I'll get
> to it tomorrow), but for a start, let me point out the obvious things.
> 
> > ---
> >  block/Makefile.objs |   1 +
> >  block/vhdx.c        | 769 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  block/vhdx.h        |  25 ++
> >  3 files changed, 795 insertions(+)
> >  create mode 100644 block/vhdx.c
> > 
> > diff --git a/block/Makefile.objs b/block/Makefile.objs
> > index 6c4b5bc..5f0358a 100644
> > --- a/block/Makefile.objs
> > +++ b/block/Makefile.objs
> > @@ -2,6 +2,7 @@ block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat
> >  block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
> >  block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
> >  block-obj-y += qed-check.o
> > +block-obj-y += vhdx.o
> >  block-obj-y += parallels.o blkdebug.o blkverify.o
> >  block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
> >  block-obj-$(CONFIG_POSIX) += raw-posix.o
> > diff --git a/block/vhdx.c b/block/vhdx.c
> > new file mode 100644
> > index 0000000..b0ea2ba
> > --- /dev/null
> > +++ b/block/vhdx.c
> > @@ -0,0 +1,769 @@
> > +/*
> > + * Block driver for Hyper-V VHDX Images
> > + *
> > + * Copyright (c) 2013 Red Hat, Inc.,
> > + *
> > + * Authors:
> > + *  Jeff Cody <jcody@redhat.com>
> > + *
> > + *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
> > + *  by Microsoft:
> > + *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
> > + *
> > + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> > + * See the COPYING.LIB file in the top-level directory.
> > + *
> > + */
> > +
> > +#include "qemu-common.h"
> > +#include "block/block_int.h"
> > +#include "qemu/module.h"
> > +#include "qemu/crc32c.h"
> > +#include "block/vhdx.h"
> > +
> > +
> > +/* Several metadata and region table data entries are identified by
> > + * guids in  a MS-specific GUID format. */
> > +
> > +
> > +/* ------- Known Region Table GUIDs ---------------------- */
> > +static const ms_guid bat_guid =      { .data1 = 0x2dc27766,
> > +                                       .data2 = 0xf623,
> > +                                       .data3 = 0x4200,
> > +                                       .data4 = { 0x9d, 0x64, 0x11, 0x5e,
> > +                                                  0x9b, 0xfd, 0x4a, 0x08} };
> > +
> > +static const ms_guid metadata_guid = { .data1 = 0x8b7ca206,
> > +                                       .data2 = 0x4790,
> > +                                       .data3 = 0x4b9a,
> > +                                       .data4 = { 0xb8, 0xfe, 0x57, 0x5f,
> > +                                                  0x05, 0x0f, 0x88, 0x6e} };
> > +
> > +
> > +
> > +/* ------- Known Metadata Entry GUIDs ---------------------- */
> > +static const ms_guid file_param_guid =   { .data1 = 0xcaa16737,
> > +                                           .data2 = 0xfa36,
> > +                                           .data3 = 0x4d43,
> > +                                           .data4 = { 0xb3, 0xb6, 0x33, 0xf0,
> > +                                                      0xaa, 0x44, 0xe7, 0x6b} };
> > +
> > +static const ms_guid virtual_size_guid = { .data1 = 0x2FA54224,
> > +                                           .data2 = 0xcd1b,
> > +                                           .data3 = 0x4876,
> > +                                           .data4 = { 0xb2, 0x11, 0x5d, 0xbe,
> > +                                                      0xd8, 0x3b, 0xf4, 0xb8} };
> > +
> > +static const ms_guid page83_guid =       { .data1 = 0xbeca12ab,
> > +                                           .data2 = 0xb2e6,
> > +                                           .data3 = 0x4523,
> > +                                           .data4 = { 0x93, 0xef, 0xc3, 0x09,
> > +                                                      0xe0, 0x00, 0xc7, 0x46} };
> > +
> > +
> > +static const ms_guid phys_sector_guid =  { .data1 = 0xcda348c7,
> > +                                           .data2 = 0x445d,
> > +                                           .data3 = 0x4471,
> > +                                           .data4 = { 0x9c, 0xc9, 0xe9, 0x88,
> > +                                                      0x52, 0x51, 0xc5, 0x56} };
> > +
> > +static const ms_guid parent_locator_guid = { .data1 = 0xa8d35f2d,
> > +                                             .data2 = 0xb30b,
> > +                                             .data3 = 0x454d,
> > +                                             .data4 = { 0xab, 0xf7, 0xd3,
> > +                                                        0xd8, 0x48, 0x34,
> > +                                                        0xab, 0x0c} };
> > +
> > +static const ms_guid logical_sector_guid = { .data1 = 0x8141bf1d,
> > +                                             .data2 = 0xa96f,
> > +                                             .data3 = 0x4709,
> > +                                             .data4 = { 0xba, 0x47, 0xf2,
> > +                                                        0x33, 0xa8, 0xfa,
> > +                                                        0xab, 0x5f} };
> > +
> > +/* Each parent type must have a valid GUID; this is for parent images
> > + * of type 'VHDX'.  If we were to allow e.g. a QCOW2 parent, we would
> > + * need to make up our own QCOW2 GUID type */
> > +static const ms_guid parent_vhdx_guid = { .data1 = 0xb04aefb7,
> > +                                          .data2 = 0xd19e,
> > +                                          .data3 = 0x4a81,
> > +                                          .data4 = { 0xb7, 0x89, 0x25, 0xb8,
> > +                                                     0xe9, 0x44, 0x59, 0x13} };
> > +
> > +
> > +#define META_FILE_PARAMETER_PRESENT      0x01
> > +#define META_VIRTUAL_DISK_SIZE_PRESENT   0x02
> > +#define META_PAGE_83_PRESENT             0x04
> > +#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08
> > +#define META_PHYS_SECTOR_SIZE_PRESENT    0x10
> > +#define META_PARENT_LOCATOR_PRESENT      0x20
> > +
> > +#define META_ALL_PRESENT    \
> > +    (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \
> > +     META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
> > +     META_PHYS_SECTOR_SIZE_PRESENT)
> > +
> > +typedef struct vhdx_metadata_entries {
> > +    vhdx_metadata_table_entry file_parameters_entry;
> > +    vhdx_metadata_table_entry virtual_disk_size_entry;
> > +    vhdx_metadata_table_entry page83_data_entry;
> > +    vhdx_metadata_table_entry logical_sector_size_entry;
> > +    vhdx_metadata_table_entry phys_sector_size_entry;
> > +    vhdx_metadata_table_entry parent_locator_entry;
> > +    uint16_t present;
> > +} vhdx_metadata_entries;
> 
> Should everything before this line actually be in the header?
> 
> > +
> > +typedef struct BDRVVHDXState {
> > +    CoMutex lock;
> > +
> > +    int curr_header;
> > +    vhdx_header *headers[2];
> > +
> > +    vhdx_region_table_header rt;
> > +    vhdx_region_table_entry bat_rt;         /* region table for the BAT */
> > +    vhdx_region_table_entry metadata_rt;    /* region table for the metadata */
> > +
> > +    vhdx_metadata_table_header  metadata_hdr;
> > +    vhdx_metadata_entries metadata_entries;
> > +
> > +    vhdx_file_parameters params;
> > +    uint32_t block_size;
> > +    uint32_t block_size_bits;
> > +    uint32_t sectors_per_block;
> > +    uint32_t sectors_per_block_bits;
> > +
> > +    uint64_t virtual_disk_size;
> > +    uint32_t logical_sector_size;
> > +    uint32_t physical_sector_size;
> > +
> > +    uint64_t chunk_ratio;
> > +    uint32_t chunk_ratio_bits;
> > +    uint32_t logical_sector_size_bits;
> > +
> > +    uint32_t bat_entries;
> > +    vhdx_bat_entry *bat;
> > +    uint64_t bat_offset;
> > +
> > +    vhdx_parent_locator_header parent_header;
> > +    vhdx_parent_locator_entry *parent_entries;
> > +
> > +} BDRVVHDXState;
> > +
> > +uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
> > +                            int crc_offset)
> > +{
> > +    uint32_t crc_new;
> > +    uint32_t crc_orig;
> > +    assert(buf != NULL);
> > +
> > +    if (crc_offset > 0) {
> > +        memcpy(&crc_orig, buf+crc_offset, sizeof(crc_orig));
> 
> Please add spaces before and after operators. I saw this a lot in this
> series. I'm relatively sure that you can use checkpatch.pl to get all
> instances pointed out.
>
Sure, I will seek those out and change them.  BTW, I ran it
checkpatch.pl, and it apparently missed the one above & below.

> > +        memset(buf+crc_offset, 0, sizeof(crc_orig));
> > +    }
> > +
> > +    crc_new = crc32c(crc, buf, size);
> > +    if (crc_offset > 0) {
> > +        memcpy(buf+crc_offset, &crc_orig, sizeof(crc_orig));
> > +    }
> > +
> > +    return crc_new;
> > +}
> > +
> > +/* Validates the checksum of the buffer, with an in-place CRC.
> > + *
> > + * Zero is substituted during crc calculation for the original crc field,
> > + * and the crc field is restored afterwards.  But the buffer will be modifed
> > + * during the calculation, so this may not be not suitable for multi-threaded
> > + * use.
> > + *
> > + * crc_offset: byte offset in buf of the buffer crc
> > + * buf: buffer pointer
> > + * size: size of buffer (must be > crc_offset+4)
> > + *
> > + * returns true if checksum is valid, false otherwise
> > + */
> > +bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset)
> > +{
> > +    uint32_t crc_orig;
> > +    uint32_t crc;
> > +
> > +    assert(buf != NULL);
> > +    assert(size > (crc_offset+4));
> > +
> > +    memcpy(&crc_orig, buf+crc_offset, sizeof(crc_orig));
> > +    crc_orig = le32_to_cpu(crc_orig);
> > +
> > +    crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset);
> > +
> > +    return crc == crc_orig;
> > +}
> > +
> > +
> > +/*
> > + * Per the MS VHDX Specification, for every VHDX file:
> > + *      - The header section is fixed size - 1 MB
> > + *      - The header section is always the first "object"
> > + *      - The first 64KB of the header is the File Identifier
> > + *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
> > + *      - The following 512 bytes constitute a UTF-16 string identifiying the
> > + *        software that created the file, and is optional and diagnostic only.
> > + *
> > + *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
> > + */
> > +static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
> > +{
> > +    if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) {
> 
> There's a VHDX_FILE_ID_MAGIC constant in the header. Don't you want to
> use it?
> 

Maybe I should remove all the magics from the header.  I could use it,
but I think this is more readable and perhaps easier to follow against
the spec.  For the upcoming log patches, there are other magic items
that are compared against strings rather than the magic defines as
well.

If you prefer I compared against the _MAGIC defines in the header, I
have no problem with that.

> > +        return 100;
> > +    }
> > +    return 0;
> > +}
> > +
> > +/* All VHDX structures on disk are little endian */
> > +static void vhdx_header_le_import(vhdx_header *h)
> > +{
> > +    assert(h != NULL);
> > +
> > +    le32_to_cpus(&h->signature);
> > +    le32_to_cpus(&h->checksum);
> > +    le64_to_cpus(&h->sequence_number);
> > +
> > +    leguid_to_cpus(&h->file_write_guid);
> > +    leguid_to_cpus(&h->data_write_guid);
> > +    leguid_to_cpus(&h->log_guid);
> > +
> > +    le16_to_cpus(&h->log_version);
> > +    le16_to_cpus(&h->version);
> > +    le32_to_cpus(&h->log_length);
> > +    le64_to_cpus(&h->log_offset);
> > +}
> > +
> > +
> > +/* opens the specified header block from the VHDX file header section */
> > +static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
> > +{
> > +    int ret = 0;
> > +    vhdx_header *header1;
> > +    vhdx_header *header2;
> > +    uint64_t h1_seq = 0;
> > +    uint64_t h2_seq = 0;
> > +    uint8_t *buffer;
> > +
> > +    header1 = qemu_blockalign(bs, sizeof(vhdx_header));
> > +    header2 = qemu_blockalign(bs, sizeof(vhdx_header));
> > +
> > +    buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE);
> > +
> > +    s->headers[0] = header1;
> > +    s->headers[1] = header2;
> > +
> > +    /* We have to read the whole VHDX_HEADER_SIZE instead of
> > +     * sizeof(vhdx_header), because the checksum is over the whole
> > +     * region */
> > +    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +    /* copy over just the relevant portion that we need */
> > +    memcpy(header1, buffer, sizeof(vhdx_header));
> > +    vhdx_header_le_import(header1);
> > +
> > +    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
> > +        header1->signature == VHDX_HDR_MAGIC) {
> > +        h1_seq = header1->sequence_number;
> > +    }
> > +
> > +    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +    /* copy over just the relevant portion that we need */
> > +    memcpy(header2, buffer, sizeof(vhdx_header));
> > +    vhdx_header_le_import(header2);
> > +
> > +    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
> > +        header2->signature == VHDX_HDR_MAGIC) {
> > +        h2_seq = header2->sequence_number;
> > +    }
> > +
> > +    if (h1_seq > h2_seq) {
> > +        s->curr_header = 0;
> > +    } else if (h2_seq > h1_seq) {
> > +        s->curr_header = 1;
> > +    } else {
> > +        printf("NO VALID HEADER\n");
> 
> Make that an qerror_report() at least.
>

OK

> > +        ret = -EINVAL;
> > +        goto fail;
> > +    }
> > +
> > +    ret = 0;
> > +
> > +    goto exit;
> > +
> > +fail:
> > +    qemu_vfree(header1);
> > +    qemu_vfree(header2);
> > +    s->headers[0] = NULL;
> > +    s->headers[1] = NULL;
> > +exit:
> > +    qemu_vfree(buffer);
> > +    return ret;
> > +}
> > +
> > +
> > +static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
> > +{
> > +    int ret = 0;
> > +    uint8_t *buffer;
> > +    int offset = 0;
> > +    vhdx_region_table_entry rt_entry;
> > +    int i;
> > +
> > +    /* We have to read the whole 64KB block, because the crc32 is over the
> > +     * whole block */
> > +    buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE);
> > +
> > +    ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
> > +                    VHDX_HEADER_BLOCK_SIZE);
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +    memcpy(&s->rt, buffer, sizeof(s->rt));
> > +    le32_to_cpus(&s->rt.signature);
> > +    le32_to_cpus(&s->rt.checksum);
> > +    le32_to_cpus(&s->rt.entry_count);
> > +    le32_to_cpus(&s->rt.reserved);
> > +    offset += sizeof(s->rt);
> > +
> > +    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
> > +        s->rt.signature != VHDX_RT_MAGIC) {
> > +        ret = -EINVAL;
> > +        goto fail;
> > +    }
> > +
> > +    for (i = 0; i < s->rt.entry_count; i++) {
> > +        memcpy(&rt_entry, buffer+offset, sizeof(rt_entry));
> > +        offset += sizeof(rt_entry);
> > +
> > +        leguid_to_cpus(&rt_entry.guid);
> > +        le64_to_cpus(&rt_entry.file_offset);
> > +        le32_to_cpus(&rt_entry.length);
> > +        le32_to_cpus(&rt_entry.data_bits);
> > +
> > +        /* see if we recognize the entry */
> > +        if (guid_eq(rt_entry.guid, bat_guid)) {
> > +            s->bat_rt = rt_entry;
> > +            continue;
> > +        }
> > +
> > +        if (guid_eq(rt_entry.guid, metadata_guid)) {
> > +            s->metadata_rt = rt_entry;
> > +            continue;
> > +        }
> > +
> > +        if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) {
> > +            /* cannot read vhdx file - required region table entry that
> > +             * we do not understand.  per spec, we must fail to open */
> > +            ret = -ENOTSUP;
> > +            goto fail;
> > +        }
> > +    }
> > +    ret = 0;
> > +
> > +fail:
> > +    qemu_vfree(buffer);
> > +    return ret;
> > +}
> > +
> > +
> > +
> > +/* Metadata initial parser
> > + *
> > + * This loads all the metadata entry fields.  This may cause additional
> > + * fields to be processed (e.g. parent locator, etc..).
> > + *
> > + * There are 5 Metadata items that are always required:
> > + *      - File Parameters (block size, has a parent)
> > + *      - Virtual Disk Size (size, in bytes, of the virtual drive)
> > + *      - Page 83 Data (scsi page 83 guid)
> > + *      - Logical Sector Size (logical sector size in bytes, either 512 or
> > + *                             4096.  We only support 512 currently)
> > + *      - Physical Sector Size (512 or 4096)
> > + *
> > + * Also, if the File Parameters indicate this is a differencing file,
> > + * we must also look for the Parent Locator metadata item.
> > + */
> > +static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
> > +{
> > +    int ret = 0;
> > +    uint8_t *buffer;
> > +    int offset = 0;
> > +    int i = 0;
> > +    uint32_t block_size, sectors_per_block, logical_sector_size;
> > +    uint64_t chunk_ratio;
> > +    vhdx_metadata_table_entry md_entry;
> > +
> > +    buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE);
> > +
> > +    ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
> > +                     VHDX_METADATA_TABLE_MAX_SIZE);
> > +    if (ret < 0) {
> > +        goto exit;
> > +    }
> > +    memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr));
> > +    offset += sizeof(s->metadata_hdr);
> > +
> > +    le64_to_cpus(&s->metadata_hdr.signature);
> > +    le16_to_cpus(&s->metadata_hdr.reserved);
> > +    le16_to_cpus(&s->metadata_hdr.entry_count);
> > +
> > +    if (s->metadata_hdr.signature != VHDX_METADATA_MAGIC) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +
> > +    s->metadata_entries.present = 0;
> > +
> > +    if ((s->metadata_hdr.entry_count * sizeof(md_entry)) >
> > +        (VHDX_METADATA_TABLE_MAX_SIZE - offset)) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +
> > +    for (i = 0; i < s->metadata_hdr.entry_count; i++) {
> > +        memcpy(&md_entry, buffer+offset, sizeof(md_entry));
> > +        offset += sizeof(md_entry);
> > +
> > +        leguid_to_cpus(&md_entry.item_id);
> > +        le32_to_cpus(&md_entry.offset);
> > +        le32_to_cpus(&md_entry.length);
> > +        le32_to_cpus(&md_entry.data_bits);
> > +        le32_to_cpus(&md_entry.reserved2);
> > +
> > +        if (guid_eq(md_entry.item_id, file_param_guid)) {
> > +            s->metadata_entries.file_parameters_entry = md_entry;
> > +            s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_eq(md_entry.item_id, virtual_size_guid)) {
> > +            s->metadata_entries.virtual_disk_size_entry = md_entry;
> > +            s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_eq(md_entry.item_id, page83_guid)) {
> > +            s->metadata_entries.page83_data_entry = md_entry;
> > +            s->metadata_entries.present |= META_PAGE_83_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_eq(md_entry.item_id, logical_sector_guid)) {
> > +            s->metadata_entries.logical_sector_size_entry = md_entry;
> > +            s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_eq(md_entry.item_id, phys_sector_guid)) {
> > +            s->metadata_entries.phys_sector_size_entry = md_entry;
> > +            s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_eq(md_entry.item_id, parent_locator_guid)) {
> > +            s->metadata_entries.parent_locator_entry = md_entry;
> > +            s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) {
> > +            /* cannot read vhdx file - required region table entry that
> > +             * we do not understand.  per spec, we must fail to open */
> > +            ret = -ENOTSUP;
> > +            goto exit;
> > +        }
> > +    }
> > +
> > +    if (s->metadata_entries.present != META_ALL_PRESENT) {
> > +        ret = -ENOTSUP;
> > +        goto exit;
> > +    }
> > +
> > +    ret = bdrv_pread(bs->file,
> > +                     s->metadata_entries.file_parameters_entry.offset
> > +                                         + s->metadata_rt.file_offset,
> > +                     &s->params,
> > +                     sizeof(s->params));
> > +
> > +    if (ret < 0) {
> > +        goto exit;
> > +    }
> > +
> > +    le32_to_cpus(&s->params.block_size);
> > +    le32_to_cpus(&s->params.data_bits);
> > +
> > +
> > +    /* We now have the file parameters, so we can tell if this is a
> > +     * differencing file (i.e.. has_parent), is dynamic or fixed
> > +     * sized (leave_blocks_allocated), and the block size */
> > +
> > +    /* The parent locator required iff the file parameters has_parent set */
> > +    if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
> > +        if (s->metadata_entries.present & ~META_PARENT_LOCATOR_PRESENT) {
> > +            /* TODO: parse  parent locator fields */
> > +            ret = -ENOTSUP; /* temp, until differencing files are supported */
> > +            goto exit;
> > +        } else {
> > +            /* if has_parent is set, but there is not parent locator present,
> > +             * then that is an invalid combination */
> > +            ret = -EINVAL;
> > +            goto exit;
> > +        }
> > +    }
> > +
> > +    /* determine virtual disk size, logical sector size,
> > +     * and phys sector size */
> > +
> > +    ret = bdrv_pread(bs->file,
> > +                     s->metadata_entries.virtual_disk_size_entry.offset
> > +                                           + s->metadata_rt.file_offset,
> > +                     &s->virtual_disk_size,
> > +                     sizeof(uint64_t));
> > +    if (ret < 0) {
> > +        goto exit;
> > +    }
> > +    ret = bdrv_pread(bs->file,
> > +                     s->metadata_entries.logical_sector_size_entry.offset
> > +                                             + s->metadata_rt.file_offset,
> > +                     &s->logical_sector_size,
> > +                     sizeof(uint32_t));
> > +    if (ret < 0) {
> > +        goto exit;
> > +    }
> > +    ret = bdrv_pread(bs->file,
> > +                     s->metadata_entries.phys_sector_size_entry.offset
> > +                                          + s->metadata_rt.file_offset,
> > +                     &s->physical_sector_size,
> > +                     sizeof(uint32_t));
> > +    if (ret < 0) {
> > +        goto exit;
> > +    }
> > +
> > +    le64_to_cpus(&s->virtual_disk_size);
> > +    le32_to_cpus(&s->logical_sector_size);
> > +    le32_to_cpus(&s->physical_sector_size);
> > +
> > +    if (s->logical_sector_size == 0 || s->params.block_size == 0) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +
> > +    /* both block_size and sector_size are guaranteed powers of 2 */
> > +    s->sectors_per_block = s->params.block_size / s->logical_sector_size;
> > +    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
> > +                     (uint64_t)s->logical_sector_size /
> > +                     (uint64_t)s->params.block_size;
> > +
> > +    /* These values are ones we will want to use for division / multiplication
> > +     * later on, and they are all guaranteed (per the spec) to be powers of 2,
> > +     * so we can take advantage of that for shift operations during
> > +     * reads/writes */
> > +    logical_sector_size = s->logical_sector_size;
> > +    if (logical_sector_size & (logical_sector_size - 1)) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +
> > +    sectors_per_block = s->sectors_per_block;
> > +    if (sectors_per_block & (sectors_per_block - 1)) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +    chunk_ratio = s->chunk_ratio;
> > +    if (chunk_ratio & (chunk_ratio - 1)) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +    block_size = s->params.block_size;
> > +    if (block_size & (block_size - 1)) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +
> > +    while (logical_sector_size >>= 1) {
> > +        s->logical_sector_size_bits++;
> > +    }
> > +    while (sectors_per_block >>= 1) {
> > +        s->sectors_per_block_bits++;
> > +    }
> > +    while (chunk_ratio >>= 1) {
> > +        s->chunk_ratio_bits++;
> > +    }
> > +    while (block_size >>= 1) {
> > +        s->block_size_bits++;
> > +    }
> > +
> > +    if (s->logical_sector_size != BDRV_SECTOR_SIZE) {
> > +        printf("VHDX error - QEMU only supports 512 byte sector sizes\n");
> > +        ret = -ENOTSUP;
> > +        goto exit;
> > +    }
> > +
> > +    ret = 0;
> > +
> > +exit:
> > +    qemu_vfree(buffer);
> > +    return ret;
> > +}
> > +
> > +/* Parse the replay log.  Per the VHDX spec, if the log is present
> > + * it must be replayed prior to opening the file, even read-only.
> > + *
> > + * If read-only, we must replay the log in RAM (or refuse to open
> > + * a dirty VHDX file read-only */
> > +static int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s)
> > +{
> > +    int ret = 0;
> > +    int i;
> > +    vhdx_header *hdr;
> > +
> > +    hdr = s->headers[s->curr_header];
> > +
> > +    /* either either the log guid, or log length is zero,
> > +     * then a replay log is present */
> > +    for (i = 0; i < sizeof(hdr->log_guid.data4); i++) {
> > +        ret |= hdr->log_guid.data4[i];
> > +    }
> > +    if (hdr->log_guid.data1 == 0 &&
> > +        hdr->log_guid.data2 == 0 &&
> > +        hdr->log_guid.data3 == 0 &&
> > +        ret == 0) {
> > +        goto exit;
> > +    }
> > +
> > +    /* per spec, only log version of 0 is supported */
> > +    if (hdr->log_version != 0) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +
> > +    if (hdr->log_length == 0) {
> > +        goto exit;
> > +    }
> > +
> > +    /* We currently do not support images with logs to replay */
> > +    ret = -ENOTSUP;
> > +
> > +exit:
> > +    return ret;
> > +}
> > +
> > +
> > +static int vhdx_open(BlockDriverState *bs, QDict *options, int flags)
> > +{
> > +    BDRVVHDXState *s = bs->opaque;
> > +    int ret = 0;
> > +    int i;
> > +
> > +    s->bat = NULL;
> > +
> > +    qemu_co_mutex_init(&s->lock);
> > +
> > +    ret = vhdx_parse_header(bs, s);
> > +    if (ret) {
> > +        goto fail;
> > +    }
> > +
> > +    ret = vhdx_parse_log(bs, s);
> > +    if (ret) {
> > +        goto fail;
> > +    }
> > +
> > +    ret = vhdx_open_region_tables(bs, s);
> > +    if (ret) {
> > +        goto fail;
> > +    }
> > +
> > +    ret = vhdx_parse_metadata(bs, s);
> > +    if (ret) {
> > +        goto fail;
> > +    }
> > +    s->block_size = s->params.block_size;
> > +
> > +    /* the VHDX spec dictates that virtual_disk_size is always a multiple of
> > +     * logical_sector_size */
> > +    bs->total_sectors = s->virtual_disk_size / s->logical_sector_size;
> > +
> > +    s->bat_offset = s->bat_rt.file_offset;
> > +    s->bat_entries = s->bat_rt.length / sizeof(vhdx_bat_entry);
> > +    s->bat = qemu_blockalign(bs, s->bat_rt.length);
> > +
> > +    ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length);
> 
> No error checking?
> 

Thanks, missed that, adding it in.

> > +    for (i = 0; i < s->bat_entries; i++) {
> > +        le64_to_cpus(&s->bat[i]);
> > +    }
> > +
> > +    if (flags & BDRV_O_RDWR) {
> > +        ret = -ENOTSUP;
> > +        goto fail;
> > +    }
> > +
> > +    /* TODO: differencing files, read, write */
> > +
> > +    return 0;
> > +fail:
> > +    qemu_vfree(s->bat);
> 
> This doesn't consider all the structs that were allocated by functions
> called in vhdx_open(). Here the same things should be freed as in
> vhdx_close().
> 

OK.  Those functions clean up after themselves, but you are right, if
a failure happens later they should be cleaned up here as well.

> > +    return ret;
> > +}
> > +
> > +static int vhdx_reopen_prepare(BDRVReopenState *state,
> > +                               BlockReopenQueue *queue, Error **errp)
> > +{
> > +    return 0;
> > +}
> > +
> > +
> > +static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
> > +                                      int nb_sectors, QEMUIOVector *qiov)
> > +{
> > +    return -ENOTSUP;
> > +}
> > +
> > +
> > +
> > +static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
> > +                                      int nb_sectors, QEMUIOVector *qiov)
> > +{
> > +    return -ENOTSUP;
> > +}
> > +
> > +
> > +static void vhdx_close(BlockDriverState *bs)
> > +{
> > +    BDRVVHDXState *s = bs->opaque;
> > +
> > +    qemu_vfree(s->headers[0]);
> > +    qemu_vfree(s->headers[1]);
> > +    qemu_vfree(s->bat);
> > +    qemu_vfree(s->parent_entries);
> > +}
> > +
> > +static BlockDriver bdrv_vhdx = {
> > +    .format_name    = "vhdx",
> > +    .instance_size  = sizeof(BDRVVHDXState),
> 
> Use the same alignment for = as below?
> 

OK

> > +    .bdrv_probe             = vhdx_probe,
> > +    .bdrv_open              = vhdx_open,
> > +    .bdrv_close             = vhdx_close,
> > +    .bdrv_reopen_prepare    = vhdx_reopen_prepare,
> > +    .bdrv_co_readv          = vhdx_co_readv,
> > +    .bdrv_co_writev         = vhdx_co_writev,
> > +};
> 
> Kevin
Kevin Wolf - April 23, 2013, 4:18 p.m.
Am 23.04.2013 um 18:11 hat Jeff Cody geschrieben:
> On Tue, Apr 23, 2013 at 05:46:28PM +0200, Kevin Wolf wrote:
> > Am 23.04.2013 um 16:24 hat Jeff Cody geschrieben:
> > > +/*
> > > + * Per the MS VHDX Specification, for every VHDX file:
> > > + *      - The header section is fixed size - 1 MB
> > > + *      - The header section is always the first "object"
> > > + *      - The first 64KB of the header is the File Identifier
> > > + *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
> > > + *      - The following 512 bytes constitute a UTF-16 string identifiying the
> > > + *        software that created the file, and is optional and diagnostic only.
> > > + *
> > > + *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
> > > + */
> > > +static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
> > > +{
> > > +    if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) {
> > 
> > There's a VHDX_FILE_ID_MAGIC constant in the header. Don't you want to
> > use it?
> > 
> 
> Maybe I should remove all the magics from the header.  I could use it,
> but I think this is more readable and perhaps easier to follow against
> the spec.  For the upcoming log patches, there are other magic items
> that are compared against strings rather than the magic defines as
> well.
> 
> If you prefer I compared against the _MAGIC defines in the header, I
> have no problem with that.

I don't mind using the strings, but then I think removing the numeric
magic from the header is the right thing indeed.

> > > +    for (i = 0; i < s->bat_entries; i++) {
> > > +        le64_to_cpus(&s->bat[i]);
> > > +    }
> > > +
> > > +    if (flags & BDRV_O_RDWR) {
> > > +        ret = -ENOTSUP;
> > > +        goto fail;
> > > +    }
> > > +
> > > +    /* TODO: differencing files, read, write */
> > > +
> > > +    return 0;
> > > +fail:
> > > +    qemu_vfree(s->bat);
> > 
> > This doesn't consider all the structs that were allocated by functions
> > called in vhdx_open(). Here the same things should be freed as in
> > vhdx_close().
> > 
> 
> OK.  Those functions clean up after themselves, but you are right, if
> a failure happens later they should be cleaned up here as well.

Yes, you can probably get away with not freeing whatever the last
function call could have allocated if it cleans up after itself, but
doing the full thing here is more obviously correct. (You may have to
move the cleanup here instead of duplicating it, or make the functions'
cleanup reset the pointers to NULL)

Kevin
Stefan Hajnoczi - April 24, 2013, 1:21 p.m.
On Tue, Apr 23, 2013 at 10:24:22AM -0400, Jeff Cody wrote:
> +    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
> +        s->rt.signature != VHDX_RT_MAGIC) {
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +
> +    for (i = 0; i < s->rt.entry_count; i++) {

It's nice to avoid signed/unsigned comparisons.  i should be uint32_t
just like entry_count.

> +        memcpy(&rt_entry, buffer+offset, sizeof(rt_entry));
> +        offset += sizeof(rt_entry);

Looks like we're trusting rt.entry_count to be a sane value?  Need to
prevent offset from exceeding buffer size.

> +    while (logical_sector_size >>= 1) {
> +        s->logical_sector_size_bits++;
> +    }
> +    while (sectors_per_block >>= 1) {
> +        s->sectors_per_block_bits++;
> +    }
> +    while (chunk_ratio >>= 1) {
> +        s->chunk_ratio_bits++;
> +    }
> +    while (block_size >>= 1) {
> +        s->block_size_bits++;
> +    }

ctz()/clo() do this.

> +static int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s)
> +{
> +    int ret = 0;
> +    int i;
> +    vhdx_header *hdr;
> +
> +    hdr = s->headers[s->curr_header];
> +
> +    /* either either the log guid, or log length is zero,

either either

> +    s->bat_offset = s->bat_rt.file_offset;
> +    s->bat_entries = s->bat_rt.length / sizeof(vhdx_bat_entry);
> +    s->bat = qemu_blockalign(bs, s->bat_rt.length);

No sanity check was done on bat_rt.length.  If this allocation fails
QEMU will exit.  Could be used as a DoS if you can get someone to attach
a malicious VHDX to their VM?
Jeff Cody - April 24, 2013, 1:40 p.m.
On Wed, Apr 24, 2013 at 03:21:10PM +0200, Stefan Hajnoczi wrote:
> On Tue, Apr 23, 2013 at 10:24:22AM -0400, Jeff Cody wrote:
> > +    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
> > +        s->rt.signature != VHDX_RT_MAGIC) {
> > +        ret = -EINVAL;
> > +        goto fail;
> > +    }
> > +
> > +    for (i = 0; i < s->rt.entry_count; i++) {
> 
> It's nice to avoid signed/unsigned comparisons.  i should be uint32_t
> just like entry_count.

I agree.  I will also double check the other parsing routines (e.g.
vhdx_parse_metadata()).

> 
> > +        memcpy(&rt_entry, buffer+offset, sizeof(rt_entry));
> > +        offset += sizeof(rt_entry);
> 
> Looks like we're trusting rt.entry_count to be a sane value?  Need to
> prevent offset from exceeding buffer size.
> 

Agree again, and I will also check the other parsers as well.


> > +    while (logical_sector_size >>= 1) {
> > +        s->logical_sector_size_bits++;
> > +    }
> > +    while (sectors_per_block >>= 1) {
> > +        s->sectors_per_block_bits++;
> > +    }
> > +    while (chunk_ratio >>= 1) {
> > +        s->chunk_ratio_bits++;
> > +    }
> > +    while (block_size >>= 1) {
> > +        s->block_size_bits++;
> > +    }
> 
> ctz()/clo() do this.
>

Ah, yes! I will switch over to using those.

> > +static int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s)
> > +{
> > +    int ret = 0;
> > +    int i;
> > +    vhdx_header *hdr;
> > +
> > +    hdr = s->headers[s->curr_header];
> > +
> > +    /* either either the log guid, or log length is zero,
> 
> either either
> 

Thanks

> > +    s->bat_offset = s->bat_rt.file_offset;
> > +    s->bat_entries = s->bat_rt.length / sizeof(vhdx_bat_entry);
> > +    s->bat = qemu_blockalign(bs, s->bat_rt.length);
> 
> No sanity check was done on bat_rt.length.  If this allocation fails
> QEMU will exit.  Could be used as a DoS if you can get someone to attach
> a malicious VHDX to their VM?

Yes, bat_rt.length needs to be verified here as well.  I will add that
in.
Kevin Wolf - April 25, 2013, 1:04 p.m.
Am 23.04.2013 um 16:24 hat Jeff Cody geschrieben:
> This is the initial block driver framework for VHDX image support (
> i.e. Hyper-V image file formats), that supports opening VHDX files, and
> parsing the headers.
> 
> This commit does not yet enable:
>     - reading
>     - writing
>     - updating the header
>     - differencing files (images with parents)
>     - log replay / dirty logs (only clean images)
> 
> This is based on Microsoft's VHDX specification:
>     "VHDX Format Specification v0.95", published 4/12/2012
>     https://www.microsoft.com/en-us/download/details.aspx?id=29681
> 
> Signed-off-by: Jeff Cody <jcody@redhat.com>

Okay, now I'm starting to actually compare the spec to your
implementation. Comments inline.

>  block/Makefile.objs |   1 +
>  block/vhdx.c        | 769 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/vhdx.h        |  25 ++
>  3 files changed, 795 insertions(+)
>  create mode 100644 block/vhdx.c
> 
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 6c4b5bc..5f0358a 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -2,6 +2,7 @@ block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat
>  block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>  block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-obj-y += qed-check.o
> +block-obj-y += vhdx.o
>  block-obj-y += parallels.o blkdebug.o blkverify.o
>  block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
>  block-obj-$(CONFIG_POSIX) += raw-posix.o
> diff --git a/block/vhdx.c b/block/vhdx.c
> new file mode 100644
> index 0000000..b0ea2ba
> --- /dev/null
> +++ b/block/vhdx.c
> @@ -0,0 +1,769 @@
> +/*
> + * Block driver for Hyper-V VHDX Images
> + *
> + * Copyright (c) 2013 Red Hat, Inc.,
> + *
> + * Authors:
> + *  Jeff Cody <jcody@redhat.com>
> + *
> + *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
> + *  by Microsoft:
> + *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
> +
> +#include "qemu-common.h"
> +#include "block/block_int.h"
> +#include "qemu/module.h"
> +#include "qemu/crc32c.h"
> +#include "block/vhdx.h"
> +
> +
> +/* Several metadata and region table data entries are identified by
> + * guids in  a MS-specific GUID format. */
> +
> +
> +/* ------- Known Region Table GUIDs ---------------------- */
> +static const ms_guid bat_guid =      { .data1 = 0x2dc27766,
> +                                       .data2 = 0xf623,
> +                                       .data3 = 0x4200,
> +                                       .data4 = { 0x9d, 0x64, 0x11, 0x5e,
> +                                                  0x9b, 0xfd, 0x4a, 0x08} };
> +
> +static const ms_guid metadata_guid = { .data1 = 0x8b7ca206,
> +                                       .data2 = 0x4790,
> +                                       .data3 = 0x4b9a,
> +                                       .data4 = { 0xb8, 0xfe, 0x57, 0x5f,
> +                                                  0x05, 0x0f, 0x88, 0x6e} };
> +
> +
> +
> +/* ------- Known Metadata Entry GUIDs ---------------------- */
> +static const ms_guid file_param_guid =   { .data1 = 0xcaa16737,
> +                                           .data2 = 0xfa36,
> +                                           .data3 = 0x4d43,
> +                                           .data4 = { 0xb3, 0xb6, 0x33, 0xf0,
> +                                                      0xaa, 0x44, 0xe7, 0x6b} };
> +
> +static const ms_guid virtual_size_guid = { .data1 = 0x2FA54224,
> +                                           .data2 = 0xcd1b,
> +                                           .data3 = 0x4876,
> +                                           .data4 = { 0xb2, 0x11, 0x5d, 0xbe,
> +                                                      0xd8, 0x3b, 0xf4, 0xb8} };
> +
> +static const ms_guid page83_guid =       { .data1 = 0xbeca12ab,
> +                                           .data2 = 0xb2e6,
> +                                           .data3 = 0x4523,
> +                                           .data4 = { 0x93, 0xef, 0xc3, 0x09,
> +                                                      0xe0, 0x00, 0xc7, 0x46} };
> +
> +
> +static const ms_guid phys_sector_guid =  { .data1 = 0xcda348c7,
> +                                           .data2 = 0x445d,
> +                                           .data3 = 0x4471,
> +                                           .data4 = { 0x9c, 0xc9, 0xe9, 0x88,
> +                                                      0x52, 0x51, 0xc5, 0x56} };
> +
> +static const ms_guid parent_locator_guid = { .data1 = 0xa8d35f2d,
> +                                             .data2 = 0xb30b,
> +                                             .data3 = 0x454d,
> +                                             .data4 = { 0xab, 0xf7, 0xd3,
> +                                                        0xd8, 0x48, 0x34,
> +                                                        0xab, 0x0c} };
> +
> +static const ms_guid logical_sector_guid = { .data1 = 0x8141bf1d,
> +                                             .data2 = 0xa96f,
> +                                             .data3 = 0x4709,
> +                                             .data4 = { 0xba, 0x47, 0xf2,
> +                                                        0x33, 0xa8, 0xfa,
> +                                                        0xab, 0x5f} };
> +
> +/* Each parent type must have a valid GUID; this is for parent images
> + * of type 'VHDX'.  If we were to allow e.g. a QCOW2 parent, we would
> + * need to make up our own QCOW2 GUID type */
> +static const ms_guid parent_vhdx_guid = { .data1 = 0xb04aefb7,
> +                                          .data2 = 0xd19e,
> +                                          .data3 = 0x4a81,
> +                                          .data4 = { 0xb7, 0x89, 0x25, 0xb8,
> +                                                     0xe9, 0x44, 0x59, 0x13} };
> +
> +
> +#define META_FILE_PARAMETER_PRESENT      0x01
> +#define META_VIRTUAL_DISK_SIZE_PRESENT   0x02
> +#define META_PAGE_83_PRESENT             0x04
> +#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08
> +#define META_PHYS_SECTOR_SIZE_PRESENT    0x10
> +#define META_PARENT_LOCATOR_PRESENT      0x20
> +
> +#define META_ALL_PRESENT    \
> +    (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \
> +     META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
> +     META_PHYS_SECTOR_SIZE_PRESENT)
> +
> +typedef struct vhdx_metadata_entries {
> +    vhdx_metadata_table_entry file_parameters_entry;
> +    vhdx_metadata_table_entry virtual_disk_size_entry;
> +    vhdx_metadata_table_entry page83_data_entry;
> +    vhdx_metadata_table_entry logical_sector_size_entry;
> +    vhdx_metadata_table_entry phys_sector_size_entry;
> +    vhdx_metadata_table_entry parent_locator_entry;
> +    uint16_t present;
> +} vhdx_metadata_entries;
> +
> +
> +typedef struct BDRVVHDXState {
> +    CoMutex lock;
> +
> +    int curr_header;
> +    vhdx_header *headers[2];
> +
> +    vhdx_region_table_header rt;
> +    vhdx_region_table_entry bat_rt;         /* region table for the BAT */
> +    vhdx_region_table_entry metadata_rt;    /* region table for the metadata */
> +
> +    vhdx_metadata_table_header  metadata_hdr;
> +    vhdx_metadata_entries metadata_entries;
> +
> +    vhdx_file_parameters params;
> +    uint32_t block_size;
> +    uint32_t block_size_bits;
> +    uint32_t sectors_per_block;
> +    uint32_t sectors_per_block_bits;
> +
> +    uint64_t virtual_disk_size;
> +    uint32_t logical_sector_size;
> +    uint32_t physical_sector_size;
> +
> +    uint64_t chunk_ratio;
> +    uint32_t chunk_ratio_bits;
> +    uint32_t logical_sector_size_bits;
> +
> +    uint32_t bat_entries;
> +    vhdx_bat_entry *bat;
> +    uint64_t bat_offset;
> +
> +    vhdx_parent_locator_header parent_header;
> +    vhdx_parent_locator_entry *parent_entries;
> +
> +} BDRVVHDXState;
> +
> +uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
> +                            int crc_offset)
> +{
> +    uint32_t crc_new;
> +    uint32_t crc_orig;
> +    assert(buf != NULL);
> +
> +    if (crc_offset > 0) {
> +        memcpy(&crc_orig, buf+crc_offset, sizeof(crc_orig));
> +        memset(buf+crc_offset, 0, sizeof(crc_orig));
> +    }
> +
> +    crc_new = crc32c(crc, buf, size);
> +    if (crc_offset > 0) {
> +        memcpy(buf+crc_offset, &crc_orig, sizeof(crc_orig));
> +    }
> +
> +    return crc_new;
> +}
> +
> +/* Validates the checksum of the buffer, with an in-place CRC.
> + *
> + * Zero is substituted during crc calculation for the original crc field,
> + * and the crc field is restored afterwards.  But the buffer will be modifed
> + * during the calculation, so this may not be not suitable for multi-threaded
> + * use.
> + *
> + * crc_offset: byte offset in buf of the buffer crc
> + * buf: buffer pointer
> + * size: size of buffer (must be > crc_offset+4)
> + *
> + * returns true if checksum is valid, false otherwise
> + */
> +bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset)
> +{
> +    uint32_t crc_orig;
> +    uint32_t crc;
> +
> +    assert(buf != NULL);
> +    assert(size > (crc_offset+4));
> +
> +    memcpy(&crc_orig, buf+crc_offset, sizeof(crc_orig));
> +    crc_orig = le32_to_cpu(crc_orig);
> +
> +    crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset);
> +
> +    return crc == crc_orig;
> +}
> +
> +
> +/*
> + * Per the MS VHDX Specification, for every VHDX file:
> + *      - The header section is fixed size - 1 MB
> + *      - The header section is always the first "object"
> + *      - The first 64KB of the header is the File Identifier
> + *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
> + *      - The following 512 bytes constitute a UTF-16 string identifiying the
> + *        software that created the file, and is optional and diagnostic only.
> + *
> + *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
> + */
> +static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
> +{
> +    if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) {
> +        return 100;
> +    }
> +    return 0;
> +}
> +
> +/* All VHDX structures on disk are little endian */
> +static void vhdx_header_le_import(vhdx_header *h)
> +{
> +    assert(h != NULL);
> +
> +    le32_to_cpus(&h->signature);
> +    le32_to_cpus(&h->checksum);
> +    le64_to_cpus(&h->sequence_number);
> +
> +    leguid_to_cpus(&h->file_write_guid);
> +    leguid_to_cpus(&h->data_write_guid);
> +    leguid_to_cpus(&h->log_guid);
> +
> +    le16_to_cpus(&h->log_version);
> +    le16_to_cpus(&h->version);
> +    le32_to_cpus(&h->log_length);
> +    le64_to_cpus(&h->log_offset);
> +}
> +
> +
> +/* opens the specified header block from the VHDX file header section */
> +static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
> +{
> +    int ret = 0;
> +    vhdx_header *header1;
> +    vhdx_header *header2;
> +    uint64_t h1_seq = 0;
> +    uint64_t h2_seq = 0;
> +    uint8_t *buffer;

Makes sense to use uint8_t* (or possible rather void*) here, but it
means that struct vhdx_header_padded is unused and could be dropped from
the header file.

> +    header1 = qemu_blockalign(bs, sizeof(vhdx_header));
> +    header2 = qemu_blockalign(bs, sizeof(vhdx_header));
> +
> +    buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE);
> +
> +    s->headers[0] = header1;
> +    s->headers[1] = header2;

The spec requires that the signature field of the File Type Identifier
must be validated when loading a VHDX file. We only have the check in
vhdx_probe(), but not in vhdx_open().

(This means that our struct vhdx_file_identifier is unused.)

> +    /* We have to read the whole VHDX_HEADER_SIZE instead of
> +     * sizeof(vhdx_header), because the checksum is over the whole
> +     * region */
> +    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +    /* copy over just the relevant portion that we need */
> +    memcpy(header1, buffer, sizeof(vhdx_header));
> +    vhdx_header_le_import(header1);
> +
> +    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
> +        header1->signature == VHDX_HDR_MAGIC) {
> +        h1_seq = header1->sequence_number;
> +    }
> +
> +    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +    /* copy over just the relevant portion that we need */
> +    memcpy(header2, buffer, sizeof(vhdx_header));
> +    vhdx_header_le_import(header2);
> +
> +    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
> +        header2->signature == VHDX_HDR_MAGIC) {
> +        h2_seq = header2->sequence_number;
> +    }
> +
> +    if (h1_seq > h2_seq) {
> +        s->curr_header = 0;
> +    } else if (h2_seq > h1_seq) {
> +        s->curr_header = 1;
> +    } else {
> +        printf("NO VALID HEADER\n");
> +        ret = -EINVAL;
> +        goto fail;
> +    }

This may be nit-picking, but according to the spec, I think a header
with a sequence number of 0 can be valid (if the other header is
invalid). This code would get 0 for both and error out.

What does a new VHDX file look like in practice? Sequence numbers 1
and 2?

> +
> +    ret = 0;
> +
> +    goto exit;
> +
> +fail:
> +    qemu_vfree(header1);
> +    qemu_vfree(header2);
> +    s->headers[0] = NULL;
> +    s->headers[1] = NULL;
> +exit:
> +    qemu_vfree(buffer);
> +    return ret;
> +}
> +
> +
> +static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
> +{
> +    int ret = 0;
> +    uint8_t *buffer;
> +    int offset = 0;
> +    vhdx_region_table_entry rt_entry;
> +    int i;
> +
> +    /* We have to read the whole 64KB block, because the crc32 is over the
> +     * whole block */
> +    buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE);
> +
> +    ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
> +                    VHDX_HEADER_BLOCK_SIZE);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +    memcpy(&s->rt, buffer, sizeof(s->rt));
> +    le32_to_cpus(&s->rt.signature);
> +    le32_to_cpus(&s->rt.checksum);
> +    le32_to_cpus(&s->rt.entry_count);
> +    le32_to_cpus(&s->rt.reserved);
> +    offset += sizeof(s->rt);

Maybe it would safer with respect to forgotting endianness conversion if
we didn't copy and then convert fields, but only copy converted values.
Like:

vhdx_region_table_header *rt = buffer;
s->rt = (vhdx_region_table_header) {
    .signature  = le32_to_cpu(rt->signature),
    .checksum   = le32_to_cpu(rt->checksum),
    ...
};

Matter of taste, I guess, so this is just a suggestion, not a request.
You could do this is quite a few places, I think.

> +
> +    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
> +        s->rt.signature != VHDX_RT_MAGIC) {
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +
> +    for (i = 0; i < s->rt.entry_count; i++) {
> +        memcpy(&rt_entry, buffer+offset, sizeof(rt_entry));
> +        offset += sizeof(rt_entry);
> +
> +        leguid_to_cpus(&rt_entry.guid);
> +        le64_to_cpus(&rt_entry.file_offset);
> +        le32_to_cpus(&rt_entry.length);
> +        le32_to_cpus(&rt_entry.data_bits);
> +
> +        /* see if we recognize the entry */
> +        if (guid_eq(rt_entry.guid, bat_guid)) {
> +            s->bat_rt = rt_entry;
> +            continue;
> +        }
> +
> +        if (guid_eq(rt_entry.guid, metadata_guid)) {
> +            s->metadata_rt = rt_entry;
> +            continue;
> +        }

If the same regions occurs multiple times, the latest wins. Such images
aren't valid, but what should we do with such cases? Is this good enough
or should we detect it?

> +        if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) {
> +            /* cannot read vhdx file - required region table entry that
> +             * we do not understand.  per spec, we must fail to open */
> +            ret = -ENOTSUP;
> +            goto fail;
> +        }
> +    }
> +    ret = 0;
> +
> +fail:
> +    qemu_vfree(buffer);
> +    return ret;
> +}
> +
> +
> +
> +/* Metadata initial parser
> + *
> + * This loads all the metadata entry fields.  This may cause additional
> + * fields to be processed (e.g. parent locator, etc..).
> + *
> + * There are 5 Metadata items that are always required:
> + *      - File Parameters (block size, has a parent)
> + *      - Virtual Disk Size (size, in bytes, of the virtual drive)
> + *      - Page 83 Data (scsi page 83 guid)
> + *      - Logical Sector Size (logical sector size in bytes, either 512 or
> + *                             4096.  We only support 512 currently)
> + *      - Physical Sector Size (512 or 4096)
> + *
> + * Also, if the File Parameters indicate this is a differencing file,
> + * we must also look for the Parent Locator metadata item.
> + */
> +static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
> +{
> +    int ret = 0;
> +    uint8_t *buffer;
> +    int offset = 0;
> +    int i = 0;
> +    uint32_t block_size, sectors_per_block, logical_sector_size;
> +    uint64_t chunk_ratio;
> +    vhdx_metadata_table_entry md_entry;
> +
> +    buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE);
> +
> +    ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
> +                     VHDX_METADATA_TABLE_MAX_SIZE);
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +    memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr));
> +    offset += sizeof(s->metadata_hdr);
> +
> +    le64_to_cpus(&s->metadata_hdr.signature);
> +    le16_to_cpus(&s->metadata_hdr.reserved);
> +    le16_to_cpus(&s->metadata_hdr.entry_count);
> +
> +    if (s->metadata_hdr.signature != VHDX_METADATA_MAGIC) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +
> +    s->metadata_entries.present = 0;
> +
> +    if ((s->metadata_hdr.entry_count * sizeof(md_entry)) >
> +        (VHDX_METADATA_TABLE_MAX_SIZE - offset)) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +
> +    for (i = 0; i < s->metadata_hdr.entry_count; i++) {
> +        memcpy(&md_entry, buffer+offset, sizeof(md_entry));
> +        offset += sizeof(md_entry);
> +
> +        leguid_to_cpus(&md_entry.item_id);
> +        le32_to_cpus(&md_entry.offset);
> +        le32_to_cpus(&md_entry.length);
> +        le32_to_cpus(&md_entry.data_bits);
> +        le32_to_cpus(&md_entry.reserved2);
> +
> +        if (guid_eq(md_entry.item_id, file_param_guid)) {
> +            s->metadata_entries.file_parameters_entry = md_entry;
> +            s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_eq(md_entry.item_id, virtual_size_guid)) {
> +            s->metadata_entries.virtual_disk_size_entry = md_entry;
> +            s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_eq(md_entry.item_id, page83_guid)) {
> +            s->metadata_entries.page83_data_entry = md_entry;
> +            s->metadata_entries.present |= META_PAGE_83_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_eq(md_entry.item_id, logical_sector_guid)) {
> +            s->metadata_entries.logical_sector_size_entry = md_entry;
> +            s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_eq(md_entry.item_id, phys_sector_guid)) {
> +            s->metadata_entries.phys_sector_size_entry = md_entry;
> +            s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_eq(md_entry.item_id, parent_locator_guid)) {
> +            s->metadata_entries.parent_locator_entry = md_entry;
> +            s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT;
> +            continue;
> +        }
> +
> +        if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) {
> +            /* cannot read vhdx file - required region table entry that
> +             * we do not understand.  per spec, we must fail to open */
> +            ret = -ENOTSUP;
> +            goto exit;
> +        }
> +    }
> +
> +    if (s->metadata_entries.present != META_ALL_PRESENT) {
> +        ret = -ENOTSUP;
> +        goto exit;
> +    }
> +
> +    ret = bdrv_pread(bs->file,
> +                     s->metadata_entries.file_parameters_entry.offset
> +                                         + s->metadata_rt.file_offset,
> +                     &s->params,
> +                     sizeof(s->params));
> +
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +
> +    le32_to_cpus(&s->params.block_size);
> +    le32_to_cpus(&s->params.data_bits);
> +
> +
> +    /* We now have the file parameters, so we can tell if this is a
> +     * differencing file (i.e.. has_parent), is dynamic or fixed
> +     * sized (leave_blocks_allocated), and the block size */
> +
> +    /* The parent locator required iff the file parameters has_parent set */
> +    if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
> +        if (s->metadata_entries.present & ~META_PARENT_LOCATOR_PRESENT) {

Not sure what you're trying to achieve here. We get -ENOTSUP if any
metadata entry except the parent locator is present, and -EINVAL if
there is none?

Of course, this doesn't matter currently, because above we've verified
that s->metadata_entries.present == META_ALL_PRESENT, so we'll always
get the -ENOTSUP case.

> +            /* TODO: parse  parent locator fields */
> +            ret = -ENOTSUP; /* temp, until differencing files are supported */
> +            goto exit;
> +        } else {
> +            /* if has_parent is set, but there is not parent locator present,
> +             * then that is an invalid combination */
> +            ret = -EINVAL;
> +            goto exit;
> +        }
> +    }
> +
> +    /* determine virtual disk size, logical sector size,
> +     * and phys sector size */
> +
> +    ret = bdrv_pread(bs->file,
> +                     s->metadata_entries.virtual_disk_size_entry.offset
> +                                           + s->metadata_rt.file_offset,
> +                     &s->virtual_disk_size,
> +                     sizeof(uint64_t));
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +    ret = bdrv_pread(bs->file,
> +                     s->metadata_entries.logical_sector_size_entry.offset
> +                                             + s->metadata_rt.file_offset,
> +                     &s->logical_sector_size,
> +                     sizeof(uint32_t));
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +    ret = bdrv_pread(bs->file,
> +                     s->metadata_entries.phys_sector_size_entry.offset
> +                                          + s->metadata_rt.file_offset,
> +                     &s->physical_sector_size,
> +                     sizeof(uint32_t));
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +
> +    le64_to_cpus(&s->virtual_disk_size);
> +    le32_to_cpus(&s->logical_sector_size);
> +    le32_to_cpus(&s->physical_sector_size);
> +
> +    if (s->logical_sector_size == 0 || s->params.block_size == 0) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +
> +    /* both block_size and sector_size are guaranteed powers of 2 */
> +    s->sectors_per_block = s->params.block_size / s->logical_sector_size;
> +    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
> +                     (uint64_t)s->logical_sector_size /
> +                     (uint64_t)s->params.block_size;
> +
> +    /* These values are ones we will want to use for division / multiplication
> +     * later on, and they are all guaranteed (per the spec) to be powers of 2,
> +     * so we can take advantage of that for shift operations during
> +     * reads/writes */
> +    logical_sector_size = s->logical_sector_size;
> +    if (logical_sector_size & (logical_sector_size - 1)) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +
> +    sectors_per_block = s->sectors_per_block;
> +    if (sectors_per_block & (sectors_per_block - 1)) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +    chunk_ratio = s->chunk_ratio;
> +    if (chunk_ratio & (chunk_ratio - 1)) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +    block_size = s->params.block_size;
> +    if (block_size & (block_size - 1)) {
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +
> +    while (logical_sector_size >>= 1) {
> +        s->logical_sector_size_bits++;
> +    }
> +    while (sectors_per_block >>= 1) {
> +        s->sectors_per_block_bits++;
> +    }
> +    while (chunk_ratio >>= 1) {
> +        s->chunk_ratio_bits++;
> +    }
> +    while (block_size >>= 1) {
> +        s->block_size_bits++;
> +    }
> +
> +    if (s->logical_sector_size != BDRV_SECTOR_SIZE) {
> +        printf("VHDX error - QEMU only supports 512 byte sector sizes\n");

This is not true. It depends on the device model, and in particular the
qdev properties of it. The block layer doesn't have access to them, so
checking that it matches the image file isn't trivial. It would have to
be the device model that checks that the BlockDriverState is valid for
the device it emulates.

I suggest dropping this check; otherwise make it (q)error_report at
least.

> +        ret = -ENOTSUP;
> +        goto exit;
> +    }
> +
> +    ret = 0;
> +
> +exit:
> +    qemu_vfree(buffer);
> +    return ret;
> +}

Kevin
Jeff Cody - April 25, 2013, 3:03 p.m.
On Thu, Apr 25, 2013 at 03:04:23PM +0200, Kevin Wolf wrote:
> Am 23.04.2013 um 16:24 hat Jeff Cody geschrieben:
> > This is the initial block driver framework for VHDX image support (
> > i.e. Hyper-V image file formats), that supports opening VHDX files, and
> > parsing the headers.
> > 
> > This commit does not yet enable:
> >     - reading
> >     - writing
> >     - updating the header
> >     - differencing files (images with parents)
> >     - log replay / dirty logs (only clean images)
> > 
> > This is based on Microsoft's VHDX specification:
> >     "VHDX Format Specification v0.95", published 4/12/2012
> >     https://www.microsoft.com/en-us/download/details.aspx?id=29681
> > 
> > Signed-off-by: Jeff Cody <jcody@redhat.com>
> 
> Okay, now I'm starting to actually compare the spec to your
> implementation. Comments inline.
> 
> >  block/Makefile.objs |   1 +
> >  block/vhdx.c        | 769 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  block/vhdx.h        |  25 ++
> >  3 files changed, 795 insertions(+)
> >  create mode 100644 block/vhdx.c
> > 
> > diff --git a/block/Makefile.objs b/block/Makefile.objs
> > index 6c4b5bc..5f0358a 100644
> > --- a/block/Makefile.objs
> > +++ b/block/Makefile.objs
> > @@ -2,6 +2,7 @@ block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat
> >  block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
> >  block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
> >  block-obj-y += qed-check.o
> > +block-obj-y += vhdx.o
> >  block-obj-y += parallels.o blkdebug.o blkverify.o
> >  block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
> >  block-obj-$(CONFIG_POSIX) += raw-posix.o
> > diff --git a/block/vhdx.c b/block/vhdx.c
> > new file mode 100644
> > index 0000000..b0ea2ba
> > --- /dev/null
> > +++ b/block/vhdx.c
> > @@ -0,0 +1,769 @@
> > +/*
> > + * Block driver for Hyper-V VHDX Images
> > + *
> > + * Copyright (c) 2013 Red Hat, Inc.,
> > + *
> > + * Authors:
> > + *  Jeff Cody <jcody@redhat.com>
> > + *
> > + *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
> > + *  by Microsoft:
> > + *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
> > + *
> > + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> > + * See the COPYING.LIB file in the top-level directory.
> > + *
> > + */
> > +
> > +#include "qemu-common.h"
> > +#include "block/block_int.h"
> > +#include "qemu/module.h"
> > +#include "qemu/crc32c.h"
> > +#include "block/vhdx.h"
> > +
> > +
> > +/* Several metadata and region table data entries are identified by
> > + * guids in  a MS-specific GUID format. */
> > +
> > +
> > +/* ------- Known Region Table GUIDs ---------------------- */
> > +static const ms_guid bat_guid =      { .data1 = 0x2dc27766,
> > +                                       .data2 = 0xf623,
> > +                                       .data3 = 0x4200,
> > +                                       .data4 = { 0x9d, 0x64, 0x11, 0x5e,
> > +                                                  0x9b, 0xfd, 0x4a, 0x08} };
> > +
> > +static const ms_guid metadata_guid = { .data1 = 0x8b7ca206,
> > +                                       .data2 = 0x4790,
> > +                                       .data3 = 0x4b9a,
> > +                                       .data4 = { 0xb8, 0xfe, 0x57, 0x5f,
> > +                                                  0x05, 0x0f, 0x88, 0x6e} };
> > +
> > +
> > +
> > +/* ------- Known Metadata Entry GUIDs ---------------------- */
> > +static const ms_guid file_param_guid =   { .data1 = 0xcaa16737,
> > +                                           .data2 = 0xfa36,
> > +                                           .data3 = 0x4d43,
> > +                                           .data4 = { 0xb3, 0xb6, 0x33, 0xf0,
> > +                                                      0xaa, 0x44, 0xe7, 0x6b} };
> > +
> > +static const ms_guid virtual_size_guid = { .data1 = 0x2FA54224,
> > +                                           .data2 = 0xcd1b,
> > +                                           .data3 = 0x4876,
> > +                                           .data4 = { 0xb2, 0x11, 0x5d, 0xbe,
> > +                                                      0xd8, 0x3b, 0xf4, 0xb8} };
> > +
> > +static const ms_guid page83_guid =       { .data1 = 0xbeca12ab,
> > +                                           .data2 = 0xb2e6,
> > +                                           .data3 = 0x4523,
> > +                                           .data4 = { 0x93, 0xef, 0xc3, 0x09,
> > +                                                      0xe0, 0x00, 0xc7, 0x46} };
> > +
> > +
> > +static const ms_guid phys_sector_guid =  { .data1 = 0xcda348c7,
> > +                                           .data2 = 0x445d,
> > +                                           .data3 = 0x4471,
> > +                                           .data4 = { 0x9c, 0xc9, 0xe9, 0x88,
> > +                                                      0x52, 0x51, 0xc5, 0x56} };
> > +
> > +static const ms_guid parent_locator_guid = { .data1 = 0xa8d35f2d,
> > +                                             .data2 = 0xb30b,
> > +                                             .data3 = 0x454d,
> > +                                             .data4 = { 0xab, 0xf7, 0xd3,
> > +                                                        0xd8, 0x48, 0x34,
> > +                                                        0xab, 0x0c} };
> > +
> > +static const ms_guid logical_sector_guid = { .data1 = 0x8141bf1d,
> > +                                             .data2 = 0xa96f,
> > +                                             .data3 = 0x4709,
> > +                                             .data4 = { 0xba, 0x47, 0xf2,
> > +                                                        0x33, 0xa8, 0xfa,
> > +                                                        0xab, 0x5f} };
> > +
> > +/* Each parent type must have a valid GUID; this is for parent images
> > + * of type 'VHDX'.  If we were to allow e.g. a QCOW2 parent, we would
> > + * need to make up our own QCOW2 GUID type */
> > +static const ms_guid parent_vhdx_guid = { .data1 = 0xb04aefb7,
> > +                                          .data2 = 0xd19e,
> > +                                          .data3 = 0x4a81,
> > +                                          .data4 = { 0xb7, 0x89, 0x25, 0xb8,
> > +                                                     0xe9, 0x44, 0x59, 0x13} };
> > +
> > +
> > +#define META_FILE_PARAMETER_PRESENT      0x01
> > +#define META_VIRTUAL_DISK_SIZE_PRESENT   0x02
> > +#define META_PAGE_83_PRESENT             0x04
> > +#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08
> > +#define META_PHYS_SECTOR_SIZE_PRESENT    0x10
> > +#define META_PARENT_LOCATOR_PRESENT      0x20
> > +
> > +#define META_ALL_PRESENT    \
> > +    (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \
> > +     META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
> > +     META_PHYS_SECTOR_SIZE_PRESENT)
> > +
> > +typedef struct vhdx_metadata_entries {
> > +    vhdx_metadata_table_entry file_parameters_entry;
> > +    vhdx_metadata_table_entry virtual_disk_size_entry;
> > +    vhdx_metadata_table_entry page83_data_entry;
> > +    vhdx_metadata_table_entry logical_sector_size_entry;
> > +    vhdx_metadata_table_entry phys_sector_size_entry;
> > +    vhdx_metadata_table_entry parent_locator_entry;
> > +    uint16_t present;
> > +} vhdx_metadata_entries;
> > +
> > +
> > +typedef struct BDRVVHDXState {
> > +    CoMutex lock;
> > +
> > +    int curr_header;
> > +    vhdx_header *headers[2];
> > +
> > +    vhdx_region_table_header rt;
> > +    vhdx_region_table_entry bat_rt;         /* region table for the BAT */
> > +    vhdx_region_table_entry metadata_rt;    /* region table for the metadata */
> > +
> > +    vhdx_metadata_table_header  metadata_hdr;
> > +    vhdx_metadata_entries metadata_entries;
> > +
> > +    vhdx_file_parameters params;
> > +    uint32_t block_size;
> > +    uint32_t block_size_bits;
> > +    uint32_t sectors_per_block;
> > +    uint32_t sectors_per_block_bits;
> > +
> > +    uint64_t virtual_disk_size;
> > +    uint32_t logical_sector_size;
> > +    uint32_t physical_sector_size;
> > +
> > +    uint64_t chunk_ratio;
> > +    uint32_t chunk_ratio_bits;
> > +    uint32_t logical_sector_size_bits;
> > +
> > +    uint32_t bat_entries;
> > +    vhdx_bat_entry *bat;
> > +    uint64_t bat_offset;
> > +
> > +    vhdx_parent_locator_header parent_header;
> > +    vhdx_parent_locator_entry *parent_entries;
> > +
> > +} BDRVVHDXState;
> > +
> > +uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
> > +                            int crc_offset)
> > +{
> > +    uint32_t crc_new;
> > +    uint32_t crc_orig;
> > +    assert(buf != NULL);
> > +
> > +    if (crc_offset > 0) {
> > +        memcpy(&crc_orig, buf+crc_offset, sizeof(crc_orig));
> > +        memset(buf+crc_offset, 0, sizeof(crc_orig));
> > +    }
> > +
> > +    crc_new = crc32c(crc, buf, size);
> > +    if (crc_offset > 0) {
> > +        memcpy(buf+crc_offset, &crc_orig, sizeof(crc_orig));
> > +    }
> > +
> > +    return crc_new;
> > +}
> > +
> > +/* Validates the checksum of the buffer, with an in-place CRC.
> > + *
> > + * Zero is substituted during crc calculation for the original crc field,
> > + * and the crc field is restored afterwards.  But the buffer will be modifed
> > + * during the calculation, so this may not be not suitable for multi-threaded
> > + * use.
> > + *
> > + * crc_offset: byte offset in buf of the buffer crc
> > + * buf: buffer pointer
> > + * size: size of buffer (must be > crc_offset+4)
> > + *
> > + * returns true if checksum is valid, false otherwise
> > + */
> > +bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset)
> > +{
> > +    uint32_t crc_orig;
> > +    uint32_t crc;
> > +
> > +    assert(buf != NULL);
> > +    assert(size > (crc_offset+4));
> > +
> > +    memcpy(&crc_orig, buf+crc_offset, sizeof(crc_orig));
> > +    crc_orig = le32_to_cpu(crc_orig);
> > +
> > +    crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset);
> > +
> > +    return crc == crc_orig;
> > +}
> > +
> > +
> > +/*
> > + * Per the MS VHDX Specification, for every VHDX file:
> > + *      - The header section is fixed size - 1 MB
> > + *      - The header section is always the first "object"
> > + *      - The first 64KB of the header is the File Identifier
> > + *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
> > + *      - The following 512 bytes constitute a UTF-16 string identifiying the
> > + *        software that created the file, and is optional and diagnostic only.
> > + *
> > + *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
> > + */
> > +static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
> > +{
> > +    if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) {
> > +        return 100;
> > +    }
> > +    return 0;
> > +}
> > +
> > +/* All VHDX structures on disk are little endian */
> > +static void vhdx_header_le_import(vhdx_header *h)
> > +{
> > +    assert(h != NULL);
> > +
> > +    le32_to_cpus(&h->signature);
> > +    le32_to_cpus(&h->checksum);
> > +    le64_to_cpus(&h->sequence_number);
> > +
> > +    leguid_to_cpus(&h->file_write_guid);
> > +    leguid_to_cpus(&h->data_write_guid);
> > +    leguid_to_cpus(&h->log_guid);
> > +
> > +    le16_to_cpus(&h->log_version);
> > +    le16_to_cpus(&h->version);
> > +    le32_to_cpus(&h->log_length);
> > +    le64_to_cpus(&h->log_offset);
> > +}
> > +
> > +
> > +/* opens the specified header block from the VHDX file header section */
> > +static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
> > +{
> > +    int ret = 0;
> > +    vhdx_header *header1;
> > +    vhdx_header *header2;
> > +    uint64_t h1_seq = 0;
> > +    uint64_t h2_seq = 0;
> > +    uint8_t *buffer;
> 
> Makes sense to use uint8_t* (or possible rather void*) here, but it
> means that struct vhdx_header_padded is unused and could be dropped from
> the header file.
>

OK, I'll drop it from the header file (or maybe convert the meaning of
it into a comment in the header, to preserve the info).

> > +    header1 = qemu_blockalign(bs, sizeof(vhdx_header));
> > +    header2 = qemu_blockalign(bs, sizeof(vhdx_header));
> > +
> > +    buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE);
> > +
> > +    s->headers[0] = header1;
> > +    s->headers[1] = header2;
> 
> The spec requires that the signature field of the File Type Identifier
> must be validated when loading a VHDX file. We only have the check in
> vhdx_probe(), but not in vhdx_open().
> 

Yes, true - I'll add that in first.

> (This means that our struct vhdx_file_identifier is unused.)
> 
> > +    /* We have to read the whole VHDX_HEADER_SIZE instead of
> > +     * sizeof(vhdx_header), because the checksum is over the whole
> > +     * region */
> > +    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +    /* copy over just the relevant portion that we need */
> > +    memcpy(header1, buffer, sizeof(vhdx_header));
> > +    vhdx_header_le_import(header1);
> > +
> > +    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
> > +        header1->signature == VHDX_HDR_MAGIC) {
> > +        h1_seq = header1->sequence_number;
> > +    }
> > +
> > +    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +    /* copy over just the relevant portion that we need */
> > +    memcpy(header2, buffer, sizeof(vhdx_header));
> > +    vhdx_header_le_import(header2);
> > +
> > +    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
> > +        header2->signature == VHDX_HDR_MAGIC) {
> > +        h2_seq = header2->sequence_number;
> > +    }
> > +
> > +    if (h1_seq > h2_seq) {
> > +        s->curr_header = 0;
> > +    } else if (h2_seq > h1_seq) {
> > +        s->curr_header = 1;
> > +    } else {
> > +        printf("NO VALID HEADER\n");
> > +        ret = -EINVAL;
> > +        goto fail;
> > +    }
> 
> This may be nit-picking, but according to the spec, I think a header
> with a sequence number of 0 can be valid (if the other header is
> invalid). This code would get 0 for both and error out.
> 

I think you are right.  The other header would need to have either an
invalid signature or checksum, but if that was the case technically 0
could indicate the active header.

> What does a new VHDX file look like in practice? Sequence numbers 1
> and 2?
> 

I'll check that out - my Hyper-V test server is at home, and I am
working at a co-working place today.  I'll check it this evening -
it'll have to be on a freshly created image before even opening it up
with Hyper-V.

> > +
> > +    ret = 0;
> > +
> > +    goto exit;
> > +
> > +fail:
> > +    qemu_vfree(header1);
> > +    qemu_vfree(header2);
> > +    s->headers[0] = NULL;
> > +    s->headers[1] = NULL;
> > +exit:
> > +    qemu_vfree(buffer);
> > +    return ret;
> > +}
> > +
> > +
> > +static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
> > +{
> > +    int ret = 0;
> > +    uint8_t *buffer;
> > +    int offset = 0;
> > +    vhdx_region_table_entry rt_entry;
> > +    int i;
> > +
> > +    /* We have to read the whole 64KB block, because the crc32 is over the
> > +     * whole block */
> > +    buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE);
> > +
> > +    ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
> > +                    VHDX_HEADER_BLOCK_SIZE);
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +    memcpy(&s->rt, buffer, sizeof(s->rt));
> > +    le32_to_cpus(&s->rt.signature);
> > +    le32_to_cpus(&s->rt.checksum);
> > +    le32_to_cpus(&s->rt.entry_count);
> > +    le32_to_cpus(&s->rt.reserved);
> > +    offset += sizeof(s->rt);
> 
> Maybe it would safer with respect to forgotting endianness conversion if
> we didn't copy and then convert fields, but only copy converted values.
> Like:
> 
> vhdx_region_table_header *rt = buffer;
> s->rt = (vhdx_region_table_header) {
>     .signature  = le32_to_cpu(rt->signature),
>     .checksum   = le32_to_cpu(rt->checksum),
>     ...
> };
> 
> Matter of taste, I guess, so this is just a suggestion, not a request.
> You could do this is quite a few places, I think.
> 
> > +
> > +    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
> > +        s->rt.signature != VHDX_RT_MAGIC) {
> > +        ret = -EINVAL;
> > +        goto fail;
> > +    }
> > +
> > +    for (i = 0; i < s->rt.entry_count; i++) {
> > +        memcpy(&rt_entry, buffer+offset, sizeof(rt_entry));
> > +        offset += sizeof(rt_entry);
> > +
> > +        leguid_to_cpus(&rt_entry.guid);
> > +        le64_to_cpus(&rt_entry.file_offset);
> > +        le32_to_cpus(&rt_entry.length);
> > +        le32_to_cpus(&rt_entry.data_bits);
> > +
> > +        /* see if we recognize the entry */
> > +        if (guid_eq(rt_entry.guid, bat_guid)) {
> > +            s->bat_rt = rt_entry;
> > +            continue;
> > +        }
> > +
> > +        if (guid_eq(rt_entry.guid, metadata_guid)) {
> > +            s->metadata_rt = rt_entry;
> > +            continue;
> > +        }
> 
> If the same regions occurs multiple times, the latest wins. Such images
> aren't valid, but what should we do with such cases? Is this good enough
> or should we detect it?

I think such images are more undefined rather than explicitly invalid.
I don't think the spec touches on the idea of multiple regions of the
same type.  That said, I don't what to make of an image file with
multiple BAT regions, for instance - I think error is the only sane
option.

Maybe keep a list of all entries, and if there are duplicates error
out with -EINVAL?  

> 
> > +        if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) {
> > +            /* cannot read vhdx file - required region table entry that
> > +             * we do not understand.  per spec, we must fail to open */
> > +            ret = -ENOTSUP;
> > +            goto fail;
> > +        }
> > +    }
> > +    ret = 0;
> > +
> > +fail:
> > +    qemu_vfree(buffer);
> > +    return ret;
> > +}
> > +
> > +
> > +
> > +/* Metadata initial parser
> > + *
> > + * This loads all the metadata entry fields.  This may cause additional
> > + * fields to be processed (e.g. parent locator, etc..).
> > + *
> > + * There are 5 Metadata items that are always required:
> > + *      - File Parameters (block size, has a parent)
> > + *      - Virtual Disk Size (size, in bytes, of the virtual drive)
> > + *      - Page 83 Data (scsi page 83 guid)
> > + *      - Logical Sector Size (logical sector size in bytes, either 512 or
> > + *                             4096.  We only support 512 currently)
> > + *      - Physical Sector Size (512 or 4096)
> > + *
> > + * Also, if the File Parameters indicate this is a differencing file,
> > + * we must also look for the Parent Locator metadata item.
> > + */
> > +static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
> > +{
> > +    int ret = 0;
> > +    uint8_t *buffer;
> > +    int offset = 0;
> > +    int i = 0;
> > +    uint32_t block_size, sectors_per_block, logical_sector_size;
> > +    uint64_t chunk_ratio;
> > +    vhdx_metadata_table_entry md_entry;
> > +
> > +    buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE);
> > +
> > +    ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
> > +                     VHDX_METADATA_TABLE_MAX_SIZE);
> > +    if (ret < 0) {
> > +        goto exit;
> > +    }
> > +    memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr));
> > +    offset += sizeof(s->metadata_hdr);
> > +
> > +    le64_to_cpus(&s->metadata_hdr.signature);
> > +    le16_to_cpus(&s->metadata_hdr.reserved);
> > +    le16_to_cpus(&s->metadata_hdr.entry_count);
> > +
> > +    if (s->metadata_hdr.signature != VHDX_METADATA_MAGIC) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +
> > +    s->metadata_entries.present = 0;
> > +
> > +    if ((s->metadata_hdr.entry_count * sizeof(md_entry)) >
> > +        (VHDX_METADATA_TABLE_MAX_SIZE - offset)) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +
> > +    for (i = 0; i < s->metadata_hdr.entry_count; i++) {
> > +        memcpy(&md_entry, buffer+offset, sizeof(md_entry));
> > +        offset += sizeof(md_entry);
> > +
> > +        leguid_to_cpus(&md_entry.item_id);
> > +        le32_to_cpus(&md_entry.offset);
> > +        le32_to_cpus(&md_entry.length);
> > +        le32_to_cpus(&md_entry.data_bits);
> > +        le32_to_cpus(&md_entry.reserved2);
> > +
> > +        if (guid_eq(md_entry.item_id, file_param_guid)) {
> > +            s->metadata_entries.file_parameters_entry = md_entry;
> > +            s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_eq(md_entry.item_id, virtual_size_guid)) {
> > +            s->metadata_entries.virtual_disk_size_entry = md_entry;
> > +            s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_eq(md_entry.item_id, page83_guid)) {
> > +            s->metadata_entries.page83_data_entry = md_entry;
> > +            s->metadata_entries.present |= META_PAGE_83_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_eq(md_entry.item_id, logical_sector_guid)) {
> > +            s->metadata_entries.logical_sector_size_entry = md_entry;
> > +            s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_eq(md_entry.item_id, phys_sector_guid)) {
> > +            s->metadata_entries.phys_sector_size_entry = md_entry;
> > +            s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_eq(md_entry.item_id, parent_locator_guid)) {
> > +            s->metadata_entries.parent_locator_entry = md_entry;
> > +            s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) {
> > +            /* cannot read vhdx file - required region table entry that
> > +             * we do not understand.  per spec, we must fail to open */
> > +            ret = -ENOTSUP;
> > +            goto exit;
> > +        }
> > +    }
> > +
> > +    if (s->metadata_entries.present != META_ALL_PRESENT) {
> > +        ret = -ENOTSUP;
> > +        goto exit;
> > +    }
> > +
> > +    ret = bdrv_pread(bs->file,
> > +                     s->metadata_entries.file_parameters_entry.offset
> > +                                         + s->metadata_rt.file_offset,
> > +                     &s->params,
> > +                     sizeof(s->params));
> > +
> > +    if (ret < 0) {
> > +        goto exit;
> > +    }
> > +
> > +    le32_to_cpus(&s->params.block_size);
> > +    le32_to_cpus(&s->params.data_bits);
> > +
> > +
> > +    /* We now have the file parameters, so we can tell if this is a
> > +     * differencing file (i.e.. has_parent), is dynamic or fixed
> > +     * sized (leave_blocks_allocated), and the block size */
> > +
> > +    /* The parent locator required iff the file parameters has_parent set */
> > +    if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
> > +        if (s->metadata_entries.present & ~META_PARENT_LOCATOR_PRESENT) {
> 
> Not sure what you're trying to achieve here. We get -ENOTSUP if any
> metadata entry except the parent locator is present, and -EINVAL if
> there is none?
>

That is a mistake - that check should be sans tilde:
  if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) 


There are currently 2 failure cases (while parent locators are
unsupported):

    1.) There is a parent present, and a parent locator field.
            - This is proper per the spec.  Normally, we would parse
              the parent locator field.  But this is not supported, so
              we return -ENOTSUP.

    2.) There is a parent present, but not parent locator field.
            - This is invalid, because if the parent is present the
              parent locator field per spec is a required field.  So,
              return -EINVAL to indicate invalid field.


> Of course, this doesn't matter currently, because above we've verified
> that s->metadata_entries.present == META_ALL_PRESENT, so we'll always
> get the -ENOTSUP case.
> 

Once we support parent locator fields, that check should be masked
against META_ALL_PRESENT, so that we are only checking for the base
required fields.  It wouldn't hurt to got ahead and do it now.

> > +            /* TODO: parse  parent locator fields */
> > +            ret = -ENOTSUP; /* temp, until differencing files are supported */
> > +            goto exit;
> > +        } else {
> > +            /* if has_parent is set, but there is not parent locator present,
> > +             * then that is an invalid combination */
> > +            ret = -EINVAL;
> > +            goto exit;
> > +        }
> > +    }
> > +
> > +    /* determine virtual disk size, logical sector size,
> > +     * and phys sector size */
> > +
> > +    ret = bdrv_pread(bs->file,
> > +                     s->metadata_entries.virtual_disk_size_entry.offset
> > +                                           + s->metadata_rt.file_offset,
> > +                     &s->virtual_disk_size,
> > +                     sizeof(uint64_t));
> > +    if (ret < 0) {
> > +        goto exit;
> > +    }
> > +    ret = bdrv_pread(bs->file,
> > +                     s->metadata_entries.logical_sector_size_entry.offset
> > +                                             + s->metadata_rt.file_offset,
> > +                     &s->logical_sector_size,
> > +                     sizeof(uint32_t));
> > +    if (ret < 0) {
> > +        goto exit;
> > +    }
> > +    ret = bdrv_pread(bs->file,
> > +                     s->metadata_entries.phys_sector_size_entry.offset
> > +                                          + s->metadata_rt.file_offset,
> > +                     &s->physical_sector_size,
> > +                     sizeof(uint32_t));
> > +    if (ret < 0) {
> > +        goto exit;
> > +    }
> > +
> > +    le64_to_cpus(&s->virtual_disk_size);
> > +    le32_to_cpus(&s->logical_sector_size);
> > +    le32_to_cpus(&s->physical_sector_size);
> > +
> > +    if (s->logical_sector_size == 0 || s->params.block_size == 0) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +
> > +    /* both block_size and sector_size are guaranteed powers of 2 */
> > +    s->sectors_per_block = s->params.block_size / s->logical_sector_size;
> > +    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
> > +                     (uint64_t)s->logical_sector_size /
> > +                     (uint64_t)s->params.block_size;
> > +
> > +    /* These values are ones we will want to use for division / multiplication
> > +     * later on, and they are all guaranteed (per the spec) to be powers of 2,
> > +     * so we can take advantage of that for shift operations during
> > +     * reads/writes */
> > +    logical_sector_size = s->logical_sector_size;
> > +    if (logical_sector_size & (logical_sector_size - 1)) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +
> > +    sectors_per_block = s->sectors_per_block;
> > +    if (sectors_per_block & (sectors_per_block - 1)) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +    chunk_ratio = s->chunk_ratio;
> > +    if (chunk_ratio & (chunk_ratio - 1)) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +    block_size = s->params.block_size;
> > +    if (block_size & (block_size - 1)) {
> > +        ret = -EINVAL;
> > +        goto exit;
> > +    }
> > +
> > +    while (logical_sector_size >>= 1) {
> > +        s->logical_sector_size_bits++;
> > +    }
> > +    while (sectors_per_block >>= 1) {
> > +        s->sectors_per_block_bits++;
> > +    }
> > +    while (chunk_ratio >>= 1) {
> > +        s->chunk_ratio_bits++;
> > +    }
> > +    while (block_size >>= 1) {
> > +        s->block_size_bits++;
> > +    }
> > +
> > +    if (s->logical_sector_size != BDRV_SECTOR_SIZE) {
> > +        printf("VHDX error - QEMU only supports 512 byte sector sizes\n");
> 
> This is not true. It depends on the device model, and in particular the
> qdev properties of it. The block layer doesn't have access to them, so
> checking that it matches the image file isn't trivial. It would have to
> be the device model that checks that the BlockDriverState is valid for
> the device it emulates.
> 
> I suggest dropping this check; otherwise make it (q)error_report at
> least.
> 

OK.

> > +        ret = -ENOTSUP;
> > +        goto exit;
> > +    }
> > +
> > +    ret = 0;
> > +
> > +exit:
> > +    qemu_vfree(buffer);
> > +    return ret;
> > +}
> 
> Kevin
Kevin Wolf - April 25, 2013, 4:52 p.m.
Am 25.04.2013 um 17:03 hat Jeff Cody geschrieben:
> On Thu, Apr 25, 2013 at 03:04:23PM +0200, Kevin Wolf wrote:
> > Am 23.04.2013 um 16:24 hat Jeff Cody geschrieben:
> > > +    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
> > > +        s->rt.signature != VHDX_RT_MAGIC) {
> > > +        ret = -EINVAL;
> > > +        goto fail;
> > > +    }
> > > +
> > > +    for (i = 0; i < s->rt.entry_count; i++) {
> > > +        memcpy(&rt_entry, buffer+offset, sizeof(rt_entry));
> > > +        offset += sizeof(rt_entry);
> > > +
> > > +        leguid_to_cpus(&rt_entry.guid);
> > > +        le64_to_cpus(&rt_entry.file_offset);
> > > +        le32_to_cpus(&rt_entry.length);
> > > +        le32_to_cpus(&rt_entry.data_bits);
> > > +
> > > +        /* see if we recognize the entry */
> > > +        if (guid_eq(rt_entry.guid, bat_guid)) {
> > > +            s->bat_rt = rt_entry;
> > > +            continue;
> > > +        }
> > > +
> > > +        if (guid_eq(rt_entry.guid, metadata_guid)) {
> > > +            s->metadata_rt = rt_entry;
> > > +            continue;
> > > +        }
> > 
> > If the same regions occurs multiple times, the latest wins. Such images
> > aren't valid, but what should we do with such cases? Is this good enough
> > or should we detect it?
> 
> I think such images are more undefined rather than explicitly invalid.
> I don't think the spec touches on the idea of multiple regions of the
> same type.

It says "The Guid field specifies a 128-bit identifier for the object
and must be unique within the table.", so it's explicitly forbidden to
have two regions of the same type.

> That said, I don't what to make of an image file with
> multiple BAT regions, for instance - I think error is the only sane
> option.
> 
> Maybe keep a list of all entries, and if there are duplicates error
> out with -EINVAL?

We could do that if we really care. Or just check for the ones that we
recognise that they aren't set yet when we assign them.

> > > +    /* We now have the file parameters, so we can tell if this is a
> > > +     * differencing file (i.e.. has_parent), is dynamic or fixed
> > > +     * sized (leave_blocks_allocated), and the block size */
> > > +
> > > +    /* The parent locator required iff the file parameters has_parent set */
> > > +    if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
> > > +        if (s->metadata_entries.present & ~META_PARENT_LOCATOR_PRESENT) {
> > 
> > Not sure what you're trying to achieve here. We get -ENOTSUP if any
> > metadata entry except the parent locator is present, and -EINVAL if
> > there is none?
> >
> 
> That is a mistake - that check should be sans tilde:
>   if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) 

Right, that makes more sense.

Kevin
Fam Zheng - April 28, 2013, 7:29 a.m.
> +static void cpu_to_leguids(ms_guid *guid)
> +{
> +    cpu_to_le32s(&guid->data1);
> +    cpu_to_le16s(&guid->data2);
> +    cpu_to_le16s(&guid->data3);
> +}

This one seems used in 5/5 only, so this patch fails compiling (defined
but not used with -Werror=unused-function). Maybe move to 5/5?
Fam Zheng - April 28, 2013, 9:58 a.m.
On Tue, 04/23 10:24, Jeff Cody wrote:
> +/* opens the specified header block from the VHDX file header section */
> +static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
> +{
> +    int ret = 0;
> +    vhdx_header *header1;
> +    vhdx_header *header2;
> +    uint64_t h1_seq = 0;
> +    uint64_t h2_seq = 0;
> +    uint8_t *buffer;
> +
> +    header1 = qemu_blockalign(bs, sizeof(vhdx_header));
> +    header2 = qemu_blockalign(bs, sizeof(vhdx_header));
> +
> +    buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE);
> +
> +    s->headers[0] = header1;
> +    s->headers[1] = header2;
> +
Logic for header1 and header2 are completely the same, IMO it might be
better to avoid code with a loop.
> +    /* We have to read the whole VHDX_HEADER_SIZE instead of
> +     * sizeof(vhdx_header), because the checksum is over the whole
> +     * region */
> +    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +    /* copy over just the relevant portion that we need */
> +    memcpy(header1, buffer, sizeof(vhdx_header));
> +    vhdx_header_le_import(header1);
> +
> +    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
> +        header1->signature == VHDX_HDR_MAGIC) {
> +        h1_seq = header1->sequence_number;
> +    }
Do we need to check the version here? As the specification page 15 says:

    The Version field specifies the version of the VHDX format used
    within the VHDX file. This field must be set to 1. If it is not, a
    parser must not attempt to parse the file using the details from
    this format specification.

> +
> +    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +    /* copy over just the relevant portion that we need */
> +    memcpy(header2, buffer, sizeof(vhdx_header));
> +    vhdx_header_le_import(header2);
> +
> +    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
> +        header2->signature == VHDX_HDR_MAGIC) {
> +        h2_seq = header2->sequence_number;
> +    }
Same as above.
> +
> +    if (h1_seq > h2_seq) {
> +        s->curr_header = 0;
> +    } else if (h2_seq > h1_seq) {
> +        s->curr_header = 1;
> +    } else {
> +        printf("NO VALID HEADER\n");
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +
> +    ret = 0;
> +
> +    goto exit;
> +
> +fail:
> +    qemu_vfree(header1);
> +    qemu_vfree(header2);
> +    s->headers[0] = NULL;
> +    s->headers[1] = NULL;
> +exit:
> +    qemu_vfree(buffer);
> +    return ret;
> +}
Jeff Cody - April 29, 2013, 5:24 p.m.
On Sun, Apr 28, 2013 at 05:58:55PM +0800, Fam Zheng wrote:
> On Tue, 04/23 10:24, Jeff Cody wrote:
> > +/* opens the specified header block from the VHDX file header section */
> > +static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
> > +{
> > +    int ret = 0;
> > +    vhdx_header *header1;
> > +    vhdx_header *header2;
> > +    uint64_t h1_seq = 0;
> > +    uint64_t h2_seq = 0;
> > +    uint8_t *buffer;
> > +
> > +    header1 = qemu_blockalign(bs, sizeof(vhdx_header));
> > +    header2 = qemu_blockalign(bs, sizeof(vhdx_header));
> > +
> > +    buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE);
> > +
> > +    s->headers[0] = header1;
> > +    s->headers[1] = header2;
> > +
> Logic for header1 and header2 are completely the same, IMO it might be
> better to avoid code with a loop.

> > +    /* We have to read the whole VHDX_HEADER_SIZE instead of
> > +     * sizeof(vhdx_header), because the checksum is over the whole
> > +     * region */
> > +    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +    /* copy over just the relevant portion that we need */
> > +    memcpy(header1, buffer, sizeof(vhdx_header));
> > +    vhdx_header_le_import(header1);
> > +
> > +    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
> > +        header1->signature == VHDX_HDR_MAGIC) {
> > +        h1_seq = header1->sequence_number;
> > +    }
> Do we need to check the version here? As the specification page 15 says:
> 
>     The Version field specifies the version of the VHDX format used
>     within the VHDX file. This field must be set to 1. If it is not, a
>     parser must not attempt to parse the file using the details from
>     this format specification.
> 

Yes, you are correct - I will add that in for v3.

> > +
> > +    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +    /* copy over just the relevant portion that we need */
> > +    memcpy(header2, buffer, sizeof(vhdx_header));
> > +    vhdx_header_le_import(header2);
> > +
> > +    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
> > +        header2->signature == VHDX_HDR_MAGIC) {
> > +        h2_seq = header2->sequence_number;
> > +    }
> Same as above.
> > +
> > +    if (h1_seq > h2_seq) {
> > +        s->curr_header = 0;
> > +    } else if (h2_seq > h1_seq) {
> > +        s->curr_header = 1;
> > +    } else {
> > +        printf("NO VALID HEADER\n");
> > +        ret = -EINVAL;
> > +        goto fail;
> > +    }
> > +
> > +    ret = 0;
> > +
> > +    goto exit;
> > +
> > +fail:
> > +    qemu_vfree(header1);
> > +    qemu_vfree(header2);
> > +    s->headers[0] = NULL;
> > +    s->headers[1] = NULL;
> > +exit:
> > +    qemu_vfree(buffer);
> > +    return ret;
> > +}
> 
> -- 
> Fam
Jeff Cody - April 29, 2013, 5:25 p.m.
On Sun, Apr 28, 2013 at 03:29:14PM +0800, Fam Zheng wrote:
> > +static void cpu_to_leguids(ms_guid *guid)
> > +{
> > +    cpu_to_le32s(&guid->data1);
> > +    cpu_to_le16s(&guid->data2);
> > +    cpu_to_le16s(&guid->data3);
> > +}
> 
> This one seems used in 5/5 only, so this patch fails compiling (defined
> but not used with -Werror=unused-function). Maybe move to 5/5?
>

Thanks.  Yes, that should be in patch 5 (git rebase user error there),
which means this will go away completely for v3.

Patch

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 6c4b5bc..5f0358a 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -2,6 +2,7 @@  block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
+block-obj-y += vhdx.o
 block-obj-y += parallels.o blkdebug.o blkverify.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
diff --git a/block/vhdx.c b/block/vhdx.c
new file mode 100644
index 0000000..b0ea2ba
--- /dev/null
+++ b/block/vhdx.c
@@ -0,0 +1,769 @@ 
+/*
+ * Block driver for Hyper-V VHDX Images
+ *
+ * Copyright (c) 2013 Red Hat, Inc.,
+ *
+ * Authors:
+ *  Jeff Cody <jcody@redhat.com>
+ *
+ *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
+ *  by Microsoft:
+ *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "qemu/crc32c.h"
+#include "block/vhdx.h"
+
+
+/* Several metadata and region table data entries are identified by
+ * guids in  a MS-specific GUID format. */
+
+
+/* ------- Known Region Table GUIDs ---------------------- */
+static const ms_guid bat_guid =      { .data1 = 0x2dc27766,
+                                       .data2 = 0xf623,
+                                       .data3 = 0x4200,
+                                       .data4 = { 0x9d, 0x64, 0x11, 0x5e,
+                                                  0x9b, 0xfd, 0x4a, 0x08} };
+
+static const ms_guid metadata_guid = { .data1 = 0x8b7ca206,
+                                       .data2 = 0x4790,
+                                       .data3 = 0x4b9a,
+                                       .data4 = { 0xb8, 0xfe, 0x57, 0x5f,
+                                                  0x05, 0x0f, 0x88, 0x6e} };
+
+
+
+/* ------- Known Metadata Entry GUIDs ---------------------- */
+static const ms_guid file_param_guid =   { .data1 = 0xcaa16737,
+                                           .data2 = 0xfa36,
+                                           .data3 = 0x4d43,
+                                           .data4 = { 0xb3, 0xb6, 0x33, 0xf0,
+                                                      0xaa, 0x44, 0xe7, 0x6b} };
+
+static const ms_guid virtual_size_guid = { .data1 = 0x2FA54224,
+                                           .data2 = 0xcd1b,
+                                           .data3 = 0x4876,
+                                           .data4 = { 0xb2, 0x11, 0x5d, 0xbe,
+                                                      0xd8, 0x3b, 0xf4, 0xb8} };
+
+static const ms_guid page83_guid =       { .data1 = 0xbeca12ab,
+                                           .data2 = 0xb2e6,
+                                           .data3 = 0x4523,
+                                           .data4 = { 0x93, 0xef, 0xc3, 0x09,
+                                                      0xe0, 0x00, 0xc7, 0x46} };
+
+
+static const ms_guid phys_sector_guid =  { .data1 = 0xcda348c7,
+                                           .data2 = 0x445d,
+                                           .data3 = 0x4471,
+                                           .data4 = { 0x9c, 0xc9, 0xe9, 0x88,
+                                                      0x52, 0x51, 0xc5, 0x56} };
+
+static const ms_guid parent_locator_guid = { .data1 = 0xa8d35f2d,
+                                             .data2 = 0xb30b,
+                                             .data3 = 0x454d,
+                                             .data4 = { 0xab, 0xf7, 0xd3,
+                                                        0xd8, 0x48, 0x34,
+                                                        0xab, 0x0c} };
+
+static const ms_guid logical_sector_guid = { .data1 = 0x8141bf1d,
+                                             .data2 = 0xa96f,
+                                             .data3 = 0x4709,
+                                             .data4 = { 0xba, 0x47, 0xf2,
+                                                        0x33, 0xa8, 0xfa,
+                                                        0xab, 0x5f} };
+
+/* Each parent type must have a valid GUID; this is for parent images
+ * of type 'VHDX'.  If we were to allow e.g. a QCOW2 parent, we would
+ * need to make up our own QCOW2 GUID type */
+static const ms_guid parent_vhdx_guid = { .data1 = 0xb04aefb7,
+                                          .data2 = 0xd19e,
+                                          .data3 = 0x4a81,
+                                          .data4 = { 0xb7, 0x89, 0x25, 0xb8,
+                                                     0xe9, 0x44, 0x59, 0x13} };
+
+
+#define META_FILE_PARAMETER_PRESENT      0x01
+#define META_VIRTUAL_DISK_SIZE_PRESENT   0x02
+#define META_PAGE_83_PRESENT             0x04
+#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08
+#define META_PHYS_SECTOR_SIZE_PRESENT    0x10
+#define META_PARENT_LOCATOR_PRESENT      0x20
+
+#define META_ALL_PRESENT    \
+    (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \
+     META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
+     META_PHYS_SECTOR_SIZE_PRESENT)
+
+typedef struct vhdx_metadata_entries {
+    vhdx_metadata_table_entry file_parameters_entry;
+    vhdx_metadata_table_entry virtual_disk_size_entry;
+    vhdx_metadata_table_entry page83_data_entry;
+    vhdx_metadata_table_entry logical_sector_size_entry;
+    vhdx_metadata_table_entry phys_sector_size_entry;
+    vhdx_metadata_table_entry parent_locator_entry;
+    uint16_t present;
+} vhdx_metadata_entries;
+
+
+typedef struct BDRVVHDXState {
+    CoMutex lock;
+
+    int curr_header;
+    vhdx_header *headers[2];
+
+    vhdx_region_table_header rt;
+    vhdx_region_table_entry bat_rt;         /* region table for the BAT */
+    vhdx_region_table_entry metadata_rt;    /* region table for the metadata */
+
+    vhdx_metadata_table_header  metadata_hdr;
+    vhdx_metadata_entries metadata_entries;
+
+    vhdx_file_parameters params;
+    uint32_t block_size;
+    uint32_t block_size_bits;
+    uint32_t sectors_per_block;
+    uint32_t sectors_per_block_bits;
+
+    uint64_t virtual_disk_size;
+    uint32_t logical_sector_size;
+    uint32_t physical_sector_size;
+
+    uint64_t chunk_ratio;
+    uint32_t chunk_ratio_bits;
+    uint32_t logical_sector_size_bits;
+
+    uint32_t bat_entries;
+    vhdx_bat_entry *bat;
+    uint64_t bat_offset;
+
+    vhdx_parent_locator_header parent_header;
+    vhdx_parent_locator_entry *parent_entries;
+
+} BDRVVHDXState;
+
+uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
+                            int crc_offset)
+{
+    uint32_t crc_new;
+    uint32_t crc_orig;
+    assert(buf != NULL);
+
+    if (crc_offset > 0) {
+        memcpy(&crc_orig, buf+crc_offset, sizeof(crc_orig));
+        memset(buf+crc_offset, 0, sizeof(crc_orig));
+    }
+
+    crc_new = crc32c(crc, buf, size);
+    if (crc_offset > 0) {
+        memcpy(buf+crc_offset, &crc_orig, sizeof(crc_orig));
+    }
+
+    return crc_new;
+}
+
+/* Validates the checksum of the buffer, with an in-place CRC.
+ *
+ * Zero is substituted during crc calculation for the original crc field,
+ * and the crc field is restored afterwards.  But the buffer will be modifed
+ * during the calculation, so this may not be not suitable for multi-threaded
+ * use.
+ *
+ * crc_offset: byte offset in buf of the buffer crc
+ * buf: buffer pointer
+ * size: size of buffer (must be > crc_offset+4)
+ *
+ * returns true if checksum is valid, false otherwise
+ */
+bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset)
+{
+    uint32_t crc_orig;
+    uint32_t crc;
+
+    assert(buf != NULL);
+    assert(size > (crc_offset+4));
+
+    memcpy(&crc_orig, buf+crc_offset, sizeof(crc_orig));
+    crc_orig = le32_to_cpu(crc_orig);
+
+    crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset);
+
+    return crc == crc_orig;
+}
+
+
+/*
+ * Per the MS VHDX Specification, for every VHDX file:
+ *      - The header section is fixed size - 1 MB
+ *      - The header section is always the first "object"
+ *      - The first 64KB of the header is the File Identifier
+ *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
+ *      - The following 512 bytes constitute a UTF-16 string identifiying the
+ *        software that created the file, and is optional and diagnostic only.
+ *
+ *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
+ */
+static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) {
+        return 100;
+    }
+    return 0;
+}
+
+/* All VHDX structures on disk are little endian */
+static void vhdx_header_le_import(vhdx_header *h)
+{
+    assert(h != NULL);
+
+    le32_to_cpus(&h->signature);
+    le32_to_cpus(&h->checksum);
+    le64_to_cpus(&h->sequence_number);
+
+    leguid_to_cpus(&h->file_write_guid);
+    leguid_to_cpus(&h->data_write_guid);
+    leguid_to_cpus(&h->log_guid);
+
+    le16_to_cpus(&h->log_version);
+    le16_to_cpus(&h->version);
+    le32_to_cpus(&h->log_length);
+    le64_to_cpus(&h->log_offset);
+}
+
+
+/* opens the specified header block from the VHDX file header section */
+static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    vhdx_header *header1;
+    vhdx_header *header2;
+    uint64_t h1_seq = 0;
+    uint64_t h2_seq = 0;
+    uint8_t *buffer;
+
+    header1 = qemu_blockalign(bs, sizeof(vhdx_header));
+    header2 = qemu_blockalign(bs, sizeof(vhdx_header));
+
+    buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE);
+
+    s->headers[0] = header1;
+    s->headers[1] = header2;
+
+    /* We have to read the whole VHDX_HEADER_SIZE instead of
+     * sizeof(vhdx_header), because the checksum is over the whole
+     * region */
+    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
+    if (ret < 0) {
+        goto fail;
+    }
+    /* copy over just the relevant portion that we need */
+    memcpy(header1, buffer, sizeof(vhdx_header));
+    vhdx_header_le_import(header1);
+
+    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
+        header1->signature == VHDX_HDR_MAGIC) {
+        h1_seq = header1->sequence_number;
+    }
+
+    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
+    if (ret < 0) {
+        goto fail;
+    }
+    /* copy over just the relevant portion that we need */
+    memcpy(header2, buffer, sizeof(vhdx_header));
+    vhdx_header_le_import(header2);
+
+    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
+        header2->signature == VHDX_HDR_MAGIC) {
+        h2_seq = header2->sequence_number;
+    }
+
+    if (h1_seq > h2_seq) {
+        s->curr_header = 0;
+    } else if (h2_seq > h1_seq) {
+        s->curr_header = 1;
+    } else {
+        printf("NO VALID HEADER\n");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    ret = 0;
+
+    goto exit;
+
+fail:
+    qemu_vfree(header1);
+    qemu_vfree(header2);
+    s->headers[0] = NULL;
+    s->headers[1] = NULL;
+exit:
+    qemu_vfree(buffer);
+    return ret;
+}
+
+
+static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    uint8_t *buffer;
+    int offset = 0;
+    vhdx_region_table_entry rt_entry;
+    int i;
+
+    /* We have to read the whole 64KB block, because the crc32 is over the
+     * whole block */
+    buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE);
+
+    ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
+                    VHDX_HEADER_BLOCK_SIZE);
+    if (ret < 0) {
+        goto fail;
+    }
+    memcpy(&s->rt, buffer, sizeof(s->rt));
+    le32_to_cpus(&s->rt.signature);
+    le32_to_cpus(&s->rt.checksum);
+    le32_to_cpus(&s->rt.entry_count);
+    le32_to_cpus(&s->rt.reserved);
+    offset += sizeof(s->rt);
+
+    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
+        s->rt.signature != VHDX_RT_MAGIC) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    for (i = 0; i < s->rt.entry_count; i++) {
+        memcpy(&rt_entry, buffer+offset, sizeof(rt_entry));
+        offset += sizeof(rt_entry);
+
+        leguid_to_cpus(&rt_entry.guid);
+        le64_to_cpus(&rt_entry.file_offset);
+        le32_to_cpus(&rt_entry.length);
+        le32_to_cpus(&rt_entry.data_bits);
+
+        /* see if we recognize the entry */
+        if (guid_eq(rt_entry.guid, bat_guid)) {
+            s->bat_rt = rt_entry;
+            continue;
+        }
+
+        if (guid_eq(rt_entry.guid, metadata_guid)) {
+            s->metadata_rt = rt_entry;
+            continue;
+        }
+
+        if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) {
+            /* cannot read vhdx file - required region table entry that
+             * we do not understand.  per spec, we must fail to open */
+            ret = -ENOTSUP;
+            goto fail;
+        }
+    }
+    ret = 0;
+
+fail:
+    qemu_vfree(buffer);
+    return ret;
+}
+
+
+
+/* Metadata initial parser
+ *
+ * This loads all the metadata entry fields.  This may cause additional
+ * fields to be processed (e.g. parent locator, etc..).
+ *
+ * There are 5 Metadata items that are always required:
+ *      - File Parameters (block size, has a parent)
+ *      - Virtual Disk Size (size, in bytes, of the virtual drive)
+ *      - Page 83 Data (scsi page 83 guid)
+ *      - Logical Sector Size (logical sector size in bytes, either 512 or
+ *                             4096.  We only support 512 currently)
+ *      - Physical Sector Size (512 or 4096)
+ *
+ * Also, if the File Parameters indicate this is a differencing file,
+ * we must also look for the Parent Locator metadata item.
+ */
+static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    uint8_t *buffer;
+    int offset = 0;
+    int i = 0;
+    uint32_t block_size, sectors_per_block, logical_sector_size;
+    uint64_t chunk_ratio;
+    vhdx_metadata_table_entry md_entry;
+
+    buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE);
+
+    ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
+                     VHDX_METADATA_TABLE_MAX_SIZE);
+    if (ret < 0) {
+        goto exit;
+    }
+    memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr));
+    offset += sizeof(s->metadata_hdr);
+
+    le64_to_cpus(&s->metadata_hdr.signature);
+    le16_to_cpus(&s->metadata_hdr.reserved);
+    le16_to_cpus(&s->metadata_hdr.entry_count);
+
+    if (s->metadata_hdr.signature != VHDX_METADATA_MAGIC) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    s->metadata_entries.present = 0;
+
+    if ((s->metadata_hdr.entry_count * sizeof(md_entry)) >
+        (VHDX_METADATA_TABLE_MAX_SIZE - offset)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    for (i = 0; i < s->metadata_hdr.entry_count; i++) {
+        memcpy(&md_entry, buffer+offset, sizeof(md_entry));
+        offset += sizeof(md_entry);
+
+        leguid_to_cpus(&md_entry.item_id);
+        le32_to_cpus(&md_entry.offset);
+        le32_to_cpus(&md_entry.length);
+        le32_to_cpus(&md_entry.data_bits);
+        le32_to_cpus(&md_entry.reserved2);
+
+        if (guid_eq(md_entry.item_id, file_param_guid)) {
+            s->metadata_entries.file_parameters_entry = md_entry;
+            s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, virtual_size_guid)) {
+            s->metadata_entries.virtual_disk_size_entry = md_entry;
+            s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, page83_guid)) {
+            s->metadata_entries.page83_data_entry = md_entry;
+            s->metadata_entries.present |= META_PAGE_83_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, logical_sector_guid)) {
+            s->metadata_entries.logical_sector_size_entry = md_entry;
+            s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, phys_sector_guid)) {
+            s->metadata_entries.phys_sector_size_entry = md_entry;
+            s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, parent_locator_guid)) {
+            s->metadata_entries.parent_locator_entry = md_entry;
+            s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT;
+            continue;
+        }
+
+        if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) {
+            /* cannot read vhdx file - required region table entry that
+             * we do not understand.  per spec, we must fail to open */
+            ret = -ENOTSUP;
+            goto exit;
+        }
+    }
+
+    if (s->metadata_entries.present != META_ALL_PRESENT) {
+        ret = -ENOTSUP;
+        goto exit;
+    }
+
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.file_parameters_entry.offset
+                                         + s->metadata_rt.file_offset,
+                     &s->params,
+                     sizeof(s->params));
+
+    if (ret < 0) {
+        goto exit;
+    }
+
+    le32_to_cpus(&s->params.block_size);
+    le32_to_cpus(&s->params.data_bits);
+
+
+    /* We now have the file parameters, so we can tell if this is a
+     * differencing file (i.e.. has_parent), is dynamic or fixed
+     * sized (leave_blocks_allocated), and the block size */
+
+    /* The parent locator required iff the file parameters has_parent set */
+    if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
+        if (s->metadata_entries.present & ~META_PARENT_LOCATOR_PRESENT) {
+            /* TODO: parse  parent locator fields */
+            ret = -ENOTSUP; /* temp, until differencing files are supported */
+            goto exit;
+        } else {
+            /* if has_parent is set, but there is not parent locator present,
+             * then that is an invalid combination */
+            ret = -EINVAL;
+            goto exit;
+        }
+    }
+
+    /* determine virtual disk size, logical sector size,
+     * and phys sector size */
+
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.virtual_disk_size_entry.offset
+                                           + s->metadata_rt.file_offset,
+                     &s->virtual_disk_size,
+                     sizeof(uint64_t));
+    if (ret < 0) {
+        goto exit;
+    }
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.logical_sector_size_entry.offset
+                                             + s->metadata_rt.file_offset,
+                     &s->logical_sector_size,
+                     sizeof(uint32_t));
+    if (ret < 0) {
+        goto exit;
+    }
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.phys_sector_size_entry.offset
+                                          + s->metadata_rt.file_offset,
+                     &s->physical_sector_size,
+                     sizeof(uint32_t));
+    if (ret < 0) {
+        goto exit;
+    }
+
+    le64_to_cpus(&s->virtual_disk_size);
+    le32_to_cpus(&s->logical_sector_size);
+    le32_to_cpus(&s->physical_sector_size);
+
+    if (s->logical_sector_size == 0 || s->params.block_size == 0) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    /* both block_size and sector_size are guaranteed powers of 2 */
+    s->sectors_per_block = s->params.block_size / s->logical_sector_size;
+    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
+                     (uint64_t)s->logical_sector_size /
+                     (uint64_t)s->params.block_size;
+
+    /* These values are ones we will want to use for division / multiplication
+     * later on, and they are all guaranteed (per the spec) to be powers of 2,
+     * so we can take advantage of that for shift operations during
+     * reads/writes */
+    logical_sector_size = s->logical_sector_size;
+    if (logical_sector_size & (logical_sector_size - 1)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    sectors_per_block = s->sectors_per_block;
+    if (sectors_per_block & (sectors_per_block - 1)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+    chunk_ratio = s->chunk_ratio;
+    if (chunk_ratio & (chunk_ratio - 1)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+    block_size = s->params.block_size;
+    if (block_size & (block_size - 1)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    while (logical_sector_size >>= 1) {
+        s->logical_sector_size_bits++;
+    }
+    while (sectors_per_block >>= 1) {
+        s->sectors_per_block_bits++;
+    }
+    while (chunk_ratio >>= 1) {
+        s->chunk_ratio_bits++;
+    }
+    while (block_size >>= 1) {
+        s->block_size_bits++;
+    }
+
+    if (s->logical_sector_size != BDRV_SECTOR_SIZE) {
+        printf("VHDX error - QEMU only supports 512 byte sector sizes\n");
+        ret = -ENOTSUP;
+        goto exit;
+    }
+
+    ret = 0;
+
+exit:
+    qemu_vfree(buffer);
+    return ret;
+}
+
+/* Parse the replay log.  Per the VHDX spec, if the log is present
+ * it must be replayed prior to opening the file, even read-only.
+ *
+ * If read-only, we must replay the log in RAM (or refuse to open
+ * a dirty VHDX file read-only */
+static int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    int i;
+    vhdx_header *hdr;
+
+    hdr = s->headers[s->curr_header];
+
+    /* either either the log guid, or log length is zero,
+     * then a replay log is present */
+    for (i = 0; i < sizeof(hdr->log_guid.data4); i++) {
+        ret |= hdr->log_guid.data4[i];
+    }
+    if (hdr->log_guid.data1 == 0 &&
+        hdr->log_guid.data2 == 0 &&
+        hdr->log_guid.data3 == 0 &&
+        ret == 0) {
+        goto exit;
+    }
+
+    /* per spec, only log version of 0 is supported */
+    if (hdr->log_version != 0) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    if (hdr->log_length == 0) {
+        goto exit;
+    }
+
+    /* We currently do not support images with logs to replay */
+    ret = -ENOTSUP;
+
+exit:
+    return ret;
+}
+
+
+static int vhdx_open(BlockDriverState *bs, QDict *options, int flags)
+{
+    BDRVVHDXState *s = bs->opaque;
+    int ret = 0;
+    int i;
+
+    s->bat = NULL;
+
+    qemu_co_mutex_init(&s->lock);
+
+    ret = vhdx_parse_header(bs, s);
+    if (ret) {
+        goto fail;
+    }
+
+    ret = vhdx_parse_log(bs, s);
+    if (ret) {
+        goto fail;
+    }
+
+    ret = vhdx_open_region_tables(bs, s);
+    if (ret) {
+        goto fail;
+    }
+
+    ret = vhdx_parse_metadata(bs, s);
+    if (ret) {
+        goto fail;
+    }
+    s->block_size = s->params.block_size;
+
+    /* the VHDX spec dictates that virtual_disk_size is always a multiple of
+     * logical_sector_size */
+    bs->total_sectors = s->virtual_disk_size / s->logical_sector_size;
+
+    s->bat_offset = s->bat_rt.file_offset;
+    s->bat_entries = s->bat_rt.length / sizeof(vhdx_bat_entry);
+    s->bat = qemu_blockalign(bs, s->bat_rt.length);
+
+    ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length);
+
+    for (i = 0; i < s->bat_entries; i++) {
+        le64_to_cpus(&s->bat[i]);
+    }
+
+    if (flags & BDRV_O_RDWR) {
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    /* TODO: differencing files, read, write */
+
+    return 0;
+fail:
+    qemu_vfree(s->bat);
+    return ret;
+}
+
+static int vhdx_reopen_prepare(BDRVReopenState *state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
+
+static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, QEMUIOVector *qiov)
+{
+    return -ENOTSUP;
+}
+
+
+
+static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, QEMUIOVector *qiov)
+{
+    return -ENOTSUP;
+}
+
+
+static void vhdx_close(BlockDriverState *bs)
+{
+    BDRVVHDXState *s = bs->opaque;
+
+    qemu_vfree(s->headers[0]);
+    qemu_vfree(s->headers[1]);
+    qemu_vfree(s->bat);
+    qemu_vfree(s->parent_entries);
+}
+
+static BlockDriver bdrv_vhdx = {
+    .format_name    = "vhdx",
+    .instance_size  = sizeof(BDRVVHDXState),
+
+    .bdrv_probe             = vhdx_probe,
+    .bdrv_open              = vhdx_open,
+    .bdrv_close             = vhdx_close,
+    .bdrv_reopen_prepare    = vhdx_reopen_prepare,
+    .bdrv_co_readv          = vhdx_co_readv,
+    .bdrv_co_writev         = vhdx_co_writev,
+};
+
+static void bdrv_vhdx_init(void)
+{
+    bdrv_register(&bdrv_vhdx);
+}
+
+block_init(bdrv_vhdx_init);
diff --git a/block/vhdx.h b/block/vhdx.h
index f5cf1ed..d71efac 100644
--- a/block/vhdx.h
+++ b/block/vhdx.h
@@ -324,4 +324,29 @@  typedef struct QEMU_PACKED vhdx_parent_locator_entry {
 
 /* ----- END VHDX SPECIFICATION STRUCTURES ---- */
 
+
+uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
+                            int crc_offset);
+
+bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset);
+
+
+static void leguid_to_cpus(ms_guid *guid)
+{
+    le32_to_cpus(&guid->data1);
+    le16_to_cpus(&guid->data2);
+    le16_to_cpus(&guid->data3);
+}
+
+static void cpu_to_leguids(ms_guid *guid)
+{
+    cpu_to_le32s(&guid->data1);
+    cpu_to_le16s(&guid->data2);
+    cpu_to_le16s(&guid->data3);
+}
+
+
+
+
+
 #endif