Patchwork [RFC,3/4] block: VHDX block driver support

login
register
mail settings
Submitter Jeff Cody
Date Feb. 18, 2013, 11:03 p.m.
Message ID <df174c440176e1498c82e61723ac1616f55e0d9f.1361228069.git.jcody@redhat.com>
Download mbox | patch
Permalink /patch/221550/
State New
Headers show

Comments

Jeff Cody - Feb. 18, 2013, 11:03 p.m.
This is preliminary and beta support for the VHDX image format,
as specified in:
   "VHDX Format Specification v0.95", published 4/12/2012
    https://www.microsoft.com/en-us/download/details.aspx?id=29681

This RFC patch has the following limitations
    - read-only (no write support yet)
    - does not support differencing files
    - does not make use of iovec or coroutines
    - may likely be broken in novel and interesting ways

I've doen read testing from a dynamic, 127GB VHDX image created
using Hyper-V.  The image was partitioned with a F18 install,
and the partitions could be mounted and files read.

As this is still under active development, and an RFC, you can
assume that any debug print statements will be removed prior
to formal patch submission.

Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 block/vhdx.c | 814 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 814 insertions(+)
 create mode 100644 block/vhdx.c
Stefan Hajnoczi - Feb. 19, 2013, 9:20 a.m.
On Mon, Feb 18, 2013 at 06:03:31PM -0500, Jeff Cody wrote:
> This is preliminary and beta support for the VHDX image format,
> as specified in:
>    "VHDX Format Specification v0.95", published 4/12/2012
>     https://www.microsoft.com/en-us/download/details.aspx?id=29681
> 
> This RFC patch has the following limitations
>     - read-only (no write support yet)
>     - does not support differencing files
>     - does not make use of iovec or coroutines
>     - may likely be broken in novel and interesting ways
> 
> I've doen read testing from a dynamic, 127GB VHDX image created
> using Hyper-V.  The image was partitioned with a F18 install,
> and the partitions could be mounted and files read.
> 
> As this is still under active development, and an RFC, you can
> assume that any debug print statements will be removed prior
> to formal patch submission.
> 
> Signed-off-by: Jeff Cody <jcody@redhat.com>
> ---
>  block/vhdx.c | 814 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 814 insertions(+)
>  create mode 100644 block/vhdx.c
> 
> diff --git a/block/vhdx.c b/block/vhdx.c
> new file mode 100644
> index 0000000..96f8fc7
> --- /dev/null
> +++ b/block/vhdx.c
> @@ -0,0 +1,814 @@
> +/*
> + * Block driver for Hyper-V VHDX Images
> + *
> + * Copyright (c) 2013 Red Hat, Inc.,
> + *
> + * Authors:
> + *  Jeff Cody <jcody@redhat.com>
> + *
> + *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
> + *  by Microsoft:
> + *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License as published by the Free
> + * Software Foundation; either version 2 of the License, or (at your option)
> + * any later version.
> + *
> + */
> +
> +#include "qemu-common.h"
> +#include "block/block_int.h"
> +#include "qemu/module.h"
> +#include "migration/migration.h"
> +#if defined(CONFIG_UUID)
> +#include <uuid/uuid.h>
> +#endif

uuid is not used in this patch?

> +#include "qemu/crc32c.h"
> +#include "block/vhdx.h"
> +
> +#define vhdx_nop(x) do { (void)(x); } while (0)
> +
> +/* Help macros to copy data from file buffers to header
> + * structures, with proper endianness.  These help avoid
> + * using packed structs */

Why not use packed structs?  Then you don't need memcpy/endianness
macros.  Abusing *_to_cpu() to go both "to" and "from" CPU endianness
is ugly.

> +/* Do not use directly, see macros below */
> +#define _hdr_copy(item, buf, size, offset, to_cpu) do { \
> +    memcpy((item), (buf)+(offset), (size));        \
> +    to_cpu((item));                                \
> +    (offset) += (size); } while (0)
> +
> +/* for all of these, buf should be a uint8_t buffer */
> +
> +/* copy 16-bit header field */
> +#define hdr_copy16(item, buf, offset) do { \
> +    _hdr_copy((item), (buf), 2, (offset), (le16_to_cpus)); } while (0)
> +
> +/* copy 32-bit header field */
> +#define hdr_copy32(item, buf, offset) do { \
> +    _hdr_copy((item), (buf), 4, (offset), (le32_to_cpus)); } while (0)
> +
> +/* copy 64-bit header field */
> +#define hdr_copy64(item, buf, offset) do { \
> +    _hdr_copy((item), (buf), 8, (offset), (le64_to_cpus)); } while (0)
> +
> +/* copy variable-length header field, no endian swapping */
> +#define hdr_copy(item, buf, size, offset) do { \
> +    _hdr_copy((item), (buf), (size), (offset), vhdx_nop); } while (0)
> +
> +/* copies a defined msguid field, with correct endianness
> + * a msguid entry has 3 data types with endianness sensitivity,
> + * followed by a byte array */
> +#define hdr_copy_guid(item, buf, offset) do {        \
> +        hdr_copy32(&(item).data1, (buf), (offset));  \
> +        hdr_copy16(&(item).data2, (buf), (offset));  \
> +        hdr_copy16(&(item).data3, (buf), (offset));  \
> +        hdr_copy(&(item).data4, (buf), sizeof((item).data4), (offset)); \
> +        } while (0)
> +
> +
> +/* Several metadata and region table data entries are identified by
> + * guids in  a MS-specific GUID format. */
> +
> +
> +/* ------- Known Region Table GUIDs ---------------------- */
> +static const ms_guid bat_guid =      { .data1 = 0x2dc27766,
> +                                       .data2 = 0xf623,
> +                                       .data3 = 0x4200,
> +                                       .data4 = { 0x9d, 0x64, 0x11, 0x5e,
> +                                                  0x9b, 0xfd, 0x4a, 0x08} };
> +
> +static const ms_guid metadata_guid = { .data1 = 0x8b7ca206,
> +                                       .data2 = 0x4790,
> +                                       .data3 = 0x4b9a,
> +                                       .data4 = { 0xb8, 0xfe, 0x57, 0x5f,
> +                                                  0x05, 0x0f, 0x88, 0x6e} };
> +
> +
> +
> +/* ------- Known Metadata Entry GUIDs ---------------------- */
> +static const ms_guid file_param_guid =   { .data1 = 0xcaa16737,
> +                                           .data2 = 0xfa36,
> +                                           .data3 = 0x4d43,
> +                                           .data4 = { 0xb3, 0xb6, 0x33, 0xf0,
> +                                                      0xaa, 0x44, 0xe7, 0x6b} };
> +
> +static const ms_guid virtual_size_guid = { .data1 = 0x2FA54224,
> +                                           .data2 = 0xcd1b,
> +                                           .data3 = 0x4876,
> +                                           .data4 = { 0xb2, 0x11, 0x5d, 0xbe,
> +                                                      0xd8, 0x3b, 0xf4, 0xb8} };
> +
> +static const ms_guid page83_guid =       { .data1 = 0xbeca12ab,
> +                                           .data2 = 0xb2e6,
> +                                           .data3 = 0x4523,
> +                                           .data4 = { 0x93, 0xef, 0xc3, 0x09,
> +                                                      0xe0, 0x00, 0xc7, 0x46} };
> +
> +static const ms_guid logical_sector_guid = {.data1 = 0x8141bf1d,
> +                                            .data2 = 0xa96f,
> +                                            .data3 = 0x4709,
> +                                           .data4 = { 0xba, 0x47, 0xf2, 0x33,
> +                                                      0xa8, 0xfa, 0xab, 0x5f} };
> +
> +static const ms_guid phys_sector_guid =  { .data1 = 0xcda348c7,
> +                                           .data2 = 0x445d,
> +                                           .data3 = 0x4471,
> +                                           .data4 = { 0x9c, 0xc9, 0xe9, 0x88,
> +                                                      0x52, 0x51, 0xc5, 0x56} };
> +
> +static const ms_guid parent_locator_guid = {.data1 = 0xa8d35f2d,
> +                                            .data2 = 0xb30b,
> +                                            .data3 = 0x454d,
> +                                           .data4 = { 0xab, 0xf7, 0xd3, 0xd8,
> +                                                      0x48, 0x34, 0xab, 0x0c} };
> +
> +
> +
> +#define META_FILE_PARAMETER_PRESENT      0x01
> +#define META_VIRTUAL_DISK_SIZE_PRESENT   0x02
> +#define META_PAGE_83_PRESENT             0x04
> +#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08
> +#define META_PHYS_SECTOR_SIZE_PRESENT    0x10
> +#define META_PARENT_LOCATOR_PRESENT      0x20
> +
> +#define META_ALL_PRESENT    \
> +    (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \
> +     META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
> +     META_PHYS_SECTOR_SIZE_PRESENT)
> +
> +typedef struct vhdx_metadata_entries {
> +    vhdx_metadata_table_entry file_parameters_entry;
> +    vhdx_metadata_table_entry virtual_disk_size_entry;
> +    vhdx_metadata_table_entry page83_data_entry;
> +    vhdx_metadata_table_entry logical_sector_size_entry;
> +    vhdx_metadata_table_entry phys_sector_size_entry;
> +    vhdx_metadata_table_entry parent_locator_entry;
> +    uint16_t present;
> +} vhdx_metadata_entries;
> +
> +
> +typedef struct BDRVVHDXState {
> +    CoMutex lock;
> +
> +    int curr_header;
> +    vhdx_header * headers[2];
> +
> +    vhdx_region_table_header rt;
> +    vhdx_region_table_entry bat_rt;         /* region table for the BAT */
> +    vhdx_region_table_entry metadata_rt;    /* region table for the metadata */
> +    vhdx_region_table_entry *unknown_rt;
> +    unsigned int unknown_rt_size;
> +
> +    vhdx_metadata_table_header  metadata_hdr;
> +    vhdx_metadata_entries metadata_entries;
> +
> +    vhdx_file_parameters params;
> +    uint32_t block_size_bits;
> +    uint32_t sectors_per_block;
> +    uint32_t sectors_per_block_bits;
> +
> +    uint64_t virtual_disk_size;
> +    uint32_t logical_sector_size;
> +    uint32_t physical_sector_size;
> +
> +    uint64_t chunk_ratio;
> +    uint32_t chunk_ratio_bits;
> +    uint32_t logical_sector_size_bits;
> +
> +    uint32_t bat_entries;
> +    vhdx_bat_entry *bat;
> +
> +    /* TODO */
> +
> +} BDRVVHDXState;
> +
> +/* CRC-32C, Castagnoli polynomial, code 0x11EDC6F41 */
> +static uint32_t vhdx_checksum(uint8_t *buf, size_t size)
> +{
> +    uint32_t chksum;
> +    chksum =  crc32c(0xffffffff, buf, size);
> +
> +    return chksum;
> +}
> +
> +/* validates the checksum of a region of table entries, substituting zero
> + * in for the in-place checksum field of the region.
> + * buf: buffer to compute crc32c over,
> + * size: size of the buffer to checksum
> + * crc_offset: offset into buf that contains the existing 4-byte checksum
> + */
> +static int vhdx_validate_checksum(uint8_t *buf, size_t size, int crc_offset)
> +{
> +    uint32_t crc_orig;
> +    uint32_t crc;
> +
> +    assert(buf != NULL);
> +    assert(size > (crc_offset+4));
> +
> +    memcpy(&crc_orig, buf+crc_offset, sizeof(crc_orig));
> +    memset(buf+crc_offset, 0, sizeof(crc_orig));
> +
> +    crc = vhdx_checksum(buf, size);
> +
> +    memcpy(buf+crc_offset, &crc_orig, sizeof(crc_orig));
> +
> +    crc_orig = le32_to_cpu(crc_orig);
> +    return crc == crc_orig ? 0 : 1;

This function should return bool.  Then you can also drop the tristate
operator (although callers need to invert their logic so true ==
success and false == failure).

> +}
> +
> +/*
> + * Per the MS VHDX Specification, for every VHDX file:
> + *      - The header section is fixed size - 1 MB
> + *      - The header section is always the first "object"
> + *      - The first 64KB of the header is the File Identifier
> + *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
> + *      - The following 512 bytes constitute a UTF-16 string identifiying the
> + *        software that created the file, and is optional and diagnostic only.
> + *
> + *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
> + */
> +static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
> +{
> +    if (buf_size >= 8 && !strncmp((char *)buf, "vhdxfile", 8)) {

Dropping const here to fit within 80-char line limit?  Declare a local
variable instead to keep const.

> +        return 100;
> +    }
> +    return 0;
> +}
> +
> +
> +static void vhdx_fill_header(vhdx_header *h, uint8_t *buffer)
> +{
> +    int offset = 0;
> +    assert(h != NULL);
> +    assert(buffer != NULL);
> +
> +    /* use memcpy to avoid unaligned data read */
> +    hdr_copy32(&h->signature,       buffer, offset);
> +    hdr_copy32(&h->checksum,        buffer, offset);
> +    hdr_copy64(&h->sequence_number, buffer, offset);
> +
> +    hdr_copy_guid(h->file_write_guid, buffer, offset);
> +    hdr_copy_guid(h->data_write_guid, buffer, offset);
> +    hdr_copy_guid(h->log_guid, buffer, offset);
> +
> +    hdr_copy16(&h->log_version,     buffer,  offset);
> +    hdr_copy16(&h->version,         buffer,  offset);
> +    hdr_copy32(&h->log_length,      buffer,  offset);
> +    hdr_copy64(&h->log_offset,      buffer,  offset);
> +}
> +
> +
> +/* opens the specified header block from the VHDX file header section */
> +static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
> +{
> +    int ret = 0;
> +    vhdx_header *header1;
> +    vhdx_header *header2;
> +    uint64_t h1_seq = 0;
> +    uint64_t h2_seq = 0;
> +    uint8_t *buffer;
> +
> +    header1 = g_malloc(sizeof(vhdx_header));
> +    header2 = g_malloc(sizeof(vhdx_header));
> +
> +    buffer = g_malloc(VHDX_HEADER_SIZE);
> +
> +    s->headers[0] = header1;
> +    s->headers[1] = header2;
> +
> +    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +    vhdx_fill_header(header1, buffer);
> +
> +    if (vhdx_validate_checksum(buffer, VHDX_HEADER_SIZE, 4) == 0 &&
> +        header1->signature == VHDX_HDR_MAGIC) {
> +        printf("header1 is valid!\n");
> +        h1_seq = header1->sequence_number;
> +    }
> +
> +    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +    vhdx_fill_header(header2, buffer);
> +
> +    if (vhdx_validate_checksum(buffer, VHDX_HEADER_SIZE, 4) == 0 &&
> +        header2->signature == VHDX_HDR_MAGIC) {
> +        printf("header2 is valid!\n");
> +        h2_seq = header2->sequence_number;
> +    }
> +
> +    if (h1_seq > h2_seq) {
> +        s->curr_header = 0;
> +    } else if (h2_seq > h1_seq) {
> +        s->curr_header = 1;
> +    } else {
> +        printf("NO VALID HEADER\n");
> +        ret = -1;

Missing goto fail?
Jeff Cody - Feb. 19, 2013, 1:26 p.m.
On Tue, Feb 19, 2013 at 10:20:40AM +0100, Stefan Hajnoczi wrote:
> On Mon, Feb 18, 2013 at 06:03:31PM -0500, Jeff Cody wrote:
> > This is preliminary and beta support for the VHDX image format,
> > as specified in:
> >    "VHDX Format Specification v0.95", published 4/12/2012
> >     https://www.microsoft.com/en-us/download/details.aspx?id=29681
> > 
> > This RFC patch has the following limitations
> >     - read-only (no write support yet)
> >     - does not support differencing files
> >     - does not make use of iovec or coroutines
> >     - may likely be broken in novel and interesting ways
> > 
> > I've doen read testing from a dynamic, 127GB VHDX image created
> > using Hyper-V.  The image was partitioned with a F18 install,
> > and the partitions could be mounted and files read.
> > 
> > As this is still under active development, and an RFC, you can
> > assume that any debug print statements will be removed prior
> > to formal patch submission.
> > 
> > Signed-off-by: Jeff Cody <jcody@redhat.com>
> > ---
> >  block/vhdx.c | 814 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 814 insertions(+)
> >  create mode 100644 block/vhdx.c
> > 
> > diff --git a/block/vhdx.c b/block/vhdx.c
> > new file mode 100644
> > index 0000000..96f8fc7
> > --- /dev/null
> > +++ b/block/vhdx.c
> > @@ -0,0 +1,814 @@
> > +/*
> > + * Block driver for Hyper-V VHDX Images
> > + *
> > + * Copyright (c) 2013 Red Hat, Inc.,
> > + *
> > + * Authors:
> > + *  Jeff Cody <jcody@redhat.com>
> > + *
> > + *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
> > + *  by Microsoft:
> > + *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
> > + *
> > + * This program is free software; you can redistribute it and/or modify it
> > + * under the terms of the GNU General Public License as published by the Free
> > + * Software Foundation; either version 2 of the License, or (at your option)
> > + * any later version.
> > + *
> > + */
> > +
> > +#include "qemu-common.h"
> > +#include "block/block_int.h"
> > +#include "qemu/module.h"
> > +#include "migration/migration.h"
> > +#if defined(CONFIG_UUID)
> > +#include <uuid/uuid.h>
> > +#endif
> 
> uuid is not used in this patch?
>

It is not indeed, these are all MS-specific UIDs.  Deleted!


> > +#include "qemu/crc32c.h"
> > +#include "block/vhdx.h"
> > +
> > +#define vhdx_nop(x) do { (void)(x); } while (0)
> > +
> > +/* Help macros to copy data from file buffers to header
> > + * structures, with proper endianness.  These help avoid
> > + * using packed structs */
> 
> Why not use packed structs?  Then you don't need memcpy/endianness
> macros.  Abusing *_to_cpu() to go both "to" and "from" CPU endianness
> is ugly.

Packed structs are (IMO) worse, and I think should be avoided wherever
possible.  Depending on the platform and structure, it can lead to
unaligned data access, which is either just slower (x86) or worse
(e.g. some arm variants).  Maybe these are all defined well in the
spec, so as to be mostly alignment proof from most architectures, but
I figured it just as well to avoid their usage since we have to touch
each field for alignment (well, I guess only on BE architectures in
the end).

These macros are just copying the headers from disk, so they are not
going "from" CPU endianness, just "to".

> 
> > +/* Do not use directly, see macros below */
> > +#define _hdr_copy(item, buf, size, offset, to_cpu) do { \
> > +    memcpy((item), (buf)+(offset), (size));        \
> > +    to_cpu((item));                                \
> > +    (offset) += (size); } while (0)
> > +
> > +/* for all of these, buf should be a uint8_t buffer */
> > +
> > +/* copy 16-bit header field */
> > +#define hdr_copy16(item, buf, offset) do { \
> > +    _hdr_copy((item), (buf), 2, (offset), (le16_to_cpus)); } while (0)
> > +
> > +/* copy 32-bit header field */
> > +#define hdr_copy32(item, buf, offset) do { \
> > +    _hdr_copy((item), (buf), 4, (offset), (le32_to_cpus)); } while (0)
> > +
> > +/* copy 64-bit header field */
> > +#define hdr_copy64(item, buf, offset) do { \
> > +    _hdr_copy((item), (buf), 8, (offset), (le64_to_cpus)); } while (0)
> > +
> > +/* copy variable-length header field, no endian swapping */
> > +#define hdr_copy(item, buf, size, offset) do { \
> > +    _hdr_copy((item), (buf), (size), (offset), vhdx_nop); } while (0)
> > +
> > +/* copies a defined msguid field, with correct endianness
> > + * a msguid entry has 3 data types with endianness sensitivity,
> > + * followed by a byte array */
> > +#define hdr_copy_guid(item, buf, offset) do {        \
> > +        hdr_copy32(&(item).data1, (buf), (offset));  \
> > +        hdr_copy16(&(item).data2, (buf), (offset));  \
> > +        hdr_copy16(&(item).data3, (buf), (offset));  \
> > +        hdr_copy(&(item).data4, (buf), sizeof((item).data4), (offset)); \
> > +        } while (0)
> > +
> > +
> > +/* Several metadata and region table data entries are identified by
> > + * guids in  a MS-specific GUID format. */
> > +
> > +
> > +/* ------- Known Region Table GUIDs ---------------------- */
> > +static const ms_guid bat_guid =      { .data1 = 0x2dc27766,
> > +                                       .data2 = 0xf623,
> > +                                       .data3 = 0x4200,
> > +                                       .data4 = { 0x9d, 0x64, 0x11, 0x5e,
> > +                                                  0x9b, 0xfd, 0x4a, 0x08} };
> > +
> > +static const ms_guid metadata_guid = { .data1 = 0x8b7ca206,
> > +                                       .data2 = 0x4790,
> > +                                       .data3 = 0x4b9a,
> > +                                       .data4 = { 0xb8, 0xfe, 0x57, 0x5f,
> > +                                                  0x05, 0x0f, 0x88, 0x6e} };
> > +
> > +
> > +
> > +/* ------- Known Metadata Entry GUIDs ---------------------- */
> > +static const ms_guid file_param_guid =   { .data1 = 0xcaa16737,
> > +                                           .data2 = 0xfa36,
> > +                                           .data3 = 0x4d43,
> > +                                           .data4 = { 0xb3, 0xb6, 0x33, 0xf0,
> > +                                                      0xaa, 0x44, 0xe7, 0x6b} };
> > +
> > +static const ms_guid virtual_size_guid = { .data1 = 0x2FA54224,
> > +                                           .data2 = 0xcd1b,
> > +                                           .data3 = 0x4876,
> > +                                           .data4 = { 0xb2, 0x11, 0x5d, 0xbe,
> > +                                                      0xd8, 0x3b, 0xf4, 0xb8} };
> > +
> > +static const ms_guid page83_guid =       { .data1 = 0xbeca12ab,
> > +                                           .data2 = 0xb2e6,
> > +                                           .data3 = 0x4523,
> > +                                           .data4 = { 0x93, 0xef, 0xc3, 0x09,
> > +                                                      0xe0, 0x00, 0xc7, 0x46} };
> > +
> > +static const ms_guid logical_sector_guid = {.data1 = 0x8141bf1d,
> > +                                            .data2 = 0xa96f,
> > +                                            .data3 = 0x4709,
> > +                                           .data4 = { 0xba, 0x47, 0xf2, 0x33,
> > +                                                      0xa8, 0xfa, 0xab, 0x5f} };
> > +
> > +static const ms_guid phys_sector_guid =  { .data1 = 0xcda348c7,
> > +                                           .data2 = 0x445d,
> > +                                           .data3 = 0x4471,
> > +                                           .data4 = { 0x9c, 0xc9, 0xe9, 0x88,
> > +                                                      0x52, 0x51, 0xc5, 0x56} };
> > +
> > +static const ms_guid parent_locator_guid = {.data1 = 0xa8d35f2d,
> > +                                            .data2 = 0xb30b,
> > +                                            .data3 = 0x454d,
> > +                                           .data4 = { 0xab, 0xf7, 0xd3, 0xd8,
> > +                                                      0x48, 0x34, 0xab, 0x0c} };
> > +
> > +
> > +
> > +#define META_FILE_PARAMETER_PRESENT      0x01
> > +#define META_VIRTUAL_DISK_SIZE_PRESENT   0x02
> > +#define META_PAGE_83_PRESENT             0x04
> > +#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08
> > +#define META_PHYS_SECTOR_SIZE_PRESENT    0x10
> > +#define META_PARENT_LOCATOR_PRESENT      0x20
> > +
> > +#define META_ALL_PRESENT    \
> > +    (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \
> > +     META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
> > +     META_PHYS_SECTOR_SIZE_PRESENT)
> > +
> > +typedef struct vhdx_metadata_entries {
> > +    vhdx_metadata_table_entry file_parameters_entry;
> > +    vhdx_metadata_table_entry virtual_disk_size_entry;
> > +    vhdx_metadata_table_entry page83_data_entry;
> > +    vhdx_metadata_table_entry logical_sector_size_entry;
> > +    vhdx_metadata_table_entry phys_sector_size_entry;
> > +    vhdx_metadata_table_entry parent_locator_entry;
> > +    uint16_t present;
> > +} vhdx_metadata_entries;
> > +
> > +
> > +typedef struct BDRVVHDXState {
> > +    CoMutex lock;
> > +
> > +    int curr_header;
> > +    vhdx_header * headers[2];
> > +
> > +    vhdx_region_table_header rt;
> > +    vhdx_region_table_entry bat_rt;         /* region table for the BAT */
> > +    vhdx_region_table_entry metadata_rt;    /* region table for the metadata */
> > +    vhdx_region_table_entry *unknown_rt;
> > +    unsigned int unknown_rt_size;
> > +
> > +    vhdx_metadata_table_header  metadata_hdr;
> > +    vhdx_metadata_entries metadata_entries;
> > +
> > +    vhdx_file_parameters params;
> > +    uint32_t block_size_bits;
> > +    uint32_t sectors_per_block;
> > +    uint32_t sectors_per_block_bits;
> > +
> > +    uint64_t virtual_disk_size;
> > +    uint32_t logical_sector_size;
> > +    uint32_t physical_sector_size;
> > +
> > +    uint64_t chunk_ratio;
> > +    uint32_t chunk_ratio_bits;
> > +    uint32_t logical_sector_size_bits;
> > +
> > +    uint32_t bat_entries;
> > +    vhdx_bat_entry *bat;
> > +
> > +    /* TODO */
> > +
> > +} BDRVVHDXState;
> > +
> > +/* CRC-32C, Castagnoli polynomial, code 0x11EDC6F41 */
> > +static uint32_t vhdx_checksum(uint8_t *buf, size_t size)
> > +{
> > +    uint32_t chksum;
> > +    chksum =  crc32c(0xffffffff, buf, size);
> > +
> > +    return chksum;
> > +}
> > +
> > +/* validates the checksum of a region of table entries, substituting zero
> > + * in for the in-place checksum field of the region.
> > + * buf: buffer to compute crc32c over,
> > + * size: size of the buffer to checksum
> > + * crc_offset: offset into buf that contains the existing 4-byte checksum
> > + */
> > +static int vhdx_validate_checksum(uint8_t *buf, size_t size, int crc_offset)
> > +{
> > +    uint32_t crc_orig;
> > +    uint32_t crc;
> > +
> > +    assert(buf != NULL);
> > +    assert(size > (crc_offset+4));
> > +
> > +    memcpy(&crc_orig, buf+crc_offset, sizeof(crc_orig));
> > +    memset(buf+crc_offset, 0, sizeof(crc_orig));
> > +
> > +    crc = vhdx_checksum(buf, size);
> > +
> > +    memcpy(buf+crc_offset, &crc_orig, sizeof(crc_orig));
> > +
> > +    crc_orig = le32_to_cpu(crc_orig);
> > +    return crc == crc_orig ? 0 : 1;
> 
> This function should return bool.  Then you can also drop the tristate
> operator (although callers need to invert their logic so true ==
> success and false == failure).

OK.  I could just invert the logic here (i.e. return crc != crc_orig),
if we want to keep success as 0/false and failure as true/!0.
> 
> > +}
> > +
> > +/*
> > + * Per the MS VHDX Specification, for every VHDX file:
> > + *      - The header section is fixed size - 1 MB
> > + *      - The header section is always the first "object"
> > + *      - The first 64KB of the header is the File Identifier
> > + *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
> > + *      - The following 512 bytes constitute a UTF-16 string identifiying the
> > + *        software that created the file, and is optional and diagnostic only.
> > + *
> > + *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
> > + */
> > +static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
> > +{
> > +    if (buf_size >= 8 && !strncmp((char *)buf, "vhdxfile", 8)) {
> 
> Dropping const here to fit within 80-char line limit?  Declare a local
> variable instead to keep const.

OK.

> 
> > +        return 100;
> > +    }
> > +    return 0;
> > +}
> > +
> > +
> > +static void vhdx_fill_header(vhdx_header *h, uint8_t *buffer)
> > +{
> > +    int offset = 0;
> > +    assert(h != NULL);
> > +    assert(buffer != NULL);
> > +
> > +    /* use memcpy to avoid unaligned data read */
> > +    hdr_copy32(&h->signature,       buffer, offset);
> > +    hdr_copy32(&h->checksum,        buffer, offset);
> > +    hdr_copy64(&h->sequence_number, buffer, offset);
> > +
> > +    hdr_copy_guid(h->file_write_guid, buffer, offset);
> > +    hdr_copy_guid(h->data_write_guid, buffer, offset);
> > +    hdr_copy_guid(h->log_guid, buffer, offset);
> > +
> > +    hdr_copy16(&h->log_version,     buffer,  offset);
> > +    hdr_copy16(&h->version,         buffer,  offset);
> > +    hdr_copy32(&h->log_length,      buffer,  offset);
> > +    hdr_copy64(&h->log_offset,      buffer,  offset);
> > +}
> > +
> > +
> > +/* opens the specified header block from the VHDX file header section */
> > +static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
> > +{
> > +    int ret = 0;
> > +    vhdx_header *header1;
> > +    vhdx_header *header2;
> > +    uint64_t h1_seq = 0;
> > +    uint64_t h2_seq = 0;
> > +    uint8_t *buffer;
> > +
> > +    header1 = g_malloc(sizeof(vhdx_header));
> > +    header2 = g_malloc(sizeof(vhdx_header));
> > +
> > +    buffer = g_malloc(VHDX_HEADER_SIZE);
> > +
> > +    s->headers[0] = header1;
> > +    s->headers[1] = header2;
> > +
> > +    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +    vhdx_fill_header(header1, buffer);
> > +
> > +    if (vhdx_validate_checksum(buffer, VHDX_HEADER_SIZE, 4) == 0 &&
> > +        header1->signature == VHDX_HDR_MAGIC) {
> > +        printf("header1 is valid!\n");
> > +        h1_seq = header1->sequence_number;
> > +    }
> > +
> > +    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +    vhdx_fill_header(header2, buffer);
> > +
> > +    if (vhdx_validate_checksum(buffer, VHDX_HEADER_SIZE, 4) == 0 &&
> > +        header2->signature == VHDX_HDR_MAGIC) {
> > +        printf("header2 is valid!\n");
> > +        h2_seq = header2->sequence_number;
> > +    }
> > +
> > +    if (h1_seq > h2_seq) {
> > +        s->curr_header = 0;
> > +    } else if (h2_seq > h1_seq) {
> > +        s->curr_header = 1;
> > +    } else {
> > +        printf("NO VALID HEADER\n");
> > +        ret = -1;
> 
> Missing goto fail?

Oops... thanks.
Kevin Wolf - Feb. 19, 2013, 2:30 p.m.
On Tue, Feb 19, 2013 at 08:26:35AM -0500, Jeff Cody wrote:
> On Tue, Feb 19, 2013 at 10:20:40AM +0100, Stefan Hajnoczi wrote:
> > On Mon, Feb 18, 2013 at 06:03:31PM -0500, Jeff Cody wrote:
> > > +#include "qemu/crc32c.h"
> > > +#include "block/vhdx.h"
> > > +
> > > +#define vhdx_nop(x) do { (void)(x); } while (0)
> > > +
> > > +/* Help macros to copy data from file buffers to header
> > > + * structures, with proper endianness.  These help avoid
> > > + * using packed structs */
> > 
> > Why not use packed structs?  Then you don't need memcpy/endianness
> > macros.  Abusing *_to_cpu() to go both "to" and "from" CPU endianness
> > is ugly.
> 
> Packed structs are (IMO) worse, and I think should be avoided wherever
> possible.  Depending on the platform and structure, it can lead to
> unaligned data access, which is either just slower (x86) or worse
> (e.g. some arm variants).  Maybe these are all defined well in the
> spec, so as to be mostly alignment proof from most architectures, but
> I figured it just as well to avoid their usage since we have to touch
> each field for alignment (well, I guess only on BE architectures in
> the end).

The whole point of packed structs is to enable the unaligned data
access, so yes, you're right with that. But in my opinion declaring the
struct packed and letting the compiler deal with unaligned data is
clearly superior to doing it manually. I mean, this is the kind of
things that compilers are for.

Kevin
Kevin Wolf - Feb. 19, 2013, 2:57 p.m.
On Mon, Feb 18, 2013 at 06:03:31PM -0500, Jeff Cody wrote:
> This is preliminary and beta support for the VHDX image format,
> as specified in:
>    "VHDX Format Specification v0.95", published 4/12/2012
>     https://www.microsoft.com/en-us/download/details.aspx?id=29681
> 
> This RFC patch has the following limitations
>     - read-only (no write support yet)
>     - does not support differencing files
>     - does not make use of iovec or coroutines
>     - may likely be broken in novel and interesting ways
> 
> I've doen read testing from a dynamic, 127GB VHDX image created
> using Hyper-V.  The image was partitioned with a F18 install,
> and the partitions could be mounted and files read.
> 
> As this is still under active development, and an RFC, you can
> assume that any debug print statements will be removed prior
> to formal patch submission.
> 
> Signed-off-by: Jeff Cody <jcody@redhat.com>

One general remark: I think you'd better not copy the structure of the
vpc block driver. It's not our worst driver, but far away from being the
best. I'm afraid that we don't have the perfect driver, but qcow2 is
probably closest to what this should look like in the end.

> ---
>  block/vhdx.c | 814 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 814 insertions(+)
>  create mode 100644 block/vhdx.c
> 
> diff --git a/block/vhdx.c b/block/vhdx.c
> new file mode 100644
> index 0000000..96f8fc7
> --- /dev/null
> +++ b/block/vhdx.c
> @@ -0,0 +1,814 @@
> +/*
> + * Block driver for Hyper-V VHDX Images
> + *
> + * Copyright (c) 2013 Red Hat, Inc.,
> + *
> + * Authors:
> + *  Jeff Cody <jcody@redhat.com>
> + *
> + *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
> + *  by Microsoft:
> + *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License as published by the Free
> + * Software Foundation; either version 2 of the License, or (at your option)
> + * any later version.
> + *
> + */

Would you consider a LGPL or MIT license? So far the block drivers are
mostly (except vdi and sheepdog) LGPL compatible, and for a future
libqblock that could become important.

> +/* opens the specified header block from the VHDX file header section */
> +static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
> +{
> +    int ret = 0;
> +    vhdx_header *header1;
> +    vhdx_header *header2;
> +    uint64_t h1_seq = 0;
> +    uint64_t h2_seq = 0;
> +    uint8_t *buffer;
> +
> +    header1 = g_malloc(sizeof(vhdx_header));
> +    header2 = g_malloc(sizeof(vhdx_header));
> +
> +    buffer = g_malloc(VHDX_HEADER_SIZE);
> +
> +    s->headers[0] = header1;
> +    s->headers[1] = header2;
> +
> +    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +    vhdx_fill_header(header1, buffer);
> +
> +    if (vhdx_validate_checksum(buffer, VHDX_HEADER_SIZE, 4) == 0 &&
> +        header1->signature == VHDX_HDR_MAGIC) {
> +        printf("header1 is valid!\n");
> +        h1_seq = header1->sequence_number;
> +    }
> +
> +    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +    vhdx_fill_header(header2, buffer);
> +
> +    if (vhdx_validate_checksum(buffer, VHDX_HEADER_SIZE, 4) == 0 &&
> +        header2->signature == VHDX_HDR_MAGIC) {
> +        printf("header2 is valid!\n");
> +        h2_seq = header2->sequence_number;
> +    }
> +
> +    if (h1_seq > h2_seq) {
> +        s->curr_header = 0;
> +    } else if (h2_seq > h1_seq) {
> +        s->curr_header = 1;
> +    } else {
> +        printf("NO VALID HEADER\n");
> +        ret = -1;

I know this is only a debug message, but it could become a real
qerror_report().

Also -1 is not a valid -errno return code.

> +    }
> +
> +    ret = 0;
> +
> +    printf("current header is %d\n", s->curr_header);
> +    goto exit;
> +
> +fail:
> +    g_free(header1);
> +    g_free(header2);
> +    s->headers[0] = NULL;
> +    s->headers[1] = NULL;
> +exit:
> +    g_free(buffer);
> +    return ret;
> +}
> +
> +static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
> +{
> +    int ret = 0;
> +    uint8_t *buffer;
> +    int offset = 0;
> +    vhdx_region_table_entry rt_entry;
> +    int i;
> +
> +    /* We have to read the whole 64KB block, because the crc32 is over the
> +     * whole block */
> +    buffer = g_malloc(VHDX_HEADER_BLOCK_SIZE);
> +
> +    printf("reading region tables...\n");
> +    ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
> +                    VHDX_HEADER_BLOCK_SIZE);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +    hdr_copy32(&s->rt.signature,   buffer, offset);
> +    hdr_copy32(&s->rt.checksum,    buffer, offset);
> +    hdr_copy32(&s->rt.entry_count, buffer, offset);
> +    hdr_copy32(&s->rt.reserved,    buffer, offset);
> +
> +    if (vhdx_validate_checksum(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
> +        s->rt.signature != VHDX_RT_MAGIC) {
> +        ret = -1;

Again, -errno. (More instances follow, won't comment on each)

> +        printf("region table checksum and/or magic failure\n");
> +        goto fail;
> +    }
> +
> +    printf("Found %" PRId32 " region table entries\n", s->rt.entry_count);
> +
> +    for (i = 0; i < s->rt.entry_count; i++) {
> +        hdr_copy_guid(rt_entry.guid, buffer, offset);
> +
> +        hdr_copy64(&rt_entry.file_offset,   buffer, offset);
> +        hdr_copy32(&rt_entry.length,        buffer, offset);
> +        hdr_copy32(&rt_entry.bitfield.data, buffer, offset);
> +
> +        /* see if we recognize the entry */
> +        if (guid_cmp(rt_entry.guid, bat_guid)) {
> +            s->bat_rt = rt_entry;
> +            continue;
> +        }
> +
> +        if (guid_cmp(rt_entry.guid, metadata_guid)) {
> +            s->metadata_rt = rt_entry;
> +            continue;
> +        }
> +
> +        if (rt_entry.bitfield.bits.required) {
> +            /* cannot read vhdx file - required region table entry that
> +             * we do not understand.  per spec, we must fail to open */
> +            printf("Found unknown region table entry that is REQUIRED!\n");
> +            ret = -1;
> +            goto fail;
> +        }
> +    }
> +    ret = 0;
> +
> +fail:
> +    g_free(buffer);
> +    return ret;
> +}
> +
> +
> +static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
> +{
> +    int ret = 0;
> +    uint8_t *buffer;
> +    int offset = 0;
> +    int i = 0;
> +    uint32_t block_size, sectors_per_block, logical_sector_size;
> +    uint64_t chunk_ratio;
> +    vhdx_metadata_table_entry md_entry;
> +
> +    buffer = g_malloc(VHDX_METADATA_TABLE_MAX_SIZE);
> +
> +    printf("reading metadata at offset 0x%" PRIx64 "\n",
> +            s->metadata_rt.file_offset);
> +    ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
> +                     VHDX_METADATA_TABLE_MAX_SIZE);
> +    if (ret < 0) {
> +        goto fail_no_free;
> +    }
> +    hdr_copy64(&s->metadata_hdr.signature,   buffer, offset);
> +    hdr_copy16(&s->metadata_hdr.reserved,    buffer, offset);
> +    hdr_copy16(&s->metadata_hdr.entry_count, buffer, offset);
> +    hdr_copy(&s->metadata_hdr.reserved2, buffer,
> +             sizeof(s->metadata_hdr.reserved2), offset);
> +
> +    if (s->metadata_hdr.signature != VHDX_METADATA_MAGIC) {
> +        ret = -1;
> +        printf("metadata header signature did not match: 0x%" PRIx64 "\n",
> +                s->metadata_hdr.signature);
> +        goto fail_no_free;
> +    }
> +
> +    printf("metadata section has %" PRId16 " entries\n",
> +            s->metadata_hdr.entry_count);
> +
> +    s->metadata_entries.present = 0;
> +
> +    for (i = 0; i < s->metadata_hdr.entry_count; i++) {
> +        hdr_copy_guid(md_entry.item_id,     buffer, offset);
> +        hdr_copy32(&md_entry.offset,        buffer, offset);
> +        hdr_copy32(&md_entry.length,        buffer, offset);
> +        hdr_copy32(&md_entry.bitfield.data, buffer, offset);
> +        hdr_copy32(&md_entry.reserved2,     buffer, offset);
> +
> +        if (guid_cmp(md_entry.item_id, file_param_guid)) {
> +            s->metadata_entries.file_parameters_entry = md_entry;
> +            s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_cmp(md_entry.item_id, virtual_size_guid)) {
> +            s->metadata_entries.virtual_disk_size_entry = md_entry;
> +            s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_cmp(md_entry.item_id, page83_guid)) {
> +            s->metadata_entries.page83_data_entry = md_entry;
> +            s->metadata_entries.present |= META_PAGE_83_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_cmp(md_entry.item_id, logical_sector_guid)) {
> +            s->metadata_entries.logical_sector_size_entry = md_entry;
> +            s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_cmp(md_entry.item_id, phys_sector_guid)) {
> +            s->metadata_entries.phys_sector_size_entry = md_entry;
> +            s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT;
> +            continue;
> +        }
> +
> +        if (guid_cmp(md_entry.item_id, parent_locator_guid)) {
> +            s->metadata_entries.parent_locator_entry = md_entry;
> +            s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT;
> +            continue;
> +        }
> +
> +        if (md_entry.bitfield.bits.is_required) {
> +            /* cannot read vhdx file - required region table entry that
> +             * we do not understand.  per spec, we must fail to open */
> +            printf("Found unknown metadata table entry that is REQUIRED!\n");
> +            ret = -1;
> +            goto exit;
> +        }
> +    }
> +
> +    if (s->metadata_entries.present != META_ALL_PRESENT) {
> +        printf("Did not find all required metadata entry fields\n");
> +        ret = -1;
> +        goto exit;
> +    }
> +
> +    offset = 0;
> +    buffer = g_realloc(buffer,
> +                       s->metadata_entries.file_parameters_entry.length);
> +    ret = bdrv_pread(bs->file,
> +                     s->metadata_entries.file_parameters_entry.offset
> +                                         + s->metadata_rt.file_offset,
> +                     buffer,
> +                     s->metadata_entries.file_parameters_entry.length);
> +
> +    hdr_copy32(&s->params.block_size,    buffer, offset);
> +    hdr_copy32(&s->params.bitfield.data, buffer, offset);
> +
> +
> +    /* We now have the file parameters, so we can tell if this is a
> +     * differencing file (i.e.. has_parent), is dynamic or fixed
> +     * sized (leave_blocks_allocated), and the block size */
> +
> +    /* The parent locator required iff the file parameters has_parent set */
> +    if (s->params.bitfield.bits.has_parent) {
> +        if (s->metadata_entries.present & ~META_PARENT_LOCATOR_PRESENT) {
> +            offset = 0;
> +            buffer = g_realloc(buffer,
> +                               s->metadata_entries.parent_locator_entry.length);
> +            ret = bdrv_pread(bs->file,
> +                             s->metadata_entries.parent_locator_entry.offset,
> +                             buffer,
> +                             s->metadata_entries.parent_locator_entry.length);
> +
> +            ret = -1; /* temp, until differencing files are supported */
> +            /* TODO: parse  parent locator fields */
> +
> +        } else {
> +            printf("Did not find all required metadata entry fields\n");
> +            ret = -1;
> +            goto exit;
> +        }
> +    }
> +    /* determine virtual disk size, logical sector size,
> +     * and phys sector size */
> +
> +    ret = bdrv_pread(bs->file,
> +                     s->metadata_entries.virtual_disk_size_entry.offset
> +                                           + s->metadata_rt.file_offset,
> +                     &s->virtual_disk_size,
> +                     sizeof(uint64_t));

Error handling? The next two calls as well.

> +    ret = bdrv_pread(bs->file,
> +                     s->metadata_entries.logical_sector_size_entry.offset
> +                                             + s->metadata_rt.file_offset,
> +                     &s->logical_sector_size,
> +                     sizeof(uint32_t));
> +    ret = bdrv_pread(bs->file,
> +                     s->metadata_entries.phys_sector_size_entry.offset
> +                                          + s->metadata_rt.file_offset,
> +                     &s->physical_sector_size,
> +                     sizeof(uint32_t));
> +
> +    le64_to_cpus(&s->virtual_disk_size);
> +    le32_to_cpus(&s->logical_sector_size);
> +    le32_to_cpus(&s->physical_sector_size);
> +
> +    /* both block_size and sector_size are guaranteed powers of 2 */
> +    s->sectors_per_block = s->params.block_size / s->logical_sector_size;
> +    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
> +                     (uint64_t)s->logical_sector_size /
> +                     (uint64_t)s->params.block_size;
> +
> +
> +
> +    /* These values are ones we will want to division / multiplication
> +     * with later on, and they are all guaranteed (per the spec) to be
> +     * powers of 2, so we can take advantage of that */
> +    logical_sector_size = s->logical_sector_size;
> +    while (logical_sector_size >>= 1) {
> +        s->logical_sector_size_bits++;
> +    }
> +    sectors_per_block = s->sectors_per_block;
> +    while (sectors_per_block >>= 1) {
> +        s->sectors_per_block_bits++;
> +    }
> +    chunk_ratio = s->chunk_ratio;
> +    while (chunk_ratio >>= 1) {
> +        s->chunk_ratio_bits++;
> +    }
> +    block_size = s->params.block_size;
> +    while (block_size >>= 1) {
> +        s->block_size_bits++;
> +    }
> +
> +    printf("chunk ratio is %" PRId64 "\n", s->chunk_ratio);
> +    printf("block size is %" PRId32 " MB\n", s->params.block_size/(1024*1024));
> +    printf("virtual disk size is %" PRId64 " MB\n",
> +            s->virtual_disk_size/(1024*1024));
> +    printf("logical sector size is %" PRId32 " bytes\n",
> +            s->logical_sector_size);
> +    printf("sectors per block is %" PRId32 "\n", s->sectors_per_block);
> +
> +    if (s->logical_sector_size != BDRV_SECTOR_SIZE) {
> +        printf("VHDX error - QEMU only supports 512 byte sector sizes\n");
> +        ret = -1;
> +        goto exit;
> +    }
> +
> +    ret = 0;
> +
> +exit:
> +    g_free(buffer);
> +fail_no_free:
> +    return ret;
> +}
> +
> +static int vhdx_open(BlockDriverState *bs, int flags)
> +{
> +    BDRVVHDXState *s = bs->opaque;
> +    int ret = 0;
> +    int i;
> +
> +    s->bat = NULL;
> +
> +    ret = vhdx_parse_header(bs, s);
> +    if (ret) {
> +        goto fail;
> +    }
> +
> +    ret = vhdx_open_region_tables(bs, s);
> +    if (ret) {
> +        goto fail;
> +    }
> +
> +    ret = vhdx_parse_metadata(bs, s);
> +    if (ret) {
> +        goto fail;
> +    }
> +
> +    /* the VHDX spec dictates that virtual_disk_size is always a multiple of
> +     * logical_sector_size */
> +    bs->total_sectors = s->virtual_disk_size / s->logical_sector_size;
> +
> +    s->bat_entries = s->bat_rt.length / VHDX_BAT_ENTRY_SIZE;
> +    s->bat = g_malloc(s->bat_rt.length);
> +
> +    ret = bdrv_pread(bs->file, s->bat_rt.file_offset, s->bat, s->bat_rt.length);
> +
> +    /* as it is a uint64_t bitfield, we need to have the right endianness */
> +    for (i = 0; i < s->bat_entries; i++) {
> +        le64_to_cpus(&s->bat[i].bitfield.data);
> +    }
> +
> +    /* TODO: differencing files, r/w */
> +    /* the below is obviously temporary */
> +    if (flags & BDRV_O_RDWR) {
> +        ret = -1;
> +        goto fail;
> +    }
> +fail:
> +    ret = 0;

Don't you think the ret = 0; would be better before the fail: label? :-)

In error cases s->bat may be leaked.

> +    return ret;
> +}
> +
> +static int vhdx_reopen_prepare(BDRVReopenState *state,
> +                               BlockReopenQueue *queue, Error **errp)
> +{
> +    return 0;
> +}
> +
> +
> +
> +static int vhdx_read(BlockDriverState *bs, int64_t sector_num,
> +                    uint8_t *buf, int nb_sectors)

For new block drivers there's really no excuse for not implementing
bdrv_co_readv instead.

> +{
> +    BDRVVHDXState *s = bs->opaque;
> +    int ret = 0;
> +    uint32_t sectors_avail;
> +    uint32_t block_offset;
> +    uint64_t offset;
> +    uint32_t bat_idx;
> +    uint32_t bytes_to_read;
> +
> +    while (nb_sectors > 0) {
> +        /* We are a differencing file, so we need to inspect the sector bitmap

This function has a few lines > 80 characters.

> +         * to see if we have the data or not */
> +        if (s->params.bitfield.bits.has_parent) {
> +            /* not supported yet */
> +            ret = -1;
> +            goto exit;
> +        } else {
> +            bat_idx = sector_num >> s->sectors_per_block_bits;
> +            /* effectively a modulo - this gives us the offset into the block
> +             * (in sector sizes) for our sector number */
> +            block_offset = sector_num - (bat_idx << s->sectors_per_block_bits);
> +            /* the chunk ratio gives us the interleaving of the sector
> +             * bitmaps, so we need to advance our page block index by the
> +             * sector bitmaps entry number */
> +            bat_idx += bat_idx >> s->chunk_ratio_bits;
> +
> +            /* the number of sectors we can read in this cycle */
> +            sectors_avail = s->sectors_per_block - block_offset;
> +
> +            if (sectors_avail  > nb_sectors) {
> +                sectors_avail = nb_sectors;
> +            }
> +
> +            bytes_to_read = sectors_avail << s->logical_sector_size_bits;
> +
> +            /*
> +            printf("bat_idx: %d\n", bat_idx);
> +            printf("sectors_avail: %d\n", bat_idx);
> +            printf("bytes_to_read: %d\n", bat_idx);
> +            */
> +            /* check the payload block state */
> +            switch (s->bat[bat_idx].bitfield.bits.state) {
> +            case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
> +            case PAYLOAD_BLOCK_UNMAPPED:    /* fall through */
> +            case PAYLOAD_BLOCK_ZERO:
> +                /* return zero */
> +                memset(buf, 0, bytes_to_read);
> +                break;
> +            case PAYLOAD_BLOCK_UNDEFINED:   /* fall through */
> +            case PAYLOAD_BLOCK_FULL_PRESENT:
> +                offset = (block_offset << s->logical_sector_size_bits) +
> +                         (s->bat[bat_idx].bitfield.bits.file_offset_mb
> +                          * (1024 * 1024));
> +                ret = bdrv_pread(bs->file, offset, buf, bytes_to_read);
> +                if (ret != bytes_to_read) {
> +                    ret = -1;
> +                    goto exit;
> +                }
> +                break;
> +            case PAYLOAD_BLOCK_PARTIALLY_PRESENT:
> +                /* we don't yet support difference files, fall through
> +                 * to error */
> +            default:
> +                ret = -1;
> +                goto exit;
> +                break;
> +            }
> +            nb_sectors -= sectors_avail;
> +            sector_num += sectors_avail;
> +            buf += bytes_to_read;
> +        }

This is a rather long else branch, and the then branch will probably get
even longer when you implement. Probably they should become separate
functions, but you'll see how it turns out when you implement it.

> +    }
> +    ret = 0;
> +exit:
> +    return ret;
> +}
> +
> +static int vhdx_write(BlockDriverState *bs, int64_t sector_num,
> +    const uint8_t *buf, int nb_sectors)
> +{
> +   /* BDRVVHDXState *s = bs->opaque; */
> +
> +    /* TODO */
> +
> +    return 0;
> +}

Right, commenting on a stub that won't survive in the real series feels
rather silly, but I wouldn't let a stub return success in any case...

> +static coroutine_fn int vhdx_co_write(BlockDriverState *bs, int64_t sector_num,
> +                                     const uint8_t *buf, int nb_sectors)
> +{
> +    int ret;
> +    BDRVVHDXState *s = bs->opaque;
> +    qemu_co_mutex_lock(&s->lock);
> +    ret = vhdx_write(bs, sector_num, buf, nb_sectors);
> +    qemu_co_mutex_unlock(&s->lock);
> +
> +    /* TODO */
> +
> +    return ret;
> +}

I see that you already removed the same pattern from reads, so havin
separate vhdx_co_write/vhdx_write doesn't make sense. Also holding the
mutex all the time doesn't appear to be a good idea. All of this is
copied from drivers that were never properly converted.

> +static int vhdx_create(const char *filename, QEMUOptionParameter *options)
> +{
> +
> +    /* TODO */
> +
> +   return 0;
> +}

You don't have to have a .bdrv_create function in the BlockDriver if you
don't implement it. The block layer usually has a sane default
implementation for everything that's missing (saner than doing nothing
an returning success in any case).

Kevin
Jeff Cody - Feb. 19, 2013, 3:04 p.m.
On Tue, Feb 19, 2013 at 03:30:10PM +0100, Kevin Wolf wrote:
> On Tue, Feb 19, 2013 at 08:26:35AM -0500, Jeff Cody wrote:
> > On Tue, Feb 19, 2013 at 10:20:40AM +0100, Stefan Hajnoczi wrote:
> > > On Mon, Feb 18, 2013 at 06:03:31PM -0500, Jeff Cody wrote:
> > > > +#include "qemu/crc32c.h"
> > > > +#include "block/vhdx.h"
> > > > +
> > > > +#define vhdx_nop(x) do { (void)(x); } while (0)
> > > > +
> > > > +/* Help macros to copy data from file buffers to header
> > > > + * structures, with proper endianness.  These help avoid
> > > > + * using packed structs */
> > > 
> > > Why not use packed structs?  Then you don't need memcpy/endianness
> > > macros.  Abusing *_to_cpu() to go both "to" and "from" CPU endianness
> > > is ugly.
> > 
> > Packed structs are (IMO) worse, and I think should be avoided wherever
> > possible.  Depending on the platform and structure, it can lead to
> > unaligned data access, which is either just slower (x86) or worse
> > (e.g. some arm variants).  Maybe these are all defined well in the
> > spec, so as to be mostly alignment proof from most architectures, but
> > I figured it just as well to avoid their usage since we have to touch
> > each field for alignment (well, I guess only on BE architectures in
> > the end).
> 
> The whole point of packed structs is to enable the unaligned data
> access, so yes, you're right with that. But in my opinion declaring the
> struct packed and letting the compiler deal with unaligned data is
> clearly superior to doing it manually. I mean, this is the kind of
> things that compilers are for.

I have no doubt the compiler can deal with the unalignment.  But we
can do things the compiler essentially can't, when it comes to data
layout.  Maybe I am engaging in too much micro-optimization here, but
the way the compiler has to deal with it is by additional
instructions, since it can't do anything about the data structure
itself.  

By doing it manually (which is nothing magic, it is just
copying struct elements over) we can deal with it on a data structure
level, which should then always yield aligned access without extra
instructions to deal with unalignment.  Now, whether this results in
any real-world performance difference is of course open for debate.


That said, since both of the block maintainers prefer packed structs,
I will change over to those (sigh - and I was so happy with those
macros! :) ).

Thanks,
Jeff
Jeff Cody - Feb. 19, 2013, 3:20 p.m.
On Tue, Feb 19, 2013 at 03:57:53PM +0100, Kevin Wolf wrote:
> On Mon, Feb 18, 2013 at 06:03:31PM -0500, Jeff Cody wrote:
> > This is preliminary and beta support for the VHDX image format,
> > as specified in:
> >    "VHDX Format Specification v0.95", published 4/12/2012
> >     https://www.microsoft.com/en-us/download/details.aspx?id=29681
> > 
> > This RFC patch has the following limitations
> >     - read-only (no write support yet)
> >     - does not support differencing files
> >     - does not make use of iovec or coroutines
> >     - may likely be broken in novel and interesting ways
> > 
> > I've doen read testing from a dynamic, 127GB VHDX image created
> > using Hyper-V.  The image was partitioned with a F18 install,
> > and the partitions could be mounted and files read.
> > 
> > As this is still under active development, and an RFC, you can
> > assume that any debug print statements will be removed prior
> > to formal patch submission.
> > 
> > Signed-off-by: Jeff Cody <jcody@redhat.com>
> 
> One general remark: I think you'd better not copy the structure of the
> vpc block driver. It's not our worst driver, but far away from being the
> best. I'm afraid that we don't have the perfect driver, but qcow2 is
> probably closest to what this should look like in the end.
> 

+1

> > ---
> >  block/vhdx.c | 814 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 814 insertions(+)
> >  create mode 100644 block/vhdx.c
> > 
> > diff --git a/block/vhdx.c b/block/vhdx.c
> > new file mode 100644
> > index 0000000..96f8fc7
> > --- /dev/null
> > +++ b/block/vhdx.c
> > @@ -0,0 +1,814 @@
> > +/*
> > + * Block driver for Hyper-V VHDX Images
> > + *
> > + * Copyright (c) 2013 Red Hat, Inc.,
> > + *
> > + * Authors:
> > + *  Jeff Cody <jcody@redhat.com>
> > + *
> > + *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
> > + *  by Microsoft:
> > + *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
> > + *
> > + * This program is free software; you can redistribute it and/or modify it
> > + * under the terms of the GNU General Public License as published by the Free
> > + * Software Foundation; either version 2 of the License, or (at your option)
> > + * any later version.
> > + *
> > + */
> 
> Would you consider a LGPL or MIT license? So far the block drivers are
> mostly (except vdi and sheepdog) LGPL compatible, and for a future
> libqblock that could become important.
>

Sure, no problem.  I'll use LGPL.

> > +/* opens the specified header block from the VHDX file header section */
> > +static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
> > +{
> > +    int ret = 0;
> > +    vhdx_header *header1;
> > +    vhdx_header *header2;
> > +    uint64_t h1_seq = 0;
> > +    uint64_t h2_seq = 0;
> > +    uint8_t *buffer;
> > +
> > +    header1 = g_malloc(sizeof(vhdx_header));
> > +    header2 = g_malloc(sizeof(vhdx_header));
> > +
> > +    buffer = g_malloc(VHDX_HEADER_SIZE);
> > +
> > +    s->headers[0] = header1;
> > +    s->headers[1] = header2;
> > +
> > +    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +    vhdx_fill_header(header1, buffer);
> > +
> > +    if (vhdx_validate_checksum(buffer, VHDX_HEADER_SIZE, 4) == 0 &&
> > +        header1->signature == VHDX_HDR_MAGIC) {
> > +        printf("header1 is valid!\n");
> > +        h1_seq = header1->sequence_number;
> > +    }
> > +
> > +    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +    vhdx_fill_header(header2, buffer);
> > +
> > +    if (vhdx_validate_checksum(buffer, VHDX_HEADER_SIZE, 4) == 0 &&
> > +        header2->signature == VHDX_HDR_MAGIC) {
> > +        printf("header2 is valid!\n");
> > +        h2_seq = header2->sequence_number;
> > +    }
> > +
> > +    if (h1_seq > h2_seq) {
> > +        s->curr_header = 0;
> > +    } else if (h2_seq > h1_seq) {
> > +        s->curr_header = 1;
> > +    } else {
> > +        printf("NO VALID HEADER\n");
> > +        ret = -1;
> 
> I know this is only a debug message, but it could become a real
> qerror_report().
> 
> Also -1 is not a valid -errno return code.
> 
> > +    }
> > +
> > +    ret = 0;
> > +
> > +    printf("current header is %d\n", s->curr_header);
> > +    goto exit;
> > +
> > +fail:
> > +    g_free(header1);
> > +    g_free(header2);
> > +    s->headers[0] = NULL;
> > +    s->headers[1] = NULL;
> > +exit:
> > +    g_free(buffer);
> > +    return ret;
> > +}
> > +
> > +static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
> > +{
> > +    int ret = 0;
> > +    uint8_t *buffer;
> > +    int offset = 0;
> > +    vhdx_region_table_entry rt_entry;
> > +    int i;
> > +
> > +    /* We have to read the whole 64KB block, because the crc32 is over the
> > +     * whole block */
> > +    buffer = g_malloc(VHDX_HEADER_BLOCK_SIZE);
> > +
> > +    printf("reading region tables...\n");
> > +    ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
> > +                    VHDX_HEADER_BLOCK_SIZE);
> > +    if (ret < 0) {
> > +        goto fail;
> > +    }
> > +
> > +    hdr_copy32(&s->rt.signature,   buffer, offset);
> > +    hdr_copy32(&s->rt.checksum,    buffer, offset);
> > +    hdr_copy32(&s->rt.entry_count, buffer, offset);
> > +    hdr_copy32(&s->rt.reserved,    buffer, offset);
> > +
> > +    if (vhdx_validate_checksum(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
> > +        s->rt.signature != VHDX_RT_MAGIC) {
> > +        ret = -1;
> 
> Again, -errno. (More instances follow, won't comment on each)
> 
> > +        printf("region table checksum and/or magic failure\n");
> > +        goto fail;
> > +    }
> > +
> > +    printf("Found %" PRId32 " region table entries\n", s->rt.entry_count);
> > +
> > +    for (i = 0; i < s->rt.entry_count; i++) {
> > +        hdr_copy_guid(rt_entry.guid, buffer, offset);
> > +
> > +        hdr_copy64(&rt_entry.file_offset,   buffer, offset);
> > +        hdr_copy32(&rt_entry.length,        buffer, offset);
> > +        hdr_copy32(&rt_entry.bitfield.data, buffer, offset);
> > +
> > +        /* see if we recognize the entry */
> > +        if (guid_cmp(rt_entry.guid, bat_guid)) {
> > +            s->bat_rt = rt_entry;
> > +            continue;
> > +        }
> > +
> > +        if (guid_cmp(rt_entry.guid, metadata_guid)) {
> > +            s->metadata_rt = rt_entry;
> > +            continue;
> > +        }
> > +
> > +        if (rt_entry.bitfield.bits.required) {
> > +            /* cannot read vhdx file - required region table entry that
> > +             * we do not understand.  per spec, we must fail to open */
> > +            printf("Found unknown region table entry that is REQUIRED!\n");
> > +            ret = -1;
> > +            goto fail;
> > +        }
> > +    }
> > +    ret = 0;
> > +
> > +fail:
> > +    g_free(buffer);
> > +    return ret;
> > +}
> > +
> > +
> > +static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
> > +{
> > +    int ret = 0;
> > +    uint8_t *buffer;
> > +    int offset = 0;
> > +    int i = 0;
> > +    uint32_t block_size, sectors_per_block, logical_sector_size;
> > +    uint64_t chunk_ratio;
> > +    vhdx_metadata_table_entry md_entry;
> > +
> > +    buffer = g_malloc(VHDX_METADATA_TABLE_MAX_SIZE);
> > +
> > +    printf("reading metadata at offset 0x%" PRIx64 "\n",
> > +            s->metadata_rt.file_offset);
> > +    ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
> > +                     VHDX_METADATA_TABLE_MAX_SIZE);
> > +    if (ret < 0) {
> > +        goto fail_no_free;
> > +    }
> > +    hdr_copy64(&s->metadata_hdr.signature,   buffer, offset);
> > +    hdr_copy16(&s->metadata_hdr.reserved,    buffer, offset);
> > +    hdr_copy16(&s->metadata_hdr.entry_count, buffer, offset);
> > +    hdr_copy(&s->metadata_hdr.reserved2, buffer,
> > +             sizeof(s->metadata_hdr.reserved2), offset);
> > +
> > +    if (s->metadata_hdr.signature != VHDX_METADATA_MAGIC) {
> > +        ret = -1;
> > +        printf("metadata header signature did not match: 0x%" PRIx64 "\n",
> > +                s->metadata_hdr.signature);
> > +        goto fail_no_free;
> > +    }
> > +
> > +    printf("metadata section has %" PRId16 " entries\n",
> > +            s->metadata_hdr.entry_count);
> > +
> > +    s->metadata_entries.present = 0;
> > +
> > +    for (i = 0; i < s->metadata_hdr.entry_count; i++) {
> > +        hdr_copy_guid(md_entry.item_id,     buffer, offset);
> > +        hdr_copy32(&md_entry.offset,        buffer, offset);
> > +        hdr_copy32(&md_entry.length,        buffer, offset);
> > +        hdr_copy32(&md_entry.bitfield.data, buffer, offset);
> > +        hdr_copy32(&md_entry.reserved2,     buffer, offset);
> > +
> > +        if (guid_cmp(md_entry.item_id, file_param_guid)) {
> > +            s->metadata_entries.file_parameters_entry = md_entry;
> > +            s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_cmp(md_entry.item_id, virtual_size_guid)) {
> > +            s->metadata_entries.virtual_disk_size_entry = md_entry;
> > +            s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_cmp(md_entry.item_id, page83_guid)) {
> > +            s->metadata_entries.page83_data_entry = md_entry;
> > +            s->metadata_entries.present |= META_PAGE_83_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_cmp(md_entry.item_id, logical_sector_guid)) {
> > +            s->metadata_entries.logical_sector_size_entry = md_entry;
> > +            s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_cmp(md_entry.item_id, phys_sector_guid)) {
> > +            s->metadata_entries.phys_sector_size_entry = md_entry;
> > +            s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (guid_cmp(md_entry.item_id, parent_locator_guid)) {
> > +            s->metadata_entries.parent_locator_entry = md_entry;
> > +            s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT;
> > +            continue;
> > +        }
> > +
> > +        if (md_entry.bitfield.bits.is_required) {
> > +            /* cannot read vhdx file - required region table entry that
> > +             * we do not understand.  per spec, we must fail to open */
> > +            printf("Found unknown metadata table entry that is REQUIRED!\n");
> > +            ret = -1;
> > +            goto exit;
> > +        }
> > +    }
> > +
> > +    if (s->metadata_entries.present != META_ALL_PRESENT) {
> > +        printf("Did not find all required metadata entry fields\n");
> > +        ret = -1;
> > +        goto exit;
> > +    }
> > +
> > +    offset = 0;
> > +    buffer = g_realloc(buffer,
> > +                       s->metadata_entries.file_parameters_entry.length);
> > +    ret = bdrv_pread(bs->file,
> > +                     s->metadata_entries.file_parameters_entry.offset
> > +                                         + s->metadata_rt.file_offset,
> > +                     buffer,
> > +                     s->metadata_entries.file_parameters_entry.length);
> > +
> > +    hdr_copy32(&s->params.block_size,    buffer, offset);
> > +    hdr_copy32(&s->params.bitfield.data, buffer, offset);
> > +
> > +
> > +    /* We now have the file parameters, so we can tell if this is a
> > +     * differencing file (i.e.. has_parent), is dynamic or fixed
> > +     * sized (leave_blocks_allocated), and the block size */
> > +
> > +    /* The parent locator required iff the file parameters has_parent set */
> > +    if (s->params.bitfield.bits.has_parent) {
> > +        if (s->metadata_entries.present & ~META_PARENT_LOCATOR_PRESENT) {
> > +            offset = 0;
> > +            buffer = g_realloc(buffer,
> > +                               s->metadata_entries.parent_locator_entry.length);
> > +            ret = bdrv_pread(bs->file,
> > +                             s->metadata_entries.parent_locator_entry.offset,
> > +                             buffer,
> > +                             s->metadata_entries.parent_locator_entry.length);
> > +
> > +            ret = -1; /* temp, until differencing files are supported */
> > +            /* TODO: parse  parent locator fields */
> > +
> > +        } else {
> > +            printf("Did not find all required metadata entry fields\n");
> > +            ret = -1;
> > +            goto exit;
> > +        }
> > +    }
> > +    /* determine virtual disk size, logical sector size,
> > +     * and phys sector size */
> > +
> > +    ret = bdrv_pread(bs->file,
> > +                     s->metadata_entries.virtual_disk_size_entry.offset
> > +                                           + s->metadata_rt.file_offset,
> > +                     &s->virtual_disk_size,
> > +                     sizeof(uint64_t));
> 
> Error handling? The next two calls as well.
>

Yup, thanks.

> > +    ret = bdrv_pread(bs->file,
> > +                     s->metadata_entries.logical_sector_size_entry.offset
> > +                                             + s->metadata_rt.file_offset,
> > +                     &s->logical_sector_size,
> > +                     sizeof(uint32_t));
> > +    ret = bdrv_pread(bs->file,
> > +                     s->metadata_entries.phys_sector_size_entry.offset
> > +                                          + s->metadata_rt.file_offset,
> > +                     &s->physical_sector_size,
> > +                     sizeof(uint32_t));
> > +
> > +    le64_to_cpus(&s->virtual_disk_size);
> > +    le32_to_cpus(&s->logical_sector_size);
> > +    le32_to_cpus(&s->physical_sector_size);
> > +
> > +    /* both block_size and sector_size are guaranteed powers of 2 */
> > +    s->sectors_per_block = s->params.block_size / s->logical_sector_size;
> > +    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
> > +                     (uint64_t)s->logical_sector_size /
> > +                     (uint64_t)s->params.block_size;
> > +
> > +
> > +
> > +    /* These values are ones we will want to division / multiplication
> > +     * with later on, and they are all guaranteed (per the spec) to be
> > +     * powers of 2, so we can take advantage of that */
> > +    logical_sector_size = s->logical_sector_size;
> > +    while (logical_sector_size >>= 1) {
> > +        s->logical_sector_size_bits++;
> > +    }
> > +    sectors_per_block = s->sectors_per_block;
> > +    while (sectors_per_block >>= 1) {
> > +        s->sectors_per_block_bits++;
> > +    }
> > +    chunk_ratio = s->chunk_ratio;
> > +    while (chunk_ratio >>= 1) {
> > +        s->chunk_ratio_bits++;
> > +    }
> > +    block_size = s->params.block_size;
> > +    while (block_size >>= 1) {
> > +        s->block_size_bits++;
> > +    }
> > +
> > +    printf("chunk ratio is %" PRId64 "\n", s->chunk_ratio);
> > +    printf("block size is %" PRId32 " MB\n", s->params.block_size/(1024*1024));
> > +    printf("virtual disk size is %" PRId64 " MB\n",
> > +            s->virtual_disk_size/(1024*1024));
> > +    printf("logical sector size is %" PRId32 " bytes\n",
> > +            s->logical_sector_size);
> > +    printf("sectors per block is %" PRId32 "\n", s->sectors_per_block);
> > +
> > +    if (s->logical_sector_size != BDRV_SECTOR_SIZE) {
> > +        printf("VHDX error - QEMU only supports 512 byte sector sizes\n");
> > +        ret = -1;
> > +        goto exit;
> > +    }
> > +
> > +    ret = 0;
> > +
> > +exit:
> > +    g_free(buffer);
> > +fail_no_free:
> > +    return ret;
> > +}
> > +
> > +static int vhdx_open(BlockDriverState *bs, int flags)
> > +{
> > +    BDRVVHDXState *s = bs->opaque;
> > +    int ret = 0;
> > +    int i;
> > +
> > +    s->bat = NULL;
> > +
> > +    ret = vhdx_parse_header(bs, s);
> > +    if (ret) {
> > +        goto fail;
> > +    }
> > +
> > +    ret = vhdx_open_region_tables(bs, s);
> > +    if (ret) {
> > +        goto fail;
> > +    }
> > +
> > +    ret = vhdx_parse_metadata(bs, s);
> > +    if (ret) {
> > +        goto fail;
> > +    }
> > +
> > +    /* the VHDX spec dictates that virtual_disk_size is always a multiple of
> > +     * logical_sector_size */
> > +    bs->total_sectors = s->virtual_disk_size / s->logical_sector_size;
> > +
> > +    s->bat_entries = s->bat_rt.length / VHDX_BAT_ENTRY_SIZE;
> > +    s->bat = g_malloc(s->bat_rt.length);
> > +
> > +    ret = bdrv_pread(bs->file, s->bat_rt.file_offset, s->bat, s->bat_rt.length);
> > +
> > +    /* as it is a uint64_t bitfield, we need to have the right endianness */
> > +    for (i = 0; i < s->bat_entries; i++) {
> > +        le64_to_cpus(&s->bat[i].bitfield.data);
> > +    }
> > +
> > +    /* TODO: differencing files, r/w */
> > +    /* the below is obviously temporary */
> > +    if (flags & BDRV_O_RDWR) {
> > +        ret = -1;
> > +        goto fail;
> > +    }
> > +fail:
> > +    ret = 0;
> 
> Don't you think the ret = 0; would be better before the fail: label? :-)
>

Uh, hmm... yes :) 

> In error cases s->bat may be leaked.
> 

OK

> > +    return ret;
> > +}
> > +
> > +static int vhdx_reopen_prepare(BDRVReopenState *state,
> > +                               BlockReopenQueue *queue, Error **errp)
> > +{
> > +    return 0;
> > +}
> > +
> > +
> > +
> > +static int vhdx_read(BlockDriverState *bs, int64_t sector_num,
> > +                    uint8_t *buf, int nb_sectors)
> 
> For new block drivers there's really no excuse for not implementing
> bdrv_co_readv instead.
>

I didn't plan on submitting the real patch series without
bdrv_co_readv() as the implementation.  But this let me test a bit
faster, and I figured I'd submit the RFC out as soon as I had
functional reads.  I was mainly doing it to prove out the data
parsing, and this seemed the quickest/easiest way.

The next series will have bdrv_co_readv/writev properly implemented.

> > +{
> > +    BDRVVHDXState *s = bs->opaque;
> > +    int ret = 0;
> > +    uint32_t sectors_avail;
> > +    uint32_t block_offset;
> > +    uint64_t offset;
> > +    uint32_t bat_idx;
> > +    uint32_t bytes_to_read;
> > +
> > +    while (nb_sectors > 0) {
> > +        /* We are a differencing file, so we need to inspect the sector bitmap
> 
> This function has a few lines > 80 characters.

Hmm... checkpatch.pl blessed it, are you sure?
> 
> > +         * to see if we have the data or not */
> > +        if (s->params.bitfield.bits.has_parent) {
> > +            /* not supported yet */
> > +            ret = -1;
> > +            goto exit;
> > +        } else {
> > +            bat_idx = sector_num >> s->sectors_per_block_bits;
> > +            /* effectively a modulo - this gives us the offset into the block
> > +             * (in sector sizes) for our sector number */
> > +            block_offset = sector_num - (bat_idx << s->sectors_per_block_bits);
> > +            /* the chunk ratio gives us the interleaving of the sector
> > +             * bitmaps, so we need to advance our page block index by the
> > +             * sector bitmaps entry number */
> > +            bat_idx += bat_idx >> s->chunk_ratio_bits;
> > +
> > +            /* the number of sectors we can read in this cycle */
> > +            sectors_avail = s->sectors_per_block - block_offset;
> > +
> > +            if (sectors_avail  > nb_sectors) {
> > +                sectors_avail = nb_sectors;
> > +            }
> > +
> > +            bytes_to_read = sectors_avail << s->logical_sector_size_bits;
> > +
> > +            /*
> > +            printf("bat_idx: %d\n", bat_idx);
> > +            printf("sectors_avail: %d\n", bat_idx);
> > +            printf("bytes_to_read: %d\n", bat_idx);
> > +            */
> > +            /* check the payload block state */
> > +            switch (s->bat[bat_idx].bitfield.bits.state) {
> > +            case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
> > +            case PAYLOAD_BLOCK_UNMAPPED:    /* fall through */
> > +            case PAYLOAD_BLOCK_ZERO:
> > +                /* return zero */
> > +                memset(buf, 0, bytes_to_read);
> > +                break;
> > +            case PAYLOAD_BLOCK_UNDEFINED:   /* fall through */
> > +            case PAYLOAD_BLOCK_FULL_PRESENT:
> > +                offset = (block_offset << s->logical_sector_size_bits) +
> > +                         (s->bat[bat_idx].bitfield.bits.file_offset_mb
> > +                          * (1024 * 1024));
> > +                ret = bdrv_pread(bs->file, offset, buf, bytes_to_read);
> > +                if (ret != bytes_to_read) {
> > +                    ret = -1;
> > +                    goto exit;
> > +                }
> > +                break;
> > +            case PAYLOAD_BLOCK_PARTIALLY_PRESENT:
> > +                /* we don't yet support difference files, fall through
> > +                 * to error */
> > +            default:
> > +                ret = -1;
> > +                goto exit;
> > +                break;
> > +            }
> > +            nb_sectors -= sectors_avail;
> > +            sector_num += sectors_avail;
> > +            buf += bytes_to_read;
> > +        }
> 
> This is a rather long else branch, and the then branch will probably get
> even longer when you implement. Probably they should become separate
> functions, but you'll see how it turns out when you implement it.
> 

+1

> > +    }
> > +    ret = 0;
> > +exit:
> > +    return ret;
> > +}
> > +
> > +static int vhdx_write(BlockDriverState *bs, int64_t sector_num,
> > +    const uint8_t *buf, int nb_sectors)
> > +{
> > +   /* BDRVVHDXState *s = bs->opaque; */
> > +
> > +    /* TODO */
> > +
> > +    return 0;
> > +}
> 
> Right, commenting on a stub that won't survive in the real series feels
> rather silly, but I wouldn't let a stub return success in any case...
> 
> > +static coroutine_fn int vhdx_co_write(BlockDriverState *bs, int64_t sector_num,
> > +                                     const uint8_t *buf, int nb_sectors)
> > +{
> > +    int ret;
> > +    BDRVVHDXState *s = bs->opaque;
> > +    qemu_co_mutex_lock(&s->lock);
> > +    ret = vhdx_write(bs, sector_num, buf, nb_sectors);
> > +    qemu_co_mutex_unlock(&s->lock);
> > +
> > +    /* TODO */
> > +
> > +    return ret;
> > +}
> 
> I see that you already removed the same pattern from reads, so havin
> separate vhdx_co_write/vhdx_write doesn't make sense. Also holding the
> mutex all the time doesn't appear to be a good idea. All of this is
> copied from drivers that were never properly converted.
>

Agreed.  I hadn't addressed writes at all, which is why this stub is
sitting here the way it is.  It'll be nuked, and a proper write
coroutine will be in its place.


> > +static int vhdx_create(const char *filename, QEMUOptionParameter *options)
> > +{
> > +
> > +    /* TODO */
> > +
> > +   return 0;
> > +}
> 
> You don't have to have a .bdrv_create function in the BlockDriver if you
> don't implement it. The block layer usually has a sane default
> implementation for everything that's missing (saner than doing nothing
> an returning success in any case).
> 

Yup.  I plan on implementing it, but will remove this until then, so
the series isn't held up by it.

> Kevin

Thanks for the reviews, I appreciate it.

Jeff
Kevin Wolf - Feb. 19, 2013, 3:20 p.m.
On Tue, Feb 19, 2013 at 10:04:07AM -0500, Jeff Cody wrote:
> On Tue, Feb 19, 2013 at 03:30:10PM +0100, Kevin Wolf wrote:
> > The whole point of packed structs is to enable the unaligned data
> > access, so yes, you're right with that. But in my opinion declaring the
> > struct packed and letting the compiler deal with unaligned data is
> > clearly superior to doing it manually. I mean, this is the kind of
> > things that compilers are for.
> 
> I have no doubt the compiler can deal with the unalignment.  But we
> can do things the compiler essentially can't, when it comes to data
> layout.  Maybe I am engaging in too much micro-optimization here, but
> the way the compiler has to deal with it is by additional
> instructions, since it can't do anything about the data structure
> itself.

We're talking about the header here. It's read exactly once and
presumably not updated very often. The performance of these operations
doesn't matter at all. Saving lines of code that could potentially
contain bugs does matter.

> That said, since both of the block maintainers prefer packed structs,
> I will change over to those (sigh - and I was so happy with those
> macros! :) ).

You can (or actually should) still copy those part to BDRVVHDXState that
the driver frequently accesses. Things just become a bit less magic when
you access members of a packed struct instead of random offsets.

Kevin

Patch

diff --git a/block/vhdx.c b/block/vhdx.c
new file mode 100644
index 0000000..96f8fc7
--- /dev/null
+++ b/block/vhdx.c
@@ -0,0 +1,814 @@ 
+/*
+ * Block driver for Hyper-V VHDX Images
+ *
+ * Copyright (c) 2013 Red Hat, Inc.,
+ *
+ * Authors:
+ *  Jeff Cody <jcody@redhat.com>
+ *
+ *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
+ *  by Microsoft:
+ *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
+#if defined(CONFIG_UUID)
+#include <uuid/uuid.h>
+#endif
+#include "qemu/crc32c.h"
+#include "block/vhdx.h"
+
+#define vhdx_nop(x) do { (void)(x); } while (0)
+
+/* Help macros to copy data from file buffers to header
+ * structures, with proper endianness.  These help avoid
+ * using packed structs */
+
+/* Do not use directly, see macros below */
+#define _hdr_copy(item, buf, size, offset, to_cpu) do { \
+    memcpy((item), (buf)+(offset), (size));        \
+    to_cpu((item));                                \
+    (offset) += (size); } while (0)
+
+/* for all of these, buf should be a uint8_t buffer */
+
+/* copy 16-bit header field */
+#define hdr_copy16(item, buf, offset) do { \
+    _hdr_copy((item), (buf), 2, (offset), (le16_to_cpus)); } while (0)
+
+/* copy 32-bit header field */
+#define hdr_copy32(item, buf, offset) do { \
+    _hdr_copy((item), (buf), 4, (offset), (le32_to_cpus)); } while (0)
+
+/* copy 64-bit header field */
+#define hdr_copy64(item, buf, offset) do { \
+    _hdr_copy((item), (buf), 8, (offset), (le64_to_cpus)); } while (0)
+
+/* copy variable-length header field, no endian swapping */
+#define hdr_copy(item, buf, size, offset) do { \
+    _hdr_copy((item), (buf), (size), (offset), vhdx_nop); } while (0)
+
+/* copies a defined msguid field, with correct endianness
+ * a msguid entry has 3 data types with endianness sensitivity,
+ * followed by a byte array */
+#define hdr_copy_guid(item, buf, offset) do {        \
+        hdr_copy32(&(item).data1, (buf), (offset));  \
+        hdr_copy16(&(item).data2, (buf), (offset));  \
+        hdr_copy16(&(item).data3, (buf), (offset));  \
+        hdr_copy(&(item).data4, (buf), sizeof((item).data4), (offset)); \
+        } while (0)
+
+
+/* Several metadata and region table data entries are identified by
+ * guids in  a MS-specific GUID format. */
+
+
+/* ------- Known Region Table GUIDs ---------------------- */
+static const ms_guid bat_guid =      { .data1 = 0x2dc27766,
+                                       .data2 = 0xf623,
+                                       .data3 = 0x4200,
+                                       .data4 = { 0x9d, 0x64, 0x11, 0x5e,
+                                                  0x9b, 0xfd, 0x4a, 0x08} };
+
+static const ms_guid metadata_guid = { .data1 = 0x8b7ca206,
+                                       .data2 = 0x4790,
+                                       .data3 = 0x4b9a,
+                                       .data4 = { 0xb8, 0xfe, 0x57, 0x5f,
+                                                  0x05, 0x0f, 0x88, 0x6e} };
+
+
+
+/* ------- Known Metadata Entry GUIDs ---------------------- */
+static const ms_guid file_param_guid =   { .data1 = 0xcaa16737,
+                                           .data2 = 0xfa36,
+                                           .data3 = 0x4d43,
+                                           .data4 = { 0xb3, 0xb6, 0x33, 0xf0,
+                                                      0xaa, 0x44, 0xe7, 0x6b} };
+
+static const ms_guid virtual_size_guid = { .data1 = 0x2FA54224,
+                                           .data2 = 0xcd1b,
+                                           .data3 = 0x4876,
+                                           .data4 = { 0xb2, 0x11, 0x5d, 0xbe,
+                                                      0xd8, 0x3b, 0xf4, 0xb8} };
+
+static const ms_guid page83_guid =       { .data1 = 0xbeca12ab,
+                                           .data2 = 0xb2e6,
+                                           .data3 = 0x4523,
+                                           .data4 = { 0x93, 0xef, 0xc3, 0x09,
+                                                      0xe0, 0x00, 0xc7, 0x46} };
+
+static const ms_guid logical_sector_guid = {.data1 = 0x8141bf1d,
+                                            .data2 = 0xa96f,
+                                            .data3 = 0x4709,
+                                           .data4 = { 0xba, 0x47, 0xf2, 0x33,
+                                                      0xa8, 0xfa, 0xab, 0x5f} };
+
+static const ms_guid phys_sector_guid =  { .data1 = 0xcda348c7,
+                                           .data2 = 0x445d,
+                                           .data3 = 0x4471,
+                                           .data4 = { 0x9c, 0xc9, 0xe9, 0x88,
+                                                      0x52, 0x51, 0xc5, 0x56} };
+
+static const ms_guid parent_locator_guid = {.data1 = 0xa8d35f2d,
+                                            .data2 = 0xb30b,
+                                            .data3 = 0x454d,
+                                           .data4 = { 0xab, 0xf7, 0xd3, 0xd8,
+                                                      0x48, 0x34, 0xab, 0x0c} };
+
+
+
+#define META_FILE_PARAMETER_PRESENT      0x01
+#define META_VIRTUAL_DISK_SIZE_PRESENT   0x02
+#define META_PAGE_83_PRESENT             0x04
+#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08
+#define META_PHYS_SECTOR_SIZE_PRESENT    0x10
+#define META_PARENT_LOCATOR_PRESENT      0x20
+
+#define META_ALL_PRESENT    \
+    (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \
+     META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
+     META_PHYS_SECTOR_SIZE_PRESENT)
+
+typedef struct vhdx_metadata_entries {
+    vhdx_metadata_table_entry file_parameters_entry;
+    vhdx_metadata_table_entry virtual_disk_size_entry;
+    vhdx_metadata_table_entry page83_data_entry;
+    vhdx_metadata_table_entry logical_sector_size_entry;
+    vhdx_metadata_table_entry phys_sector_size_entry;
+    vhdx_metadata_table_entry parent_locator_entry;
+    uint16_t present;
+} vhdx_metadata_entries;
+
+
+typedef struct BDRVVHDXState {
+    CoMutex lock;
+
+    int curr_header;
+    vhdx_header * headers[2];
+
+    vhdx_region_table_header rt;
+    vhdx_region_table_entry bat_rt;         /* region table for the BAT */
+    vhdx_region_table_entry metadata_rt;    /* region table for the metadata */
+    vhdx_region_table_entry *unknown_rt;
+    unsigned int unknown_rt_size;
+
+    vhdx_metadata_table_header  metadata_hdr;
+    vhdx_metadata_entries metadata_entries;
+
+    vhdx_file_parameters params;
+    uint32_t block_size_bits;
+    uint32_t sectors_per_block;
+    uint32_t sectors_per_block_bits;
+
+    uint64_t virtual_disk_size;
+    uint32_t logical_sector_size;
+    uint32_t physical_sector_size;
+
+    uint64_t chunk_ratio;
+    uint32_t chunk_ratio_bits;
+    uint32_t logical_sector_size_bits;
+
+    uint32_t bat_entries;
+    vhdx_bat_entry *bat;
+
+    /* TODO */
+
+} BDRVVHDXState;
+
+/* CRC-32C, Castagnoli polynomial, code 0x11EDC6F41 */
+static uint32_t vhdx_checksum(uint8_t *buf, size_t size)
+{
+    uint32_t chksum;
+    chksum =  crc32c(0xffffffff, buf, size);
+
+    return chksum;
+}
+
+/* validates the checksum of a region of table entries, substituting zero
+ * in for the in-place checksum field of the region.
+ * buf: buffer to compute crc32c over,
+ * size: size of the buffer to checksum
+ * crc_offset: offset into buf that contains the existing 4-byte checksum
+ */
+static int vhdx_validate_checksum(uint8_t *buf, size_t size, int crc_offset)
+{
+    uint32_t crc_orig;
+    uint32_t crc;
+
+    assert(buf != NULL);
+    assert(size > (crc_offset+4));
+
+    memcpy(&crc_orig, buf+crc_offset, sizeof(crc_orig));
+    memset(buf+crc_offset, 0, sizeof(crc_orig));
+
+    crc = vhdx_checksum(buf, size);
+
+    memcpy(buf+crc_offset, &crc_orig, sizeof(crc_orig));
+
+    crc_orig = le32_to_cpu(crc_orig);
+    return crc == crc_orig ? 0 : 1;
+}
+
+/*
+ * Per the MS VHDX Specification, for every VHDX file:
+ *      - The header section is fixed size - 1 MB
+ *      - The header section is always the first "object"
+ *      - The first 64KB of the header is the File Identifier
+ *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
+ *      - The following 512 bytes constitute a UTF-16 string identifiying the
+ *        software that created the file, and is optional and diagnostic only.
+ *
+ *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
+ */
+static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    if (buf_size >= 8 && !strncmp((char *)buf, "vhdxfile", 8)) {
+        return 100;
+    }
+    return 0;
+}
+
+
+static void vhdx_fill_header(vhdx_header *h, uint8_t *buffer)
+{
+    int offset = 0;
+    assert(h != NULL);
+    assert(buffer != NULL);
+
+    /* use memcpy to avoid unaligned data read */
+    hdr_copy32(&h->signature,       buffer, offset);
+    hdr_copy32(&h->checksum,        buffer, offset);
+    hdr_copy64(&h->sequence_number, buffer, offset);
+
+    hdr_copy_guid(h->file_write_guid, buffer, offset);
+    hdr_copy_guid(h->data_write_guid, buffer, offset);
+    hdr_copy_guid(h->log_guid, buffer, offset);
+
+    hdr_copy16(&h->log_version,     buffer,  offset);
+    hdr_copy16(&h->version,         buffer,  offset);
+    hdr_copy32(&h->log_length,      buffer,  offset);
+    hdr_copy64(&h->log_offset,      buffer,  offset);
+}
+
+
+/* opens the specified header block from the VHDX file header section */
+static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    vhdx_header *header1;
+    vhdx_header *header2;
+    uint64_t h1_seq = 0;
+    uint64_t h2_seq = 0;
+    uint8_t *buffer;
+
+    header1 = g_malloc(sizeof(vhdx_header));
+    header2 = g_malloc(sizeof(vhdx_header));
+
+    buffer = g_malloc(VHDX_HEADER_SIZE);
+
+    s->headers[0] = header1;
+    s->headers[1] = header2;
+
+    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
+    if (ret < 0) {
+        goto fail;
+    }
+    vhdx_fill_header(header1, buffer);
+
+    if (vhdx_validate_checksum(buffer, VHDX_HEADER_SIZE, 4) == 0 &&
+        header1->signature == VHDX_HDR_MAGIC) {
+        printf("header1 is valid!\n");
+        h1_seq = header1->sequence_number;
+    }
+
+    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
+    if (ret < 0) {
+        goto fail;
+    }
+    vhdx_fill_header(header2, buffer);
+
+    if (vhdx_validate_checksum(buffer, VHDX_HEADER_SIZE, 4) == 0 &&
+        header2->signature == VHDX_HDR_MAGIC) {
+        printf("header2 is valid!\n");
+        h2_seq = header2->sequence_number;
+    }
+
+    if (h1_seq > h2_seq) {
+        s->curr_header = 0;
+    } else if (h2_seq > h1_seq) {
+        s->curr_header = 1;
+    } else {
+        printf("NO VALID HEADER\n");
+        ret = -1;
+    }
+
+    ret = 0;
+
+    printf("current header is %d\n", s->curr_header);
+    goto exit;
+
+fail:
+    g_free(header1);
+    g_free(header2);
+    s->headers[0] = NULL;
+    s->headers[1] = NULL;
+exit:
+    g_free(buffer);
+    return ret;
+}
+
+static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    uint8_t *buffer;
+    int offset = 0;
+    vhdx_region_table_entry rt_entry;
+    int i;
+
+    /* We have to read the whole 64KB block, because the crc32 is over the
+     * whole block */
+    buffer = g_malloc(VHDX_HEADER_BLOCK_SIZE);
+
+    printf("reading region tables...\n");
+    ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
+                    VHDX_HEADER_BLOCK_SIZE);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    hdr_copy32(&s->rt.signature,   buffer, offset);
+    hdr_copy32(&s->rt.checksum,    buffer, offset);
+    hdr_copy32(&s->rt.entry_count, buffer, offset);
+    hdr_copy32(&s->rt.reserved,    buffer, offset);
+
+    if (vhdx_validate_checksum(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
+        s->rt.signature != VHDX_RT_MAGIC) {
+        ret = -1;
+        printf("region table checksum and/or magic failure\n");
+        goto fail;
+    }
+
+    printf("Found %" PRId32 " region table entries\n", s->rt.entry_count);
+
+    for (i = 0; i < s->rt.entry_count; i++) {
+        hdr_copy_guid(rt_entry.guid, buffer, offset);
+
+        hdr_copy64(&rt_entry.file_offset,   buffer, offset);
+        hdr_copy32(&rt_entry.length,        buffer, offset);
+        hdr_copy32(&rt_entry.bitfield.data, buffer, offset);
+
+        /* see if we recognize the entry */
+        if (guid_cmp(rt_entry.guid, bat_guid)) {
+            s->bat_rt = rt_entry;
+            continue;
+        }
+
+        if (guid_cmp(rt_entry.guid, metadata_guid)) {
+            s->metadata_rt = rt_entry;
+            continue;
+        }
+
+        if (rt_entry.bitfield.bits.required) {
+            /* cannot read vhdx file - required region table entry that
+             * we do not understand.  per spec, we must fail to open */
+            printf("Found unknown region table entry that is REQUIRED!\n");
+            ret = -1;
+            goto fail;
+        }
+    }
+    ret = 0;
+
+fail:
+    g_free(buffer);
+    return ret;
+}
+
+
+static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    uint8_t *buffer;
+    int offset = 0;
+    int i = 0;
+    uint32_t block_size, sectors_per_block, logical_sector_size;
+    uint64_t chunk_ratio;
+    vhdx_metadata_table_entry md_entry;
+
+    buffer = g_malloc(VHDX_METADATA_TABLE_MAX_SIZE);
+
+    printf("reading metadata at offset 0x%" PRIx64 "\n",
+            s->metadata_rt.file_offset);
+    ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
+                     VHDX_METADATA_TABLE_MAX_SIZE);
+    if (ret < 0) {
+        goto fail_no_free;
+    }
+    hdr_copy64(&s->metadata_hdr.signature,   buffer, offset);
+    hdr_copy16(&s->metadata_hdr.reserved,    buffer, offset);
+    hdr_copy16(&s->metadata_hdr.entry_count, buffer, offset);
+    hdr_copy(&s->metadata_hdr.reserved2, buffer,
+             sizeof(s->metadata_hdr.reserved2), offset);
+
+    if (s->metadata_hdr.signature != VHDX_METADATA_MAGIC) {
+        ret = -1;
+        printf("metadata header signature did not match: 0x%" PRIx64 "\n",
+                s->metadata_hdr.signature);
+        goto fail_no_free;
+    }
+
+    printf("metadata section has %" PRId16 " entries\n",
+            s->metadata_hdr.entry_count);
+
+    s->metadata_entries.present = 0;
+
+    for (i = 0; i < s->metadata_hdr.entry_count; i++) {
+        hdr_copy_guid(md_entry.item_id,     buffer, offset);
+        hdr_copy32(&md_entry.offset,        buffer, offset);
+        hdr_copy32(&md_entry.length,        buffer, offset);
+        hdr_copy32(&md_entry.bitfield.data, buffer, offset);
+        hdr_copy32(&md_entry.reserved2,     buffer, offset);
+
+        if (guid_cmp(md_entry.item_id, file_param_guid)) {
+            s->metadata_entries.file_parameters_entry = md_entry;
+            s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT;
+            continue;
+        }
+
+        if (guid_cmp(md_entry.item_id, virtual_size_guid)) {
+            s->metadata_entries.virtual_disk_size_entry = md_entry;
+            s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT;
+            continue;
+        }
+
+        if (guid_cmp(md_entry.item_id, page83_guid)) {
+            s->metadata_entries.page83_data_entry = md_entry;
+            s->metadata_entries.present |= META_PAGE_83_PRESENT;
+            continue;
+        }
+
+        if (guid_cmp(md_entry.item_id, logical_sector_guid)) {
+            s->metadata_entries.logical_sector_size_entry = md_entry;
+            s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT;
+            continue;
+        }
+
+        if (guid_cmp(md_entry.item_id, phys_sector_guid)) {
+            s->metadata_entries.phys_sector_size_entry = md_entry;
+            s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT;
+            continue;
+        }
+
+        if (guid_cmp(md_entry.item_id, parent_locator_guid)) {
+            s->metadata_entries.parent_locator_entry = md_entry;
+            s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT;
+            continue;
+        }
+
+        if (md_entry.bitfield.bits.is_required) {
+            /* cannot read vhdx file - required region table entry that
+             * we do not understand.  per spec, we must fail to open */
+            printf("Found unknown metadata table entry that is REQUIRED!\n");
+            ret = -1;
+            goto exit;
+        }
+    }
+
+    if (s->metadata_entries.present != META_ALL_PRESENT) {
+        printf("Did not find all required metadata entry fields\n");
+        ret = -1;
+        goto exit;
+    }
+
+    offset = 0;
+    buffer = g_realloc(buffer,
+                       s->metadata_entries.file_parameters_entry.length);
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.file_parameters_entry.offset
+                                         + s->metadata_rt.file_offset,
+                     buffer,
+                     s->metadata_entries.file_parameters_entry.length);
+
+    hdr_copy32(&s->params.block_size,    buffer, offset);
+    hdr_copy32(&s->params.bitfield.data, buffer, offset);
+
+
+    /* We now have the file parameters, so we can tell if this is a
+     * differencing file (i.e.. has_parent), is dynamic or fixed
+     * sized (leave_blocks_allocated), and the block size */
+
+    /* The parent locator required iff the file parameters has_parent set */
+    if (s->params.bitfield.bits.has_parent) {
+        if (s->metadata_entries.present & ~META_PARENT_LOCATOR_PRESENT) {
+            offset = 0;
+            buffer = g_realloc(buffer,
+                               s->metadata_entries.parent_locator_entry.length);
+            ret = bdrv_pread(bs->file,
+                             s->metadata_entries.parent_locator_entry.offset,
+                             buffer,
+                             s->metadata_entries.parent_locator_entry.length);
+
+            ret = -1; /* temp, until differencing files are supported */
+            /* TODO: parse  parent locator fields */
+
+        } else {
+            printf("Did not find all required metadata entry fields\n");
+            ret = -1;
+            goto exit;
+        }
+    }
+    /* determine virtual disk size, logical sector size,
+     * and phys sector size */
+
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.virtual_disk_size_entry.offset
+                                           + s->metadata_rt.file_offset,
+                     &s->virtual_disk_size,
+                     sizeof(uint64_t));
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.logical_sector_size_entry.offset
+                                             + s->metadata_rt.file_offset,
+                     &s->logical_sector_size,
+                     sizeof(uint32_t));
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.phys_sector_size_entry.offset
+                                          + s->metadata_rt.file_offset,
+                     &s->physical_sector_size,
+                     sizeof(uint32_t));
+
+    le64_to_cpus(&s->virtual_disk_size);
+    le32_to_cpus(&s->logical_sector_size);
+    le32_to_cpus(&s->physical_sector_size);
+
+    /* both block_size and sector_size are guaranteed powers of 2 */
+    s->sectors_per_block = s->params.block_size / s->logical_sector_size;
+    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
+                     (uint64_t)s->logical_sector_size /
+                     (uint64_t)s->params.block_size;
+
+
+
+    /* These values are ones we will want to division / multiplication
+     * with later on, and they are all guaranteed (per the spec) to be
+     * powers of 2, so we can take advantage of that */
+    logical_sector_size = s->logical_sector_size;
+    while (logical_sector_size >>= 1) {
+        s->logical_sector_size_bits++;
+    }
+    sectors_per_block = s->sectors_per_block;
+    while (sectors_per_block >>= 1) {
+        s->sectors_per_block_bits++;
+    }
+    chunk_ratio = s->chunk_ratio;
+    while (chunk_ratio >>= 1) {
+        s->chunk_ratio_bits++;
+    }
+    block_size = s->params.block_size;
+    while (block_size >>= 1) {
+        s->block_size_bits++;
+    }
+
+    printf("chunk ratio is %" PRId64 "\n", s->chunk_ratio);
+    printf("block size is %" PRId32 " MB\n", s->params.block_size/(1024*1024));
+    printf("virtual disk size is %" PRId64 " MB\n",
+            s->virtual_disk_size/(1024*1024));
+    printf("logical sector size is %" PRId32 " bytes\n",
+            s->logical_sector_size);
+    printf("sectors per block is %" PRId32 "\n", s->sectors_per_block);
+
+    if (s->logical_sector_size != BDRV_SECTOR_SIZE) {
+        printf("VHDX error - QEMU only supports 512 byte sector sizes\n");
+        ret = -1;
+        goto exit;
+    }
+
+    ret = 0;
+
+exit:
+    g_free(buffer);
+fail_no_free:
+    return ret;
+}
+
+static int vhdx_open(BlockDriverState *bs, int flags)
+{
+    BDRVVHDXState *s = bs->opaque;
+    int ret = 0;
+    int i;
+
+    s->bat = NULL;
+
+    ret = vhdx_parse_header(bs, s);
+    if (ret) {
+        goto fail;
+    }
+
+    ret = vhdx_open_region_tables(bs, s);
+    if (ret) {
+        goto fail;
+    }
+
+    ret = vhdx_parse_metadata(bs, s);
+    if (ret) {
+        goto fail;
+    }
+
+    /* the VHDX spec dictates that virtual_disk_size is always a multiple of
+     * logical_sector_size */
+    bs->total_sectors = s->virtual_disk_size / s->logical_sector_size;
+
+    s->bat_entries = s->bat_rt.length / VHDX_BAT_ENTRY_SIZE;
+    s->bat = g_malloc(s->bat_rt.length);
+
+    ret = bdrv_pread(bs->file, s->bat_rt.file_offset, s->bat, s->bat_rt.length);
+
+    /* as it is a uint64_t bitfield, we need to have the right endianness */
+    for (i = 0; i < s->bat_entries; i++) {
+        le64_to_cpus(&s->bat[i].bitfield.data);
+    }
+
+    /* TODO: differencing files, r/w */
+    /* the below is obviously temporary */
+    if (flags & BDRV_O_RDWR) {
+        ret = -1;
+        goto fail;
+    }
+fail:
+    ret = 0;
+    return ret;
+}
+
+static int vhdx_reopen_prepare(BDRVReopenState *state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
+
+
+static int vhdx_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVVHDXState *s = bs->opaque;
+    int ret = 0;
+    uint32_t sectors_avail;
+    uint32_t block_offset;
+    uint64_t offset;
+    uint32_t bat_idx;
+    uint32_t bytes_to_read;
+
+    while (nb_sectors > 0) {
+        /* We are a differencing file, so we need to inspect the sector bitmap
+         * to see if we have the data or not */
+        if (s->params.bitfield.bits.has_parent) {
+            /* not supported yet */
+            ret = -1;
+            goto exit;
+        } else {
+            bat_idx = sector_num >> s->sectors_per_block_bits;
+            /* effectively a modulo - this gives us the offset into the block
+             * (in sector sizes) for our sector number */
+            block_offset = sector_num - (bat_idx << s->sectors_per_block_bits);
+            /* the chunk ratio gives us the interleaving of the sector
+             * bitmaps, so we need to advance our page block index by the
+             * sector bitmaps entry number */
+            bat_idx += bat_idx >> s->chunk_ratio_bits;
+
+            /* the number of sectors we can read in this cycle */
+            sectors_avail = s->sectors_per_block - block_offset;
+
+            if (sectors_avail  > nb_sectors) {
+                sectors_avail = nb_sectors;
+            }
+
+            bytes_to_read = sectors_avail << s->logical_sector_size_bits;
+
+            /*
+            printf("bat_idx: %d\n", bat_idx);
+            printf("sectors_avail: %d\n", bat_idx);
+            printf("bytes_to_read: %d\n", bat_idx);
+            */
+            /* check the payload block state */
+            switch (s->bat[bat_idx].bitfield.bits.state) {
+            case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
+            case PAYLOAD_BLOCK_UNMAPPED:    /* fall through */
+            case PAYLOAD_BLOCK_ZERO:
+                /* return zero */
+                memset(buf, 0, bytes_to_read);
+                break;
+            case PAYLOAD_BLOCK_UNDEFINED:   /* fall through */
+            case PAYLOAD_BLOCK_FULL_PRESENT:
+                offset = (block_offset << s->logical_sector_size_bits) +
+                         (s->bat[bat_idx].bitfield.bits.file_offset_mb
+                          * (1024 * 1024));
+                ret = bdrv_pread(bs->file, offset, buf, bytes_to_read);
+                if (ret != bytes_to_read) {
+                    ret = -1;
+                    goto exit;
+                }
+                break;
+            case PAYLOAD_BLOCK_PARTIALLY_PRESENT:
+                /* we don't yet support difference files, fall through
+                 * to error */
+            default:
+                ret = -1;
+                goto exit;
+                break;
+            }
+            nb_sectors -= sectors_avail;
+            sector_num += sectors_avail;
+            buf += bytes_to_read;
+        }
+    }
+    ret = 0;
+exit:
+    return ret;
+}
+
+static int vhdx_write(BlockDriverState *bs, int64_t sector_num,
+    const uint8_t *buf, int nb_sectors)
+{
+   /* BDRVVHDXState *s = bs->opaque; */
+
+    /* TODO */
+
+    return 0;
+}
+
+static coroutine_fn int vhdx_co_write(BlockDriverState *bs, int64_t sector_num,
+                                     const uint8_t *buf, int nb_sectors)
+{
+    int ret;
+    BDRVVHDXState *s = bs->opaque;
+    qemu_co_mutex_lock(&s->lock);
+    ret = vhdx_write(bs, sector_num, buf, nb_sectors);
+    qemu_co_mutex_unlock(&s->lock);
+
+    /* TODO */
+
+    return ret;
+}
+
+
+static int vhdx_create(const char *filename, QEMUOptionParameter *options)
+{
+
+    /* TODO */
+
+   return 0;
+}
+
+static void vhdx_close(BlockDriverState *bs)
+{
+    BDRVVHDXState *s = bs->opaque;
+
+    /* TODO */
+
+    g_free(s->headers[0]);
+    g_free(s->headers[1]);
+    g_free(s->bat);
+}
+
+static QEMUOptionParameter vhdx_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_SUBFMT,
+        .type = OPT_STRING,
+        .help =
+            ""
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_vhdx = {
+    .format_name    = "vhdx",
+    .instance_size  = sizeof(BDRVVHDXState),
+
+    .bdrv_probe             = vhdx_probe,
+    .bdrv_open              = vhdx_open,
+    .bdrv_close             = vhdx_close,
+    .bdrv_reopen_prepare    = vhdx_reopen_prepare,
+    .bdrv_create            = vhdx_create,
+    .bdrv_read              = vhdx_read,
+    .bdrv_write             = vhdx_co_write,
+    .create_options         = vhdx_create_options,
+};
+
+static void bdrv_vhdx_init(void)
+{
+    bdrv_register(&bdrv_vhdx);
+}
+
+block_init(bdrv_vhdx_init);