Patchwork [v4,2/5] qed: Add QEMU Enhanced Disk image format

login
register
mail settings
Submitter Stefan Hajnoczi
Date Oct. 28, 2010, 11:01 a.m.
Message ID <1288263684-18892-3-git-send-email-stefanha@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/69460/
State New
Headers show

Comments

Stefan Hajnoczi - Oct. 28, 2010, 11:01 a.m.
This patch introduces the qed on-disk layout and implements image
creation.  Later patches add read/write and other functionality.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 Makefile.objs |    1 +
 block/qed.c   |  548 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/qed.h   |  145 +++++++++++++++
 block_int.h   |    1 +
 4 files changed, 695 insertions(+), 0 deletions(-)
 create mode 100644 block/qed.c
 create mode 100644 block/qed.h
Kevin Wolf - Nov. 12, 2010, 3:43 p.m.
Am 28.10.2010 13:01, schrieb Stefan Hajnoczi:
> This patch introduces the qed on-disk layout and implements image
> creation.  Later patches add read/write and other functionality.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> ---
>  Makefile.objs |    1 +
>  block/qed.c   |  548 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/qed.h   |  145 +++++++++++++++
>  block_int.h   |    1 +
>  4 files changed, 695 insertions(+), 0 deletions(-)
>  create mode 100644 block/qed.c
>  create mode 100644 block/qed.h
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index f07fb01..7bae72a 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -14,6 +14,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>  
>  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
> +block-nested-y += qed.o
>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>  block-nested-$(CONFIG_POSIX) += raw-posix.o
> diff --git a/block/qed.c b/block/qed.c
> new file mode 100644
> index 0000000..8469cf0
> --- /dev/null
> +++ b/block/qed.c
> @@ -0,0 +1,548 @@
> +/*
> + * QEMU Enhanced Disk Format
> + *
> + * Copyright IBM, Corp. 2010
> + *
> + * Authors:
> + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
> + *  Anthony Liguori   <aliguori@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
> +
> +#include "qed.h"
> +
> +static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
> +                          const char *filename)
> +{
> +    const QEDHeader *header = (const QEDHeader *)buf;
> +
> +    if (buf_size < sizeof(*header)) {
> +        return 0;
> +    }
> +    if (le32_to_cpu(header->magic) != QED_MAGIC) {
> +        return 0;
> +    }
> +    return 100;
> +}
> +
> +/**
> + * Check whether an image format is raw
> + *
> + * @fmt:    Backing file format, may be NULL
> + */
> +static bool qed_fmt_is_raw(const char *fmt)
> +{
> +    return fmt && strcmp(fmt, "raw") == 0;
> +}

People shouldn't use them directly, but should we also consider file,
host_device, etc.?

> +
> +static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
> +{
> +    cpu->magic = le32_to_cpu(le->magic);
> +    cpu->cluster_size = le32_to_cpu(le->cluster_size);
> +    cpu->table_size = le32_to_cpu(le->table_size);
> +    cpu->header_size = le32_to_cpu(le->header_size);
> +    cpu->features = le64_to_cpu(le->features);
> +    cpu->compat_features = le64_to_cpu(le->compat_features);
> +    cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
> +    cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
> +    cpu->image_size = le64_to_cpu(le->image_size);
> +    cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
> +    cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
> +}
> +
> +static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
> +{
> +    le->magic = cpu_to_le32(cpu->magic);
> +    le->cluster_size = cpu_to_le32(cpu->cluster_size);
> +    le->table_size = cpu_to_le32(cpu->table_size);
> +    le->header_size = cpu_to_le32(cpu->header_size);
> +    le->features = cpu_to_le64(cpu->features);
> +    le->compat_features = cpu_to_le64(cpu->compat_features);
> +    le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
> +    le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
> +    le->image_size = cpu_to_le64(cpu->image_size);
> +    le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
> +    le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
> +}
> +
> +static int qed_write_header_sync(BDRVQEDState *s)
> +{
> +    QEDHeader le;
> +    int ret;
> +
> +    qed_header_cpu_to_le(&s->header, &le);
> +    ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
> +    if (ret != sizeof(le)) {
> +        return ret;
> +    }
> +    return 0;
> +}
> +
> +static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
> +{
> +    uint64_t table_entries;
> +    uint64_t l2_size;
> +
> +    table_entries = (table_size * cluster_size) / sizeof(uint64_t);
> +    l2_size = table_entries * cluster_size;
> +
> +    return l2_size * table_entries;
> +}
> +
> +static bool qed_is_cluster_size_valid(uint32_t cluster_size)
> +{
> +    if (cluster_size < QED_MIN_CLUSTER_SIZE ||
> +        cluster_size > QED_MAX_CLUSTER_SIZE) {
> +        return false;
> +    }
> +    if (cluster_size & (cluster_size - 1)) {
> +        return false; /* not power of 2 */
> +    }
> +    return true;
> +}
> +
> +static bool qed_is_table_size_valid(uint32_t table_size)
> +{
> +    if (table_size < QED_MIN_TABLE_SIZE ||
> +        table_size > QED_MAX_TABLE_SIZE) {
> +        return false;
> +    }
> +    if (table_size & (table_size - 1)) {
> +        return false; /* not power of 2 */
> +    }
> +    return true;
> +}
> +
> +static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
> +                                    uint32_t table_size)
> +{
> +    if (image_size % BDRV_SECTOR_SIZE != 0) {
> +        return false; /* not multiple of sector size */
> +    }
> +    if (image_size > qed_max_image_size(cluster_size, table_size)) {
> +        return false; /* image is too large */
> +    }
> +    return true;
> +}
> +
> +/**
> + * Read a string of known length from the image file
> + *
> + * @file:       Image file
> + * @offset:     File offset to start of string, in bytes
> + * @n:          String length in bytes
> + * @buf:        Destination buffer
> + * @buflen:     Destination buffer length in bytes
> + * @ret:        0 on success, -errno on failure
> + *
> + * The string is NUL-terminated.
> + */
> +static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n,
> +                           char *buf, size_t buflen)
> +{
> +    int ret;
> +    if (n >= buflen) {
> +        return -EINVAL;
> +    }
> +    ret = bdrv_pread(file, offset, buf, n);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    buf[n] = '\0';
> +    return 0;
> +}
> +
> +static int bdrv_qed_open(BlockDriverState *bs, int flags)
> +{
> +    BDRVQEDState *s = bs->opaque;
> +    QEDHeader le_header;
> +    int64_t file_size;
> +    int ret;
> +
> +    s->bs = bs;
> +
> +    ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    ret = 0; /* ret should always be 0 or -errno */
> +    qed_header_le_to_cpu(&le_header, &s->header);
> +
> +    if (s->header.magic != QED_MAGIC) {
> +        return -ENOENT;
> +    }

ENOENT seems a bit odd for a wrong magic number, especially if it's used
for the error message. Wouldn't EINVAL or ENOTSUP be a closer match?

> +    if (s->header.features & ~QED_FEATURE_MASK) {
> +        return -ENOTSUP; /* image uses unsupported feature bits */
> +    }
> +    if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
> +        return -EINVAL;
> +    }
> +
> +    /* Round down file size to the last cluster */
> +    file_size = bdrv_getlength(bs->file);
> +    if (file_size < 0) {
> +        return file_size;
> +    }
> +    s->file_size = qed_start_of_cluster(s, file_size);
> +
> +    if (!qed_is_table_size_valid(s->header.table_size)) {
> +        return -EINVAL;
> +    }
> +    if (!qed_is_image_size_valid(s->header.image_size,
> +                                 s->header.cluster_size,
> +                                 s->header.table_size)) {
> +        return -EINVAL;
> +    }
> +    if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
> +        return -EINVAL;
> +    }
> +
> +    s->table_nelems = (s->header.cluster_size * s->header.table_size) /
> +                      sizeof(uint64_t);
> +    s->l2_shift = ffs(s->header.cluster_size) - 1;
> +    s->l2_mask = s->table_nelems - 1;
> +    s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
> +
> +    if ((s->header.features & QED_F_BACKING_FILE)) {
> +        ret = qed_read_string(bs->file, s->header.backing_filename_offset,
> +                              s->header.backing_filename_size, bs->backing_file,
> +                              sizeof(bs->backing_file));
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
> +            pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
> +        }
> +    }
> +
> +    /* Reset unknown autoclear feature bits.  This is a backwards
> +     * compatibility mechanism that allows images to be opened by older
> +     * programs, which "knock out" unknown feature bits.  When an image is
> +     * opened by a newer program again it can detect that the autoclear
> +     * feature is no longer valid.
> +     */
> +    if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
> +        !bdrv_is_read_only(bs->file)) {
> +        s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
> +
> +        ret = qed_write_header_sync(s);
> +        if (ret) {
> +            return ret;
> +        }
> +
> +        /* From here on only known autoclear feature bits are valid */
> +        bdrv_flush(bs->file);
> +    }
> +
> +    return ret;
> +}
> +
> +static void bdrv_qed_close(BlockDriverState *bs)
> +{
> +}
> +
> +static void bdrv_qed_flush(BlockDriverState *bs)
> +{
> +    bdrv_flush(bs->file);
> +}

This conflicts with one of my recent changes. bdrv_flush should return
an int now.

> +
> +static int qed_create(const char *filename, uint32_t cluster_size,
> +                      uint64_t image_size, uint32_t table_size,
> +                      const char *backing_file, const char *backing_fmt)
> +{
> +    QEDHeader header = {
> +        .magic = QED_MAGIC,
> +        .cluster_size = cluster_size,
> +        .table_size = table_size,
> +        .header_size = 1,
> +        .features = 0,
> +        .compat_features = 0,
> +        .l1_table_offset = cluster_size,
> +        .image_size = image_size,
> +    };
> +    QEDHeader le_header;
> +    uint8_t *l1_table = NULL;
> +    size_t l1_size = header.cluster_size * header.table_size;
> +    int ret = 0;
> +    BlockDriverState *bs = NULL;
> +
> +    ret = bdrv_create_file(filename, NULL);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR | BDRV_O_CACHE_WB);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    if (backing_file) {
> +        header.features |= QED_F_BACKING_FILE;
> +        header.backing_filename_offset = sizeof(le_header);
> +        header.backing_filename_size = strlen(backing_file);
> +
> +        if (qed_fmt_is_raw(backing_fmt)) {
> +            header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
> +        }
> +    }
> +
> +    qed_header_cpu_to_le(&header, &le_header);
> +    ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header));
> +    if (ret < 0) {
> +        goto out;
> +    }
> +    ret = bdrv_pwrite(bs, sizeof(le_header), backing_file,
> +                      header.backing_filename_size);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    l1_table = qemu_mallocz(l1_size);
> +    ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    ret = 0; /* success */
> +out:
> +    qemu_free(l1_table);
> +    bdrv_delete(bs);
> +    return ret;
> +}
> +
> +static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options)
> +{
> +    uint64_t image_size = 0;
> +    uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE;
> +    uint32_t table_size = QED_DEFAULT_TABLE_SIZE;
> +    const char *backing_file = NULL;
> +    const char *backing_fmt = NULL;
> +
> +    while (options && options->name) {
> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> +            image_size = options->value.n;
> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
> +            backing_file = options->value.s;
> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
> +            backing_fmt = options->value.s;

I'm not sure here. It doesn't really matter for QED, but I find it
strange that -o backing_fmt=foobar works and gives you a non-raw backing
file. Should we check if the format exists at least?

Kevin
Stefan Hajnoczi - Nov. 12, 2010, 4:34 p.m.
On Fri, Nov 12, 2010 at 3:43 PM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 28.10.2010 13:01, schrieb Stefan Hajnoczi:
>> +/**
>> + * Check whether an image format is raw
>> + *
>> + * @fmt:    Backing file format, may be NULL
>> + */
>> +static bool qed_fmt_is_raw(const char *fmt)
>> +{
>> +    return fmt && strcmp(fmt, "raw") == 0;
>> +}
>
> People shouldn't use them directly, but should we also consider file,
> host_device, etc.?

Hrm..I will look into it for v5.  I thought we always have a "raw"
format on top of "file", "host_device", etc protocols?

>> +    if (s->header.magic != QED_MAGIC) {
>> +        return -ENOENT;
>> +    }
>
> ENOENT seems a bit odd for a wrong magic number, especially if it's used
> for the error message. Wouldn't EINVAL or ENOTSUP be a closer match?

You're right, ENOENT is confusing for the user.

>> +static void bdrv_qed_flush(BlockDriverState *bs)
>> +{
>> +    bdrv_flush(bs->file);
>> +}
>
> This conflicts with one of my recent changes. bdrv_flush should return
> an int now.

Will fix and will also check for bdrv_flush() failures.

>> +    while (options && options->name) {
>> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
>> +            image_size = options->value.n;
>> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
>> +            backing_file = options->value.s;
>> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
>> +            backing_fmt = options->value.s;
>
> I'm not sure here. It doesn't really matter for QED, but I find it
> strange that -o backing_fmt=foobar works and gives you a non-raw backing
> file. Should we check if the format exists at least?

I see the same issue in QCOW2 so let's solve this generically in a
separate patch.

Stefan
Kevin Wolf - Nov. 12, 2010, 4:52 p.m.
Am 12.11.2010 17:34, schrieb Stefan Hajnoczi:
> On Fri, Nov 12, 2010 at 3:43 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>> Am 28.10.2010 13:01, schrieb Stefan Hajnoczi:
>>> +/**
>>> + * Check whether an image format is raw
>>> + *
>>> + * @fmt:    Backing file format, may be NULL
>>> + */
>>> +static bool qed_fmt_is_raw(const char *fmt)
>>> +{
>>> +    return fmt && strcmp(fmt, "raw") == 0;
>>> +}
>>
>> People shouldn't use them directly, but should we also consider file,
>> host_device, etc.?
> 
> Hrm..I will look into it for v5.  I thought we always have a "raw"
> format on top of "file", "host_device", etc protocols?

That's how it's meant to be used. I think at the moment we still allow
to directly use file etc. and you know that if something is possible,
users will do it.

But instead of listing all of them here, maybe we should just make the
final step and return an error if they are used directly. Unfortunately,
that will bring us back to the discussion about formats and protocols... ;-)

>>> +    if (s->header.magic != QED_MAGIC) {
>>> +        return -ENOENT;
>>> +    }
>>
>> ENOENT seems a bit odd for a wrong magic number, especially if it's used
>> for the error message. Wouldn't EINVAL or ENOTSUP be a closer match?
> 
> You're right, ENOENT is confusing for the user.
> 
>>> +static void bdrv_qed_flush(BlockDriverState *bs)
>>> +{
>>> +    bdrv_flush(bs->file);
>>> +}
>>
>> This conflicts with one of my recent changes. bdrv_flush should return
>> an int now.
> 
> Will fix and will also check for bdrv_flush() failures.
> 
>>> +    while (options && options->name) {
>>> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
>>> +            image_size = options->value.n;
>>> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
>>> +            backing_file = options->value.s;
>>> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
>>> +            backing_fmt = options->value.s;
>>
>> I'm not sure here. It doesn't really matter for QED, but I find it
>> strange that -o backing_fmt=foobar works and gives you a non-raw backing
>> file. Should we check if the format exists at least?
> 
> I see the same issue in QCOW2 so let's solve this generically in a
> separate patch.

Okay, that makes sense.

Kevin
Markus Armbruster - Nov. 12, 2010, 5:24 p.m.
Kevin Wolf <kwolf@redhat.com> writes:

> Am 12.11.2010 17:34, schrieb Stefan Hajnoczi:
>> On Fri, Nov 12, 2010 at 3:43 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>>> Am 28.10.2010 13:01, schrieb Stefan Hajnoczi:
>>>> +/**
>>>> + * Check whether an image format is raw
>>>> + *
>>>> + * @fmt:    Backing file format, may be NULL
>>>> + */
>>>> +static bool qed_fmt_is_raw(const char *fmt)
>>>> +{
>>>> +    return fmt && strcmp(fmt, "raw") == 0;
>>>> +}
>>>
>>> People shouldn't use them directly, but should we also consider file,
>>> host_device, etc.?
>> 
>> Hrm..I will look into it for v5.  I thought we always have a "raw"
>> format on top of "file", "host_device", etc protocols?
>
> That's how it's meant to be used. I think at the moment we still allow
> to directly use file etc. and you know that if something is possible,
> users will do it.

It's possible, it's perfectly well-defined, and it works.

> But instead of listing all of them here, maybe we should just make the
> final step and return an error if they are used directly.

No.

>                                                           Unfortunately,
> that will bring us back to the discussion about formats and protocols... ;-)

/me runs away screaming

[...]
Kevin Wolf - Nov. 12, 2010, 5:34 p.m.
Am 12.11.2010 18:24, schrieb Markus Armbruster:
> Kevin Wolf <kwolf@redhat.com> writes:
> 
>> Am 12.11.2010 17:34, schrieb Stefan Hajnoczi:
>>> On Fri, Nov 12, 2010 at 3:43 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>>>> Am 28.10.2010 13:01, schrieb Stefan Hajnoczi:
>>>>> +/**
>>>>> + * Check whether an image format is raw
>>>>> + *
>>>>> + * @fmt:    Backing file format, may be NULL
>>>>> + */
>>>>> +static bool qed_fmt_is_raw(const char *fmt)
>>>>> +{
>>>>> +    return fmt && strcmp(fmt, "raw") == 0;
>>>>> +}
>>>>
>>>> People shouldn't use them directly, but should we also consider file,
>>>> host_device, etc.?
>>>
>>> Hrm..I will look into it for v5.  I thought we always have a "raw"
>>> format on top of "file", "host_device", etc protocols?
>>
>> That's how it's meant to be used. I think at the moment we still allow
>> to directly use file etc. and you know that if something is possible,
>> users will do it.
> 
> It's possible, it's perfectly well-defined, and it works.

Wow, I'm not used to be confirmed that quickly!

Yes, it's possible, it works, but it's only exposed because we didn't
pay attention when host_* was split out. There's no real reason to use
it (or do you have any example where format=raw doesn't work?) and it
should be considered an implementation detail.

Kevin

Patch

diff --git a/Makefile.objs b/Makefile.objs
index f07fb01..7bae72a 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -14,6 +14,7 @@  block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
+block-nested-y += qed.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
diff --git a/block/qed.c b/block/qed.c
new file mode 100644
index 0000000..8469cf0
--- /dev/null
+++ b/block/qed.c
@@ -0,0 +1,548 @@ 
+/*
+ * QEMU Enhanced Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qed.h"
+
+static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
+                          const char *filename)
+{
+    const QEDHeader *header = (const QEDHeader *)buf;
+
+    if (buf_size < sizeof(*header)) {
+        return 0;
+    }
+    if (le32_to_cpu(header->magic) != QED_MAGIC) {
+        return 0;
+    }
+    return 100;
+}
+
+/**
+ * Check whether an image format is raw
+ *
+ * @fmt:    Backing file format, may be NULL
+ */
+static bool qed_fmt_is_raw(const char *fmt)
+{
+    return fmt && strcmp(fmt, "raw") == 0;
+}
+
+static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
+{
+    cpu->magic = le32_to_cpu(le->magic);
+    cpu->cluster_size = le32_to_cpu(le->cluster_size);
+    cpu->table_size = le32_to_cpu(le->table_size);
+    cpu->header_size = le32_to_cpu(le->header_size);
+    cpu->features = le64_to_cpu(le->features);
+    cpu->compat_features = le64_to_cpu(le->compat_features);
+    cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
+    cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
+    cpu->image_size = le64_to_cpu(le->image_size);
+    cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
+    cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
+}
+
+static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
+{
+    le->magic = cpu_to_le32(cpu->magic);
+    le->cluster_size = cpu_to_le32(cpu->cluster_size);
+    le->table_size = cpu_to_le32(cpu->table_size);
+    le->header_size = cpu_to_le32(cpu->header_size);
+    le->features = cpu_to_le64(cpu->features);
+    le->compat_features = cpu_to_le64(cpu->compat_features);
+    le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
+    le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
+    le->image_size = cpu_to_le64(cpu->image_size);
+    le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
+    le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
+}
+
+static int qed_write_header_sync(BDRVQEDState *s)
+{
+    QEDHeader le;
+    int ret;
+
+    qed_header_cpu_to_le(&s->header, &le);
+    ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
+    if (ret != sizeof(le)) {
+        return ret;
+    }
+    return 0;
+}
+
+static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
+{
+    uint64_t table_entries;
+    uint64_t l2_size;
+
+    table_entries = (table_size * cluster_size) / sizeof(uint64_t);
+    l2_size = table_entries * cluster_size;
+
+    return l2_size * table_entries;
+}
+
+static bool qed_is_cluster_size_valid(uint32_t cluster_size)
+{
+    if (cluster_size < QED_MIN_CLUSTER_SIZE ||
+        cluster_size > QED_MAX_CLUSTER_SIZE) {
+        return false;
+    }
+    if (cluster_size & (cluster_size - 1)) {
+        return false; /* not power of 2 */
+    }
+    return true;
+}
+
+static bool qed_is_table_size_valid(uint32_t table_size)
+{
+    if (table_size < QED_MIN_TABLE_SIZE ||
+        table_size > QED_MAX_TABLE_SIZE) {
+        return false;
+    }
+    if (table_size & (table_size - 1)) {
+        return false; /* not power of 2 */
+    }
+    return true;
+}
+
+static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
+                                    uint32_t table_size)
+{
+    if (image_size % BDRV_SECTOR_SIZE != 0) {
+        return false; /* not multiple of sector size */
+    }
+    if (image_size > qed_max_image_size(cluster_size, table_size)) {
+        return false; /* image is too large */
+    }
+    return true;
+}
+
+/**
+ * Read a string of known length from the image file
+ *
+ * @file:       Image file
+ * @offset:     File offset to start of string, in bytes
+ * @n:          String length in bytes
+ * @buf:        Destination buffer
+ * @buflen:     Destination buffer length in bytes
+ * @ret:        0 on success, -errno on failure
+ *
+ * The string is NUL-terminated.
+ */
+static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n,
+                           char *buf, size_t buflen)
+{
+    int ret;
+    if (n >= buflen) {
+        return -EINVAL;
+    }
+    ret = bdrv_pread(file, offset, buf, n);
+    if (ret < 0) {
+        return ret;
+    }
+    buf[n] = '\0';
+    return 0;
+}
+
+static int bdrv_qed_open(BlockDriverState *bs, int flags)
+{
+    BDRVQEDState *s = bs->opaque;
+    QEDHeader le_header;
+    int64_t file_size;
+    int ret;
+
+    s->bs = bs;
+
+    ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
+    if (ret < 0) {
+        return ret;
+    }
+    ret = 0; /* ret should always be 0 or -errno */
+    qed_header_le_to_cpu(&le_header, &s->header);
+
+    if (s->header.magic != QED_MAGIC) {
+        return -ENOENT;
+    }
+    if (s->header.features & ~QED_FEATURE_MASK) {
+        return -ENOTSUP; /* image uses unsupported feature bits */
+    }
+    if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
+        return -EINVAL;
+    }
+
+    /* Round down file size to the last cluster */
+    file_size = bdrv_getlength(bs->file);
+    if (file_size < 0) {
+        return file_size;
+    }
+    s->file_size = qed_start_of_cluster(s, file_size);
+
+    if (!qed_is_table_size_valid(s->header.table_size)) {
+        return -EINVAL;
+    }
+    if (!qed_is_image_size_valid(s->header.image_size,
+                                 s->header.cluster_size,
+                                 s->header.table_size)) {
+        return -EINVAL;
+    }
+    if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
+        return -EINVAL;
+    }
+
+    s->table_nelems = (s->header.cluster_size * s->header.table_size) /
+                      sizeof(uint64_t);
+    s->l2_shift = ffs(s->header.cluster_size) - 1;
+    s->l2_mask = s->table_nelems - 1;
+    s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
+
+    if ((s->header.features & QED_F_BACKING_FILE)) {
+        ret = qed_read_string(bs->file, s->header.backing_filename_offset,
+                              s->header.backing_filename_size, bs->backing_file,
+                              sizeof(bs->backing_file));
+        if (ret < 0) {
+            return ret;
+        }
+
+        if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
+            pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
+        }
+    }
+
+    /* Reset unknown autoclear feature bits.  This is a backwards
+     * compatibility mechanism that allows images to be opened by older
+     * programs, which "knock out" unknown feature bits.  When an image is
+     * opened by a newer program again it can detect that the autoclear
+     * feature is no longer valid.
+     */
+    if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
+        !bdrv_is_read_only(bs->file)) {
+        s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
+
+        ret = qed_write_header_sync(s);
+        if (ret) {
+            return ret;
+        }
+
+        /* From here on only known autoclear feature bits are valid */
+        bdrv_flush(bs->file);
+    }
+
+    return ret;
+}
+
+static void bdrv_qed_close(BlockDriverState *bs)
+{
+}
+
+static void bdrv_qed_flush(BlockDriverState *bs)
+{
+    bdrv_flush(bs->file);
+}
+
+static int qed_create(const char *filename, uint32_t cluster_size,
+                      uint64_t image_size, uint32_t table_size,
+                      const char *backing_file, const char *backing_fmt)
+{
+    QEDHeader header = {
+        .magic = QED_MAGIC,
+        .cluster_size = cluster_size,
+        .table_size = table_size,
+        .header_size = 1,
+        .features = 0,
+        .compat_features = 0,
+        .l1_table_offset = cluster_size,
+        .image_size = image_size,
+    };
+    QEDHeader le_header;
+    uint8_t *l1_table = NULL;
+    size_t l1_size = header.cluster_size * header.table_size;
+    int ret = 0;
+    BlockDriverState *bs = NULL;
+
+    ret = bdrv_create_file(filename, NULL);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR | BDRV_O_CACHE_WB);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (backing_file) {
+        header.features |= QED_F_BACKING_FILE;
+        header.backing_filename_offset = sizeof(le_header);
+        header.backing_filename_size = strlen(backing_file);
+
+        if (qed_fmt_is_raw(backing_fmt)) {
+            header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
+        }
+    }
+
+    qed_header_cpu_to_le(&header, &le_header);
+    ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header));
+    if (ret < 0) {
+        goto out;
+    }
+    ret = bdrv_pwrite(bs, sizeof(le_header), backing_file,
+                      header.backing_filename_size);
+    if (ret < 0) {
+        goto out;
+    }
+
+    l1_table = qemu_mallocz(l1_size);
+    ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = 0; /* success */
+out:
+    qemu_free(l1_table);
+    bdrv_delete(bs);
+    return ret;
+}
+
+static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options)
+{
+    uint64_t image_size = 0;
+    uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE;
+    uint32_t table_size = QED_DEFAULT_TABLE_SIZE;
+    const char *backing_file = NULL;
+    const char *backing_fmt = NULL;
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            image_size = options->value.n;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_file = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
+            backing_fmt = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+            if (options->value.n) {
+                cluster_size = options->value.n;
+            }
+        } else if (!strcmp(options->name, BLOCK_OPT_TABLE_SIZE)) {
+            if (options->value.n) {
+                table_size = options->value.n;
+            }
+        }
+        options++;
+    }
+
+    if (!qed_is_cluster_size_valid(cluster_size)) {
+        fprintf(stderr, "QED cluster size must be within range [%u, %u] and power of 2\n",
+                QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
+        return -EINVAL;
+    }
+    if (!qed_is_table_size_valid(table_size)) {
+        fprintf(stderr, "QED table size must be within range [%u, %u] and power of 2\n",
+                QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
+        return -EINVAL;
+    }
+    if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) {
+        fprintf(stderr, "QED image size must be a non-zero multiple of "
+                        "cluster size and less than %" PRIu64 " bytes\n",
+                qed_max_image_size(cluster_size, table_size));
+        return -EINVAL;
+    }
+
+    return qed_create(filename, cluster_size, image_size, table_size,
+                      backing_file, backing_fmt);
+}
+
+static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                                  int nb_sectors, int *pnum)
+{
+    return -ENOTSUP;
+}
+
+static int bdrv_qed_make_empty(BlockDriverState *bs)
+{
+    return -ENOTSUP;
+}
+
+static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
+                                            int64_t sector_num,
+                                            QEMUIOVector *qiov, int nb_sectors,
+                                            BlockDriverCompletionFunc *cb,
+                                            void *opaque)
+{
+    return NULL;
+}
+
+static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
+                                             int64_t sector_num,
+                                             QEMUIOVector *qiov, int nb_sectors,
+                                             BlockDriverCompletionFunc *cb,
+                                             void *opaque)
+{
+    return NULL;
+}
+
+static BlockDriverAIOCB *bdrv_qed_aio_flush(BlockDriverState *bs,
+                                            BlockDriverCompletionFunc *cb,
+                                            void *opaque)
+{
+    return bdrv_aio_flush(bs->file, cb, opaque);
+}
+
+static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset)
+{
+    return -ENOTSUP;
+}
+
+static int64_t bdrv_qed_getlength(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+    return s->header.image_size;
+}
+
+static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    memset(bdi, 0, sizeof(*bdi));
+    bdi->cluster_size = s->header.cluster_size;
+    return 0;
+}
+
+static int bdrv_qed_change_backing_file(BlockDriverState *bs,
+                                        const char *backing_file,
+                                        const char *backing_fmt)
+{
+    BDRVQEDState *s = bs->opaque;
+    QEDHeader new_header, le_header;
+    void *buffer;
+    size_t buffer_len, backing_file_len;
+    int ret;
+
+    /* Refuse to set backing filename if unknown compat feature bits are
+     * active.  If the image uses an unknown compat feature then we may not
+     * know the layout of data following the header structure and cannot safely
+     * add a new string.
+     */
+    if (backing_file && (s->header.compat_features &
+                         ~QED_COMPAT_FEATURE_MASK)) {
+        return -ENOTSUP;
+    }
+
+    memcpy(&new_header, &s->header, sizeof(new_header));
+
+    new_header.features &= ~(QED_F_BACKING_FILE |
+                             QED_F_BACKING_FORMAT_NO_PROBE);
+
+    /* Adjust feature flags */
+    if (backing_file) {
+        new_header.features |= QED_F_BACKING_FILE;
+
+        if (qed_fmt_is_raw(backing_fmt)) {
+            new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
+        }
+    }
+
+    /* Calculate new header size */
+    backing_file_len = 0;
+
+    if (backing_file) {
+        backing_file_len = strlen(backing_file);
+    }
+
+    buffer_len = sizeof(new_header);
+    new_header.backing_filename_offset = buffer_len;
+    new_header.backing_filename_size = backing_file_len;
+    buffer_len += backing_file_len;
+
+    /* Make sure we can rewrite header without failing */
+    if (buffer_len > new_header.header_size * new_header.cluster_size) {
+        return -ENOSPC;
+    }
+
+    /* Prepare new header */
+    buffer = qemu_malloc(buffer_len);
+
+    qed_header_cpu_to_le(&new_header, &le_header);
+    memcpy(buffer, &le_header, sizeof(le_header));
+    buffer_len = sizeof(le_header);
+
+    memcpy(buffer + buffer_len, backing_file, backing_file_len);
+    buffer_len += backing_file_len;
+
+    /* Write new header */
+    ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len);
+    qemu_free(buffer);
+    if (ret == 0) {
+        memcpy(&s->header, &new_header, sizeof(new_header));
+    }
+    return ret;
+}
+
+static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result)
+{
+    return -ENOTSUP;
+}
+
+static QEMUOptionParameter qed_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size (in bytes)"
+    }, {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    }, {
+        .name = BLOCK_OPT_BACKING_FMT,
+        .type = OPT_STRING,
+        .help = "Image format of the base image"
+    }, {
+        .name = BLOCK_OPT_CLUSTER_SIZE,
+        .type = OPT_SIZE,
+        .help = "Cluster size (in bytes)"
+    }, {
+        .name = BLOCK_OPT_TABLE_SIZE,
+        .type = OPT_SIZE,
+        .help = "L1/L2 table size (in clusters)"
+    },
+    { /* end of list */ }
+};
+
+static BlockDriver bdrv_qed = {
+    .format_name              = "qed",
+    .instance_size            = sizeof(BDRVQEDState),
+    .create_options           = qed_create_options,
+
+    .bdrv_probe               = bdrv_qed_probe,
+    .bdrv_open                = bdrv_qed_open,
+    .bdrv_close               = bdrv_qed_close,
+    .bdrv_create              = bdrv_qed_create,
+    .bdrv_flush               = bdrv_qed_flush,
+    .bdrv_is_allocated        = bdrv_qed_is_allocated,
+    .bdrv_make_empty          = bdrv_qed_make_empty,
+    .bdrv_aio_readv           = bdrv_qed_aio_readv,
+    .bdrv_aio_writev          = bdrv_qed_aio_writev,
+    .bdrv_aio_flush           = bdrv_qed_aio_flush,
+    .bdrv_truncate            = bdrv_qed_truncate,
+    .bdrv_getlength           = bdrv_qed_getlength,
+    .bdrv_get_info            = bdrv_qed_get_info,
+    .bdrv_change_backing_file = bdrv_qed_change_backing_file,
+    .bdrv_check               = bdrv_qed_check,
+};
+
+static void bdrv_qed_init(void)
+{
+    bdrv_register(&bdrv_qed);
+}
+
+block_init(bdrv_qed_init);
diff --git a/block/qed.h b/block/qed.h
new file mode 100644
index 0000000..b53e9a0
--- /dev/null
+++ b/block/qed.h
@@ -0,0 +1,145 @@ 
+/*
+ * QEMU Enhanced Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_QED_H
+#define BLOCK_QED_H
+
+#include "block_int.h"
+
+/* The layout of a QED file is as follows:
+ *
+ * +--------+----------+----------+----------+-----+
+ * | header | L1 table | cluster0 | cluster1 | ... |
+ * +--------+----------+----------+----------+-----+
+ *
+ * There is a 2-level pagetable for cluster allocation:
+ *
+ *                     +----------+
+ *                     | L1 table |
+ *                     +----------+
+ *                ,------'  |  '------.
+ *           +----------+   |    +----------+
+ *           | L2 table |  ...   | L2 table |
+ *           +----------+        +----------+
+ *       ,------'  |  '------.
+ *  +----------+   |    +----------+
+ *  |   Data   |  ...   |   Data   |
+ *  +----------+        +----------+
+ *
+ * The L1 table is fixed size and always present.  L2 tables are allocated on
+ * demand.  The L1 table size determines the maximum possible image size; it
+ * can be influenced using the cluster_size and table_size values.
+ *
+ * All fields are little-endian on disk.
+ */
+
+enum {
+    QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24,
+
+    /* The image supports a backing file */
+    QED_F_BACKING_FILE = 0x01,
+
+    /* The backing file format must not be probed, treat as raw image */
+    QED_F_BACKING_FORMAT_NO_PROBE = 0x04,
+
+    /* Feature bits must be used when the on-disk format changes */
+    QED_FEATURE_MASK = QED_F_BACKING_FILE | /* supported feature bits */
+                       QED_F_BACKING_FORMAT_NO_PROBE,
+    QED_COMPAT_FEATURE_MASK = 0,            /* supported compat feature bits */
+    QED_AUTOCLEAR_FEATURE_MASK = 0,         /* supported autoclear feature bits */
+
+    /* Data is stored in groups of sectors called clusters.  Cluster size must
+     * be large to avoid keeping too much metadata.  I/O requests that have
+     * sub-cluster size will require read-modify-write.
+     */
+    QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */
+    QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024,
+    QED_DEFAULT_CLUSTER_SIZE = 64 * 1024,
+
+    /* Allocated clusters are tracked using a 2-level pagetable.  Table size is
+     * a multiple of clusters so large maximum image sizes can be supported
+     * without jacking up the cluster size too much.
+     */
+    QED_MIN_TABLE_SIZE = 1,        /* in clusters */
+    QED_MAX_TABLE_SIZE = 16,
+    QED_DEFAULT_TABLE_SIZE = 4,
+};
+
+typedef struct {
+    uint32_t magic;                 /* QED\0 */
+
+    uint32_t cluster_size;          /* in bytes */
+    uint32_t table_size;            /* for L1 and L2 tables, in clusters */
+    uint32_t header_size;           /* in clusters */
+
+    uint64_t features;              /* format feature bits */
+    uint64_t compat_features;       /* compatible feature bits */
+    uint64_t autoclear_features;    /* self-resetting feature bits */
+
+    uint64_t l1_table_offset;       /* in bytes */
+    uint64_t image_size;            /* total logical image size, in bytes */
+
+    /* if (features & QED_F_BACKING_FILE) */
+    uint32_t backing_filename_offset; /* in bytes from start of header */
+    uint32_t backing_filename_size;   /* in bytes */
+} QEDHeader;
+
+typedef struct {
+    BlockDriverState *bs;           /* device */
+    uint64_t file_size;             /* length of image file, in bytes */
+
+    QEDHeader header;               /* always cpu-endian */
+    uint32_t table_nelems;
+    uint32_t l1_shift;
+    uint32_t l2_shift;
+    uint32_t l2_mask;
+} BDRVQEDState;
+
+static inline uint64_t qed_start_of_cluster(BDRVQEDState *s, uint64_t offset)
+{
+    return offset & ~(uint64_t)(s->header.cluster_size - 1);
+}
+
+/**
+ * Test if a cluster offset is valid
+ */
+static inline bool qed_check_cluster_offset(BDRVQEDState *s, uint64_t offset)
+{
+    uint64_t header_size = (uint64_t)s->header.header_size *
+                           s->header.cluster_size;
+
+    if (offset & (s->header.cluster_size - 1)) {
+        return false;
+    }
+    return offset >= header_size && offset < s->file_size;
+}
+
+/**
+ * Test if a table offset is valid
+ */
+static inline bool qed_check_table_offset(BDRVQEDState *s, uint64_t offset)
+{
+    uint64_t end_offset = offset + (s->header.table_size - 1) *
+                          s->header.cluster_size;
+
+    /* Overflow check */
+    if (end_offset <= offset) {
+        return false;
+    }
+
+    return qed_check_cluster_offset(s, offset) &&
+           qed_check_cluster_offset(s, end_offset);
+}
+
+#endif /* BLOCK_QED_H */
diff --git a/block_int.h b/block_int.h
index 87e60b8..fa58000 100644
--- a/block_int.h
+++ b/block_int.h
@@ -38,6 +38,7 @@ 
 #define BLOCK_OPT_BACKING_FILE  "backing_file"
 #define BLOCK_OPT_BACKING_FMT   "backing_fmt"
 #define BLOCK_OPT_CLUSTER_SIZE  "cluster_size"
+#define BLOCK_OPT_TABLE_SIZE    "table_size"
 #define BLOCK_OPT_PREALLOC      "preallocation"
 
 typedef struct AIOPool {