Message ID | 1283767478-16740-1-git-send-email-stefanha@linux.vnet.ibm.com |
---|---|
State | New |
Headers | show |
On 06.09.2010, at 12:04, Stefan Hajnoczi wrote: > QEMU Enhanced Disk format is a disk image format that forgoes features > found in qcow2 in favor of better levels of performance and data > integrity. Due to its simpler on-disk layout, it is possible to safely > perform metadata updates more efficiently. > > Installations, suspend-to-disk, and other allocation-heavy I/O workloads > will see increased performance due to fewer I/Os and syncs. Workloads > that do not cause new clusters to be allocated will perform similar to > raw images due to in-memory metadata caching. > > The format supports sparse disk images. It does not rely on the host > filesystem holes feature, making it a good choice for sparse disk images > that need to be transferred over channels where holes are not supported. > > Backing files are supported so only deltas against a base image can be > stored. > > The file format is extensible so that additional features can be added > later with graceful compatibility handling. > > Internal snapshots are not supported. This eliminates the need for > additional metadata to track copy-on-write clusters. > > Compression and encryption are not supported. They add complexity and > can be implemented at other layers in the stack (i.e. inside the guest > or on the host). > > The format is currently functional with the following features missing: > * Resizing the disk image. The capability has been designed in but the > code has not been written yet. > * Resetting the image after backing file commit completes. > * Changing the backing filename. > * Consistency check (fsck). This is simple due to the on-disk layout. Yippie - yet another disk format :). Let's hope this one survives. > > Signed-off-by: Anthony Liguori <aliguori@us.ibm.com> > Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> > --- > This code is also available from git (for development and testing the tracing > and blkverify features are pulled in, whereas this single squashed patch > applies to mainline qemu.git): > > http://repo.or.cz/w/qemu/stefanha.git/shortlog/refs/heads/qed > just looked at it and stumbled over two simple nits. [snip] > +/** > + * Get the number of bits for a power of 2 > + * > + * The following is true for powers of 2: > + * n == 1 << get_bits_from_size(n) > + */ > +int get_bits_from_size(size_t size) > +{ > + int res = 0; > + > + if (size == 0) { > + return -1; > + } > + > + while (size != 1) { > + /* Not a power of two */ > + if (size & 1) { > + return -1; > + } > + > + size >>= 1; > + res++; > + } > + > + return res; > +} Should be an extra patch - it doesn't hurt to send an RFC patch set. This thing is so big that it's no fun to review :). > + > +const char *bytes_to_str(uint64_t size) > +{ > + static char buffer[64]; > + > + if (size < (1ULL << 10)) { > + snprintf(buffer, sizeof(buffer), "%" PRIu64 " byte(s)", size); > + } else if (size < (1ULL << 20)) { > + snprintf(buffer, sizeof(buffer), "%" PRIu64 " KB(s)", size >> 10); > + } else if (size < (1ULL << 30)) { > + snprintf(buffer, sizeof(buffer), "%" PRIu64 " MB(s)", size >> 20); > + } else if (size < (1ULL << 40)) { > + snprintf(buffer, sizeof(buffer), "%" PRIu64 " GB(s)", size >> 30); > + } else { > + snprintf(buffer, sizeof(buffer), "%" PRIu64 " TB(s)", size >> 40); > + } > + > + return buffer; This returns a variable from the stack! Please make the target buffer caller defined. > +} > diff --git a/qemu-common.h b/qemu-common.h > index dfd3dc0..754b107 100644 > --- a/qemu-common.h > +++ b/qemu-common.h > @@ -137,6 +137,8 @@ time_t mktimegm(struct tm *tm); > int qemu_fls(int i); > int qemu_fdatasync(int fd); > int fcntl_setfl(int fd, int flag); > +int get_bits_from_size(size_t size); > +const char *bytes_to_str(uint64_t size); > > /* path.c */ > void init_paths(const char *prefix); > @@ -283,6 +285,7 @@ void qemu_iovec_destroy(QEMUIOVector *qiov); > void qemu_iovec_reset(QEMUIOVector *qiov); > void qemu_iovec_to_buffer(QEMUIOVector *qiov, void *buf); > void qemu_iovec_from_buffer(QEMUIOVector *qiov, const void *buf, size_t count); > +void qemu_iovec_zero(QEMUIOVector *qiov); separate patch please. Alex
Am 06.09.2010 12:04, schrieb Stefan Hajnoczi: > QEMU Enhanced Disk format is a disk image format that forgoes features > found in qcow2 in favor of better levels of performance and data > integrity. Due to its simpler on-disk layout, it is possible to safely > perform metadata updates more efficiently. > > Installations, suspend-to-disk, and other allocation-heavy I/O workloads > will see increased performance due to fewer I/Os and syncs. Workloads > that do not cause new clusters to be allocated will perform similar to > raw images due to in-memory metadata caching. > > The format supports sparse disk images. It does not rely on the host > filesystem holes feature, making it a good choice for sparse disk images > that need to be transferred over channels where holes are not supported. > > Backing files are supported so only deltas against a base image can be > stored. > > The file format is extensible so that additional features can be added > later with graceful compatibility handling. > > Internal snapshots are not supported. This eliminates the need for > additional metadata to track copy-on-write clusters. > > Compression and encryption are not supported. They add complexity and > can be implemented at other layers in the stack (i.e. inside the guest > or on the host). > > The format is currently functional with the following features missing: > * Resizing the disk image. The capability has been designed in but the > code has not been written yet. > * Resetting the image after backing file commit completes. > * Changing the backing filename. > * Consistency check (fsck). This is simple due to the on-disk layout. > > Signed-off-by: Anthony Liguori <aliguori@us.ibm.com> > Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> Okay, so before I actually look at the patch longer than a couple of seconds let me just ask the obvious question... Before inventing yet another image format, you certainly have checked the existing ones. Except for not implementing compression and encryption this looks a lot like qcow1 to me. I see that you even retained the two-level cluster tables. So if we ignore the implementation for a moment and just compare the formats, what's the crucial difference between qcow1 and qed that I'm missing? And if it's not qcow1, why not improving our support for another existing format like VHD? Kevin
On Mon, Sep 6, 2010 at 11:25 AM, Alexander Graf <agraf@suse.de> wrote:
> Should be an extra patch - it doesn't hurt to send an RFC patch set. This thing is so big that it's no fun to review :).
I'll start consolidating commits so the next round will be easier to review.
Stefan
On Mon, Sep 06, 2010 at 11:04:38AM +0100, Stefan Hajnoczi wrote: > QEMU Enhanced Disk format is a disk image format that forgoes features > found in qcow2 in favor of better levels of performance and data > integrity. Due to its simpler on-disk layout, it is possible to safely > perform metadata updates more efficiently. > > Installations, suspend-to-disk, and other allocation-heavy I/O workloads > will see increased performance due to fewer I/Os and syncs. Workloads > that do not cause new clusters to be allocated will perform similar to > raw images due to in-memory metadata caching. > > The format supports sparse disk images. It does not rely on the host > filesystem holes feature, making it a good choice for sparse disk images > that need to be transferred over channels where holes are not supported. > > Backing files are supported so only deltas against a base image can be > stored. > > The file format is extensible so that additional features can be added > later with graceful compatibility handling. > > Internal snapshots are not supported. This eliminates the need for > additional metadata to track copy-on-write clusters. > > Compression and encryption are not supported. They add complexity and > can be implemented at other layers in the stack (i.e. inside the guest > or on the host). I agree with ditching compression, but encryption is an important capability which cannot be satisfactorily added at other layers in the stack. While block devices / local filesystems can layer in dm-crypt in the host, this is not possible with network/cluster filesystems which account for a non-trivial target audience. Adding encryption inside the guest is sub-optimal because you cannot do secure automation of guest startup. Either you require manaual intervention to start every guest to enter the key, or if you hardcode the key, then anyone who can access the guest disk image can start the guest. The qcow2 encryption is the perfect solution for this problem, guaranteeing the data security even when the storage system / network transport offers no security, and allowing for secure control over guest startup. Further, adding encryptiuon does not add any serious complexity to the on disk format - just 1 extra header field, nor to the implmenetation - just pass the data block through a encrypt/decrypt filter, with no extra I/O paths. > diff --git a/block/qed-cluster.c b/block/qed-cluster.c > new file mode 100644 > index 0000000..6deea27 > --- /dev/null > +++ b/block/qed-cluster.c > @@ -0,0 +1,136 @@ > +/* > + * QEMU Enhanced Disk Format Cluster functions > + * > + * Copyright IBM, Corp. 2010 > + * > + * Authors: > + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> > + * Anthony Liguori <aliguori@us.ibm.com> > + * > + * This work is licensed under the terms of the GNU LGPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + * > + */ > + > +#include "qed.h" > + > +/** > + * Count the number of contiguous data clusters > + * > + * @s: QED state > + * @table: L2 table > + * @index: First cluster index > + * @n: Maximum number of clusters > + * @offset: Set to first cluster offset > + * > + * This function scans tables for contiguous allocated or free clusters. > + */ > +static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s, > + QEDTable *table, > + unsigned int index, > + unsigned int n, > + uint64_t *offset) > +{ > + unsigned int end = MIN(index + n, s->table_nelems); > + uint64_t last = table->offsets[index]; > + unsigned int i; > + > + *offset = last; > + > + for (i = index + 1; i < end; i++) { > + if (last == 0) { > + /* Counting free clusters */ > + if (table->offsets[i] != 0) { > + break; > + } > + } else { > + /* Counting allocated clusters */ > + if (table->offsets[i] != last + s->header.cluster_size) { > + break; > + } > + last = table->offsets[i]; > + } > + } > + return i - index; > +} > + > +typedef struct { > + BDRVQEDState *s; > + uint64_t pos; > + size_t len; > + > + QEDRequest *request; > + > + /* User callback */ > + QEDFindClusterFunc *cb; > + void *opaque; > +} QEDFindClusterCB; > + > +static void qed_find_cluster_cb(void *opaque, int ret) > +{ > + QEDFindClusterCB *find_cluster_cb = opaque; > + BDRVQEDState *s = find_cluster_cb->s; > + QEDRequest *request = find_cluster_cb->request; > + uint64_t offset = 0; > + size_t len = 0; > + unsigned int index; > + unsigned int n; > + > + if (ret) { > + ret = QED_CLUSTER_ERROR; > + goto out; > + } > + > + index = qed_l2_index(s, find_cluster_cb->pos); > + n = qed_bytes_to_clusters(s, > + qed_offset_into_cluster(s, find_cluster_cb->pos) + > + find_cluster_cb->len); > + n = qed_count_contiguous_clusters(s, request->l2_table->table, > + index, n, &offset); > + > + ret = offset ? QED_CLUSTER_FOUND : QED_CLUSTER_L2; > + len = MIN(find_cluster_cb->len, n * s->header.cluster_size - > + qed_offset_into_cluster(s, find_cluster_cb->pos)); > + > +out: > + find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len); > + qemu_free(find_cluster_cb); > +} > + > +/** > + * Find the offset of a data cluster > + * > + * @s: QED state > + * @pos: Byte position in device > + * @len: Number of bytes > + * @cb: Completion function > + * @opaque: User data for completion function > + */ > +void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, > + size_t len, QEDFindClusterFunc *cb, void *opaque) > +{ > + QEDFindClusterCB *find_cluster_cb; > + uint64_t l2_offset; > + > + /* Limit length to L2 boundary. Requests are broken up at the L2 boundary > + * so that a request acts on one L2 table at a time. > + */ > + len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos); > + > + l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)]; > + if (!l2_offset) { > + cb(opaque, QED_CLUSTER_L1, 0, len); > + return; > + } > + > + find_cluster_cb = qemu_malloc(sizeof(*find_cluster_cb)); > + find_cluster_cb->s = s; > + find_cluster_cb->pos = pos; > + find_cluster_cb->len = len; > + find_cluster_cb->cb = cb; > + find_cluster_cb->opaque = opaque; > + find_cluster_cb->request = request; > + > + qed_read_l2_table(s, request, l2_offset, > + qed_find_cluster_cb, find_cluster_cb); > +} > diff --git a/block/qed-gencb.c b/block/qed-gencb.c > new file mode 100644 > index 0000000..d389e12 > --- /dev/null > +++ b/block/qed-gencb.c > @@ -0,0 +1,32 @@ > +/* > + * QEMU Enhanced Disk Format > + * > + * Copyright IBM, Corp. 2010 > + * > + * Authors: > + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> > + * > + * This work is licensed under the terms of the GNU LGPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + * > + */ > + > +#include "qed.h" > + > +void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque) > +{ > + GenericCB *gencb = qemu_malloc(len); > + gencb->cb = cb; > + gencb->opaque = opaque; > + return gencb; > +} > + > +void gencb_complete(void *opaque, int ret) > +{ > + GenericCB *gencb = opaque; > + BlockDriverCompletionFunc *cb = gencb->cb; > + void *user_opaque = gencb->opaque; > + > + qemu_free(gencb); > + cb(user_opaque, ret); > +} > diff --git a/block/qed-l2-cache.c b/block/qed-l2-cache.c > new file mode 100644 > index 0000000..747a629 > --- /dev/null > +++ b/block/qed-l2-cache.c > @@ -0,0 +1,131 @@ > +/* > + * QEMU Enhanced Disk Format L2 Cache > + * > + * Copyright IBM, Corp. 2010 > + * > + * Authors: > + * Anthony Liguori <aliguori@us.ibm.com> > + * > + * This work is licensed under the terms of the GNU LGPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + * > + */ > + > +#include "qed.h" > + > +/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */ > +#define MAX_L2_CACHE_SIZE 50 > + > +/** > + * Initialize the L2 cache > + */ > +void qed_init_l2_cache(L2TableCache *l2_cache, > + L2TableAllocFunc *alloc_l2_table, > + void *alloc_l2_table_opaque) > +{ > + QTAILQ_INIT(&l2_cache->entries); > + l2_cache->n_entries = 0; > + l2_cache->alloc_l2_table = alloc_l2_table; > + l2_cache->alloc_l2_table_opaque = alloc_l2_table_opaque; > +} > + > +/** > + * Free the L2 cache > + */ > +void qed_free_l2_cache(L2TableCache *l2_cache) > +{ > + CachedL2Table *entry, *next_entry; > + > + QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) { > + qemu_free(entry->table); > + qemu_free(entry); > + } > +} > + > +/** > + * Allocate an uninitialized entry from the cache > + * > + * The returned entry has a reference count of 1 and is owned by the caller. > + */ > +CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache) > +{ > + CachedL2Table *entry; > + > + entry = qemu_mallocz(sizeof(*entry)); > + entry->table = l2_cache->alloc_l2_table(l2_cache->alloc_l2_table_opaque); > + entry->ref++; > + > + return entry; > +} > + > +/** > + * Decrease an entry's reference count and free if necessary when the reference > + * count drops to zero. > + */ > +void qed_unref_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *entry) > +{ > + if (!entry) { > + return; > + } > + > + entry->ref--; > + if (entry->ref == 0) { > + qemu_free(entry->table); > + qemu_free(entry); > + } > +} > + > +/** > + * Find an entry in the L2 cache. This may return NULL and it's up to the > + * caller to satisfy the cache miss. > + * > + * For a cached entry, this function increases the reference count and returns > + * the entry. > + */ > +CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset) > +{ > + CachedL2Table *entry; > + > + QTAILQ_FOREACH(entry, &l2_cache->entries, node) { > + if (entry->offset == offset) { > + entry->ref++; > + return entry; > + } > + } > + return NULL; > +} > + > +/** > + * Commit an L2 cache entry into the cache. This is meant to be used as part of > + * the process to satisfy a cache miss. A caller would allocate an entry which > + * is not actually in the L2 cache and then once the entry was valid and > + * present on disk, the entry can be committed into the cache. > + * > + * Since the cache is write-through, it's important that this function is not > + * called until the entry is present on disk and the L1 has been updated to > + * point to the entry. > + * > + * This function will take a reference to the entry so the caller is still > + * responsible for unreferencing the entry. > + */ > +void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table) > +{ > + CachedL2Table *entry; > + > + entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset); > + if (entry) { > + qed_unref_l2_cache_entry(l2_cache, entry); > + return; > + } > + > + if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) { > + entry = QTAILQ_FIRST(&l2_cache->entries); > + QTAILQ_REMOVE(&l2_cache->entries, entry, node); > + l2_cache->n_entries--; > + qed_unref_l2_cache_entry(l2_cache, entry); > + } > + > + l2_table->ref++; > + l2_cache->n_entries++; > + QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node); > +} > diff --git a/block/qed-table.c b/block/qed-table.c > new file mode 100644 > index 0000000..9a72582 > --- /dev/null > +++ b/block/qed-table.c > @@ -0,0 +1,242 @@ > +/* > + * QEMU Enhanced Disk Format Table I/O > + * > + * Copyright IBM, Corp. 2010 > + * > + * Authors: > + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> > + * Anthony Liguori <aliguori@us.ibm.com> > + * > + * This work is licensed under the terms of the GNU LGPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + * > + */ > + > +#include "qed.h" > + > +typedef struct { > + GenericCB gencb; > + BDRVQEDState *s; > + QEDTable *table; > + > + struct iovec iov; > + QEMUIOVector qiov; > +} QEDReadTableCB; > + > +static void qed_read_table_cb(void *opaque, int ret) > +{ > + QEDReadTableCB *read_table_cb = opaque; > + QEDTable *table = read_table_cb->table; > + int noffsets = read_table_cb->iov.iov_len / sizeof(uint64_t); > + int i; > + > + /* Handle I/O error */ > + if (ret) { > + goto out; > + } > + > + /* Byteswap and verify offsets */ > + for (i = 0; i < noffsets; i++) { > + table->offsets[i] = le64_to_cpu(table->offsets[i]); > + } > + > +out: > + /* Completion */ > + gencb_complete(&read_table_cb->gencb, ret); > +} > + > +static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, > + BlockDriverCompletionFunc *cb, void *opaque) > +{ > + QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb), > + cb, opaque); > + QEMUIOVector *qiov = &read_table_cb->qiov; > + BlockDriverAIOCB *aiocb; > + > + read_table_cb->s = s; > + read_table_cb->table = table; > + read_table_cb->iov.iov_base = table->offsets, > + read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size, > + > + qemu_iovec_init_external(qiov, &read_table_cb->iov, 1); > + aiocb = bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov, > + read_table_cb->iov.iov_len / BDRV_SECTOR_SIZE, > + qed_read_table_cb, read_table_cb); > + if (!aiocb) { > + qed_read_table_cb(read_table_cb, -EIO); > + } > +} > + > +typedef struct { > + GenericCB gencb; > + BDRVQEDState *s; > + QEDTable *orig_table; > + bool flush; /* flush after write? */ > + > + struct iovec iov; > + QEMUIOVector qiov; > + > + QEDTable table; > +} QEDWriteTableCB; > + > +static void qed_write_table_cb(void *opaque, int ret) > +{ > + QEDWriteTableCB *write_table_cb = opaque; > + > + if (ret) { > + goto out; > + } > + > + if (write_table_cb->flush) { > + /* We still need to flush first */ > + write_table_cb->flush = false; > + bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb, > + write_table_cb); > + return; > + } > + > +out: > + gencb_complete(&write_table_cb->gencb, ret); > + return; > +} > + > +/** > + * Write out an updated part or all of a table > + * > + * @s: QED state > + * @offset: Offset of table in image file, in bytes > + * @table: Table > + * @index: Index of first element > + * @n: Number of elements > + * @flush: Whether or not to sync to disk > + * @cb: Completion function > + * @opaque: Argument for completion function > + */ > +static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, > + unsigned int index, unsigned int n, bool flush, > + BlockDriverCompletionFunc *cb, void *opaque) > +{ > + QEDWriteTableCB *write_table_cb; > + BlockDriverAIOCB *aiocb; > + unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1; > + unsigned int start, end, i; > + size_t len_bytes; > + > + /* Calculate indices of the first and one after last elements */ > + start = index & ~sector_mask; > + end = (index + n + sector_mask) & ~sector_mask; > + > + len_bytes = (end - start) * sizeof(uint64_t); > + > + write_table_cb = gencb_alloc(sizeof(*write_table_cb) + len_bytes, > + cb, opaque); > + write_table_cb->s = s; > + write_table_cb->orig_table = table; > + write_table_cb->flush = flush; > + write_table_cb->iov.iov_base = write_table_cb->table.offsets; > + write_table_cb->iov.iov_len = len_bytes; > + qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1); > + > + /* Byteswap table */ > + for (i = start; i < end; i++) { > + write_table_cb->table.offsets[i - start] = cpu_to_le64(table->offsets[i]); > + } > + > + /* Adjust for offset into table */ > + offset += start * sizeof(uint64_t); > + > + aiocb = bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, > + &write_table_cb->qiov, > + write_table_cb->iov.iov_len / BDRV_SECTOR_SIZE, > + qed_write_table_cb, write_table_cb); > + if (!aiocb) { > + qed_write_table_cb(write_table_cb, -EIO); > + } > +} > + > +static void qed_read_l1_table_cb(void *opaque, int ret) > +{ > + *(int *)opaque = ret; > +} > + > +/** > + * Read the L1 table synchronously > + */ > +int qed_read_l1_table(BDRVQEDState *s) > +{ > + int ret = -EINPROGRESS; > + > + /* TODO push/pop async context? */ > + > + qed_read_table(s, s->header.l1_table_offset, > + s->l1_table, qed_read_l1_table_cb, &ret); > + while (ret == -EINPROGRESS) { > + qemu_aio_wait(); > + } > + return ret; > +} > + > +void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, > + BlockDriverCompletionFunc *cb, void *opaque) > +{ > + qed_write_table(s, s->header.l1_table_offset, > + s->l1_table, index, n, false, cb, opaque); > +} > + > +typedef struct { > + GenericCB gencb; > + BDRVQEDState *s; > + uint64_t l2_offset; > + QEDRequest *request; > +} QEDReadL2TableCB; > + > +static void qed_read_l2_table_cb(void *opaque, int ret) > +{ > + QEDReadL2TableCB *read_l2_table_cb = opaque; > + QEDRequest *request = read_l2_table_cb->request; > + BDRVQEDState *s = read_l2_table_cb->s; > + > + if (ret) { > + /* can't trust loaded L2 table anymore */ > + qed_unref_l2_cache_entry(&s->l2_cache, request->l2_table); > + request->l2_table = NULL; > + } else { > + request->l2_table->offset = read_l2_table_cb->l2_offset; > + qed_commit_l2_cache_entry(&s->l2_cache, request->l2_table); > + } > + > + gencb_complete(&read_l2_table_cb->gencb, ret); > +} > + > +void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, > + BlockDriverCompletionFunc *cb, void *opaque) > +{ > + QEDReadL2TableCB *read_l2_table_cb; > + > + qed_unref_l2_cache_entry(&s->l2_cache, request->l2_table); > + > + /* Check for cached L2 entry */ > + request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset); > + if (request->l2_table) { > + cb(opaque, 0); > + return; > + } > + > + request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); > + > + read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque); > + read_l2_table_cb->s = s; > + read_l2_table_cb->l2_offset = offset; > + read_l2_table_cb->request = request; > + > + qed_read_table(s, offset, request->l2_table->table, > + qed_read_l2_table_cb, read_l2_table_cb); > +} > + > +void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, > + unsigned int index, unsigned int n, bool flush, > + BlockDriverCompletionFunc *cb, void *opaque) > +{ > + qed_write_table(s, request->l2_table->offset, > + request->l2_table->table, index, n, flush, cb, opaque); > +} > diff --git a/block/qed.c b/block/qed.c > new file mode 100644 > index 0000000..cf64418 > --- /dev/null > +++ b/block/qed.c > @@ -0,0 +1,1103 @@ > +/* > + * QEMU Enhanced Disk Format > + * > + * Copyright IBM, Corp. 2010 > + * > + * Authors: > + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> > + * Anthony Liguori <aliguori@us.ibm.com> > + * > + * This work is licensed under the terms of the GNU LGPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + * > + */ > + > +#include "qed.h" > + > +/* TODO blkdebug support */ > +/* TODO BlockDriverState::buffer_alignment */ > +/* TODO check L2 table sizes before accessing them? */ > +/* TODO skip zero prefill since the filesystem should zero the sectors anyway */ > +/* TODO if a table element's offset is invalid then the image is broken. If > + * there was a power failure and the table update reached storage but the data > + * being pointed to did not, forget about the lost data by clearing the offset. > + * However, need to be careful to detect invalid offsets for tables that are > + * read *after* more clusters have been allocated. */ > + > +enum { > + QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24, > + > + /* The image supports a backing file */ > + QED_F_BACKING_FILE = 0x01, > + > + /* The image has the backing file format */ > + QED_CF_BACKING_FORMAT = 0x01, > + > + /* Feature bits must be used when the on-disk format changes */ > + QED_FEATURE_MASK = QED_F_BACKING_FILE, /* supported feature bits */ > + QED_COMPAT_FEATURE_MASK = QED_CF_BACKING_FORMAT, /* supported compat feature bits */ > + > + /* Data is stored in groups of sectors called clusters. Cluster size must > + * be large to avoid keeping too much metadata. I/O requests that have > + * sub-cluster size will require read-modify-write. > + */ > + QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */ > + QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024, > + QED_DEFAULT_CLUSTER_SIZE = 64 * 1024, > + > + /* Allocated clusters are tracked using a 2-level pagetable. Table size is > + * a multiple of clusters so large maximum image sizes can be supported > + * without jacking up the cluster size too much. > + */ > + QED_MIN_TABLE_SIZE = 1, /* in clusters */ > + QED_MAX_TABLE_SIZE = 16, > + QED_DEFAULT_TABLE_SIZE = 4, > +}; > + > +static void qed_aio_cancel(BlockDriverAIOCB *acb) > +{ > + qemu_aio_release(acb); > +} > + > +static AIOPool qed_aio_pool = { > + .aiocb_size = sizeof(QEDAIOCB), > + .cancel = qed_aio_cancel, > +}; > + > +/** > + * Allocate memory that satisfies image file and backing file alignment requirements > + * > + * TODO make this common and consider propagating max buffer_alignment to the root image > + */ > +static void *qed_memalign(BDRVQEDState *s, size_t len) > +{ > + size_t align = s->bs->file->buffer_alignment; > + BlockDriverState *backing_hd = s->bs->backing_hd; > + > + if (backing_hd && backing_hd->buffer_alignment > align) { > + align = backing_hd->buffer_alignment; > + } > + > + return qemu_memalign(align, len); > +} > + > +static int bdrv_qed_probe(const uint8_t *buf, int buf_size, > + const char *filename) > +{ > + const QEDHeader *header = (const void *)buf; > + > + if (buf_size < sizeof(*header)) { > + return 0; > + } > + if (le32_to_cpu(header->magic) != QED_MAGIC) { > + return 0; > + } > + return 100; > +} > + > +static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu) > +{ > + cpu->magic = le32_to_cpu(le->magic); > + cpu->cluster_size = le32_to_cpu(le->cluster_size); > + cpu->table_size = le32_to_cpu(le->table_size); > + cpu->first_cluster = le32_to_cpu(le->first_cluster); > + cpu->features = le64_to_cpu(le->features); > + cpu->compat_features = le64_to_cpu(le->compat_features); > + cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset); > + cpu->image_size = le64_to_cpu(le->image_size); > + cpu->backing_file_offset = le32_to_cpu(le->backing_file_offset); > + cpu->backing_file_size = le32_to_cpu(le->backing_file_size); > + cpu->backing_fmt_offset = le32_to_cpu(le->backing_fmt_offset); > + cpu->backing_fmt_size = le32_to_cpu(le->backing_fmt_size); > +} > + > +static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le) > +{ > + le->magic = cpu_to_le32(cpu->magic); > + le->cluster_size = cpu_to_le32(cpu->cluster_size); > + le->table_size = cpu_to_le32(cpu->table_size); > + le->first_cluster = cpu_to_le32(cpu->first_cluster); > + le->features = cpu_to_le64(cpu->features); > + le->compat_features = cpu_to_le64(cpu->compat_features); > + le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset); > + le->image_size = cpu_to_le64(cpu->image_size); > + le->backing_file_offset = cpu_to_le32(cpu->backing_file_offset); > + le->backing_file_size = cpu_to_le32(cpu->backing_file_size); > + le->backing_fmt_offset = cpu_to_le32(cpu->backing_fmt_offset); > + le->backing_fmt_size = cpu_to_le32(cpu->backing_fmt_size); > +} > + > +static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size) > +{ > + uint64_t table_entries; > + uint64_t l2_size; > + > + table_entries = (table_size * cluster_size) / 8; > + l2_size = table_entries * cluster_size; > + > + return l2_size * table_entries; > +} > + > +static bool qed_is_cluster_size_valid(uint32_t cluster_size) > +{ > + if (cluster_size < QED_MIN_CLUSTER_SIZE || > + cluster_size > QED_MAX_CLUSTER_SIZE) { > + return false; > + } > + if (cluster_size & (cluster_size - 1)) { > + return false; /* not power of 2 */ > + } > + return true; > +} > + > +static bool qed_is_table_size_valid(uint32_t table_size) > +{ > + if (table_size < QED_MIN_TABLE_SIZE || > + table_size > QED_MAX_TABLE_SIZE) { > + return false; > + } > + if (table_size & (table_size - 1)) { > + return false; /* not power of 2 */ > + } > + return true; > +} > + > +static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size, > + uint32_t table_size) > +{ > + if (image_size == 0) { > + /* Supporting zero size images makes life harder because even the L1 > + * table is not needed. Make life simple and forbid zero size images. > + */ > + return false; > + } > + if (image_size & (cluster_size - 1)) { > + return false; /* not multiple of cluster size */ > + } > + if (image_size > qed_max_image_size(cluster_size, table_size)) { > + return false; /* image is too large */ > + } > + return true; > +} > + > +/** > + * Test if a byte offset is cluster aligned and within the image file > + */ > +static bool qed_check_byte_offset(BDRVQEDState *s, uint64_t offset) > +{ > + if (offset & (s->header.cluster_size - 1)) { > + return false; > + } > + if (offset == 0) { > + return false; /* first cluster contains the header and is not valid */ > + } > + return offset < s->file_size; > +} > + > +/** > + * Read a string of known length from the image file > + * > + * @file: Image file > + * @offset: File offset to start of string, in bytes > + * @n: String length in bytes > + * @buf: Destination buffer > + * @buflen: Destination buffer length in bytes > + * > + * The string is NUL-terminated. > + */ > +static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n, > + char *buf, size_t buflen) > +{ > + int ret; > + if (n >= buflen) { > + return -EINVAL; > + } > + ret = bdrv_pread(file, offset, buf, n); > + if (ret != n) { > + return ret; > + } > + buf[n] = '\0'; > + return 0; > +} > + > +/** > + * Allocate new clusters > + * > + * @s: QED state > + * @n: Number of contiguous clusters to allocate > + * @offset: Offset of first allocated cluster, filled in on success > + */ > +static int qed_alloc_clusters(BDRVQEDState *s, unsigned int n, uint64_t *offset) > +{ > + *offset = s->file_size; > + s->file_size += n * s->header.cluster_size; > + return 0; > +} > + > +static QEDTable *qed_alloc_table(void *opaque) > +{ > + BDRVQEDState *s = opaque; > + > + /* Honor O_DIRECT memory alignment requirements */ > + return qed_memalign(s, s->header.cluster_size * s->header.table_size); > +} > + > +/** > + * Allocate a new zeroed L2 table > + */ > +static CachedL2Table *qed_new_l2_table(BDRVQEDState *s) > +{ > + uint64_t offset; > + int ret; > + CachedL2Table *l2_table; > + > + ret = qed_alloc_clusters(s, s->header.table_size, &offset); > + if (ret) { > + return NULL; > + } > + > + l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); > + l2_table->offset = offset; > + > + memset(l2_table->table->offsets, 0, > + s->header.cluster_size * s->header.table_size); > + return l2_table; > +} > + > +static int bdrv_qed_open(BlockDriverState *bs, int flags) > +{ > + BDRVQEDState *s = bs->opaque; > + QEDHeader le_header; > + int64_t file_size; > + int ret; > + > + s->bs = bs; > + QSIMPLEQ_INIT(&s->allocating_write_reqs); > + > + ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header)); > + if (ret != sizeof(le_header)) { > + return ret; > + } > + qed_header_le_to_cpu(&le_header, &s->header); > + > + if (s->header.magic != QED_MAGIC) { > + return -ENOENT; > + } > + if (s->header.features & ~QED_FEATURE_MASK) { > + return -ENOTSUP; /* image uses unsupported feature bits */ > + } > + if (!qed_is_cluster_size_valid(s->header.cluster_size)) { > + return -EINVAL; > + } > + > + /* Round up file size to the next cluster */ > + file_size = bdrv_getlength(bs->file); > + if (file_size < 0) { > + return file_size; > + } > + s->file_size = qed_start_of_cluster(s, file_size + s->header.cluster_size - 1); > + > + if (!qed_is_table_size_valid(s->header.table_size)) { > + return -EINVAL; > + } > + if (!qed_is_image_size_valid(s->header.image_size, > + s->header.cluster_size, > + s->header.table_size)) { > + return -EINVAL; > + } > + if (!qed_check_byte_offset(s, s->header.l1_table_offset)) { > + return -EINVAL; > + } > + > + s->table_nelems = (s->header.cluster_size * s->header.table_size) / > + sizeof(s->l1_table->offsets[0]); > + s->l2_shift = get_bits_from_size(s->header.cluster_size); > + s->l2_mask = s->table_nelems - 1; > + s->l1_shift = s->l2_shift + get_bits_from_size(s->l2_mask + 1); > + > + if ((s->header.features & QED_F_BACKING_FILE)) { > + ret = qed_read_string(bs->file, s->header.backing_file_offset, > + s->header.backing_file_size, bs->backing_file, > + sizeof(bs->backing_file)); > + if (ret < 0) { > + return ret; > + } > + > + if ((s->header.compat_features & QED_CF_BACKING_FORMAT)) { > + ret = qed_read_string(bs->file, s->header.backing_fmt_offset, > + s->header.backing_fmt_size, > + bs->backing_format, > + sizeof(bs->backing_format)); > + if (ret < 0) { > + return ret; > + } > + } IMHO we should make the backing format compulsory with use of the backing file. The only time probing is required is when initially creating the child image, thereafter there's no benefit to probing again. Regards, Daniel
On Mon, Sep 6, 2010 at 11:27 AM, Kevin Wolf <kwolf@redhat.com> wrote: > Am 06.09.2010 12:04, schrieb Stefan Hajnoczi: >> QEMU Enhanced Disk format is a disk image format that forgoes features >> found in qcow2 in favor of better levels of performance and data >> integrity. Â Due to its simpler on-disk layout, it is possible to safely >> perform metadata updates more efficiently. >> >> Installations, suspend-to-disk, and other allocation-heavy I/O workloads >> will see increased performance due to fewer I/Os and syncs. Â Workloads >> that do not cause new clusters to be allocated will perform similar to >> raw images due to in-memory metadata caching. >> >> The format supports sparse disk images. Â It does not rely on the host >> filesystem holes feature, making it a good choice for sparse disk images >> that need to be transferred over channels where holes are not supported. >> >> Backing files are supported so only deltas against a base image can be >> stored. >> >> The file format is extensible so that additional features can be added >> later with graceful compatibility handling. >> >> Internal snapshots are not supported. Â This eliminates the need for >> additional metadata to track copy-on-write clusters. >> >> Compression and encryption are not supported. Â They add complexity and >> can be implemented at other layers in the stack (i.e. inside the guest >> or on the host). >> >> The format is currently functional with the following features missing: >> Â * Resizing the disk image. Â The capability has been designed in but the >> Â Â code has not been written yet. >> Â * Resetting the image after backing file commit completes. >> Â * Changing the backing filename. >> Â * Consistency check (fsck). Â This is simple due to the on-disk layout. >> >> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com> >> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> > > Okay, so before I actually look at the patch longer than a couple of > seconds let me just ask the obvious question... > > Before inventing yet another image format, you certainly have checked > the existing ones. Except for not implementing compression and > encryption this looks a lot like qcow1 to me. I see that you even > retained the two-level cluster tables. > > So if we ignore the implementation for a moment and just compare the > formats, what's the crucial difference between qcow1 and qed that I'm > missing? And if it's not qcow1, why not improving our support for > another existing format like VHD? Is this a subset of existing on-disk formats? Yes. The motivation is to have an image format that performs well and is safe, with backing image support. Currently no image format in QEMU meets these requirements. Perhaps it is appropriate to use an existing on-disk format. I actually considered in-place migration (compatibility) with qcow2 to make life easier for users and avoid a new format. However, there is baggage to doing this and the focus should be on building a solid image format instead of fitting into a legacy format that qemu-img convert can take care of. Stefan
On 09/06/2010 05:27 AM, Kevin Wolf wrote: > Okay, so before I actually look at the patch longer than a couple of > seconds let me just ask the obvious question... > > Before inventing yet another image format, you certainly have checked > the existing ones. Obviously, yes. Here are the issues: cow.c: it's cow of an otherwise sparse file. An important reason for implementing a format is the ability to copy (or scp) an image without special tools. qcow2.c: the refcount, cow cluster, and compression make an implementation seeking integrity and performance challenging. vmdk.c: we feel it's important for qemu to have a block format with a gpl friendly specification that we have a say in vhd/vpc.c: same as vmdk with the addition that the OSP is known to not be gpl friendly vdi.c: uses a bitmap instead of a two level table. An advantage of a two level table is that it allows image resize without much fuss. qcow.c: it lacks extensibility and compression means that there's no guarantee that blocks are a fixed size. This makes it very difficult to implement a high performance block format without having two separate code paths. > Except for not implementing compression and > encryption this looks a lot like qcow1 to me. I see that you even > retained the two-level cluster tables. > > So if we ignore the implementation for a moment and just compare the > formats, what's the crucial difference between qcow1 and qed that I'm > missing? And if it's not qcow1, why not improving our support for > another existing format like VHD? > Block formats are easy to get wrong. QED is an existence proof that given the right constraints, we can build a full asynchronous, high performance image format with proper data integrity. You could get to QED by incrementally improving qcow but you'd have to break the format to make it extensible and disable support for compression. But at that point, why not just make a new format since you're breaking compatibility. You would have to fully rewrite the code so what's the point of keeping the format? Regards, Anthony Liguori > Kevin > >
On 09/06/2010 06:18 AM, Daniel P. Berrange wrote: > I agree with ditching compression, but encryption is an important > capability which cannot be satisfactorily added at other layers > in the stack. While block devices / local filesystems can layer > in dm-crypt in the host, this is not possible with network/cluster > filesystems which account for a non-trivial target audience. ecryptfs should work with NFS these days. If it still doesn't, it will in the not too distant future. > Adding > encryption inside the guest is sub-optimal because you cannot do > secure automation of guest startup. Either you require manaual > intervention to start every guest to enter the key, or if you > hardcode the key, then anyone who can access the guest disk image > can start the guest. I think this belongs in the VFS level but from a format perspective, an encryption feature would be easy to add. >> + >> + if ((s->header.compat_features& QED_CF_BACKING_FORMAT)) { >> + ret = qed_read_string(bs->file, s->header.backing_fmt_offset, >> + s->header.backing_fmt_size, >> + bs->backing_format, >> + sizeof(bs->backing_format)); >> + if (ret< 0) { >> + return ret; >> + } >> + } >> > IMHO we should make the backing format compulsory with use of > the backing file. The only time probing is required is when > initially creating the child image, thereafter there's no > benefit to probing again. > Stefan originally made it mandatory but I asked to make it optional. From a format specification perspective, backing_fmt introduces some problems. What does a backing_fmt of 'vmdk' mean outside of qemu? More importantly, humans to create image formats by hand. Instead, they use tools like qemu-img. If you think we should for the specification of a backing file format in qemu-img, that's the place we should do it. Regards, Anthony Liguori > Regards, > Daniel >
On 09/06/2010 07:40 AM, Stefan Hajnoczi wrote: > On Mon, Sep 6, 2010 at 11:27 AM, Kevin Wolf<kwolf@redhat.com> wrote: > >> Am 06.09.2010 12:04, schrieb Stefan Hajnoczi: >> >>> QEMU Enhanced Disk format is a disk image format that forgoes features >>> found in qcow2 in favor of better levels of performance and data >>> integrity. Due to its simpler on-disk layout, it is possible to safely >>> perform metadata updates more efficiently. >>> >>> Installations, suspend-to-disk, and other allocation-heavy I/O workloads >>> will see increased performance due to fewer I/Os and syncs. Workloads >>> that do not cause new clusters to be allocated will perform similar to >>> raw images due to in-memory metadata caching. >>> >>> The format supports sparse disk images. It does not rely on the host >>> filesystem holes feature, making it a good choice for sparse disk images >>> that need to be transferred over channels where holes are not supported. >>> >>> Backing files are supported so only deltas against a base image can be >>> stored. >>> >>> The file format is extensible so that additional features can be added >>> later with graceful compatibility handling. >>> >>> Internal snapshots are not supported. This eliminates the need for >>> additional metadata to track copy-on-write clusters. >>> >>> Compression and encryption are not supported. They add complexity and >>> can be implemented at other layers in the stack (i.e. inside the guest >>> or on the host). >>> >>> The format is currently functional with the following features missing: >>> * Resizing the disk image. The capability has been designed in but the >>> code has not been written yet. >>> * Resetting the image after backing file commit completes. >>> * Changing the backing filename. >>> * Consistency check (fsck). This is simple due to the on-disk layout. >>> >>> Signed-off-by: Anthony Liguori<aliguori@us.ibm.com> >>> Signed-off-by: Stefan Hajnoczi<stefanha@linux.vnet.ibm.com> >>> >> Okay, so before I actually look at the patch longer than a couple of >> seconds let me just ask the obvious question... >> >> Before inventing yet another image format, you certainly have checked >> the existing ones. Except for not implementing compression and >> encryption this looks a lot like qcow1 to me. I see that you even >> retained the two-level cluster tables. >> >> So if we ignore the implementation for a moment and just compare the >> formats, what's the crucial difference between qcow1 and qed that I'm >> missing? And if it's not qcow1, why not improving our support for >> another existing format like VHD? >> > Is this a subset of existing on-disk formats? Yes. The motivation is > to have an image format that performs well and is safe, with backing > image support. Currently no image format in QEMU meets these > requirements. > > Perhaps it is appropriate to use an existing on-disk format. If you implement a subset of functionality for an existing on-disk format, I think you damage user's expectations. If we claim to support qcow images, then given any old qcow image I have laying around for 5 years ago, I should be able to run it without qemu throwing an error. There's some really ugly stuff in qcow. Nothing is actually aligned. This makes implementing things like O_DIRECT very challenging since you basically have to handle bouncing any possible buffer. Since the L1 table occurs immediately after the header, there's really no room to play any kind of tricks to add features. Regards, Anthony Liguori > I > actually considered in-place migration (compatibility) with qcow2 to > make life easier for users and avoid a new format. However, there is > baggage to doing this and the focus should be on building a solid > image format instead of fitting into a legacy format that qemu-img > convert can take care of. > > Stefan > >
On Mon, Sep 6, 2010 at 1:57 PM, Anthony Liguori <anthony@codemonkey.ws> wrote: > On 09/06/2010 07:40 AM, Stefan Hajnoczi wrote: >> >> On Mon, Sep 6, 2010 at 11:27 AM, Kevin Wolf<kwolf@redhat.com> Â wrote: >> >>> >>> Am 06.09.2010 12:04, schrieb Stefan Hajnoczi: >>> >>>> >>>> QEMU Enhanced Disk format is a disk image format that forgoes features >>>> found in qcow2 in favor of better levels of performance and data >>>> integrity. Â Due to its simpler on-disk layout, it is possible to safely >>>> perform metadata updates more efficiently. >>>> >>>> Installations, suspend-to-disk, and other allocation-heavy I/O workloads >>>> will see increased performance due to fewer I/Os and syncs. Â Workloads >>>> that do not cause new clusters to be allocated will perform similar to >>>> raw images due to in-memory metadata caching. >>>> >>>> The format supports sparse disk images. Â It does not rely on the host >>>> filesystem holes feature, making it a good choice for sparse disk images >>>> that need to be transferred over channels where holes are not supported. >>>> >>>> Backing files are supported so only deltas against a base image can be >>>> stored. >>>> >>>> The file format is extensible so that additional features can be added >>>> later with graceful compatibility handling. >>>> >>>> Internal snapshots are not supported. Â This eliminates the need for >>>> additional metadata to track copy-on-write clusters. >>>> >>>> Compression and encryption are not supported. Â They add complexity and >>>> can be implemented at other layers in the stack (i.e. inside the guest >>>> or on the host). >>>> >>>> The format is currently functional with the following features missing: >>>> Â * Resizing the disk image. Â The capability has been designed in but the >>>> Â Â code has not been written yet. >>>> Â * Resetting the image after backing file commit completes. >>>> Â * Changing the backing filename. >>>> Â * Consistency check (fsck). Â This is simple due to the on-disk layout. >>>> >>>> Signed-off-by: Anthony Liguori<aliguori@us.ibm.com> >>>> Signed-off-by: Stefan Hajnoczi<stefanha@linux.vnet.ibm.com> >>>> >>> >>> Okay, so before I actually look at the patch longer than a couple of >>> seconds let me just ask the obvious question... >>> >>> Before inventing yet another image format, you certainly have checked >>> the existing ones. Except for not implementing compression and >>> encryption this looks a lot like qcow1 to me. I see that you even >>> retained the two-level cluster tables. >>> >>> So if we ignore the implementation for a moment and just compare the >>> formats, what's the crucial difference between qcow1 and qed that I'm >>> missing? And if it's not qcow1, why not improving our support for >>> another existing format like VHD? >>> >> >> Is this a subset of existing on-disk formats? Â Yes. Â The motivation is >> to have an image format that performs well and is safe, with backing >> image support. Â Currently no image format in QEMU meets these >> requirements. >> >> Perhaps it is appropriate to use an existing on-disk format. > > If you implement a subset of functionality for an existing on-disk format, I > think you damage user's expectations. > > If we claim to support qcow images, then given any old qcow image I have > laying around for 5 years ago, I should be able to run it without qemu > throwing an error. > > There's some really ugly stuff in qcow. Â Nothing is actually aligned. Â This > makes implementing things like O_DIRECT very challenging since you basically > have to handle bouncing any possible buffer. Â Since the L1 table occurs > immediately after the header, there's really no room to play any kind of > tricks to add features. These are the details that are baggage. Ultimately it may be hard to deal with them without just bumping the qcow version number and thereby having a new format anyway. Stefan
On 09/06/2010 05:04 AM, Stefan Hajnoczi wrote: > QEMU Enhanced Disk format is a disk image format that forgoes features > found in qcow2 in favor of better levels of performance and data > integrity. Due to its simpler on-disk layout, it is possible to safely > perform metadata updates more efficiently. > > Installations, suspend-to-disk, and other allocation-heavy I/O workloads > will see increased performance due to fewer I/Os and syncs. Workloads > that do not cause new clusters to be allocated will perform similar to > raw images due to in-memory metadata caching. > > The format supports sparse disk images. It does not rely on the host > filesystem holes feature, making it a good choice for sparse disk images > that need to be transferred over channels where holes are not supported. > > Backing files are supported so only deltas against a base image can be > stored. > > The file format is extensible so that additional features can be added > later with graceful compatibility handling. > > Internal snapshots are not supported. This eliminates the need for > additional metadata to track copy-on-write clusters. > > Compression and encryption are not supported. They add complexity and > can be implemented at other layers in the stack (i.e. inside the guest > or on the host). > > The format is currently functional with the following features missing: > * Resizing the disk image. The capability has been designed in but the > code has not been written yet. > * Resetting the image after backing file commit completes. > * Changing the backing filename. > * Consistency check (fsck). This is simple due to the on-disk layout. > > Signed-off-by: Anthony Liguori<aliguori@us.ibm.com> > Signed-off-by: Stefan Hajnoczi<stefanha@linux.vnet.ibm.com> > Another point worth mentioning is that our intention is to have a formal specification of the format before merging. A start of that is located at http://wiki.qemu.org/Features/QED Regards, Anthony Liguori
On Mon, Sep 06, 2010 at 07:52:41AM -0500, Anthony Liguori wrote: > On 09/06/2010 06:18 AM, Daniel P. Berrange wrote: > >I agree with ditching compression, but encryption is an important > >capability which cannot be satisfactorily added at other layers > >in the stack. While block devices / local filesystems can layer > >in dm-crypt in the host, this is not possible with network/cluster > >filesystems which account for a non-trivial target audience. > > ecryptfs should work with NFS these days. If it still doesn't, it will > in the not too distant future. Assuming it does work with NFS, IIUC, that still requires the user to have root privileges to setup ecryptfs for the NFS mount in question. So it takes care of the use case where the host admin doesn't trust the network/remote fs admin, but doesn't work for the case of local unprivileged users with NFS home dirs & a host admin who doesnt help. > > Adding > >encryption inside the guest is sub-optimal because you cannot do > >secure automation of guest startup. Either you require manaual > >intervention to start every guest to enter the key, or if you > >hardcode the key, then anyone who can access the guest disk image > >can start the guest. > > I think this belongs in the VFS level but from a format perspective, an > encryption feature would be easy to add. > > >>+ > >>+ if ((s->header.compat_features& QED_CF_BACKING_FORMAT)) { > >>+ ret = qed_read_string(bs->file, s->header.backing_fmt_offset, > >>+ s->header.backing_fmt_size, > >>+ bs->backing_format, > >>+ sizeof(bs->backing_format)); > >>+ if (ret< 0) { > >>+ return ret; > >>+ } > >>+ } > >> > >IMHO we should make the backing format compulsory with use of > >the backing file. The only time probing is required is when > >initially creating the child image, thereafter there's no > >benefit to probing again. > > > > Stefan originally made it mandatory but I asked to make it optional. > > From a format specification perspective, backing_fmt introduces some > problems. What does a backing_fmt of 'vmdk' mean outside of qemu? As currently implemented the string refers to a QEMU block driver which is perhaps not the best choice for a general purpose file format, if we want this applicable to other non-QEMU apps. Perhaps it would be better if we explicitly declared backing format as an enumerated int that represents specific file formats, thus decoupling it from a specific driver. Another related idea is perhaps to specify that if backing_fmt is omitted in the metadata, the backing file must be treated as a QED format file, rather than probed. Arguably qemu's VMDK driver should be treating all VMDK backing files as VMDK format rather than probing since I'm not VMware has no idea of a backing file in qcow or any other format. > More importantly, humans to create image formats by hand. Instead, they > use tools like qemu-img. If you think we should for the specification > of a backing file format in qemu-img, that's the place we should do it. Certainly qemu-img can always add a format, even if the specification declared it optional, but I think its worth considering declaring it it compulsory in the spec, to take that variable out of the equation for apps using the images. Regards, Daniel
Am 06.09.2010 14:57, schrieb Anthony Liguori: > On 09/06/2010 07:40 AM, Stefan Hajnoczi wrote: >> On Mon, Sep 6, 2010 at 11:27 AM, Kevin Wolf<kwolf@redhat.com> wrote: >> >>> Am 06.09.2010 12:04, schrieb Stefan Hajnoczi: >>> >>>> QEMU Enhanced Disk format is a disk image format that forgoes features >>>> found in qcow2 in favor of better levels of performance and data >>>> integrity. Due to its simpler on-disk layout, it is possible to safely >>>> perform metadata updates more efficiently. >>>> >>>> Installations, suspend-to-disk, and other allocation-heavy I/O workloads >>>> will see increased performance due to fewer I/Os and syncs. Workloads >>>> that do not cause new clusters to be allocated will perform similar to >>>> raw images due to in-memory metadata caching. >>>> >>>> The format supports sparse disk images. It does not rely on the host >>>> filesystem holes feature, making it a good choice for sparse disk images >>>> that need to be transferred over channels where holes are not supported. >>>> >>>> Backing files are supported so only deltas against a base image can be >>>> stored. >>>> >>>> The file format is extensible so that additional features can be added >>>> later with graceful compatibility handling. >>>> >>>> Internal snapshots are not supported. This eliminates the need for >>>> additional metadata to track copy-on-write clusters. >>>> >>>> Compression and encryption are not supported. They add complexity and >>>> can be implemented at other layers in the stack (i.e. inside the guest >>>> or on the host). >>>> >>>> The format is currently functional with the following features missing: >>>> * Resizing the disk image. The capability has been designed in but the >>>> code has not been written yet. >>>> * Resetting the image after backing file commit completes. >>>> * Changing the backing filename. >>>> * Consistency check (fsck). This is simple due to the on-disk layout. >>>> >>>> Signed-off-by: Anthony Liguori<aliguori@us.ibm.com> >>>> Signed-off-by: Stefan Hajnoczi<stefanha@linux.vnet.ibm.com> >>>> >>> Okay, so before I actually look at the patch longer than a couple of >>> seconds let me just ask the obvious question... >>> >>> Before inventing yet another image format, you certainly have checked >>> the existing ones. Except for not implementing compression and >>> encryption this looks a lot like qcow1 to me. I see that you even >>> retained the two-level cluster tables. >>> >>> So if we ignore the implementation for a moment and just compare the >>> formats, what's the crucial difference between qcow1 and qed that I'm >>> missing? And if it's not qcow1, why not improving our support for >>> another existing format like VHD? >>> >> Is this a subset of existing on-disk formats? Yes. The motivation is >> to have an image format that performs well and is safe, with backing >> image support. Currently no image format in QEMU meets these >> requirements. >> >> Perhaps it is appropriate to use an existing on-disk format. > > If you implement a subset of functionality for an existing on-disk > format, I think you damage user's expectations. I don't really buy that implementing compression/encryption wouldn't have been possible if it was the only problem. Of course, if you don't implement it, you can't use an on-disk format that supports them. > If we claim to support qcow images, then given any old qcow image I have > laying around for 5 years ago, I should be able to run it without qemu > throwing an error. > > There's some really ugly stuff in qcow. Nothing is actually aligned. > This makes implementing things like O_DIRECT very challenging since you > basically have to handle bouncing any possible buffer. Since the L1 > table occurs immediately after the header, there's really no room to > play any kind of tricks to add features. That's a good point actually. I didn't remember that. Kevin
On Mon, Sep 6, 2010 at 12:25 PM, Alexander Graf <agraf@suse.de> wrote: > On 06.09.2010, at 12:04, Stefan Hajnoczi wrote: >> + >> +const char *bytes_to_str(uint64_t size) >> +{ >> + Â Â static char buffer[64]; >> + >> + Â Â if (size < (1ULL << 10)) { >> + Â Â Â Â snprintf(buffer, sizeof(buffer), "%" PRIu64 " byte(s)", size); >> + Â Â } else if (size < (1ULL << 20)) { >> + Â Â Â Â snprintf(buffer, sizeof(buffer), "%" PRIu64 " KB(s)", size >> 10); >> + Â Â } else if (size < (1ULL << 30)) { >> + Â Â Â Â snprintf(buffer, sizeof(buffer), "%" PRIu64 " MB(s)", size >> 20); >> + Â Â } else if (size < (1ULL << 40)) { >> + Â Â Â Â snprintf(buffer, sizeof(buffer), "%" PRIu64 " GB(s)", size >> 30); >> + Â Â } else { >> + Â Â Â Â snprintf(buffer, sizeof(buffer), "%" PRIu64 " TB(s)", size >> 40); >> + Â Â } >> + >> + Â Â return buffer; > > This returns a variable from the stack! Please make the target buffer caller defined. It's static, so it's formally correct. But probably not a good idea :) Luca
On 06.09.2010, at 16:21, Luca Tettamanti wrote: > On Mon, Sep 6, 2010 at 12:25 PM, Alexander Graf <agraf@suse.de> wrote: >> On 06.09.2010, at 12:04, Stefan Hajnoczi wrote: >>> + >>> +const char *bytes_to_str(uint64_t size) >>> +{ >>> + static char buffer[64]; >>> + >>> + if (size < (1ULL << 10)) { >>> + snprintf(buffer, sizeof(buffer), "%" PRIu64 " byte(s)", size); >>> + } else if (size < (1ULL << 20)) { >>> + snprintf(buffer, sizeof(buffer), "%" PRIu64 " KB(s)", size >> 10); >>> + } else if (size < (1ULL << 30)) { >>> + snprintf(buffer, sizeof(buffer), "%" PRIu64 " MB(s)", size >> 20); >>> + } else if (size < (1ULL << 40)) { >>> + snprintf(buffer, sizeof(buffer), "%" PRIu64 " GB(s)", size >> 30); >>> + } else { >>> + snprintf(buffer, sizeof(buffer), "%" PRIu64 " TB(s)", size >> 40); >>> + } >>> + >>> + return buffer; >> >> This returns a variable from the stack! Please make the target buffer caller defined. > > It's static, so it's formally correct. But probably not a good idea :) Oh - I missed the static there. Yeah, it's even worse. This is racy. Alex
On 09/06/2010 09:24 AM, Alexander Graf wrote: > > Oh - I missed the static there. Yeah, it's even worse. This is racy. > It's easy to refactor away so I'll just do that but it's not actually racy. It's just not re-entrant and the lifetime of the returned result is only until the next call. Regards, Anthony Liguori > Alex > > >
On 09/06/2010 08:35 AM, Daniel P. Berrange wrote: > On Mon, Sep 06, 2010 at 07:52:41AM -0500, Anthony Liguori wrote: > >> On 09/06/2010 06:18 AM, Daniel P. Berrange wrote: >> >>> I agree with ditching compression, but encryption is an important >>> capability which cannot be satisfactorily added at other layers >>> in the stack. While block devices / local filesystems can layer >>> in dm-crypt in the host, this is not possible with network/cluster >>> filesystems which account for a non-trivial target audience. >>> >> ecryptfs should work with NFS these days. If it still doesn't, it will >> in the not too distant future. >> > Assuming it does work with NFS, IIUC, that still requires the user to > have root privileges to setup ecryptfs for the NFS mount in question. > So it takes care of the use case where the host admin doesn't trust > the network/remote fs admin, but doesn't work for the case of local > unprivileged users with NFS home dirs& a host admin who doesnt help. > There's talk of moving ecryptfs from a stackable file system to a VFS feature. Among other things, this would make it usable by non-privileged users since there's really no reason for it to not be. Let's take a step back though as I'd like to point out two things. The first has feature support which means that if it's just a matter of adding something to the header and encrypting blocks, then it's super easy to add. Furthermore, you get graceful detection of failure when using an encrypted image with a version of QEMU that doesn't support encryption in QED. When creating new images that aren't encrypted with the new QEMU, the images still work with old QEMUs. So really, there's little rush to add encryption (or any feature) to QED. The main focus ATM is making we achieve good performance and good reliability. But encryption is never simple. If you want anymore more than a toy, you really need to integrate into a key ring system, make use of a crypto API to leverage cryptographic accelerators, etc. This is why relying on the a filesystem (or VFS feature) makes so much sense. > As currently implemented the string refers to a QEMU block driver > which is perhaps not the best choice for a general purpose file > format, if we want this applicable to other non-QEMU apps. Perhaps > it would be better if we explicitly declared backing format as an > enumerated int that represents specific file formats, thus decoupling > it from a specific driver. > That's one of the reasons I made this an optional feature. I think we're going to have to revisit the backing format in the future to be something more meaningful. For the purposes of the spec, I was going to say that backing_fmt was a suggestion to an implementation on how to interpret backing_file and leave it at that. It terms of making something that's strictly enforced, I would suggest not specifying the format but rather having something like is_backing_raw. IOW, a boolean that would be set if the backing file was raw (and not probe-able). Otherwise, the backing format can be safely probed. I would then say that backing file cannot be raw unless that bit is set or something like that. > Another related idea is perhaps to specify that if backing_fmt is > omitted in the metadata, the backing file must be treated as a QED > format file, rather than probed. !raw would be a better way of specifying it but yeah, I think it's a reasonable idea. Regards, Anthony Liguori >> More importantly, humans to create image formats by hand. Instead, they >> use tools like qemu-img. If you think we should for the specification >> of a backing file format in qemu-img, that's the place we should do it. >> > Certainly qemu-img can always add a format, even if the specification > declared it optional, but I think its worth considering declaring it > it compulsory in the spec, to take that variable out of the equation > for apps using the images. > > Regards, > Daniel >
On 09/06/2010 09:10 AM, Kevin Wolf wrote: > >> If you implement a subset of functionality for an existing on-disk >> format, I think you damage user's expectations. >> > I don't really buy that implementing compression/encryption wouldn't > have been possible if it was the only problem. Of course, if you don't > implement it, you can't use an on-disk format that supports them. > The trouble with compression is that you don't have fixed size clusters any more. In order to support writes, you either have to write uncompressed data to the EOF leaking the compressed version or write compressed data and attempt to use a free list to avoid leaking clusters. Since cluster size isn't fixed, the free list is of variable size which means you'd have to do something sophisticated like a buddy algorithm to allocate from the free list. It's just not worth it since there's no easy way to do it correctly. Encryption is straight forward. Lack of features is a killer though. The only thing you could really do is the same type of trickery we did with qcow2 where we detect whether there's room between the header and the L1. Of course, there's nothing in qcow that really says if the L1 doesn't start at sizeof(old_header) then you have new_header so this is not technically backwards compatible. But even assuming it is, the new features introduced in new_header are undiscoverable to older version of QEMU. So if you do something that makes the image unreadable to older QEMUs (like adding a new encryption algorithm), instead of getting a nice error, you get silent corruption. qcow has had more than the QEMU implementation too so we're not the only ones that have been creating qcow images so we can't just rely on our historic behavior. IMHO, this alone justifies a new format. Regards, Anthony Liguori >> If we claim to support qcow images, then given any old qcow image I have >> laying around for 5 years ago, I should be able to run it without qemu >> throwing an error. >> >> There's some really ugly stuff in qcow. Nothing is actually aligned. >> This makes implementing things like O_DIRECT very challenging since you >> basically have to handle bouncing any possible buffer. Since the L1 >> table occurs immediately after the header, there's really no room to >> play any kind of tricks to add features. >> > That's a good point actually. I didn't remember that. > > Kevin >
On 09/06/2010 01:04 PM, Stefan Hajnoczi wrote: > QEMU Enhanced Disk format is a disk image format that forgoes features > found in qcow2 in favor of better levels of performance and data > integrity. Due to its simpler on-disk layout, it is possible to safely > perform metadata updates more efficiently. > > Installations, suspend-to-disk, and other allocation-heavy I/O workloads > will see increased performance due to fewer I/Os and syncs. Workloads > that do not cause new clusters to be allocated will perform similar to > raw images due to in-memory metadata caching. > > The format supports sparse disk images. It does not rely on the host > filesystem holes feature, making it a good choice for sparse disk images > that need to be transferred over channels where holes are not supported. > > Backing files are supported so only deltas against a base image can be > stored. > > The file format is extensible so that additional features can be added > later with graceful compatibility handling. > > Internal snapshots are not supported. This eliminates the need for > additional metadata to track copy-on-write clusters. > > Compression and encryption are not supported. They add complexity and > can be implemented at other layers in the stack (i.e. inside the guest > or on the host). > > The format is currently functional with the following features missing: > * Resizing the disk image. The capability has been designed in but the > code has not been written yet. > * Resetting the image after backing file commit completes. > * Changing the backing filename. > * Consistency check (fsck). This is simple due to the on-disk layout. > > Signed-off-by: Anthony Liguori<aliguori@us.ibm.com> > Signed-off-by: Stefan Hajnoczi<stefanha@linux.vnet.ibm.com> > --- > This code is also available from git (for development and testing the tracing > and blkverify features are pulled in, whereas this single squashed patch > applies to mainline qemu.git): > > http://repo.or.cz/w/qemu/stefanha.git/shortlog/refs/heads/qed > > Numbers for RHEL6 install, cache=none disk image on ext3. This is an > interactive install on my laptop, so not a proper benchmark but I want to show > there is real difference today: > * raw: 4m4s > * qed: 4m21s (107%) > * qcow2: 4m46s (117%) > > Makefile.objs | 1 + > block/qcow2.c | 22 - > block/qed-cluster.c | 136 +++++++ > block/qed-gencb.c | 32 ++ > block/qed-l2-cache.c | 131 ++++++ > block/qed-table.c | 242 +++++++++++ > block/qed.c | 1103 ++++++++++++++++++++++++++++++++++++++++++++++++++ > block/qed.h | 212 ++++++++++ docs/qed.txt ++++++++++++++++++++++++++++++++++++++++++++ ?
On 09/06/2010 04:06 PM, Anthony Liguori wrote: > > Another point worth mentioning is that our intention is to have a > formal specification of the format before merging. A start of that is > located at http://wiki.qemu.org/Features/QED > > =Specification= > > The file format looks like this: > > +---------+---------+---------+-----+ > | extent0 | extent1 | extent1 | ... | > +---------+---------+---------+-----+ > > The first extent contains a header. The header contains information > about the first data extent. A data extent may be a data cluster, an > L2, or an L1 table. L1 and L2 tables are composed of one or more > contiguous extents. > > ==Header== > Header { > uint32_t magic; /* QED\0 */ Endianness? > > uint32_t cluster_size; /* in bytes */ Does cluster == extent? If so, use the same terminology. If not, explain. Usually extent is a variable size structure. > uint32_t table_size; /* table size, in clusters */ Presumably L1 table size? Or any table size? Hm. It would be nicer not to require contiguous sectors anywhere. How about a variable- or fixed-height tree? > uint32_t first_cluster; /* in clusters */ First cluster of what? > > uint64_t features; /* format feature bits */ > uint64_t compat_features; /* compat feature bits */ > uint64_t l1_table_offset; /* L1 table offset, in clusters */ > uint64_t image_size; /* total image size, in clusters */ Logical, yes? Is the physical image size always derived from the host file metadata? Is this always safe? > /* if (features & QED_F_BACKING_FILE) */ > uint32_t backing_file_offset; /* in bytes from start of header */ > uint32_t backing_file_size; /* in bytes */ It's really the filename size, not the file size. Also, make a note that it is not zero terminated. > > /* if (compat_features & QED_CF_BACKING_FORMAT) */ > uint32_t backing_fmt_offset; /* in bytes from start of header */ > uint32_t backing_fmt_size; /* in bytes */ Why not make it mandatory? > } Need a checksum for the header. > > ==Extent table== > > #define TABLE_NOFFSETS (table_size * cluster_size / sizeof(uint64_t)) > > Table { > uint64_t offsets[TABLE_NOFFSETS]; > } It's fashionable to put checksums here. Do we want a real extent-based format like modern filesystems? So after defragmentation a full image has O(1) metadata? > > The extent tables are organized as follows: > > +----------+ > | L1 table | > +----------+ > ,------' | '------. > +----------+ | +----------+ > | L2 table | ... | L2 table | > +----------+ +----------+ > ,------' | '------. > +----------+ | +----------+ > | Data | ... | Data | > +----------+ +----------+ > > The table_size field allows tables to be multiples of the cluster > size. For example, cluster_size=64 KB and table_size=4 results in 256 > KB tables. > > =Operations= > > ==Read== > # If L2 table is not present in L1, read from backing image. > # If data cluster is not present in L2, read from backing image. > # Otherwise read data from cluster. If not in backing image, provide zeros > > ==Write== > # If L2 table is not present in L1, allocate new cluster and L2. > Perform L2 and L1 link after writing data. > # If data cluster is not present in L2, allocate new cluster. Perform > L1 link after writing data. > # Otherwise overwrite data cluster. Detail copy-on-write from backing image. On a partial write without a backing file, do we recommend zero-filling the cluster (to avoid intra-cluster fragmentation)? > > The L2 link '''should''' be made after the data is in place on > storage. However, when no ordering is enforced the worst case > scenario is an L2 link to an unwritten cluster. Or it may cause corruption if the physical file size is not committed, and L2 now points at a free cluster. > > The L1 link '''must''' be made after the L2 cluster is in place on > storage. If the order is reversed then the L1 table may point to a > bogus L2 table. (Is this a problem since clusters are allocated at > the end of the file?) > > ==Grow== > # If table_size * TABLE_NOFFSETS < new_image_size, fail -EOVERFLOW. > The L1 table is not big enough. With a variable-height tree, we allocate a new root, link its first entry to the old root, and write the new header with updated root and height. > # Write new image_size header field. > > =Data integrity= > ==Write== > Writes that complete before a flush must be stable when the flush > completes. > > If storage is interrupted (e.g. power outage) then writes in progress > may be lost, stable, or partially completed. The storage must not be > otherwise corrupted or inaccessible after it is restarted. We can remove this requirement by copying-on-write any metadata write, and keeping two copies of the header (with version numbers and checksums). Enterprise storage will not corrupt on writes, but commodity storage may.
On 09/07/2010 09:51 AM, Avi Kivity wrote: I'll let Stefan address most of this. >> uint32_t first_cluster; /* in clusters */ > > First cluster of what? This should probably be header_size /* in clusters */ because that's what it really means. > > Need a checksum for the header. Is that not a bit overkill for what we're doing? What's the benefit? >> >> The L2 link '''should''' be made after the data is in place on >> storage. However, when no ordering is enforced the worst case >> scenario is an L2 link to an unwritten cluster. > > Or it may cause corruption if the physical file size is not committed, > and L2 now points at a free cluster. An fsync() will make sure the physical file size is committed. The metadata does not carry an additional integrity guarantees over the actual disk data except that in order to avoid internal corruption, we have to order the L2 and L1 writes. As part of the read process, it's important to validate that the L2 entries don't point to blocks beyond EOF. This is an indication of a corrupted I/O operation and we need to treat that as an unallocated cluster. >> >> The L1 link '''must''' be made after the L2 cluster is in place on >> storage. If the order is reversed then the L1 table may point to a >> bogus L2 table. (Is this a problem since clusters are allocated at >> the end of the file?) >> >> ==Grow== >> # If table_size * TABLE_NOFFSETS < new_image_size, fail -EOVERFLOW. >> The L1 table is not big enough. > > With a variable-height tree, we allocate a new root, link its first > entry to the old root, and write the new header with updated root and > height. > >> # Write new image_size header field. >> >> =Data integrity= >> ==Write== >> Writes that complete before a flush must be stable when the flush >> completes. >> >> If storage is interrupted (e.g. power outage) then writes in progress >> may be lost, stable, or partially completed. The storage must not be >> otherwise corrupted or inaccessible after it is restarted. > > We can remove this requirement by copying-on-write any metadata write, > and keeping two copies of the header (with version numbers and checksums). QED has a property today that all metadata or cluster locations have a single location on the disk format that is immutable. Defrag would relax this but defrag can be slow. Having an immutable on-disk location is a powerful property which eliminates a lot of complexity with respect to reference counting and dealing with free lists. For the initial design I would avoid introducing something like this. One of the nice things about features is that we can introduce multi-level trees as a future feature if we really think it's the right thing to do. But we should start at a simple design with high confidence and high performance, and then introduce features with the burden that we're absolutely sure that we don't regress integrity or performance. Regards, Anthony Liguori > Enterprise storage will not corrupt on writes, but commodity storage > may. >
On 09/07/2010 06:40 PM, Anthony Liguori wrote: >> >> Need a checksum for the header. > > Is that not a bit overkill for what we're doing? What's the benefit? Make sure we're not looking at a header write interrupted by a crash. >>> >>> The L2 link '''should''' be made after the data is in place on >>> storage. However, when no ordering is enforced the worst case >>> scenario is an L2 link to an unwritten cluster. >> >> Or it may cause corruption if the physical file size is not >> committed, and L2 now points at a free cluster. > > An fsync() will make sure the physical file size is committed. The > metadata does not carry an additional integrity guarantees over the > actual disk data except that in order to avoid internal corruption, we > have to order the L2 and L1 writes. I was referring to "when no ordering is enforced, the worst case scenario is an L2 link to an unwritten cluster". This isn't true - worst case you point to an unallocated cluster which can then be claimed by data or metadata. > > As part of the read process, it's important to validate that the L2 > entries don't point to blocks beyond EOF. This is an indication of a > corrupted I/O operation and we need to treat that as an unallocated > cluster. Right, but what if the first operation referring to that cluster is an allocation? >> We can remove this requirement by copying-on-write any metadata >> write, and keeping two copies of the header (with version numbers and >> checksums). > > QED has a property today that all metadata or cluster locations have a > single location on the disk format that is immutable. Defrag would > relax this but defrag can be slow. > > Having an immutable on-disk location is a powerful property which > eliminates a lot of complexity with respect to reference counting and > dealing with free lists. However, it exposes the format to "writes may corrupt overwritten data". > > For the initial design I would avoid introducing something like this. > One of the nice things about features is that we can introduce > multi-level trees as a future feature if we really think it's the > right thing to do. > > But we should start at a simple design with high confidence and high > performance, and then introduce features with the burden that we're > absolutely sure that we don't regress integrity or performance. For most things, yes. Metadata checksums should be designed in though (since we need to double the pointer size). Variable height trees have the nice property that you don't need multi cluster allocation. It's nice to avoid large L2s for very large disks.
On 09/07/2010 09:51 AM, Avi Kivity wrote: >> /* if (features & QED_F_BACKING_FILE) */ >> uint32_t backing_file_offset; /* in bytes from start of header */ >> uint32_t backing_file_size; /* in bytes */ > > It's really the filename size, not the file size. Also, make a note > that it is not zero terminated. > >> >> /* if (compat_features & QED_CF_BACKING_FORMAT) */ >> uint32_t backing_fmt_offset; /* in bytes from start of header */ >> uint32_t backing_fmt_size; /* in bytes */ > > Why not make it mandatory? You mean, why not make it: /* if (features & QED_F_BACKING_FILE) */ As opposed to an independent compat feature. Mandatory features mean that you cannot read an image format if you don't understand the feature. In the context of backing_format, it means you have to have all of the possible values fully defined. IOW, what are valid values for backing_fmt? "raw" and "qed" are obvious but what does it mean from a formal specification perspective to have "vmdk"? Is that VMDK v3 or v4, what if there's a v5? If we make backing_fmt a suggestion, it gives us flexibility to leave this poorly defined whereas implementation can fall back to probing if there's any doubt. For the spec, I'd like to define "raw" and "qed". I'd like to modify the qemu implementation to refuse to load an image as raw unless backing_fmt is raw but otherwise just probing. For image creation, if an explicit backing format isn't specified by the user, I'd like to insert backing_fmt=raw for probed raw images and otherwise, not specify a backing_fmt. Regards, Anthony Liguori > >> } > > Need a checksum for the header. > >> >> ==Extent table== >> >> #define TABLE_NOFFSETS (table_size * cluster_size / sizeof(uint64_t)) >> >> Table { >> uint64_t offsets[TABLE_NOFFSETS]; >> } > > It's fashionable to put checksums here. > > Do we want a real extent-based format like modern filesystems? So > after defragmentation a full image has O(1) metadata? > >> >> The extent tables are organized as follows: >> >> +----------+ >> | L1 table | >> +----------+ >> ,------' | '------. >> +----------+ | +----------+ >> | L2 table | ... | L2 table | >> +----------+ +----------+ >> ,------' | '------. >> +----------+ | +----------+ >> | Data | ... | Data | >> +----------+ +----------+ >> >> The table_size field allows tables to be multiples of the cluster >> size. For example, cluster_size=64 KB and table_size=4 results in >> 256 KB tables. >> >> =Operations= >> >> ==Read== >> # If L2 table is not present in L1, read from backing image. >> # If data cluster is not present in L2, read from backing image. >> # Otherwise read data from cluster. > > If not in backing image, provide zeros > >> >> ==Write== >> # If L2 table is not present in L1, allocate new cluster and L2. >> Perform L2 and L1 link after writing data. >> # If data cluster is not present in L2, allocate new cluster. >> Perform L1 link after writing data. >> # Otherwise overwrite data cluster. > > Detail copy-on-write from backing image. > > On a partial write without a backing file, do we recommend > zero-filling the cluster (to avoid intra-cluster fragmentation)? > >> >> The L2 link '''should''' be made after the data is in place on >> storage. However, when no ordering is enforced the worst case >> scenario is an L2 link to an unwritten cluster. > > Or it may cause corruption if the physical file size is not committed, > and L2 now points at a free cluster. > >> >> The L1 link '''must''' be made after the L2 cluster is in place on >> storage. If the order is reversed then the L1 table may point to a >> bogus L2 table. (Is this a problem since clusters are allocated at >> the end of the file?) >> >> ==Grow== >> # If table_size * TABLE_NOFFSETS < new_image_size, fail -EOVERFLOW. >> The L1 table is not big enough. > > With a variable-height tree, we allocate a new root, link its first > entry to the old root, and write the new header with updated root and > height. > >> # Write new image_size header field. >> >> =Data integrity= >> ==Write== >> Writes that complete before a flush must be stable when the flush >> completes. >> >> If storage is interrupted (e.g. power outage) then writes in progress >> may be lost, stable, or partially completed. The storage must not be >> otherwise corrupted or inaccessible after it is restarted. > > We can remove this requirement by copying-on-write any metadata write, > and keeping two copies of the header (with version numbers and > checksums). Enterprise storage will not corrupt on writes, but > commodity storage may. >
On 09/07/2010 11:09 AM, Avi Kivity wrote: > On 09/07/2010 06:40 PM, Anthony Liguori wrote: >>> >>> Need a checksum for the header. >> >> Is that not a bit overkill for what we're doing? What's the benefit? > > Make sure we're not looking at a header write interrupted by a crash. Couldn't hurt I guess. I don't think it's actually needed for L1/L2 tables FWIW. >>>> The L2 link '''should''' be made after the data is in place on >>>> storage. However, when no ordering is enforced the worst case >>>> scenario is an L2 link to an unwritten cluster. >>> >>> Or it may cause corruption if the physical file size is not >>> committed, and L2 now points at a free cluster. >> >> An fsync() will make sure the physical file size is committed. The >> metadata does not carry an additional integrity guarantees over the >> actual disk data except that in order to avoid internal corruption, >> we have to order the L2 and L1 writes. > > I was referring to "when no ordering is enforced, the worst case > scenario is an L2 link to an unwritten cluster". This isn't true - > worst case you point to an unallocated cluster which can then be > claimed by data or metadata. Right, it's necessary to do an fsync to protect against this. To make this user friendly, we could have a dirty bit in the header which gets set on first metadata write and then cleared on clean shutdown. Upon startup, if the dirty bit is set, we do an fsck. >>> We can remove this requirement by copying-on-write any metadata >>> write, and keeping two copies of the header (with version numbers >>> and checksums). >> >> QED has a property today that all metadata or cluster locations have >> a single location on the disk format that is immutable. Defrag would >> relax this but defrag can be slow. >> >> Having an immutable on-disk location is a powerful property which >> eliminates a lot of complexity with respect to reference counting and >> dealing with free lists. > > However, it exposes the format to "writes may corrupt overwritten data". No, you never write an L2 entry once it's been set. If an L2 entry isn't set, the contents of the cluster is all zeros. If you write data to allocate an L2 entry, until you do a flush(), the data can either be what was written or all zeros. >> For the initial design I would avoid introducing something like >> this. One of the nice things about features is that we can introduce >> multi-level trees as a future feature if we really think it's the >> right thing to do. >> >> But we should start at a simple design with high confidence and high >> performance, and then introduce features with the burden that we're >> absolutely sure that we don't regress integrity or performance. > > For most things, yes. Metadata checksums should be designed in though > (since we need to double the pointer size). > > Variable height trees have the nice property that you don't need multi > cluster allocation. It's nice to avoid large L2s for very large disks. FWIW, L2s are 256K at the moment and with a two level table, it can support 5PB of data. If we changed the tables to 128K, we could support 1PB and with 64K tables we would support 256TB. So we could definitely reduce the table sizes now to be a single cluster and it would probably cover us for the foreseeable future. Regards, Anthony Liguori
On Mon, Sep 6, 2010 at 10:04 AM, Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> wrote: > QEMU Enhanced Disk format is a disk image format that forgoes features > found in qcow2 in favor of better levels of performance and data > integrity.  Due to its simpler on-disk layout, it is possible to safely > perform metadata updates more efficiently. > > Installations, suspend-to-disk, and other allocation-heavy I/O workloads > will see increased performance due to fewer I/Os and syncs.  Workloads > that do not cause new clusters to be allocated will perform similar to > raw images due to in-memory metadata caching. > > The format supports sparse disk images.  It does not rely on the host > filesystem holes feature, making it a good choice for sparse disk images > that need to be transferred over channels where holes are not supported. > > Backing files are supported so only deltas against a base image can be > stored. > > The file format is extensible so that additional features can be added > later with graceful compatibility handling. > > Internal snapshots are not supported.  This eliminates the need for > additional metadata to track copy-on-write clusters. It would be nice to support external snapshots, so another file besides the disk images can store the snapshots. Then snapshotting would be available even with raw or QED disk images. This is of course not QED specific. > + * > + * +--------+----------+----------+----------+-----+ > + * | header | L1 table | cluster0 | cluster1 | ... | > + * +--------+----------+----------+----------+-----+ > + * > + * There is a 2-level pagetable for cluster allocation: > + * > + *           +----------+ > + *           | L1 table | > + *           +----------+ > + *         ,------'  |  '------. > + *      +----------+  |   +----------+ > + *      | L2 table |  ...  | L2 table | > + *      +----------+     +----------+ > + *    ,------'  |  '------. > + *  +----------+  |   +----------+ > + *  |  Data  |  ...  |  Data  | > + *  +----------+     +----------+ > + * > + * The L1 table is fixed size and always present.  L2 tables are allocated on > + * demand.  The L1 table size determines the maximum possible image size; it > + * can be influenced using the cluster_size and table_size values. The formula for calculating the maximum size would be nice. Is the image_size the limit? How many clusters can there be? What happens if the image_size is not equal to multiple of cluster size? Wouldn't image_size be redundant if cluster_size and table_size determine the image size? > + * > + * All fields are little-endian on disk. > + */ > + > +typedef struct { > +   uint32_t magic;         /* QED */ > + > +   uint32_t cluster_size;      /* in bytes */ Doesn't cluster_size need to be a power of two? > +   uint32_t table_size;       /* table size, in clusters */ > +   uint32_t first_cluster;     /* first usable cluster */ This introduces some limits to the location of first cluster, with 4k clusters it must reside within the first 16TB. I guess it doesn't matter. > + > +   uint64_t features;        /* format feature bits */ > +   uint64_t compat_features;    /* compatible feature bits */ > +   uint64_t l1_table_offset;    /* L1 table offset, in bytes */ > +   uint64_t image_size;       /* total image size, in bytes */ > + > +   uint32_t backing_file_offset;  /* in bytes from start of header */ > +   uint32_t backing_file_size;   /* in bytes */ > +   uint32_t backing_fmt_offset;   /* in bytes from start of header */ > +   uint32_t backing_fmt_size;    /* in bytes */ > +} QEDHeader; > + > +typedef struct { > +   uint64_t offsets[0];       /* in bytes */ > +} QEDTable; Is this for both L1 and L2 tables?
On 09/07/2010 02:25 PM, Blue Swirl wrote: > On Mon, Sep 6, 2010 at 10:04 AM, Stefan Hajnoczi > <stefanha@linux.vnet.ibm.com> wrote: > >> QEMU Enhanced Disk format is a disk image format that forgoes features >> found in qcow2 in favor of better levels of performance and data >> integrity. Due to its simpler on-disk layout, it is possible to safely >> perform metadata updates more efficiently. >> >> Installations, suspend-to-disk, and other allocation-heavy I/O workloads >> will see increased performance due to fewer I/Os and syncs. Workloads >> that do not cause new clusters to be allocated will perform similar to >> raw images due to in-memory metadata caching. >> >> The format supports sparse disk images. It does not rely on the host >> filesystem holes feature, making it a good choice for sparse disk images >> that need to be transferred over channels where holes are not supported. >> >> Backing files are supported so only deltas against a base image can be >> stored. >> >> The file format is extensible so that additional features can be added >> later with graceful compatibility handling. >> >> Internal snapshots are not supported. This eliminates the need for >> additional metadata to track copy-on-write clusters. >> > It would be nice to support external snapshots, so another file > besides the disk images can store the snapshots. Then snapshotting > would be available even with raw or QED disk images. This is of course > not QED specific. > There's two types of snapshots that I think can cause confusion. There's CPU/device state snapshots and then there's a block device snapshot. qcow2 and qed both support block device snapshots. qed only supports external snapshots (via backing_file) whereas qcow2 supports external and internal snapshots. The internal snapshots are the source of an incredible amount of complexity in the format. qcow2 can also store CPU/device state snapshots and correlate them to block device snapshots (within a single block device). It only supports doing non-live CPU/device state snapshots. OTOH, qemu can support live snapshotting via live migration. Today, it can be used to snapshot CPU/device state to a file on the filesystem with minimum downtime. Combined with an external block snapshot and correlating data, this could be used to implement a single "snapshot" command that would behave like savevm but would not pause a guest's execution. It's really just a matter of plumbing to expose an interface for this today. We have all of the infrastructure we need. >> + * >> + * +--------+----------+----------+----------+-----+ >> + * | header | L1 table | cluster0 | cluster1 | ... | >> + * +--------+----------+----------+----------+-----+ >> + * >> + * There is a 2-level pagetable for cluster allocation: >> + * >> + * +----------+ >> + * | L1 table | >> + * +----------+ >> + * ,------' | '------. >> + * +----------+ | +----------+ >> + * | L2 table | ... | L2 table | >> + * +----------+ +----------+ >> + * ,------' | '------. >> + * +----------+ | +----------+ >> + * | Data | ... | Data | >> + * +----------+ +----------+ >> + * >> + * The L1 table is fixed size and always present. L2 tables are allocated on >> + * demand. The L1 table size determines the maximum possible image size; it >> + * can be influenced using the cluster_size and table_size values. >> > The formula for calculating the maximum size would be nice. table_entries = (table_size * cluster_size / 8) max_size = (table_entries) * table_entries * cluster_size it's a hell of a lot easier to do powers-of-two math though: table_entries = 2^2 * 2^16 / 2^3 = 2^15 max_size = 2^15 * 2^15 * 2^16 = 2^46 = 64TB > Is the > image_size the limit? No. > How many clusters can there be? table_entries * table_entries > What happens if > the image_size is not equal to multiple of cluster size? The code checks this and fails at open() or create() time. > Wouldn't > image_size be redundant if cluster_size and table_size determine the > image size? > In a two level table, if you make table_size the determining factor, the image has to be a multiple of the space spanned by the L2 tables which in the default case for qed is 2GB. >> + * >> + * All fields are little-endian on disk. >> + */ >> + >> +typedef struct { >> + uint32_t magic; /* QED */ >> + >> + uint32_t cluster_size; /* in bytes */ >> > Doesn't cluster_size need to be a power of two? > Yes. It's enforced at open() and create() time but needs to be in the spec. >> + uint32_t table_size; /* table size, in clusters */ >> + uint32_t first_cluster; /* first usable cluster */ >> > This introduces some limits to the location of first cluster, with 4k > clusters it must reside within the first 16TB. I guess it doesn't > matter. > first_cluster is a bad name. It should be header_size and yeah, there is a limit on header_size. >> + >> + uint64_t features; /* format feature bits */ >> + uint64_t compat_features; /* compatible feature bits */ >> + uint64_t l1_table_offset; /* L1 table offset, in bytes */ >> + uint64_t image_size; /* total image size, in bytes */ >> + >> + uint32_t backing_file_offset; /* in bytes from start of header */ >> + uint32_t backing_file_size; /* in bytes */ >> + uint32_t backing_fmt_offset; /* in bytes from start of header */ >> + uint32_t backing_fmt_size; /* in bytes */ >> +} QEDHeader; >> + >> +typedef struct { >> + uint64_t offsets[0]; /* in bytes */ >> +} QEDTable; >> > Is this for both L1 and L2 tables? > Yes, which has the nice advantage of simplifying the code quite a bit. Regards, Anthony Liguori
On Tue, Sep 07, 2010 at 11:12:15AM -0500, Anthony Liguori wrote: > IOW, what are valid values for backing_fmt? "raw" and "qed" are obvious > but what does it mean from a formal specification perspective to have > "vmdk"? Is that VMDK v3 or v4, what if there's a v5? It might be better to just use a uint16_t field for the backing format, where each valid format gets a bit position assigned. For now just raw, qed and qcow2 would be enough.
On 09/07/2010 11:25 AM, Anthony Liguori wrote: > On 09/07/2010 11:09 AM, Avi Kivity wrote: >> On 09/07/2010 06:40 PM, Anthony Liguori wrote: >>>> >>>> Need a checksum for the header. >>> >>> Is that not a bit overkill for what we're doing? What's the benefit? >> >> Make sure we're not looking at a header write interrupted by a crash. > > Couldn't hurt I guess. I don't think it's actually needed for L1/L2 > tables FWIW. > >>>>> The L2 link '''should''' be made after the data is in place on >>>>> storage. However, when no ordering is enforced the worst case >>>>> scenario is an L2 link to an unwritten cluster. >>>> >>>> Or it may cause corruption if the physical file size is not >>>> committed, and L2 now points at a free cluster. >>> >>> An fsync() will make sure the physical file size is committed. The >>> metadata does not carry an additional integrity guarantees over the >>> actual disk data except that in order to avoid internal corruption, >>> we have to order the L2 and L1 writes. >> >> I was referring to "when no ordering is enforced, the worst case >> scenario is an L2 link to an unwritten cluster". This isn't true - >> worst case you point to an unallocated cluster which can then be >> claimed by data or metadata. > > Right, it's necessary to do an fsync to protect against this. To make > this user friendly, we could have a dirty bit in the header which gets > set on first metadata write and then cleared on clean shutdown. > > Upon startup, if the dirty bit is set, we do an fsck. > >>>> We can remove this requirement by copying-on-write any metadata >>>> write, and keeping two copies of the header (with version numbers >>>> and checksums). >>> >>> QED has a property today that all metadata or cluster locations have >>> a single location on the disk format that is immutable. Defrag >>> would relax this but defrag can be slow. >>> >>> Having an immutable on-disk location is a powerful property which >>> eliminates a lot of complexity with respect to reference counting >>> and dealing with free lists. >> >> However, it exposes the format to "writes may corrupt overwritten data". > > No, you never write an L2 entry once it's been set. If an L2 entry > isn't set, the contents of the cluster is all zeros. > > If you write data to allocate an L2 entry, until you do a flush(), the > data can either be what was written or all zeros. > >>> For the initial design I would avoid introducing something like >>> this. One of the nice things about features is that we can >>> introduce multi-level trees as a future feature if we really think >>> it's the right thing to do. >>> >>> But we should start at a simple design with high confidence and high >>> performance, and then introduce features with the burden that we're >>> absolutely sure that we don't regress integrity or performance. >> >> For most things, yes. Metadata checksums should be designed in >> though (since we need to double the pointer size). >> >> Variable height trees have the nice property that you don't need >> multi cluster allocation. It's nice to avoid large L2s for very >> large disks. > > FWIW, L2s are 256K at the moment and with a two level table, it can > support 5PB of data. I clearly suck at basic math today. The image supports 64TB today. Dropping to 128K tables would reduce it to 16TB and 64k tables would be 4TB. BTW, I don't think your checksumming idea is sound. If you store a 64-bit checksum along side each point, it becomes necessary to update the parent pointer every time the table changes. This introduces an ordering requirement which means you need to sync() the file every time you update and L2 entry. Today, we only need to sync() when we first allocate an L2 entry (because their locations never change). From a performance perspective, it's the difference between an fsync() every 64k vs. every 2GB. Plus, doesn't btrfs do block level checksumming? IOW, if you run a workload where you care about this level of data integrity validation, if you did btrfs + qed, you would be fine. Since the majority of file systems don't do metadata checksumming, it's not obvious to me that we should be. I think one of the critical flaws in qcow2 was trying to invent a better filesystem within qemu instead of just sticking to a very simple and obviously correct format and letting the FS folks do the really fancy stuff. Regards, Anthony Liguori
On 09/07/2010 04:35 PM, Christoph Hellwig wrote: > On Tue, Sep 07, 2010 at 11:12:15AM -0500, Anthony Liguori wrote: > >> IOW, what are valid values for backing_fmt? "raw" and "qed" are obvious >> but what does it mean from a formal specification perspective to have >> "vmdk"? Is that VMDK v3 or v4, what if there's a v5? >> > It might be better to just use a uint16_t field for the backing format, > where each valid format gets a bit position assigned. For now just raw, > qed and qcow2 would be enough. > If it were just one bit for just raw or not raw, wouldn't that be enough? Everything that isn't raw can be probed reliably so we really only need to distinguish between things that are probe-able and things that are not probe-able. Regards, Anthony Liguori
On Tue, Sep 07, 2010 at 05:29:53PM -0500, Anthony Liguori wrote: > If it were just one bit for just raw or not raw, wouldn't that be enough? > > Everything that isn't raw can be probed reliably so we really only need > to distinguish between things that are probe-able and things that are > not probe-able. That might work as well. The important point is to not encode the formats as strings, which is not a very useful portable encoding.
Am 07.09.2010 22:41, schrieb Anthony Liguori: > There's two types of snapshots that I think can cause confusion. > There's CPU/device state snapshots and then there's a block device snapshot. > > qcow2 and qed both support block device snapshots. qed only supports > external snapshots (via backing_file) whereas qcow2 supports external > and internal snapshots. The internal snapshots are the source of an > incredible amount of complexity in the format. > > qcow2 can also store CPU/device state snapshots and correlate them to > block device snapshots (within a single block device). It only supports > doing non-live CPU/device state snapshots. Which is not a property of the format, but of the implementation. I think it shouldn't be too hard to introduce live snapshots. > OTOH, qemu can support live snapshotting via live migration. Today, it > can be used to snapshot CPU/device state to a file on the filesystem > with minimum downtime. > > Combined with an external block snapshot and correlating data, this > could be used to implement a single "snapshot" command that would behave > like savevm but would not pause a guest's execution. We'd need fields for referencing a VM state file from a QED image, just like it's already done for backing files. Kevin
On 09/08/2010 01:27 AM, Anthony Liguori wrote: >> FWIW, L2s are 256K at the moment and with a two level table, it can >> support 5PB of data. > > > I clearly suck at basic math today. The image supports 64TB today. > Dropping to 128K tables would reduce it to 16TB and 64k tables would > be 4TB. Maybe we should do three levels then. Some users are bound to complain about 64TB. > > BTW, I don't think your checksumming idea is sound. If you store a > 64-bit checksum along side each point, it becomes necessary to update > the parent pointer every time the table changes. This introduces an > ordering requirement which means you need to sync() the file every > time you update and L2 entry. Even worse, if the crash happens between an L2 update and an L1 checksum update, the entire cluster goes away. You really want allocate-on-write for this. > Today, we only need to sync() when we first allocate an L2 entry > (because their locations never change). From a performance > perspective, it's the difference between an fsync() every 64k vs. > every 2GB. Yup. From a correctness perspective, it's the difference between a corrupted filesystem on almost every crash and a corrupted filesystem in some very rare cases. > > Plus, doesn't btrfs do block level checksumming? IOW, if you run a > workload where you care about this level of data integrity validation, > if you did btrfs + qed, you would be fine. Or just btrfs by itself (use btrfs for snapshots and base images, use qemu-img convert for shipping). > > Since the majority of file systems don't do metadata checksumming, > it's not obvious to me that we should be. The logic is that as data sizes increase, the probablity of error increases. > I think one of the critical flaws in qcow2 was trying to invent a > better filesystem within qemu instead of just sticking to a very > simple and obviously correct format and letting the FS folks do the > really fancy stuff. Well, if we introduce a minimal format, we need to make sure it isn't too minimal. I'm still not sold on the idea. What we're doing now is pushing the qcow2 complexity to users. We don't have to worry about refcounts now, but users have to worry whether they're the machine they're copying the image to supports qed or not. The performance problems with qcow2 are solvable. If we preallocate clusters, the performance characteristics become essentially the same as qed.
On 08.09.2010, at 10:23, Avi Kivity wrote: > On 09/08/2010 01:27 AM, Anthony Liguori wrote: >>> FWIW, L2s are 256K at the moment and with a two level table, it can support 5PB of data. >> >> >> I clearly suck at basic math today. The image supports 64TB today. Dropping to 128K tables would reduce it to 16TB and 64k tables would be 4TB. > > Maybe we should do three levels then. Some users are bound to complain about 64TB. Why 3 levels? Can't the L2 size be dynamic? Then big images get a big L2 map while small images get a smaller one. Alex
On 09/08/2010 11:41 AM, Alexander Graf wrote: > On 08.09.2010, at 10:23, Avi Kivity wrote: > >> On 09/08/2010 01:27 AM, Anthony Liguori wrote: >>>> FWIW, L2s are 256K at the moment and with a two level table, it can support 5PB of data. >>> >>> I clearly suck at basic math today. The image supports 64TB today. Dropping to 128K tables would reduce it to 16TB and 64k tables would be 4TB. >> Maybe we should do three levels then. Some users are bound to complain about 64TB. > Why 3 levels? Can't the L2 size be dynamic? Then big images get a big L2 map while small images get a smaller one. > Dunno, just seems more regular to me. Image resize doesn't need to relocate the L2 table in case it overflows. The overhead from three levels is an extra table, which is negligible. With 64K tables, the maximum image size is 32PiB, which is 14 bits away from a 2TB disk, giving us about 30 years.
Here is a summary of how qed images can be accessed safely after a crash or power loss. First off, we only need to consider write operations since read operations do not change the state of the image file and cannot lead to metadata corruption. There are two types of writes. Allocating writes which are necessary when no cluster has been allocated for this logical block and in-place writes when a cluster has previously been allocated. In-place writes overwrite old data in the image file. They do not allocate new clusters or update any metadata. This is why write performance is comparable to raw in the long run. Once you've done the hard work of allocating a cluster you can write and re-write its sectors because the cluster stays put. The failure scenario here is the same as for a raw image: power loss means that data may or may not be written to disk and perhaps not all sectors were written. It is up to the guest to handle recovery and the qed metadata has not been corrupted. Allocating writes fall into two cases: 1. There is no existing L2 table to link the data cluster into. Allocate and write the data cluster, allocate an L2 table, link up the data cluster in the L2 table, fsync(), and link up the L2 table in the L1 table. Notice the fsync() between the L2 update and L1 update ensures that the L1 table always points to a complete L2 table. 2. There is an existing L2 table to link the data cluster into. Allocate and write the data cluster, link up the data cluster in the L2 table. Notice that there is no flush operation between writing the data and updating the metadata. Since there is no ordering imposed between the data write and metadata update, the following scenarios may occur on crash: 1. Neither data write nor metadata update reach the disk. This is fine, qed metadata has not been corrupted. 2. Data reaches disk but metadata update does not. We have leaked a cluster but not corrupted metadata. Leaked clusters can be detected with qemu-img check. Note that if file size is not a multiple of cluster size, then the file size is rounded down by cluster size. That means the next cluster allocation will claim the partial write at the end of the file. 3. Metadata update reaches disk but data does not. The interesting case! The L2 table now points to a cluster which is beyond the last cluster in the image file. Remember that file size is rounded down by cluster size, so partial data writes are discarded and this case applies. Now we're in trouble. The image cannot be accessed without some sanity checking because not only do table entries point to invalid clusters, but new allocating writes might make previously invalid cluster offsets valid again (then there would be two or more table entries pointing to the same cluster)! Anthony's suggestion is to use a "mounted" or "dirty" bit in the qed header to detect a crashed image when opening the image file. If no crash has occurred, then the mounted bit is unset and normal operation is safe. If the mounted bit is set, then an check of the L1/L2 tables must be performed and any invalid cluster offsets must be cleared to zero. When an invalid cluster is cleared to zero, we arrive back at case 1 above: neither data write nor metadata update reached the disk, and we are in a safe state. 4. Both data and metadata reach disk. No problem. Have I missed anything? Stefan
On 09/08/2010 03:23 AM, Avi Kivity wrote: > On 09/08/2010 01:27 AM, Anthony Liguori wrote: >>> FWIW, L2s are 256K at the moment and with a two level table, it can >>> support 5PB of data. >> >> >> I clearly suck at basic math today. The image supports 64TB today. >> Dropping to 128K tables would reduce it to 16TB and 64k tables would >> be 4TB. > > Maybe we should do three levels then. Some users are bound to > complain about 64TB. That's just the default size. The table size and cluster sizes are configurable. Without changing the cluster size, the image can support up to 1PB. >> >> BTW, I don't think your checksumming idea is sound. If you store a >> 64-bit checksum along side each point, it becomes necessary to update >> the parent pointer every time the table changes. This introduces an >> ordering requirement which means you need to sync() the file every >> time you update and L2 entry. > > Even worse, if the crash happens between an L2 update and an L1 > checksum update, the entire cluster goes away. You really want > allocate-on-write for this. > >> Today, we only need to sync() when we first allocate an L2 entry >> (because their locations never change). From a performance >> perspective, it's the difference between an fsync() every 64k vs. >> every 2GB. > > Yup. From a correctness perspective, it's the difference between a > corrupted filesystem on almost every crash and a corrupted filesystem > in some very rare cases. I'm not sure I understand you're corruption comment. Are you claiming that without checksumming, you'll often get corruption or are you claiming that without checksums, if you don't sync metadata updates you'll get corruption? qed is very careful about ensuring that we don't need to do syncs and we don't get corruption because of data loss. I don't necessarily buy your checksumming argument. >> Plus, doesn't btrfs do block level checksumming? IOW, if you run a >> workload where you care about this level of data integrity >> validation, if you did btrfs + qed, you would be fine. > > Or just btrfs by itself (use btrfs for snapshots and base images, use > qemu-img convert for shipping). > >> >> Since the majority of file systems don't do metadata checksumming, >> it's not obvious to me that we should be. > > The logic is that as data sizes increase, the probablity of error > increases. > >> I think one of the critical flaws in qcow2 was trying to invent a >> better filesystem within qemu instead of just sticking to a very >> simple and obviously correct format and letting the FS folks do the >> really fancy stuff. > > Well, if we introduce a minimal format, we need to make sure it isn't > too minimal. > > I'm still not sold on the idea. What we're doing now is pushing the > qcow2 complexity to users. We don't have to worry about refcounts > now, but users have to worry whether they're the machine they're > copying the image to supports qed or not. > > The performance problems with qcow2 are solvable. If we preallocate > clusters, the performance characteristics become essentially the same > as qed. By creating two code paths within qcow2. It's not just the reference counts, it's the lack of guaranteed alignment, compression, and some of the other poor decisions in the format. If you have two code paths in qcow2, you have non-deterministic performance because users that do reasonable things with their images will end up getting catastrophically bad performance. A new format doesn't introduce much additional complexity. We provide image conversion tool and we can almost certainly provide an in-place conversion tool that makes the process very fast. Regards, Anthony Liguori
On 09/08/2010 03:53 AM, Avi Kivity wrote: > On 09/08/2010 11:41 AM, Alexander Graf wrote: >> On 08.09.2010, at 10:23, Avi Kivity wrote: >> >>> On 09/08/2010 01:27 AM, Anthony Liguori wrote: >>>>> FWIW, L2s are 256K at the moment and with a two level table, it >>>>> can support 5PB of data. >>>> >>>> I clearly suck at basic math today. The image supports 64TB >>>> today. Dropping to 128K tables would reduce it to 16TB and 64k >>>> tables would be 4TB. >>> Maybe we should do three levels then. Some users are bound to >>> complain about 64TB. >> Why 3 levels? Can't the L2 size be dynamic? Then big images get a big >> L2 map while small images get a smaller one. >> > > Dunno, just seems more regular to me. Image resize doesn't need to > relocate the L2 table in case it overflows. > > The overhead from three levels is an extra table, which is negligible. It means an extra I/O request in the degenerate case whereas increasing the table size only impacts the size of the metadata. A 10GB image currently has 1.2MB of metadata in QED today. A 1TB image uses 128MB of metadata. The ratio of metadata is about 0.01%. A three level table adds an additional I/O request in order to reduce metadata. But the metadata is small enough today that I don't see the point. Regards, Anthony Liguori > With 64K tables, the maximum image size is 32PiB, which is 14 bits > away from a 2TB disk, giving us about 30 years. >
Am 08.09.2010 14:48, schrieb Anthony Liguori: > On 09/08/2010 03:23 AM, Avi Kivity wrote: >> On 09/08/2010 01:27 AM, Anthony Liguori wrote: >>>> FWIW, L2s are 256K at the moment and with a two level table, it can >>>> support 5PB of data. >>> >>> >>> I clearly suck at basic math today. The image supports 64TB today. >>> Dropping to 128K tables would reduce it to 16TB and 64k tables would >>> be 4TB. >> >> Maybe we should do three levels then. Some users are bound to >> complain about 64TB. > > That's just the default size. The table size and cluster sizes are > configurable. Without changing the cluster size, the image can support > up to 1PB. > >>> >>> BTW, I don't think your checksumming idea is sound. If you store a >>> 64-bit checksum along side each point, it becomes necessary to update >>> the parent pointer every time the table changes. This introduces an >>> ordering requirement which means you need to sync() the file every >>> time you update and L2 entry. >> >> Even worse, if the crash happens between an L2 update and an L1 >> checksum update, the entire cluster goes away. You really want >> allocate-on-write for this. >> >>> Today, we only need to sync() when we first allocate an L2 entry >>> (because their locations never change). From a performance >>> perspective, it's the difference between an fsync() every 64k vs. >>> every 2GB. >> >> Yup. From a correctness perspective, it's the difference between a >> corrupted filesystem on almost every crash and a corrupted filesystem >> in some very rare cases. > > I'm not sure I understand you're corruption comment. Are you claiming > that without checksumming, you'll often get corruption or are you > claiming that without checksums, if you don't sync metadata updates > you'll get corruption? > > qed is very careful about ensuring that we don't need to do syncs and we > don't get corruption because of data loss. I don't necessarily buy your > checksumming argument. > >>> Plus, doesn't btrfs do block level checksumming? IOW, if you run a >>> workload where you care about this level of data integrity >>> validation, if you did btrfs + qed, you would be fine. >> >> Or just btrfs by itself (use btrfs for snapshots and base images, use >> qemu-img convert for shipping). >> >>> >>> Since the majority of file systems don't do metadata checksumming, >>> it's not obvious to me that we should be. >> >> The logic is that as data sizes increase, the probablity of error >> increases. >> >>> I think one of the critical flaws in qcow2 was trying to invent a >>> better filesystem within qemu instead of just sticking to a very >>> simple and obviously correct format and letting the FS folks do the >>> really fancy stuff. >> >> Well, if we introduce a minimal format, we need to make sure it isn't >> too minimal. >> >> I'm still not sold on the idea. What we're doing now is pushing the >> qcow2 complexity to users. We don't have to worry about refcounts >> now, but users have to worry whether they're the machine they're >> copying the image to supports qed or not. >> >> The performance problems with qcow2 are solvable. If we preallocate >> clusters, the performance characteristics become essentially the same >> as qed. > > By creating two code paths within qcow2. It's not just the reference > counts, it's the lack of guaranteed alignment, compression, and some of > the other poor decisions in the format. I'm not aware of any unaligned data in qcow2. Compression can leave some sectors sparse, but that's something the FS has to deal with, not qcow2. > If you have two code paths in qcow2, you have non-deterministic > performance because users that do reasonable things with their images > will end up getting catastrophically bad performance. Compression and encryption lead to bad performance, yes. These are very clear criteria and something very easy to understand for users. I've never heard any user complain about this "non-deterministic" behaviour. > A new format doesn't introduce much additional complexity. We provide > image conversion tool and we can almost certainly provide an in-place > conversion tool that makes the process very fast. I'm not convinced that in-place conversion is worth the trouble. Kevin
On 09/08/2010 08:20 AM, Kevin Wolf wrote: > Am 08.09.2010 14:48, schrieb Anthony Liguori: > >> On 09/08/2010 03:23 AM, Avi Kivity wrote: >> >>> On 09/08/2010 01:27 AM, Anthony Liguori wrote: >>> >>>>> FWIW, L2s are 256K at the moment and with a two level table, it can >>>>> support 5PB of data. >>>>> >>>> >>>> I clearly suck at basic math today. The image supports 64TB today. >>>> Dropping to 128K tables would reduce it to 16TB and 64k tables would >>>> be 4TB. >>>> >>> Maybe we should do three levels then. Some users are bound to >>> complain about 64TB. >>> >> That's just the default size. The table size and cluster sizes are >> configurable. Without changing the cluster size, the image can support >> up to 1PB. >> >> >>>> BTW, I don't think your checksumming idea is sound. If you store a >>>> 64-bit checksum along side each point, it becomes necessary to update >>>> the parent pointer every time the table changes. This introduces an >>>> ordering requirement which means you need to sync() the file every >>>> time you update and L2 entry. >>>> >>> Even worse, if the crash happens between an L2 update and an L1 >>> checksum update, the entire cluster goes away. You really want >>> allocate-on-write for this. >>> >>> >>>> Today, we only need to sync() when we first allocate an L2 entry >>>> (because their locations never change). From a performance >>>> perspective, it's the difference between an fsync() every 64k vs. >>>> every 2GB. >>>> >>> Yup. From a correctness perspective, it's the difference between a >>> corrupted filesystem on almost every crash and a corrupted filesystem >>> in some very rare cases. >>> >> I'm not sure I understand you're corruption comment. Are you claiming >> that without checksumming, you'll often get corruption or are you >> claiming that without checksums, if you don't sync metadata updates >> you'll get corruption? >> >> qed is very careful about ensuring that we don't need to do syncs and we >> don't get corruption because of data loss. I don't necessarily buy your >> checksumming argument. >> >> >>>> Plus, doesn't btrfs do block level checksumming? IOW, if you run a >>>> workload where you care about this level of data integrity >>>> validation, if you did btrfs + qed, you would be fine. >>>> >>> Or just btrfs by itself (use btrfs for snapshots and base images, use >>> qemu-img convert for shipping). >>> >>> >>>> Since the majority of file systems don't do metadata checksumming, >>>> it's not obvious to me that we should be. >>>> >>> The logic is that as data sizes increase, the probablity of error >>> increases. >>> >>> >>>> I think one of the critical flaws in qcow2 was trying to invent a >>>> better filesystem within qemu instead of just sticking to a very >>>> simple and obviously correct format and letting the FS folks do the >>>> really fancy stuff. >>>> >>> Well, if we introduce a minimal format, we need to make sure it isn't >>> too minimal. >>> >>> I'm still not sold on the idea. What we're doing now is pushing the >>> qcow2 complexity to users. We don't have to worry about refcounts >>> now, but users have to worry whether they're the machine they're >>> copying the image to supports qed or not. >>> >>> The performance problems with qcow2 are solvable. If we preallocate >>> clusters, the performance characteristics become essentially the same >>> as qed. >>> >> By creating two code paths within qcow2. It's not just the reference >> counts, it's the lack of guaranteed alignment, compression, and some of >> the other poor decisions in the format. >> > I'm not aware of any unaligned data in qcow2. Compression can leave some > sectors sparse, but that's something the FS has to deal with, not qcow2. > If my memory serves, you changed qcow2 some time ago to make sure that metadata is aligned but historically, we didn't always do that and the qcow2 doesn't enforce that metadata is aligned. This means that if you did try to make a version of qcow2 that was totally async or really just was fast, you'd have to make sure you dealt with unaligned accesses and bounced buffers accordingly. >> If you have two code paths in qcow2, you have non-deterministic >> performance because users that do reasonable things with their images >> will end up getting catastrophically bad performance. >> > Compression and encryption lead to bad performance, yes. These are very > clear criteria and something very easy to understand for users. I've > never heard any user complain about this "non-deterministic" behaviour. > That's because qcow2 has always been limited in it's performance so it's quite deterministic :-) Don't get me wrong, you and others have done amazing things making qcow2 better than it was and it's pretty reasonable when dealing with IDE and a single backing spindle, but when dealing with virtio and a large storage array, it simply doesn't even come close to raw. FWIW, we'll numbers later this week with a detailed comparison. Regards, Anthony Liguori >> A new format doesn't introduce much additional complexity. We provide >> image conversion tool and we can almost certainly provide an in-place >> conversion tool that makes the process very fast. >> > I'm not convinced that in-place conversion is worth the trouble. > > Kevin >
Am 08.09.2010 15:26, schrieb Anthony Liguori: > On 09/08/2010 08:20 AM, Kevin Wolf wrote: >> Am 08.09.2010 14:48, schrieb Anthony Liguori: >>>>> I think one of the critical flaws in qcow2 was trying to invent a >>>>> better filesystem within qemu instead of just sticking to a very >>>>> simple and obviously correct format and letting the FS folks do the >>>>> really fancy stuff. >>>>> >>>> Well, if we introduce a minimal format, we need to make sure it isn't >>>> too minimal. >>>> >>>> I'm still not sold on the idea. What we're doing now is pushing the >>>> qcow2 complexity to users. We don't have to worry about refcounts >>>> now, but users have to worry whether they're the machine they're >>>> copying the image to supports qed or not. >>>> >>>> The performance problems with qcow2 are solvable. If we preallocate >>>> clusters, the performance characteristics become essentially the same >>>> as qed. >>>> >>> By creating two code paths within qcow2. It's not just the reference >>> counts, it's the lack of guaranteed alignment, compression, and some of >>> the other poor decisions in the format. >>> >> I'm not aware of any unaligned data in qcow2. Compression can leave some >> sectors sparse, but that's something the FS has to deal with, not qcow2. >> > > If my memory serves, you changed qcow2 some time ago to make sure that > metadata is aligned but historically, we didn't always do that and the > qcow2 doesn't enforce that metadata is aligned. I can't remember any such change, but the problem might as well be on my side. In any case, if it was like you say, we would still have to accept unaligned data or we would have broken compatibility. Maybe you mean that historically the qcow2 driver was accessing single table entries instead of the whole table, and that was an unaligned access? That was only a bad implementation, though. > This means that if you did try to make a version of qcow2 that was > totally async or really just was fast, you'd have to make sure you dealt > with unaligned accesses and bounced buffers accordingly. Right. Though even if some obscure data was unaligned, what really matters are L1/L2 tables and refcount tables/blocks. And these are definitely cluster aligned. >>> If you have two code paths in qcow2, you have non-deterministic >>> performance because users that do reasonable things with their images >>> will end up getting catastrophically bad performance. >>> >> Compression and encryption lead to bad performance, yes. These are very >> clear criteria and something very easy to understand for users. I've >> never heard any user complain about this "non-deterministic" behaviour. > > That's because qcow2 has always been limited in it's performance so it's > quite deterministic :-) Run an installation on an encrypted qcow2 and one on a "normal" qcow2 image. Last time I tried there was a bit of a difference... Kevin
On Tue, Sep 7, 2010 at 3:51 PM, Avi Kivity <avi@redhat.com> wrote: >  On 09/06/2010 04:06 PM, Anthony Liguori wrote: >> >> Another point worth mentioning is that our intention is to have a formal >> specification of the format before merging.  A start of that is located at >> http://wiki.qemu.org/Features/QED >> > >> =Specification= >> >> The file format looks like this: >> >>  +---------+---------+---------+-----+ >>  | extent0 | extent1 | extent1 | ... | >>  +---------+---------+---------+-----+ >> >> The first extent contains a header.  The header contains information about >> the first data extent.  A data extent may be a data cluster, an L2, or an L1 >> table.  L1 and L2 tables are composed of one or more contiguous extents. >> >> ==Header== >>  Header { >>   uint32_t magic;        /* QED\0 */ > > Endianness? Little-endian for all metadata. Updated on wiki page. >> >>   uint32_t cluster_size;     /* in bytes */ > > Does cluster == extent?  If so, use the same terminology.  If not, explain. > > Usually extent is a variable size structure. QED does not use extents. It uses fixed size clusters, 64 KB by default but configurable at image creation time. The wiki page has been fleshed out more to describe the cluster-based layout. >>   uint32_t table_size;      /* table size, in clusters */ > > Presumably L1 table size?  Or any table size? > > Hm.  It would be nicer not to require contiguous sectors anywhere.  How > about a variable- or fixed-height tree? Both extents and fancier trees don't fit the philosophy, which is to keep things straightforward and fast by doing less. With extents and trees you've got something that looks much more like a full-blown filesystem. Is there an essential feature or characteristic that QED cannot provide in its current design? >>   uint32_t first_cluster;    /* in clusters */ > > First cluster of what? > >> >>   uint64_t features;       /* format feature bits */ >>   uint64_t compat_features;   /* compat feature bits */ >>   uint64_t l1_table_offset;   /* L1 table offset, in clusters */ >>   uint64_t image_size;      /* total image size, in clusters */ > > Logical, yes? Yes. Wiki updated. > Is the physical image size always derived from the host file metadata?  Is > this always safe? In my email summarizing crash scenarios and recovery we cover the bases and I think it is safe to rely on file size as physical image size. The drawback is that you need a host filesystem and cannot directly use a bare block device. I think that is acceptable for a sparse format, otherwise we'd be using raw. > >>   /* if (features & QED_F_BACKING_FILE) */ >>   uint32_t backing_file_offset; /* in bytes from start of header */ >>   uint32_t backing_file_size;  /* in bytes */ > > It's really the filename size, not the file size.  Also, make a note that it > is not zero terminated. Fixed both on wiki. > >> >>   /* if (compat_features & QED_CF_BACKING_FORMAT) */ >>   uint32_t backing_fmt_offset;  /* in bytes from start of header */ >>   uint32_t backing_fmt_size;   /* in bytes */ > > Why not make it mandatory? > > >>  } > > Need a checksum for the header. > >> >> ==Extent table== >> >>  #define TABLE_NOFFSETS (table_size * cluster_size / sizeof(uint64_t)) >> >>  Table { >>   uint64_t offsets[TABLE_NOFFSETS]; >>  } > > It's fashionable to put checksums here. > > Do we want a real extent-based format like modern filesystems?  So after > defragmentation a full image has O(1) metadata? > >> >> The extent tables are organized as follows: >> >>           +----------+ >>           | L1 table | >>           +----------+ >>        ,------'  |  '------. >>      +----------+  |   +----------+ >>      | L2 table |  ...  | L2 table | >>      +----------+     +----------+ >>    ,------'  |  '------. >>  +----------+  |   +----------+ >>  |  Data  |  ...  |  Data  | >>  +----------+     +----------+ >> >> The table_size field allows tables to be multiples of the cluster size. >>  For example, cluster_size=64 KB and table_size=4 results in 256 KB tables. >> >> =Operations= >> >> ==Read== >> # If L2 table is not present in L1, read from backing image. >> # If data cluster is not present in L2, read from backing image. >> # Otherwise read data from cluster. > > If not in backing image, provide zeros Wiki updated. > >> >> ==Write== >> # If L2 table is not present in L1, allocate new cluster and L2.  Perform >> L2 and L1 link after writing data. >> # If data cluster is not present in L2, allocate new cluster.  Perform L1 >> link after writing data. >> # Otherwise overwrite data cluster. > > Detail copy-on-write from backing image. > > On a partial write without a backing file, do we recommend zero-filling the > cluster (to avoid intra-cluster fragmentation)? Currently zeroes are written and with 64 KB cluster size hopefully isn't too painful. > >> >> The L2 link '''should''' be made after the data is in place on storage. >>  However, when no ordering is enforced the worst case scenario is an L2 link >> to an unwritten cluster. > > Or it may cause corruption if the physical file size is not committed, and > L2 now points at a free cluster. > >> >> The L1 link '''must''' be made after the L2 cluster is in place on >> storage.  If the order is reversed then the L1 table may point to a bogus L2 >> table.  (Is this a problem since clusters are allocated at the end of the >> file?) >> >> ==Grow== >> # If table_size * TABLE_NOFFSETS < new_image_size, fail -EOVERFLOW.  The >> L1 table is not big enough. > > With a variable-height tree, we allocate a new root, link its first entry to > the old root, and write the new header with updated root and height. > >> # Write new image_size header field. >> >> =Data integrity= >> ==Write== >> Writes that complete before a flush must be stable when the flush >> completes. >> >> If storage is interrupted (e.g. power outage) then writes in progress may >> be lost, stable, or partially completed.  The storage must not be otherwise >> corrupted or inaccessible after it is restarted. > > We can remove this requirement by copying-on-write any metadata write, and > keeping two copies of the header (with version numbers and checksums). >  Enterprise storage will not corrupt on writes, but commodity storage may. > > -- > error compiling committee.c: too many arguments to function > > >
On Tue, Sep 7, 2010 at 8:25 PM, Blue Swirl <blauwirbel@gmail.com> wrote: > On Mon, Sep 6, 2010 at 10:04 AM, Stefan Hajnoczi > <stefanha@linux.vnet.ibm.com> wrote: >> QEMU Enhanced Disk format is a disk image format that forgoes features >> found in qcow2 in favor of better levels of performance and data >> integrity.  Due to its simpler on-disk layout, it is possible to safely >> perform metadata updates more efficiently. >> >> Installations, suspend-to-disk, and other allocation-heavy I/O workloads >> will see increased performance due to fewer I/Os and syncs.  Workloads >> that do not cause new clusters to be allocated will perform similar to >> raw images due to in-memory metadata caching. >> >> The format supports sparse disk images.  It does not rely on the host >> filesystem holes feature, making it a good choice for sparse disk images >> that need to be transferred over channels where holes are not supported. >> >> Backing files are supported so only deltas against a base image can be >> stored. >> >> The file format is extensible so that additional features can be added >> later with graceful compatibility handling. >> >> Internal snapshots are not supported.  This eliminates the need for >> additional metadata to track copy-on-write clusters. > > It would be nice to support external snapshots, so another file > besides the disk images can store the snapshots. Then snapshotting > would be available even with raw or QED disk images. This is of course > not QED specific. > >> + * >> + * +--------+----------+----------+----------+-----+ >> + * | header | L1 table | cluster0 | cluster1 | ... | >> + * +--------+----------+----------+----------+-----+ >> + * >> + * There is a 2-level pagetable for cluster allocation: >> + * >> + *           +----------+ >> + *           | L1 table | >> + *           +----------+ >> + *         ,------'  |  '------. >> + *      +----------+  |   +----------+ >> + *      | L2 table |  ...  | L2 table | >> + *      +----------+     +----------+ >> + *    ,------'  |  '------. >> + *  +----------+  |   +----------+ >> + *  |  Data  |  ...  |  Data  | >> + *  +----------+     +----------+ >> + * >> + * The L1 table is fixed size and always present.  L2 tables are allocated on >> + * demand.  The L1 table size determines the maximum possible image size; it >> + * can be influenced using the cluster_size and table_size values. > > The formula for calculating the maximum size would be nice. Is the > image_size the limit? How many clusters can there be? What happens if > the image_size is not equal to multiple of cluster size? Wouldn't > image_size be redundant if cluster_size and table_size determine the > image size? image_size is the logical image size, whereas TABLE_NELEMS * TABLE_NELEMS * cluster_size is the maximum logical image size (TABLE_NELEMS depends on table_size and cluster_size). I have updated the wiki page with the constraint. I don't think the specification needs to mention error behavior, that would depend on the implementation. But the specification needs to mention alignment constraints so I have added them. > >> + * >> + * All fields are little-endian on disk. >> + */ >> + >> +typedef struct { >> +   uint32_t magic;         /* QED */ >> + >> +   uint32_t cluster_size;      /* in bytes */ > > Doesn't cluster_size need to be a power of two? > >> +   uint32_t table_size;       /* table size, in clusters */ >> +   uint32_t first_cluster;     /* first usable cluster */ > > This introduces some limits to the location of first cluster, with 4k > clusters it must reside within the first 16TB. I guess it doesn't > matter. It shouldn't matter since any header that is >=16 TB means something mutated, escaped the lab, and is terrorizing the world as a qed monster image. > >> + >> +   uint64_t features;        /* format feature bits */ >> +   uint64_t compat_features;    /* compatible feature bits */ >> +   uint64_t l1_table_offset;    /* L1 table offset, in bytes */ >> +   uint64_t image_size;       /* total image size, in bytes */ >> + >> +   uint32_t backing_file_offset;  /* in bytes from start of header */ >> +   uint32_t backing_file_size;   /* in bytes */ >> +   uint32_t backing_fmt_offset;   /* in bytes from start of header */ >> +   uint32_t backing_fmt_size;    /* in bytes */ >> +} QEDHeader; >> + >> +typedef struct { >> +   uint64_t offsets[0];       /* in bytes */ >> +} QEDTable; > > Is this for both L1 and L2 tables? Yes, they both have the same size. Stefan
On Wed, Sep 08, 2010 at 12:15:13PM +0100, Stefan Hajnoczi wrote: > In-place writes overwrite old data in the image file. They do not > allocate new clusters or update any metadata. This is why write > performance is comparable to raw in the long run. Only if qed doesn't cause additional fragmentation. Which it currently does.
On 09/08/2010 10:38 AM, Christoph Hellwig wrote: > On Wed, Sep 08, 2010 at 12:15:13PM +0100, Stefan Hajnoczi wrote: > >> In-place writes overwrite old data in the image file. They do not >> allocate new clusters or update any metadata. This is why write >> performance is comparable to raw in the long run. >> > Only if qed doesn't cause additional fragmentation. Which it currently > does. > http://wiki.qemu.org/Features/QED/OnlineDefrag Is a spec for a very simple approach to online defrag that I hope we can implement in the near future. I think that once we have the mechanisms to freeze clusters and to swap clusters, implementing much more sophisticated defragmentation algorithms will become easy. Regards, Anthony Liguori
On Wed, Sep 8, 2010 at 3:37 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > On Tue, Sep 7, 2010 at 8:25 PM, Blue Swirl <blauwirbel@gmail.com> wrote: >> On Mon, Sep 6, 2010 at 10:04 AM, Stefan Hajnoczi >> <stefanha@linux.vnet.ibm.com> wrote: >>> QEMU Enhanced Disk format is a disk image format that forgoes features >>> found in qcow2 in favor of better levels of performance and data >>> integrity.  Due to its simpler on-disk layout, it is possible to safely >>> perform metadata updates more efficiently. >>> >>> Installations, suspend-to-disk, and other allocation-heavy I/O workloads >>> will see increased performance due to fewer I/Os and syncs.  Workloads >>> that do not cause new clusters to be allocated will perform similar to >>> raw images due to in-memory metadata caching. >>> >>> The format supports sparse disk images.  It does not rely on the host >>> filesystem holes feature, making it a good choice for sparse disk images >>> that need to be transferred over channels where holes are not supported. >>> >>> Backing files are supported so only deltas against a base image can be >>> stored. >>> >>> The file format is extensible so that additional features can be added >>> later with graceful compatibility handling. >>> >>> Internal snapshots are not supported.  This eliminates the need for >>> additional metadata to track copy-on-write clusters. >> >> It would be nice to support external snapshots, so another file >> besides the disk images can store the snapshots. Then snapshotting >> would be available even with raw or QED disk images. This is of course >> not QED specific. >> >>> + * >>> + * +--------+----------+----------+----------+-----+ >>> + * | header | L1 table | cluster0 | cluster1 | ... | >>> + * +--------+----------+----------+----------+-----+ >>> + * >>> + * There is a 2-level pagetable for cluster allocation: >>> + * >>> + *           +----------+ >>> + *           | L1 table | >>> + *           +----------+ >>> + *         ,------'  |  '------. >>> + *      +----------+  |   +----------+ >>> + *      | L2 table |  ...  | L2 table | >>> + *      +----------+     +----------+ >>> + *    ,------'  |  '------. >>> + *  +----------+  |   +----------+ >>> + *  |  Data  |  ...  |  Data  | >>> + *  +----------+     +----------+ >>> + * >>> + * The L1 table is fixed size and always present.  L2 tables are allocated on >>> + * demand.  The L1 table size determines the maximum possible image size; it >>> + * can be influenced using the cluster_size and table_size values. >> >> The formula for calculating the maximum size would be nice. Is the >> image_size the limit? How many clusters can there be? What happens if >> the image_size is not equal to multiple of cluster size? Wouldn't >> image_size be redundant if cluster_size and table_size determine the >> image size? > > image_size is the logical image size, whereas TABLE_NELEMS * > TABLE_NELEMS * cluster_size is the maximum logical image size > (TABLE_NELEMS depends on table_size and cluster_size).  I have updated > the wiki page with the constraint. Based on these: #define TABLE_NOFFSETS (table_size * cluster_size / sizeof(uint64_t)) header.image_size <= TABLE_NOFFSETS * TABLE_NOFFSETS * header.cluster_size, the maximum image size equals to table_size^2 * cluster_size^3 / sizeof(uint64_t)^2. Is the squaring and cubing of the terms beneficial? I mean, the size scales up fast to unusable numbers, whereas with a more linear equation (for example, allow different L1 and L2 sizes), more values could be actually usable. Again, I'm not sure if this matters at all. I think the minimum size should be table_size = 1, cluster_size = 4 bytes, 1^2 * 4^3 / 8^2 = 2 bytes, or is the minimum bigger? What's the minimum for cluster_size? > > I don't think the specification needs to mention error behavior, that > would depend on the implementation.  But the specification needs to > mention alignment constraints so I have added them. > >> >>> + * >>> + * All fields are little-endian on disk. >>> + */ >>> + >>> +typedef struct { >>> +   uint32_t magic;         /* QED */ >>> + >>> +   uint32_t cluster_size;      /* in bytes */ >> >> Doesn't cluster_size need to be a power of two? >> >>> +   uint32_t table_size;       /* table size, in clusters */ >>> +   uint32_t first_cluster;     /* first usable cluster */ >> >> This introduces some limits to the location of first cluster, with 4k >> clusters it must reside within the first 16TB. I guess it doesn't >> matter. > > It shouldn't matter since any header that is >=16 TB means something > mutated, escaped the lab, and is terrorizing the world as a qed > monster image. In the Wiki version this has changed to header_size in clusters. With 2GB clusters, there will be some wasted bits. By the way, perhaps cluster_size of 0 should mean 4GB? Or maybe all sizes should be expressed as an exponent to 2, then 16 bits would allow cluster sizes up to 2^64? >> >>> + >>> +   uint64_t features;        /* format feature bits */ >>> +   uint64_t compat_features;    /* compatible feature bits */ >>> +   uint64_t l1_table_offset;    /* L1 table offset, in bytes */ >>> +   uint64_t image_size;       /* total image size, in bytes */ >>> + >>> +   uint32_t backing_file_offset;  /* in bytes from start of header */ >>> +   uint32_t backing_file_size;   /* in bytes */ >>> +   uint32_t backing_fmt_offset;   /* in bytes from start of header */ >>> +   uint32_t backing_fmt_size;    /* in bytes */ >>> +} QEDHeader; >>> + >>> +typedef struct { >>> +   uint64_t offsets[0];       /* in bytes */ >>> +} QEDTable; >> >> Is this for both L1 and L2 tables? > > Yes, they both have the same size. > > Stefan >
On 09/08/2010 01:24 PM, Blue Swirl wrote: > Based on these: > #define TABLE_NOFFSETS (table_size * cluster_size / sizeof(uint64_t)) > header.image_size<= TABLE_NOFFSETS * TABLE_NOFFSETS * header.cluster_size, > the maximum image size equals to table_size^2 * cluster_size^3 / > sizeof(uint64_t)^2. Is the squaring and cubing of the terms > beneficial? I mean, the size scales up fast to unusable numbers, > whereas with a more linear equation (for example, allow different L1 > and L2 sizes), more values could be actually usable. Again, I'm not > sure if this matters at all. > > I think the minimum size should be table_size = 1, cluster_size = 4 > bytes, 1^2 * 4^3 / 8^2 = 2 bytes, or is the minimum bigger? What's > the minimum for cluster_size? > 4k. The smallest image size is 1GB. There is no upper limit on image size because clusters can be arbitrarily large. >> It shouldn't matter since any header that is>=16 TB means something >> mutated, escaped the lab, and is terrorizing the world as a qed >> monster image. >> > In the Wiki version this has changed to header_size in clusters. With > 2GB clusters, there will be some wasted bits. > 2GB clusters would waste an awful lot of space regardless. I don't think it's useful to have clusters that large. > By the way, perhaps cluster_size of 0 should mean 4GB? Or maybe all > sizes should be expressed as an exponent to 2, then 16 bits would > allow cluster sizes up to 2^64? > I don't think cluster sizes much greater than 64k actually make sense. We don't need an image format that supports > 1PB disks. Regards, Anthony Liguori
On Wed, Sep 8, 2010 at 6:35 PM, Anthony Liguori <anthony@codemonkey.ws> wrote: > On 09/08/2010 01:24 PM, Blue Swirl wrote: >> >> Based on these: >> #define TABLE_NOFFSETS (table_size * cluster_size / sizeof(uint64_t)) >> header.image_size<= TABLE_NOFFSETS * TABLE_NOFFSETS * header.cluster_size, >> the maximum image size equals to table_size^2 * cluster_size^3 / >> sizeof(uint64_t)^2. Is the squaring and cubing of the terms >> beneficial? I mean, the size scales up fast to unusable numbers, >> whereas with a more linear equation (for example, allow different L1 >> and L2 sizes), more values could be actually usable. Again, I'm not >> sure if this matters at all. >> >> I think the minimum size should be table_size = 1, cluster_size = 4 >> bytes, Â 1^2 * 4^3 / 8^2 = 2 bytes, or is the minimum bigger? What's >> the minimum for cluster_size? >> > > 4k. > > The smallest image size is 1GB. Â There is no upper limit on image size > because clusters can be arbitrarily large. That's a bit big, for example CD images are only 640M and there were smaller disks. But I guess you mean the smallest maximum size limited by the cluster_size etc, so the actual images may be even smaller. > > >>> It shouldn't matter since any header that is>=16 TB means something >>> mutated, escaped the lab, and is terrorizing the world as a qed >>> monster image. >>> >> >> In the Wiki version this has changed to header_size in clusters. With >> 2GB clusters, there will be some wasted bits. >> > > 2GB clusters would waste an awful lot of space regardless. Â I don't think > it's useful to have clusters that large. > >> By the way, perhaps cluster_size of 0 should mean 4GB? Or maybe all >> sizes should be expressed as an exponent to 2, then 16 bits would >> allow cluster sizes up to 2^64? >> > > I don't think cluster sizes much greater than 64k actually make sense. Â We > don't need an image format that supports > 1PB disks. File system developers could want to try images in exabyte ranges. Isn't the purpose of an image format that you can create a virtual disk that can appear to be bigger than the disk space needed?
On 09/08/2010 01:56 PM, Blue Swirl wrote: > That's a bit big, for example CD images are only 640M and there were > smaller disks. But I guess you mean the smallest maximum size limited > by the cluster_size etc, so the actual images may be even smaller. > Yes. The smallest image is one cluster. The smallest cluster is 4k so the smallest image is 4k. >> I don't think cluster sizes much greater than 64k actually make sense. We >> don't need an image format that supports> 1PB disks. >> > File system developers could want to try images in exabyte ranges. > Isn't the purpose of an image format that you can create a virtual > disk that can appear to be bigger than the disk space needed? > $ qemu-img create -f qed -o table_size=16,cluster_size=1M exabyte.qed $((1024*1024))T Formatting 'exabyte.qed', fmt=qed size=1152921504606846976 cluster_size=1048576 table_size=16 copy_on_read=off I still contend it's insane to do, but it does work and only requires a 1M cluster size. Generally speaking, max image size is: (cluster_size * table_size / 8) * (cluster_size * table_size / 8) * cluster_size Or: (2^x * 2^y / 2^3) * (2^x * 2^y / 2^3) * 2^x valid values for y are [0...4]. Valid values for x are [12...31] Solve for each range and you have 2^30...2^107 but you can't have an image > ~2^64. There's an awful lot of flexibility with just something as simple as a two level table. Regards, Anthony Liguori
On Wed, Sep 08, 2010 at 11:30:10AM -0500, Anthony Liguori wrote: > http://wiki.qemu.org/Features/QED/OnlineDefrag > > Is a spec for a very simple approach to online defrag that I hope we can > implement in the near future. I think that once we have the mechanisms > to freeze clusters and to swap clusters, implementing much more > sophisticated defragmentation algorithms will become easy. This image defragmentation might in fact cause even more fragmentation at the filesystem layer.
On 09/08/2010 03:23 PM, Christoph Hellwig wrote: > On Wed, Sep 08, 2010 at 11:30:10AM -0500, Anthony Liguori wrote: > >> http://wiki.qemu.org/Features/QED/OnlineDefrag >> >> Is a spec for a very simple approach to online defrag that I hope we can >> implement in the near future. I think that once we have the mechanisms >> to freeze clusters and to swap clusters, implementing much more >> sophisticated defragmentation algorithms will become easy. >> > This image defragmentation might in fact cause even more fragmentation > at the filesystem layer. > That's a good point. Is there a reasonable way to do this cooperatively with the underlying filesystem? BTW, the same problem would occur for sparse file system images, no? Regards, Anthony Liguori
On Wed, Sep 08, 2010 at 03:28:50PM -0500, Anthony Liguori wrote: > That's a good point. Is there a reasonable way to do this cooperatively > with the underlying filesystem? The only thing we can do easily is to try to use as large as possible extents in the allocation. Once we're at a cuple Megabytes the fragmentation doesn't matter too much. > BTW, the same problem would occur for sparse file system images, no? Sparse filesystems images are relatively probe to fragmentation, too. Some filesystems like ext4 have heuristics that try to related physical locality to logical locality, but that only helps if the filesystem is relatively empty. On XFS you can set a minimum extent size which forces the filesystem to allocate more data than nessecary and thus reduce fragmentation. That's equivalent to suggestion a above to use larger extents in the image format.
On 09/09/2010 05:35 AM, Christoph Hellwig wrote: > On Wed, Sep 08, 2010 at 03:28:50PM -0500, Anthony Liguori wrote: >> That's a good point. Is there a reasonable way to do this cooperatively >> with the underlying filesystem? > The only thing we can do easily is to try to use as large as possible > extents in the allocation. Once we're at a cuple Megabytes the > fragmentation doesn't matter too much. That only works if the initial write writes the entire extent (zero-filling a shorter write). But that both slows down that write, and quickly grows the image to its full logical size. The other thing we can do is defragment the logical image, then defragment the underlying file (if the filesystem supports it, issue the appropriate ioctl, otherwise defragment to a new file which you write linearly).
On 09/08/2010 03:55 PM, Anthony Liguori wrote: (3 levels) >> Dunno, just seems more regular to me. Image resize doesn't need to >> relocate the L2 table in case it overflows. >> >> The overhead from three levels is an extra table, which is negligible. > > > It means an extra I/O request in the degenerate case For small images, it means a single extra read per boot (and a single extra write write for the lifetime of the image). Larger images increase this, but it will always be a constant number of extra reads per boot and extra writes per image lifetime, proportional to logical image size. > whereas increasing the table size only impacts the size of the metadata. Larger L2 tables mean reduced L2 cache efficiency and longer delays while it is loaded. At 100 MB/s, a 256KB L2 takes 2.5ms compared to 0.6 ms for 64KB, perhaps not so traumatic. > > A 10GB image currently has 1.2MB of metadata in QED today. A 1TB > image uses 128MB of metadata. The ratio of metadata is about 0.01%. > > A three level table adds an additional I/O request in order to reduce > metadata. But the metadata is small enough today that I don't see the > point. The point is to allow really large images.
On 09/08/2010 03:48 PM, Anthony Liguori wrote: > On 09/08/2010 03:23 AM, Avi Kivity wrote: >> On 09/08/2010 01:27 AM, Anthony Liguori wrote: >>>> FWIW, L2s are 256K at the moment and with a two level table, it can >>>> support 5PB of data. >>> >>> >>> I clearly suck at basic math today. The image supports 64TB today. >>> Dropping to 128K tables would reduce it to 16TB and 64k tables would >>> be 4TB. >> >> Maybe we should do three levels then. Some users are bound to >> complain about 64TB. > > That's just the default size. The table size and cluster sizes are > configurable. Without changing the cluster size, the image can > support up to 1PB. Loading very large L2 tables on demand will result in very long latencies. Increasing cluster size will result in very long first write latencies. Adding an extra level results in an extra random write every 4TB. >> >>> Today, we only need to sync() when we first allocate an L2 entry >>> (because their locations never change). From a performance >>> perspective, it's the difference between an fsync() every 64k vs. >>> every 2GB. >> >> Yup. From a correctness perspective, it's the difference between a >> corrupted filesystem on almost every crash and a corrupted filesystem >> in some very rare cases. > > > I'm not sure I understand you're corruption comment. Are you claiming > that without checksumming, you'll often get corruption or are you > claiming that without checksums, if you don't sync metadata updates > you'll get corruption? No, I'm claiming that with checksums but without allocate-on-write you will have frequent (detected) data loss after power failures. Checksums need to go hand-in-hand with allocate-on-write (which happens to be the principle underlying zfs and btrfs). > > qed is very careful about ensuring that we don't need to do syncs and > we don't get corruption because of data loss. I don't necessarily buy > your checksumming argument. The requirement for checksumming comes from a different place. For decades we've enjoyed very low undetected bit error rates. However the actual amount of data is increasing to the point that it makes an undetectable bit error likely, just by throwing a huge amount of bits at storage. Write ordering doesn't address this issue. Virtualization is one of the uses where you have a huge number of bits. btrfs addresses this, but if you have (working) btrfs you don't need qed. Another problem is nfs; TCP and UDP checksums are incredibly weak and it is easy for a failure to bypass them. Ethernet CRCs are better, but they only work if the error is introduced after the CRC is taken and before it is verified. >> >> Well, if we introduce a minimal format, we need to make sure it isn't >> too minimal. >> >> I'm still not sold on the idea. What we're doing now is pushing the >> qcow2 complexity to users. We don't have to worry about refcounts >> now, but users have to worry whether they're the machine they're >> copying the image to supports qed or not. >> >> The performance problems with qcow2 are solvable. If we preallocate >> clusters, the performance characteristics become essentially the same >> as qed. > > By creating two code paths within qcow2. You're creating two code paths for users. > It's not just the reference counts, it's the lack of guaranteed > alignment, compression, and some of the other poor decisions in the > format. > > If you have two code paths in qcow2, you have non-deterministic > performance because users that do reasonable things with their images > will end up getting catastrophically bad performance. We can address that in the tools. "By enabling compression, you may reduce performance for multithreaded workloads. Abort/Retry/Ignore?" > > A new format doesn't introduce much additional complexity. We provide > image conversion tool and we can almost certainly provide an in-place > conversion tool that makes the process very fast. It requires users to make a decision. By the time qed is ready for mass deployment, 1-2 years will have passed. How many qcow2 images will be in the wild then? How much scheduled downtime will be needed? How much user confusion will be caused? Virtualization is about compatibility. In-guest compatibility first, but keeping the external environment stable is also important. We really need to exhaust the possibilities with qcow2 before giving up on it.
On 09/09/2010 09:45 AM, Avi Kivity wrote: >> >> A new format doesn't introduce much additional complexity. We >> provide image conversion tool and we can almost certainly provide an >> in-place conversion tool that makes the process very fast. > > It requires users to make a decision. By the time qed is ready for > mass deployment, 1-2 years will have passed. How many qcow2 images > will be in the wild then? How much scheduled downtime will be > needed? How much user confusion will be caused? > > Virtualization is about compatibility. In-guest compatibility first, > but keeping the external environment stable is also important. We > really need to exhaust the possibilities with qcow2 before giving up > on it. > btw, if we were starting from scratch, I'd definitely pick qed over qcow2. But we aren't starting from scratch (if we did, we wouldn't be doing x86 either).
On 09/08/2010 02:15 PM, Stefan Hajnoczi wrote: > 3. Metadata update reaches disk but data does not. The interesting > case! The L2 table now points to a cluster which is beyond the last > cluster in the image file. Remember that file size is rounded down by > cluster size, so partial data writes are discarded and this case > applies. > > Now we're in trouble. The image cannot be accessed without some > sanity checking because not only do table entries point to invalid > clusters, but new allocating writes might make previously invalid > cluster offsets valid again (then there would be two or more table > entries pointing to the same cluster)! > > Anthony's suggestion is to use a "mounted" or "dirty" bit in the qed > header to detect a crashed image when opening the image file. If no > crash has occurred, then the mounted bit is unset and normal operation > is safe. If the mounted bit is set, then an check of the L1/L2 tables > must be performed and any invalid cluster offsets must be cleared to > zero. When an invalid cluster is cleared to zero, we arrive back at > case 1 above: neither data write nor metadata update reached the disk, > and we are in a safe state. While fsck has a lovely ext2 retro feel, there's a reason it's shunned - it can take quite a while to run. A fully loaded L1 with 32K entries will require 32K random I/Os, which can take over 5 minutes on a disk that provides 100 IOPS. On a large shared disk, you'll have a lot more IOPS, but likely much fewer IOPS per guest, so if you have a power loss, fsck time per guest will likely be longer (irrespective of guest size). Preallocation, on the other hand, is amortized, or you can piggy-back its fsync on a guest flush. Note its equally applicable to qcow2 and qed.
On 09/08/2010 06:07 PM, Stefan Hajnoczi wrote: >>> uint32_t table_size; /* table size, in clusters */ >> Presumably L1 table size? Or any table size? >> >> Hm. It would be nicer not to require contiguous sectors anywhere. How >> about a variable- or fixed-height tree? > Both extents and fancier trees don't fit the philosophy, which is to > keep things straightforward and fast by doing less. With extents and > trees you've got something that looks much more like a full-blown > filesystem. Is there an essential feature or characteristic that QED > cannot provide in its current design? > Not using extents mean that random workloads on very large disks will continuously need to page in L2s (which are quite large, 256KB is large enough that you need to account for read time, not just seek time). Keeping it to two levels means that the image size is limited, not very good for an image format designed in 2010. >> Is the physical image size always derived from the host file metadata? Is >> this always safe? > In my email summarizing crash scenarios and recovery we cover the > bases and I think it is safe to rely on file size as physical image > size. The drawback is that you need a host filesystem and cannot > directly use a bare block device. I think that is acceptable for a > sparse format, otherwise we'd be using raw. Hm, we do have a use case for qcow2-over-lvm. I can't say it's something I like, but a point to consider.
On 09/09/2010 01:45 AM, Avi Kivity wrote: > Loading very large L2 tables on demand will result in very long > latencies. Increasing cluster size will result in very long first > write latencies. Adding an extra level results in an extra random > write every 4TB. It would be trivially easy to add another level of tables as a feature bit so let's delay the decision. >> >> qed is very careful about ensuring that we don't need to do syncs and >> we don't get corruption because of data loss. I don't necessarily >> buy your checksumming argument. > > The requirement for checksumming comes from a different place. For > decades we've enjoyed very low undetected bit error rates. However > the actual amount of data is increasing to the point that it makes an > undetectable bit error likely, just by throwing a huge amount of bits > at storage. Write ordering doesn't address this issue. I don't think we should optimize an image format for cheap disks and an old file system. We should optimize for the future. That means a btrfs file system and/or enterprise storage. The point of an image format is not to recreate btrfs in software. It's to provide a mechanism to allow users to move images around reasonable but once an image is present on a reasonable filesystem, we should more or less get the heck out of the way. > >> By creating two code paths within qcow2. > > You're creating two code paths for users. No, I'm creating a single path: QED. There are already two code paths: raw and qcow2. qcow2 has had such a bad history that for a lot of users, it's not even a choice. Today, users have to choose between performance and reliability or features. QED offers an opportunity to be able to tell users to just always use QED as an image format and forget about raw/qcow2/everything else. You can say, let's just make qcow2 better, but we've been trying that for years and we have an existence proof that we can do it in a straight forward fashion with QED.A new format doesn't introduce much additional complexity. We provide image conversion tool and we can almost certainly provide an in-place conversion tool that makes the process very fast. > > It requires users to make a decision. By the time qed is ready for > mass deployment, 1-2 years will have passed. How many qcow2 images > will be in the wild then? How much scheduled downtime will be needed? Zero if we're smart. You can do QED stream + live migration to do a live conversion from raw to QED. > How much user confusion will be caused? User confusion is reduced if we can make strong, clear statements: all users should use QED even if they care about performance. Today, there's mass confusion because of the poor state of qcow2. > Virtualization is about compatibility. In-guest compatibility first, > but keeping the external environment stable is also important. We > really need to exhaust the possibilities with qcow2 before giving up > on it. IMHO, we're long past exhausting the possibilities with qcow2. We still haven't decided what we're going to do for 0.13.0. Are we going to ship qcow2 with awful performance (a 15 minute operation taking hours) or with compromised data integrity? It's been this way for every release since qcow2 existed. Let's not let sunk cost cloud our judgement here. qcow2 is not a properly designed image format. It was a weekend hacking session from Fabrice that he dropped in the code base and never really finished doing what he originally intended. The improvements that have been made to it are almost at the heroic level but we're only hurting our users by not moving on to something better. Regards, Anthony Liguori
On 09/09/2010 02:49 PM, Anthony Liguori wrote: > We should optimize for the future. That means a btrfs file system and/or > enterprise storage. So we should just implement a copy-on-read wrapper that generates a sparse raw image and uses FIEMAP (or whatever it is called these days) to test for the presence of extents. Then you let btrfs handle everything else... Paolo
On 09/09/2010 11:48 AM, Paolo Bonzini wrote: > On 09/09/2010 02:49 PM, Anthony Liguori wrote: >> We should optimize for the future. That means a btrfs file system and/or >> enterprise storage. > > So we should just implement a copy-on-read wrapper that generates a > sparse raw image and uses FIEMAP (or whatever it is called these days) > to test for the presence of extents. Then you let btrfs handle > everything else... My position is that we'll need a sparse image format well into the future because while btrfs may be ubiquitous as a file system, IRL, people transfer images around all of the time through dumb transports like HTTP and fat-formatted USB keys. A 100GB image with 1GB allocated cannot explode to 100GB just because HTTP is a dump transport. Where we should do copy-on-read is a different topic. Really, I should have waited to share that feature to avoid confusing the current discussion. Regards, Anthony Liguori > > Paolo
On 09/09/2010 01:59 AM, Avi Kivity wrote: > On 09/08/2010 06:07 PM, Stefan Hajnoczi wrote: >>>> uint32_t table_size; /* table size, in clusters */ >>> Presumably L1 table size? Or any table size? >>> >>> Hm. It would be nicer not to require contiguous sectors anywhere. How >>> about a variable- or fixed-height tree? >> Both extents and fancier trees don't fit the philosophy, which is to >> keep things straightforward and fast by doing less. With extents and >> trees you've got something that looks much more like a full-blown >> filesystem. Is there an essential feature or characteristic that QED >> cannot provide in its current design? >> > > Not using extents mean that random workloads on very large disks will > continuously need to page in L2s (which are quite large, 256KB is > large enough that you need to account for read time, not just seek > time). Keeping it to two levels means that the image size is limited, > not very good for an image format designed in 2010. Define "very large disks". My target for VM images is 100GB-1TB. Practically speaking, that at least covers us for the next 5 years. Since QED has rich support for features, we can continue to evolve the format over time in a backwards compatible way. I'd rather delay supporting massively huge disks for the future when we better understand true nature of the problem. >>> Is the physical image size always derived from the host file >>> metadata? Is >>> this always safe? >> In my email summarizing crash scenarios and recovery we cover the >> bases and I think it is safe to rely on file size as physical image >> size. The drawback is that you need a host filesystem and cannot >> directly use a bare block device. I think that is acceptable for a >> sparse format, otherwise we'd be using raw. > > Hm, we do have a use case for qcow2-over-lvm. I can't say it's > something I like, but a point to consider. We specifically are not supporting that use-case in QED today. There's a good reason for it. For cluster allocation, we achieve good performance because for L2 cluster updates, we can avoid synchronous metadata updates (except for L1 updates). We achieve synchronous metadata updates by leveraging the underlying filesystem's metadata. The underlying filesystems are much smarter about their metadata updates. They'll keep a journal to delay synchronous updates and other fancy things. If we tried to represent the disk size in the header, we would have to do an fsync() on every cluster allocation. I can only imagine the use case for qcow2-over-lvm is performance. But the performance of QED on a file system is so much better than qcow2 that you can safely just use a file system and avoid the complexity of qcow2 over lvm. Regards, Anthony Liguori
On Thu, Sep 09, 2010 at 12:43:28PM -0500, Anthony Liguori wrote: > Define "very large disks". > > My target for VM images is 100GB-1TB. Practically speaking, that at > least covers us for the next 5 years. We have 2TB SATA disks shipping already, and people tend to produce more and more "data". I don't think adding such a limit these days is a good idea at all. It's fine to limit the (tested)implementation to around 100TB for now, but desining a new image format that doesn't reach into the petabyte range today is extremly short sightened. > I can only imagine the use case for qcow2-over-lvm is performance. But > the performance of QED on a file system is so much better than qcow2 > that you can safely just use a file system and avoid the complexity of > qcow2 over lvm. A volume manager has many advantages over an image format. For one it allows much larger extent allocation sizes, given you much less fragmentation. There's also lots of infrastructure for dealing with it. Last but not least using clustered lvm is much simpler than a clustered filesystem.
On Thu, Sep 09, 2010 at 12:02:26PM -0500, Anthony Liguori wrote: > My position is that we'll need a sparse image format well into the > future because while btrfs may be ubiquitous as a file system, IRL, > people transfer images around all of the time through dumb transports > like HTTP and fat-formatted USB keys. A 100GB image with 1GB allocated > cannot explode to 100GB just because HTTP is a dump transport. > > Where we should do copy-on-read is a different topic. Really, I should > have waited to share that feature to avoid confusing the current discussion. Yes, we will need an image format forever. However I'd be a much happier camper if typical production setups wouldn't use them. Either way the qed image format is something that too me looks much better than qcow2, primarily due to the simpliciy. I haven't managed to fully review it yet, so I might change my opinion again.
On Thu, Sep 09, 2010 at 09:24:26AM +0300, Avi Kivity wrote: > The other thing we can do is defragment the logical image, then > defragment the underlying file (if the filesystem supports it, issue the > appropriate ioctl, otherwise defragment to a new file which you write > linearly). What's what the defragmentation code does in a slightly optimized fashion anyway - so if you want to do it from qemu just do it that way. Don't even bother calling the filesystem ioctls directly given that they just implementa low-level helpers and the actual logic is in the userspace side of the defragmentation tools.
On 09/09/2010 08:02 PM, Anthony Liguori wrote: > On 09/09/2010 11:48 AM, Paolo Bonzini wrote: >> On 09/09/2010 02:49 PM, Anthony Liguori wrote: >>> We should optimize for the future. That means a btrfs file system >>> and/or >>> enterprise storage. >> >> So we should just implement a copy-on-read wrapper that generates a >> sparse raw image and uses FIEMAP (or whatever it is called these >> days) to test for the presence of extents. Then you let btrfs handle >> everything else... > > My position is that we'll need a sparse image format well into the > future because while btrfs may be ubiquitous as a file system, IRL, > people transfer images around all of the time through dumb transports > like HTTP and fat-formatted USB keys. A 100GB image with 1GB > allocated cannot explode to 100GB just because HTTP is a dump transport. > An 'Export' and 'Upload' buttons would do the job. For command line users, compressing the image will remove the unallocated extents, as will 'qemu-img convert -O qcow2'. It's not as nice as having a sparse format, but on the other hand, performance and data integrity will be better, as well as the excellent snapshot support.
On 09/09/2010 03:49 PM, Anthony Liguori wrote: > On 09/09/2010 01:45 AM, Avi Kivity wrote: >> Loading very large L2 tables on demand will result in very long >> latencies. Increasing cluster size will result in very long first >> write latencies. Adding an extra level results in an extra random >> write every 4TB. > > It would be trivially easy to add another level of tables as a feature > bit so let's delay the decision. It means that you'll need to upgrade qemu to read certain images, but okay. >>> >>> qed is very careful about ensuring that we don't need to do syncs >>> and we don't get corruption because of data loss. I don't >>> necessarily buy your checksumming argument. >> >> The requirement for checksumming comes from a different place. For >> decades we've enjoyed very low undetected bit error rates. However >> the actual amount of data is increasing to the point that it makes an >> undetectable bit error likely, just by throwing a huge amount of bits >> at storage. Write ordering doesn't address this issue. > > I don't think we should optimize an image format for cheap disks and > an old file system. > > We should optimize for the future. That means a btrfs file system I wouldn't use an image format at all with btrfs. > and/or enterprise storage. That doesn't eliminate undiscovered errors (they can still come from the transport). > > The point of an image format is not to recreate btrfs in software. > It's to provide a mechanism to allow users to move images around > reasonable but once an image is present on a reasonable filesystem, we > should more or less get the heck out of the way. You can achieve exactly the same thing with qcow2. Yes, it's more work, but it's also less disruptive to users. >> >>> By creating two code paths within qcow2. >> >> You're creating two code paths for users. > > No, I'm creating a single path: QED. > > There are already two code paths: raw and qcow2. qcow2 has had such a > bad history that for a lot of users, it's not even a choice. qcow2 exists, people use it, and by the time qed is offered on distros (even more on enterprise distros), there will be a lot more qcow2 images. Not everyone runs qemu.git HEAD. What will you tell those people? Upgrade your image? They may still want to share it with older installations. What if they use features not present in qed? Bad luck? qcow2 is going to live forever no matter what we do. > > Today, users have to choose between performance and reliability or > features. QED offers an opportunity to be able to tell users to just > always use QED as an image format and forget about > raw/qcow2/everything else. raw will always be needed for direct volume access and shared storage. qcow2 will always be needed for old images. > > You can say, let's just make qcow2 better, but we've been trying that > for years and we have an existence proof that we can do it in a > straight forward fashion with QED. When you don't use the extra qcow2 features, it has the same performance characteristics as qed. You need to batch allocation and freeing, but that's fairly straightforward. Yes, qcow2 has a long and tortured history and qed is perfect. Starting from scratch is always easier and more fun. Except for the users. > A new format doesn't introduce much additional complexity. We provide > image conversion tool and we can almost certainly provide an in-place > conversion tool that makes the process very fast. It introduces a lot of complexity for the users who aren't qed experts. They need to make a decision. What's the impact of the change? Are the features that we lose important to us? Do we know what they are? Is there any risk? Can we make the change online or do we have to schedule downtime? Do all our hosts support qed? Improving qcow2 will be very complicated for Kevin who already looks older beyond his years [1] but very simple for users. >> >> It requires users to make a decision. By the time qed is ready for >> mass deployment, 1-2 years will have passed. How many qcow2 images >> will be in the wild then? How much scheduled downtime will be needed? > > Zero if we're smart. You can do QED stream + live migration to do a > live conversion from raw to QED. > Not all installations use live migration (say, desktop users). >> How much user confusion will be caused? > > User confusion is reduced if we can make strong, clear statements: all > users should use QED even if they care about performance. Today, > there's mass confusion because of the poor state of qcow2. If we improve qcow2 and make the same strong, clear statement we'll have the same results. > >> Virtualization is about compatibility. In-guest compatibility first, >> but keeping the external environment stable is also important. We >> really need to exhaust the possibilities with qcow2 before giving up >> on it. > > IMHO, we're long past exhausting the possibilities with qcow2. We > still haven't decided what we're going to do for 0.13.0. Sorry, I disagree 100%. How can you say that, when no one has yet tried, for example, batching allocations and frees? Or properly threaded it? What we've done is make qcow2 safe and a more parallel than it was. But "exhaust all possibilities"? not even close. > Are we going to ship qcow2 with awful performance (a 15 minute > operation taking hours) or with compromised data integrity? We're going to fix it. > > It's been this way for every release since qcow2 existed. Let's not > let sunk cost cloud our judgement here. Yes, new and shiny is always better. > > qcow2 is not a properly designed image format. It was a weekend > hacking session from Fabrice that he dropped in the code base and > never really finished doing what he originally intended. The > improvements that have been made to it are almost at the heroic level > but we're only hurting our users by not moving on to something better. > I don't like qcow2 either. But from a performance perspective, it can be made equivalent to qed with some effort. It is worthwhile to expend that effort rather than push the burden to users. > Regards, > > Anthony Liguori > > [1] okay, maybe not.
On 09/10/2010 12:01 AM, Christoph Hellwig wrote: > On Thu, Sep 09, 2010 at 09:24:26AM +0300, Avi Kivity wrote: >> The other thing we can do is defragment the logical image, then >> defragment the underlying file (if the filesystem supports it, issue the >> appropriate ioctl, otherwise defragment to a new file which you write >> linearly). > What's what the defragmentation code does in a slightly optimized > fashion anyway - so if you want to do it from qemu just do it that > way. Don't even bother calling the filesystem ioctls directly given > that they just implementa low-level helpers and the actual logic is > in the userspace side of the defragmentation tools. Well, if we ask the kernel do to it, we gain any future optimizations as well. For example, if parts of the file are already defragmented, the kernel can avoid moving that data.
On 09/09/2010 08:43 PM, Anthony Liguori wrote: >> Hm, we do have a use case for qcow2-over-lvm. I can't say it's >> something I like, but a point to consider. > > > We specifically are not supporting that use-case in QED today. > There's a good reason for it. For cluster allocation, we achieve good > performance because for L2 cluster updates, we can avoid synchronous > metadata updates (except for L1 updates). > As I've mentioned several times, if you preallocate, then you amortize that cost of keeping track of the physical image size. > We achieve synchronous metadata updates by leveraging the underlying > filesystem's metadata. The underlying filesystems are much smarter > about their metadata updates. They'll keep a journal to delay > synchronous updates and other fancy things. They only guarantee that the filesystem is consistent. A write() that extends a file may be reordered with the L2 write() that references the new cluster. Requiring fsck on unclean shutdown is very backwards for a 2010 format. > > If we tried to represent the disk size in the header, we would have to > do an fsync() on every cluster allocation. On every N cluster allocations. > > I can only imagine the use case for qcow2-over-lvm is performance. > But the performance of QED on a file system is so much better than > qcow2 that you can safely just use a file system and avoid the > complexity of qcow2 over lvm. > qcow2 over lvm is typically used on clusters.
On 09/10/2010 02:14 PM, Avi Kivity wrote: > >> >> qcow2 is not a properly designed image format. It was a weekend >> hacking session from Fabrice that he dropped in the code base and >> never really finished doing what he originally intended. The >> improvements that have been made to it are almost at the heroic level >> but we're only hurting our users by not moving on to something better. >> > > I don't like qcow2 either. But from a performance perspective, it can > be made equivalent to qed with some effort. It is worthwhile to > expend that effort rather than push the burden to users. btw, despite being not properly designed, qcow2 is able to support TRIM. qed isn't able to, except by leaking clusters on shutdown. TRIM support is required unless you're okay with the image growing until it is no longer sparse (the lack of TRIM support in guests make sparse image formats somewhat of a joke, but nobody seems to notice).
On Fri, Sep 10, 2010 at 12:22 PM, Avi Kivity <avi@redhat.com> wrote: >  On 09/09/2010 08:43 PM, Anthony Liguori wrote: >>> >>> Hm, we do have a use case for qcow2-over-lvm.  I can't say it's something >>> I like, but a point to consider. >> >> >> We specifically are not supporting that use-case in QED today.  There's a >> good reason for it.  For cluster allocation, we achieve good performance >> because for L2 cluster updates, we can avoid synchronous metadata updates >> (except for L1 updates). >> > > As I've mentioned several times, if you preallocate, then you amortize that > cost of keeping track of the physical image size. > >> We achieve synchronous metadata updates by leveraging the underlying >> filesystem's metadata.  The underlying filesystems are much smarter about >> their metadata updates.  They'll keep a journal to delay synchronous updates >> and other fancy things. > > They only guarantee that the filesystem is consistent.  A write() that > extends a file may be reordered with the L2 write() that references the new > cluster.  Requiring fsck on  unclean shutdown is very backwards for a 2010 > format. I'm interested in understanding how preallocation will work in a way that does not introduce extra flushes in the common case or require fsck. It seems to me that you can either preallocate and then rely on an fsck on startup to figure out which clusters are now really in use, or you can keep an exact max_cluster but this requires an extra write operation for each allocating write (and perhaps a flush?). Can you go into more detail in how preallocation should work? > >> >> If we tried to represent the disk size in the header, we would have to do >> an fsync() on every cluster allocation. > > On every N cluster allocations. > >> >> I can only imagine the use case for qcow2-over-lvm is performance.  But >> the performance of QED on a file system is so much better than qcow2 that >> you can safely just use a file system and avoid the complexity of qcow2 over >> lvm. >> > > qcow2 over lvm is typically used on clusters. > > -- > I have a truly marvellous patch that fixes the bug which this > signature is too narrow to contain. > >
On Fri, Sep 10, 2010 at 12:25 PM, Avi Kivity <avi@redhat.com> wrote: > Â On 09/10/2010 02:14 PM, Avi Kivity wrote: >> >>> >>> qcow2 is not a properly designed image format. Â It was a weekend hacking >>> session from Fabrice that he dropped in the code base and never really >>> finished doing what he originally intended. Â The improvements that have been >>> made to it are almost at the heroic level but we're only hurting our users >>> by not moving on to something better. >>> >> >> I don't like qcow2 either. Â But from a performance perspective, it can be >> made equivalent to qed with some effort. Â It is worthwhile to expend that >> effort rather than push the burden to users. > > btw, despite being not properly designed, qcow2 is able to support TRIM. > Â qed isn't able to, except by leaking clusters on shutdown. Â TRIM support is > required unless you're okay with the image growing until it is no longer > sparse (the lack of TRIM support in guests make sparse image formats > somewhat of a joke, but nobody seems to notice). Anthony has started writing up notes on trim for qed: http://wiki.qemu.org/Features/QED/Trim I need to look at the actual ATA and SCSI specs for how this will work. The issue I am concerned with is sub-cluster trim operations. If the trim region is less than a cluster, then both qed and qcow2 don't really have a way to handle it. Perhaps we could punch a hole in the file, given a userspace interface to do this, but that isn't ideal because we're losing sparseness again. Stefan
On 09/10/2010 02:29 PM, Stefan Hajnoczi wrote: > >> They only guarantee that the filesystem is consistent. A write() that >> extends a file may be reordered with the L2 write() that references the new >> cluster. Requiring fsck on unclean shutdown is very backwards for a 2010 >> format. > I'm interested in understanding how preallocation will work in a way > that does not introduce extra flushes in the common case or require > fsck. > > It seems to me that you can either preallocate and then rely on an > fsck on startup to figure out which clusters are now really in use, or > you can keep an exact max_cluster but this requires an extra write > operation for each allocating write (and perhaps a flush?). > > Can you go into more detail in how preallocation should work? You simply leak the preallocated clusters. That's not as bad as it sounds - if you never write() the clusters they don't occupy any space on disk, so you only leak address space, not actual storage. If you copy the image then you actually do lost storage. If you really wanted to recover the lost storage you could start a thread in the background that looks for unallocated blocks. Unlike fsck, you don't have to wait for it since data integrity does not depend on it. I don't think it's worthwhile, though. Other games you can play with preallocation is varying the preallocation window with workload: start with no preallocation, as the guest starts to allocate you increase the window. When the guest starts to idle again you can return the storage to the operating system and reduce the window back to zero.
On 09/10/2010 02:33 PM, Stefan Hajnoczi wrote: > >> btw, despite being not properly designed, qcow2 is able to support TRIM. >> qed isn't able to, except by leaking clusters on shutdown. TRIM support is >> required unless you're okay with the image growing until it is no longer >> sparse (the lack of TRIM support in guests make sparse image formats >> somewhat of a joke, but nobody seems to notice). > Anthony has started writing up notes on trim for qed: > http://wiki.qemu.org/Features/QED/Trim > Looks like it depends on fsck, which is not a good idea for large images. > I need to look at the actual ATA and SCSI specs for how this will > work. The issue I am concerned with is sub-cluster trim operations. > If the trim region is less than a cluster, then both qed and qcow2 > don't really have a way to handle it. Perhaps we could punch a hole > in the file, given a userspace interface to do this, but that isn't > ideal because we're losing sparseness again. To deal with a sub-cluster TRIM, look at the surrounding sectors. If they're zero, free the cluster. If not, write zeros or use sys_punch() to the range specified by TRIM.
On Fri, Sep 10, 2010 at 12:14 PM, Avi Kivity <avi@redhat.com> wrote: > Â On 09/09/2010 03:49 PM, Anthony Liguori wrote: >> >> On 09/09/2010 01:45 AM, Avi Kivity wrote: >>> >>> Loading very large L2 tables on demand will result in very long >>> latencies. Â Increasing cluster size will result in very long first write >>> latencies. Â Adding an extra level results in an extra random write every >>> 4TB. >> >> It would be trivially easy to add another level of tables as a feature bit >> so let's delay the decision. > > It means that you'll need to upgrade qemu to read certain images, but okay. > >>>> >>>> qed is very careful about ensuring that we don't need to do syncs and we >>>> don't get corruption because of data loss. Â I don't necessarily buy your >>>> checksumming argument. >>> >>> The requirement for checksumming comes from a different place. Â For >>> decades we've enjoyed very low undetected bit error rates. Â However the >>> actual amount of data is increasing to the point that it makes an >>> undetectable bit error likely, just by throwing a huge amount of bits at >>> storage. Â Write ordering doesn't address this issue. >> >> I don't think we should optimize an image format for cheap disks and an >> old file system. >> >> We should optimize for the future. Â That means a btrfs file system > > I wouldn't use an image format at all with btrfs. > >> and/or enterprise storage. > > That doesn't eliminate undiscovered errors (they can still come from the > transport). Eliminating silent data corruption is currently not a goal for any disk image format I know of. For filesystems, I know that ZFS and btrfs will try to detect corruption using data checksumming. The guest filesystem, the disk image format, or the host filesystem could do checksumming. The hypervisor should keep out of the way in the interest of performance and emulation fidelity. Why does checksumming need to be done in the image format? Isn't the choice between host and guest filesystem checksumming already enough? >> >> The point of an image format is not to recreate btrfs in software. Â It's >> to provide a mechanism to allow users to move images around reasonable but >> once an image is present on a reasonable filesystem, we should more or less >> get the heck out of the way. > > You can achieve exactly the same thing with qcow2. Â Yes, it's more work, but > it's also less disruptive to users. > >>> >>>> By creating two code paths within qcow2. >>> >>> You're creating two code paths for users. >> >> No, I'm creating a single path: QED. >> >> There are already two code paths: raw and qcow2. Â qcow2 has had such a bad >> history that for a lot of users, it's not even a choice. > > qcow2 exists, people use it, and by the time qed is offered on distros (even > more on enterprise distros), there will be a lot more qcow2 images. Â Not > everyone runs qemu.git HEAD. > > What will you tell those people? Â Upgrade your image? Â They may still want > to share it with older installations. Â What if they use features not present > in qed? Â Bad luck? > > qcow2 is going to live forever no matter what we do. It should be possible to do (live) upgrades for supported images. > >> >> Today, users have to choose between performance and reliability or >> features. Â QED offers an opportunity to be able to tell users to just always >> use QED as an image format and forget about raw/qcow2/everything else. > > raw will always be needed for direct volume access and shared storage. > Â qcow2 will always be needed for old images. > >> >> You can say, let's just make qcow2 better, but we've been trying that for >> years and we have an existence proof that we can do it in a straight forward >> fashion with QED. > > When you don't use the extra qcow2 features, it has the same performance > characteristics as qed. Â You need to batch allocation and freeing, but > that's fairly straightforward. > > Yes, qcow2 has a long and tortured history and qed is perfect. Â Starting > from scratch is always easier and more fun. Â Except for the users. > >> A new format doesn't introduce much additional complexity. Â We provide >> image conversion tool and we can almost certainly provide an in-place >> conversion tool that makes the process very fast. > > It introduces a lot of complexity for the users who aren't qed experts. > Â They need to make a decision. Â What's the impact of the change? Â Are the > features that we lose important to us? Â Do we know what they are? Â Is there > any risk? Â Can we make the change online or do we have to schedule downtime? > Â Do all our hosts support qed? > > Improving qcow2 will be very complicated for Kevin who already looks older > beyond his years [1] but very simple for users. > >>> >>> It requires users to make a decision. Â By the time qed is ready for mass >>> deployment, 1-2 years will have passed. Â How many qcow2 images will be in >>> the wild then? Â How much scheduled downtime will be needed? >> >> Zero if we're smart. Â You can do QED stream + live migration to do a live >> conversion from raw to QED. >> > > Not all installations use live migration (say, desktop users). > >>> Â How much user confusion will be caused? >> >> User confusion is reduced if we can make strong, clear statements: all >> users should use QED even if they care about performance. Â Today, there's >> mass confusion because of the poor state of qcow2. > > If we improve qcow2 and make the same strong, clear statement we'll have the > same results. > >> >>> Virtualization is about compatibility. Â In-guest compatibility first, but >>> keeping the external environment stable is also important. Â We really need >>> to exhaust the possibilities with qcow2 before giving up on it. >> >> IMHO, we're long past exhausting the possibilities with qcow2. Â We still >> haven't decided what we're going to do for 0.13.0. > > Sorry, I disagree 100%. Â How can you say that, when no one has yet tried, > for example, batching allocations and frees? Â Or properly threaded it? > > What we've done is make qcow2 safe and a more parallel than it was. Â But > "exhaust all possibilities"? not even close. > > >> Are we going to ship qcow2 with awful performance (a 15 minute operation >> taking hours) or with compromised data integrity? > > We're going to fix it. > >> >> It's been this way for every release since qcow2 existed. Â Let's not let >> sunk cost cloud our judgement here. > > Yes, new and shiny is always better. > >> >> qcow2 is not a properly designed image format. Â It was a weekend hacking >> session from Fabrice that he dropped in the code base and never really >> finished doing what he originally intended. Â The improvements that have been >> made to it are almost at the heroic level but we're only hurting our users >> by not moving on to something better. >> > > I don't like qcow2 either. Â But from a performance perspective, it can be > made equivalent to qed with some effort. Â It is worthwhile to expend that > effort rather than push the burden to users. > >> Regards, >> >> Anthony Liguori >> >> > > [1] okay, maybe not. > > -- > I have a truly marvellous patch that fixes the bug which this > signature is too narrow to contain. > > >
On 09/10/2010 02:43 PM, Stefan Hajnoczi wrote: >> >>> and/or enterprise storage. >> That doesn't eliminate undiscovered errors (they can still come from the >> transport). > Eliminating silent data corruption is currently not a goal for any > disk image format I know of. For filesystems, I know that ZFS and > btrfs will try to detect corruption using data checksumming. > > The guest filesystem, the disk image format, or the host filesystem > could do checksumming. The hypervisor should keep out of the way in > the interest of performance and emulation fidelity. Why does > checksumming need to be done in the image format? Isn't the choice > between host and guest filesystem checksumming already enough? You're correct about the data. It's better to do it at the end-point in any case. The metadata is something else - an error in a cluster table is magnified so it is likely to cause the loss of an entire image, and there's nothing the guest can do about it. btrfs duplicates metadata to avoid this (but if we have btrfs underneath, we can just use raw). >> qcow2 exists, people use it, and by the time qed is offered on distros (even >> more on enterprise distros), there will be a lot more qcow2 images. Not >> everyone runs qemu.git HEAD. >> >> What will you tell those people? Upgrade your image? They may still want >> to share it with older installations. What if they use features not present >> in qed? Bad luck? >> >> qcow2 is going to live forever no matter what we do. > It should be possible to do (live) upgrades for supported images. > That only solves part of the problem. Please TRIM below the last line of your message.
Am 10.09.2010 13:43, schrieb Stefan Hajnoczi: >>>>> By creating two code paths within qcow2. >>>> >>>> You're creating two code paths for users. >>> >>> No, I'm creating a single path: QED. >>> >>> There are already two code paths: raw and qcow2. qcow2 has had such a bad >>> history that for a lot of users, it's not even a choice. >> >> qcow2 exists, people use it, and by the time qed is offered on distros (even >> more on enterprise distros), there will be a lot more qcow2 images. Not >> everyone runs qemu.git HEAD. >> >> What will you tell those people? Upgrade your image? They may still want >> to share it with older installations. What if they use features not present >> in qed? Bad luck? >> >> qcow2 is going to live forever no matter what we do. > > It should be possible to do (live) upgrades for supported images. That still leaves those qcow2 images that use features not supported by qed. Just a few features missing in qed are internal snapshots, qcow2 on block devices, compression, encryption. So qed can't be a complete replacement for qcow2 (and that was the whole point of doing qed). If anything, it can exist besides qcow2. Kevin
On Fri, Sep 10, 2010 at 1:12 PM, Kevin Wolf <kwolf@redhat.com> wrote: > Am 10.09.2010 13:43, schrieb Stefan Hajnoczi: >>>>>> By creating two code paths within qcow2. >>>>> >>>>> You're creating two code paths for users. >>>> >>>> No, I'm creating a single path: QED. >>>> >>>> There are already two code paths: raw and qcow2. Â qcow2 has had such a bad >>>> history that for a lot of users, it's not even a choice. >>> >>> qcow2 exists, people use it, and by the time qed is offered on distros (even >>> more on enterprise distros), there will be a lot more qcow2 images. Â Not >>> everyone runs qemu.git HEAD. >>> >>> What will you tell those people? Â Upgrade your image? Â They may still want >>> to share it with older installations. Â What if they use features not present >>> in qed? Â Bad luck? >>> >>> qcow2 is going to live forever no matter what we do. >> >> It should be possible to do (live) upgrades for supported images. > > That still leaves those qcow2 images that use features not supported by > qed. Just a few features missing in qed are internal snapshots, qcow2 on > block devices, compression, encryption. So qed can't be a complete > replacement for qcow2 (and that was the whole point of doing qed). If > anything, it can exist besides qcow2. qcow2 is a feature-driven format. It sacrifices some of the core qualities of an image format in exchange for advanced features. I like to use qcow2 myself for desktop virtualization. qed applies the 80/20 rule to disk image formats. Let's perfect the basics for most users at a fraction of the {development,performance} cost. Then, with a clean base that takes on board the lessons of existing formats it is much easier to innovate. Look at the image streaming, defragmentation, and trim ideas that are playing out right now. I think the reason we haven't seen them before is because the effort and the baggage of doing them is too great. Sure, we maintain existing formats but I don't see active development pushing virtualized storage happening. Do you think qcow2 is the right format for the future? The flagship disk image format for KVM? Stefan
On 09/10/2010 03:35 PM, Stefan Hajnoczi wrote: > >> That still leaves those qcow2 images that use features not supported by >> qed. Just a few features missing in qed are internal snapshots, qcow2 on >> block devices, compression, encryption. So qed can't be a complete >> replacement for qcow2 (and that was the whole point of doing qed). If >> anything, it can exist besides qcow2. > qcow2 is a feature-driven format. It sacrifices some of the core > qualities of an image format in exchange for advanced features. I > like to use qcow2 myself for desktop virtualization. > > qed applies the 80/20 rule to disk image formats. Let's perfect the > basics for most users at a fraction of the {development,performance} > cost. > > Then, with a clean base that takes on board the lessons of existing > formats it is much easier to innovate. Look at the image streaming, > defragmentation, and trim ideas that are playing out right now. I > think the reason we haven't seen them before is because the effort and > the baggage of doing them is too great. Sure, we maintain existing > formats but I don't see active development pushing virtualized storage > happening. The same could be said about much of qemu. It is an old code base that wasn't designed for virtualization. Yet we maintain it and develop it because compatibility is king. (as an aside, qcow2 is better positioned for TRIM support than qed is) > Do you think qcow2 is the right format for the future? The flagship > disk image format for KVM? If we were starting from scratch, no. But we aren't starting from scratch.
On Fri, Sep 10, 2010 at 1:47 PM, Avi Kivity <avi@redhat.com> wrote: > Â On 09/10/2010 03:35 PM, Stefan Hajnoczi wrote: >> >>> That still leaves those qcow2 images that use features not supported by >>> qed. Just a few features missing in qed are internal snapshots, qcow2 on >>> block devices, compression, encryption. So qed can't be a complete >>> replacement for qcow2 (and that was the whole point of doing qed). If >>> anything, it can exist besides qcow2. >> >> qcow2 is a feature-driven format. Â It sacrifices some of the core >> qualities of an image format in exchange for advanced features. Â I >> like to use qcow2 myself for desktop virtualization. >> >> qed applies the 80/20 rule to disk image formats. Â Let's perfect the >> basics for most users at a fraction of the {development,performance} >> cost. >> >> Then, with a clean base that takes on board the lessons of existing >> formats it is much easier to innovate. Â Look at the image streaming, >> defragmentation, and trim ideas that are playing out right now. Â I >> think the reason we haven't seen them before is because the effort and >> the baggage of doing them is too great. Â Sure, we maintain existing >> formats but I don't see active development pushing virtualized storage >> happening. > > The same could be said about much of qemu. Â It is an old code base that > wasn't designed for virtualization. Â Yet we maintain it and develop it > because compatibility is king. For compatibility? I figured the amount of effort to implement all the device emulation and BIOS was not deemed worth starting from scratch. Stefan
On 09/10/2010 06:14 AM, Avi Kivity wrote: >> >> The point of an image format is not to recreate btrfs in software. >> It's to provide a mechanism to allow users to move images around >> reasonable but once an image is present on a reasonable filesystem, >> we should more or less get the heck out of the way. > > You can achieve exactly the same thing with qcow2. Yes, it's more > work, but it's also less disruptive to users. This is turning dangerously close into a vbus vs. virtio discussion :-) Let me review the motivation for QED and why we've decided incremental improvements to qcow2 were not viable. 1) qcow2 has awful performance characteristics 2) qcow2 has historically had data integrity issues. It's unclear anyone is willing to say that they're 100% confident that there are still data integrity issues in the format. 3) The users I care most about are absolutely uncompromising about data integrity. There is no room for uncertainty or trade offs when you're building an enterprise product. 4) We have looked at trying to fix qcow2. It appears to be a monumental amount of work that starts with a rewrite where it's unclear if we can even keep supporting all of the special features. IOW, there is likely to be a need for users to experience some type of image conversion or optimization process. 5) A correct version of qcow2 has terrible performance. You need to do a bunch of fancy tricks to recover that performance. Every fancy trick needs to be carefully evaluated with respect to correctness. There's a large surface area for potential data corruptors. We're still collecting performance data, but here's an example of what we're talking about. FFSB Random Writes MB/s (Block Size=8KB) Native Raw QCow2 QED 1 Thread 30.2 24.4 22.7 23.4 8 Threads 145.1 119.9 10.6 112.9 16 Threads 177.1 139.0 10.1 120.9 The performance difference is an order of magnitude. qcow2 bounces all requests, needs to issue synchronous metadata updates, and only supports a single outstanding request at a time. With good performance and high confidence in integrity, it's a no brainer as far as I'm concerned. We have a format that it easy to rationalize as correct, performs damn close to raw. On the other hand, we have a format that no one is confident that is correct that is even harder to rationalize as correct, and is an order of magnitude off raw in performance. It's really a no brainer. The impact to users is minimal. Upgrading images to a new format is not a big deal. This isn't guest visible and we're not talking about deleting qcow2 and removing support for it. >> Today, users have to choose between performance and reliability or >> features. QED offers an opportunity to be able to tell users to just >> always use QED as an image format and forget about >> raw/qcow2/everything else. > > raw will always be needed for direct volume access and shared > storage. qcow2 will always be needed for old images. My point is that for the future, the majority of people no longer have to think about "do I need performance more than I need sparse images?". If they have some special use case, fine, but for most people we simplify their choices. >> You can say, let's just make qcow2 better, but we've been trying that >> for years and we have an existence proof that we can do it in a >> straight forward fashion with QED. > > When you don't use the extra qcow2 features, it has the same > performance characteristics as qed. If you're willing to leak blocks on a scale that is still unknown. It's not at all clear that making qcow2 have the same characteristics as qed is an easy problem. qed is specifically designed to avoid synchronous metadata updates. qcow2 cannot achieve that. You can *potentially* batch metadata updates by preallocating clusters, but what's the right amount to preallocate and is it really okay to leak blocks at that scale? It's a weak story either way. There's a burden of proof still required to establish that this would, indeed, address the performance concerns. > You need to batch allocation and freeing, but that's fairly > straightforward. > > Yes, qcow2 has a long and tortured history and qed is perfect. > Starting from scratch is always easier and more fun. Except for the > users. The fact that you're basing your argument on "think of the users" is strange because you're advocating not doing something that is going to be hugely beneficial for our users. You're really arguing that we should continue only offering a format with weak data integrity and even weaker performance. >> A new format doesn't introduce much additional complexity. We >> provide image conversion tool and we can almost certainly provide an >> in-place conversion tool that makes the process very fast. > > It introduces a lot of complexity for the users who aren't qed > experts. They need to make a decision. What's the impact of the > change? Are the features that we lose important to us? Do we know > what they are? Is there any risk? Can we make the change online or > do we have to schedule downtime? Do all our hosts support qed? It's very simple. Use qed, convert all existing images. Image conversion is a part of virtualization. We have tools to do it. If they want to stick with qcow2 and are happy with it, fine, no one is advocating removing it. We can solve all possible problems and have images that users can move back to arbitrarily old versions of qemu with all of the same advantages of the newer versions. It's not realistic. > Improving qcow2 will be very complicated for Kevin who already looks > older beyond his years [1] but very simple for users. I think we're all better off if we move past sunk costs and focus on solving other problems. I'd rather we all focus on improving performance and correctness even further than trying to make qcow2 be as good as what every other hypervisor had 5 years ago. qcow2 has been a failure. Let's live up to it and move on. Making statements at each release that qcow2 has issues but we'll fix it soon just makes us look like we don't know what we're doing. >> User confusion is reduced if we can make strong, clear statements: >> all users should use QED even if they care about performance. Today, >> there's mass confusion because of the poor state of qcow2. > > If we improve qcow2 and make the same strong, clear statement we'll > have the same results. To be honest, the brand is tarnished. Once something gains a reputation for having poor integrity, it's very hard to overcome that. Even if you have Kevin spend the next 6 months rewriting qcow2 from scratch, I'm going to have a hard time convincing customers trust it. All someone has to do is look at change logs to see that it has a bad history. That's more than enough to make people very nervous. >>> Virtualization is about compatibility. In-guest compatibility >>> first, but keeping the external environment stable is also >>> important. We really need to exhaust the possibilities with qcow2 >>> before giving up on it. >> >> IMHO, we're long past exhausting the possibilities with qcow2. We >> still haven't decided what we're going to do for 0.13.0. > > Sorry, I disagree 100%. How can you say that, when no one has yet > tried, for example, batching allocations and frees? Or properly > threaded it? We've spent years trying to address problems in qcow2. And Stefan specifically has spent a good amount of time trying to fix qcow2. I know you've spent time trying to thread it too. I don't think you really grasp how difficult of a problem it is to fix qcow2. It's not just that the code is bad, the format makes something that should be simple more complicated than it needs to be. >> qcow2 is not a properly designed image format. It was a weekend >> hacking session from Fabrice that he dropped in the code base and >> never really finished doing what he originally intended. The >> improvements that have been made to it are almost at the heroic level >> but we're only hurting our users by not moving on to something better. >> > > > I don't like qcow2 either. But from a performance perspective, it can > be made equivalent to qed with some effort. It is worthwhile to > expend that effort rather than push the burden to users. The choices we have 1) provide our users a format that has high performance and good data integrity 2) continue to only offer a format that has poor performance and bad data integrity and promise that we'll eventually fix it. We've been doing (2) for too long now. We need to offer a solution to users today. It's not fair to our users to not offer them a good solution just because we don't want to admit to previous mistakes. If someone can fix qcow2 and make it competitive, by all means, please do. Regards, Anthony Liguori >> Regards, >> >> Anthony Liguori >> >> > > [1] okay, maybe not. >
On 09/10/2010 06:25 AM, Avi Kivity wrote: > On 09/10/2010 02:14 PM, Avi Kivity wrote: >> >>> >>> qcow2 is not a properly designed image format. It was a weekend >>> hacking session from Fabrice that he dropped in the code base and >>> never really finished doing what he originally intended. The >>> improvements that have been made to it are almost at the heroic >>> level but we're only hurting our users by not moving on to something >>> better. >>> >> >> I don't like qcow2 either. But from a performance perspective, it >> can be made equivalent to qed with some effort. It is worthwhile to >> expend that effort rather than push the burden to users. > > btw, despite being not properly designed, qcow2 is able to support > TRIM. qed isn't able to, except by leaking clusters on shutdown. > TRIM support is required unless you're okay with the image growing > until it is no longer sparse (the lack of TRIM support in guests make > sparse image formats somewhat of a joke, but nobody seems to notice). It's actually pretty easy in QED and it should perform very well. http://wiki.qemu.org/Features/QED/Trim Regards, Anthony Liguori
On 09/10/2010 04:10 PM, Stefan Hajnoczi wrote: > On Fri, Sep 10, 2010 at 1:47 PM, Avi Kivity<avi@redhat.com> wrote: >> On 09/10/2010 03:35 PM, Stefan Hajnoczi wrote: >>>> That still leaves those qcow2 images that use features not supported by >>>> qed. Just a few features missing in qed are internal snapshots, qcow2 on >>>> block devices, compression, encryption. So qed can't be a complete >>>> replacement for qcow2 (and that was the whole point of doing qed). If >>>> anything, it can exist besides qcow2. >>> qcow2 is a feature-driven format. It sacrifices some of the core >>> qualities of an image format in exchange for advanced features. I >>> like to use qcow2 myself for desktop virtualization. >>> >>> qed applies the 80/20 rule to disk image formats. Let's perfect the >>> basics for most users at a fraction of the {development,performance} >>> cost. >>> >>> Then, with a clean base that takes on board the lessons of existing >>> formats it is much easier to innovate. Look at the image streaming, >>> defragmentation, and trim ideas that are playing out right now. I >>> think the reason we haven't seen them before is because the effort and >>> the baggage of doing them is too great. Sure, we maintain existing >>> formats but I don't see active development pushing virtualized storage >>> happening. >> The same could be said about much of qemu. It is an old code base that >> wasn't designed for virtualization. Yet we maintain it and develop it >> because compatibility is king. > For compatibility? I figured the amount of effort to implement all > the device emulation and BIOS was not deemed worth starting from > scratch. You're right. Even if someone did suggest to implement it because it sucks, we'd cry foul because of the risk to compatibility. My chief complaint against vbus was compatibility, and while qed isn't in exactly the same position (we're a lot more flexible on the host than on the guest), it does put a burden on users. I don't see how qed has any inherent performance advantage, it is essentially the same as qcow2 minus refcounting, which is easily batched. It's a lot easier to work with, both because it's a new code base and because it's simpler, but both of these will erode in time.
On 09/10/2010 06:43 AM, Avi Kivity wrote: > On 09/10/2010 02:33 PM, Stefan Hajnoczi wrote: >> >>> btw, despite being not properly designed, qcow2 is able to support >>> TRIM. >>> qed isn't able to, except by leaking clusters on shutdown. TRIM >>> support is >>> required unless you're okay with the image growing until it is no >>> longer >>> sparse (the lack of TRIM support in guests make sparse image formats >>> somewhat of a joke, but nobody seems to notice). >> Anthony has started writing up notes on trim for qed: >> http://wiki.qemu.org/Features/QED/Trim >> > > Looks like it depends on fsck, which is not a good idea for large images. fsck will always be fast on qed because the metadata is small. For a 1PB image, there's 128MB worth of L2s if it's fully allocated (keeping in mind, that once you're fully allocated, you'll never fsck again). If you've got 1PB worth of storage, I'm fairly sure you're going to be able to do 128MB of reads in a short period of time. Even if it's a few seconds, it only occurs on power failure so it's pretty reasonable. >> I need to look at the actual ATA and SCSI specs for how this will >> work. The issue I am concerned with is sub-cluster trim operations. >> If the trim region is less than a cluster, then both qed and qcow2 >> don't really have a way to handle it. Perhaps we could punch a hole >> in the file, given a userspace interface to do this, but that isn't >> ideal because we're losing sparseness again. > > To deal with a sub-cluster TRIM, look at the surrounding sectors. If > they're zero, free the cluster. If not, write zeros or use > sys_punch() to the range specified by TRIM. Better yet, if you can't trim a full cluster, just write out zeros and have a separate background process that punches out zero clusters. That approach is a bit more generic and will help compact images independently of guest trims. Regards, Anthony Liguori
On 09/10/2010 07:06 AM, Avi Kivity wrote: > On 09/10/2010 02:43 PM, Stefan Hajnoczi wrote: >>> >>>> and/or enterprise storage. >>> That doesn't eliminate undiscovered errors (they can still come from >>> the >>> transport). >> Eliminating silent data corruption is currently not a goal for any >> disk image format I know of. For filesystems, I know that ZFS and >> btrfs will try to detect corruption using data checksumming. >> >> The guest filesystem, the disk image format, or the host filesystem >> could do checksumming. The hypervisor should keep out of the way in >> the interest of performance and emulation fidelity. Why does >> checksumming need to be done in the image format? Isn't the choice >> between host and guest filesystem checksumming already enough? > > You're correct about the data. It's better to do it at the end-point > in any case. > > The metadata is something else - an error in a cluster table is > magnified so it is likely to cause the loss of an entire image, and > there's nothing the guest can do about it. btrfs duplicates metadata > to avoid this (but if we have btrfs underneath, we can just use raw). What it really comes down to is that checksumming is a filesystem feature that requires a sophisticated way of handling metadata which puts it beyond the scope of what an image format should be. The point of an image format is to make it a filesystem from 10 years ago in terms of sophistication and leave the cutting edge file system research to file system developers. Regards, Anthony Liguori
On 09/10/2010 07:47 AM, Avi Kivity wrote: >> Then, with a clean base that takes on board the lessons of existing >> formats it is much easier to innovate. Look at the image streaming, >> defragmentation, and trim ideas that are playing out right now. I >> think the reason we haven't seen them before is because the effort and >> the baggage of doing them is too great. Sure, we maintain existing >> formats but I don't see active development pushing virtualized storage >> happening. > > > The same could be said about much of qemu. It is an old code base > that wasn't designed for virtualization. Yet we maintain it and > develop it because compatibility is king. > > (as an aside, qcow2 is better positioned for TRIM support than qed is) You're hand waving to a dangerous degree here :-) TRIM in qcow2 would require the following sequence: 1) remove cluster from L2 table 2) sync() 3) reduce cluster reference count 4) sync() TRIM needs to be fast so this is not going to be acceptable. How do you solve it? For QED, TRIM requires: 1) remove cluster from L2 table 2) sync() In both cases, I'm assuming we lazily write the free list and have a way to detect unclean mounts. Unclean mounts require an fsck() and both qcow2 and qed require it. You can drop the last sync() in both QEDand qcow2 by delaying the sync() until you reallocate the cluster. If you sync() for some other reason before then, you can avoid it completely. I don't think you can remove (2) from qcow2 TRIM. This is the key feature of qed. Because there's only one piece of metadata, you never have to worry about metadata ordering. You can amortize the cost of metadata ordering in qcow2 by batching certain operations but not all operations are easily batched. Maybe you could batch trim operations and attempt to do them all at once. But then you need to track future write requests in order to make sure you don't trim over a new write. When it comes to data integrity, increased complexity == increased chance of screwing up. Regards, Anthony Liguori
On 09/10/2010 04:14 PM, Anthony Liguori wrote: > On 09/10/2010 06:14 AM, Avi Kivity wrote: >>> >>> The point of an image format is not to recreate btrfs in software. >>> It's to provide a mechanism to allow users to move images around >>> reasonable but once an image is present on a reasonable filesystem, >>> we should more or less get the heck out of the way. >> >> You can achieve exactly the same thing with qcow2. Yes, it's more >> work, but it's also less disruptive to users. > > This is turning dangerously close into a vbus vs. virtio discussion :-) > > Let me review the motivation for QED and why we've decided incremental > improvements to qcow2 were not viable. > > 1) qcow2 has awful performance characteristics The current qcow2 implementation, yes. The qcow2 format, no. > 2) qcow2 has historically had data integrity issues. It's unclear > anyone is willing to say that they're 100% confident that there are > still data integrity issues in the format. Fast forward a few years, no one will be 100% confident there are no data integrity issues in qed. > 3) The users I care most about are absolutely uncompromising about > data integrity. There is no room for uncertainty or trade offs when > you're building an enterprise product. 100% in agreement here. > 4) We have looked at trying to fix qcow2. It appears to be a > monumental amount of work that starts with a rewrite where it's > unclear if we can even keep supporting all of the special features. > IOW, there is likely to be a need for users to experience some type of > image conversion or optimization process. I don't see why. > > 5) A correct version of qcow2 has terrible performance. Not inherently. > You need to do a bunch of fancy tricks to recover that performance. > Every fancy trick needs to be carefully evaluated with respect to > correctness. There's a large surface area for potential data corruptors. s/large/larger/. The only real difference is the refcount table, which I agree sucks, but happens to be nice for TRIM support. > > We're still collecting performance data, but here's an example of what > we're talking about. > > FFSB Random Writes MB/s (Block Size=8KB) > > Native Raw QCow2 QED > 1 Thread 30.2 24.4 22.7 23.4 > 8 Threads 145.1 119.9 10.6 112.9 > 16 Threads 177.1 139.0 10.1 120.9 > > The performance difference is an order of magnitude. qcow2 bounces > all requests, needs to issue synchronous metadata updates, and only > supports a single outstanding request at a time. Those are properties of the implementation, not the format. The format makes it harder to get it right but doesn't give us a free pass not to do it. > > With good performance and high confidence in integrity, it's a no > brainer as far as I'm concerned. We have a format that it easy to > rationalize as correct, performs damn close to raw. On the other > hand, we have a format that no one is confident that is correct that > is even harder to rationalize as correct, and is an order of magnitude > off raw in performance. > > It's really a no brainer. Sure, because you don't care about users. All of the complexity of changing image formats (and deciding whether to do that or not) is hidden away. > > The impact to users is minimal. Upgrading images to a new format is > not a big deal. This isn't guest visible and we're not talking about > deleting qcow2 and removing support for it. It's a big deal to them. Users are not experts in qemu image formats. They will have to learn how to do it, whether they can do it (need to upgrade all your qemus before you can do it, need to make sure you're not using qcow2 features, need to be sure you're not planning to use qcow2 features). Sure, we'll support qcow2, but will we give it the same attention? >>> Today, users have to choose between performance and reliability or >>> features. QED offers an opportunity to be able to tell users to >>> just always use QED as an image format and forget about >>> raw/qcow2/everything else. >> >> raw will always be needed for direct volume access and shared >> storage. qcow2 will always be needed for old images. > > My point is that for the future, the majority of people no longer have > to think about "do I need performance more than I need sparse images?". That can be satisfied with qcow2 + preallocation. > > If they have some special use case, fine, but for most people we > simplify their choices. > >>> You can say, let's just make qcow2 better, but we've been trying >>> that for years and we have an existence proof that we can do it in a >>> straight forward fashion with QED. >> >> When you don't use the extra qcow2 features, it has the same >> performance characteristics as qed. > > If you're willing to leak blocks on a scale that is still unknown. Who cares, those aren't real storage blocks. > It's not at all clear that making qcow2 have the same characteristics > as qed is an easy problem. qed is specifically designed to avoid > synchronous metadata updates. qcow2 cannot achieve that. qcow2 and qed are equivalent if you disregard the refcount table (which we address by preallocation). Exactly the same technique you use for sync-free metadata updates in qed can be used for qcow2. > You can *potentially* batch metadata updates by preallocating > clusters, but what's the right amount to preallocate You look at your write rate and adjust it dynamically so you never wait. > and is it really okay to leak blocks at that scale? Again, those aren't real blocks. And we're talking power loss anyway. It's certainly better than requiring fsck for correctness. > It's a weak story either way. There's a burden of proof still > required to establish that this would, indeed, address the performance > concerns. I don't see why you doubt it so much. Amortization is an well known technique for reducing the cost of expensive operations. > >> You need to batch allocation and freeing, but that's fairly >> straightforward. >> >> Yes, qcow2 has a long and tortured history and qed is perfect. >> Starting from scratch is always easier and more fun. Except for the >> users. > > The fact that you're basing your argument on "think of the users" is > strange because you're advocating not doing something that is going to > be hugely beneficial for our users. You misunderstand me. I'm not advocating dropping qed and stopping qcow2 development. I'm advocating dropping qed and working on qcow2 to provide the benefits that qed brings. > > You're really arguing that we should continue only offering a format > with weak data integrity and even weaker performance. Those are not properties of the format, only of the implementation. > >>> A new format doesn't introduce much additional complexity. We >>> provide image conversion tool and we can almost certainly provide an >>> in-place conversion tool that makes the process very fast. >> >> It introduces a lot of complexity for the users who aren't qed >> experts. They need to make a decision. What's the impact of the >> change? Are the features that we lose important to us? Do we know >> what they are? Is there any risk? Can we make the change online or >> do we have to schedule downtime? Do all our hosts support qed? > > It's very simple. Use qed, convert all existing images. Image > conversion is a part of virtualization. We have tools to do it. If > they want to stick with qcow2 and are happy with it, fine, no one is > advocating removing it. This simple formula doesn't work if some of your hosts don't support qed yet. And it's still complicated for users because they have to understand all of that. "trust me, use qed" is not going to work. Image conversion is a part of virtualization, yes. A sucky part, we should try to avoid it. > > We can solve all possible problems and have images that users can move > back to arbitrarily old versions of qemu with all of the same > advantages of the newer versions. It's not realistic. True, but we can do better that replace the image format. > >> Improving qcow2 will be very complicated for Kevin who already looks >> older beyond his years [1] but very simple for users. > > I think we're all better off if we move past sunk costs and focus on > solving other problems. I'd rather we all focus on improving > performance and correctness even further than trying to make qcow2 be > as good as what every other hypervisor had 5 years ago. > > qcow2 has been a failure. Let's live up to it and move on. Making > statements at each release that qcow2 has issues but we'll fix it soon > just makes us look like we don't know what we're doing. > Switching file formats is a similar statement. >>> User confusion is reduced if we can make strong, clear statements: >>> all users should use QED even if they care about performance. >>> Today, there's mass confusion because of the poor state of qcow2. >> >> If we improve qcow2 and make the same strong, clear statement we'll >> have the same results. > > To be honest, the brand is tarnished. Once something gains a > reputation for having poor integrity, it's very hard to overcome that. > > Even if you have Kevin spend the next 6 months rewriting qcow2 from > scratch, I'm going to have a hard time convincing customers trust it. > > All someone has to do is look at change logs to see that it has a bad > history. That's more than enough to make people very nervous. People will be nervous of something completely new (though I agree the simplicity is a very strong point of qed). >>> IMHO, we're long past exhausting the possibilities with qcow2. We >>> still haven't decided what we're going to do for 0.13.0. >> >> Sorry, I disagree 100%. How can you say that, when no one has yet >> tried, for example, batching allocations and frees? Or properly >> threaded it? > > We've spent years trying to address problems in qcow2. And Stefan > specifically has spent a good amount of time trying to fix qcow2. I > know you've spent time trying to thread it too. I don't think you > really grasp how difficult of a problem it is to fix qcow2. It's not > just that the code is bad, the format makes something that should be > simple more complicated than it needs to be. IMO, the real problem is the state machine implementation. Threading it would make it much simpler. I wish I had the time to go back to do that. What is specifically so bad about qcow2? The refcount table? It happens to be necessary for TRIM. Copy-on-write? It's needed for external snapshots. > >>> qcow2 is not a properly designed image format. It was a weekend >>> hacking session from Fabrice that he dropped in the code base and >>> never really finished doing what he originally intended. The >>> improvements that have been made to it are almost at the heroic >>> level but we're only hurting our users by not moving on to something >>> better. >>> >> >> >> I don't like qcow2 either. But from a performance perspective, it >> can be made equivalent to qed with some effort. It is worthwhile to >> expend that effort rather than push the burden to users. > > The choices we have 1) provide our users a format that has high > performance and good data integrity 2) continue to only offer a format > that has poor performance and bad data integrity and promise that > we'll eventually fix it. > > We've been doing (2) for too long now. We need to offer a solution to > users today. It's not fair to our users to not offer them a good > solution just because we don't want to admit to previous mistakes. > > If someone can fix qcow2 and make it competitive, by all means, please > do. We can have them side by side and choose later based on performance. Though I fear if qed is merged qcow2 will see no further work.
On Fri, Sep 10, 2010 at 12:33:09PM +0100, Stefan Hajnoczi wrote: > > btw, despite being not properly designed, qcow2 is able to support TRIM. > > ?qed isn't able to, except by leaking clusters on shutdown. ?TRIM support is > > required unless you're okay with the image growing until it is no longer > > sparse (the lack of TRIM support in guests make sparse image formats > > somewhat of a joke, but nobody seems to notice). > > Anthony has started writing up notes on trim for qed: > http://wiki.qemu.org/Features/QED/Trim > > I need to look at the actual ATA and SCSI specs for how this will > work. The issue I am concerned with is sub-cluster trim operations. > If the trim region is less than a cluster, then both qed and qcow2 > don't really have a way to handle it. Perhaps we could punch a hole > in the file, given a userspace interface to do this, but that isn't > ideal because we're losing sparseness again. ATA TRIM doesn't have a granularity, it's always sector sized. SCSI WRITE SAME with the unmap bit or UNMAP as well as my virtio_blk support for discarding blocks export topoligy information about the required minimum discard request size. I export it from qemu the same way as we export other topology information and at least Linux hosts can use it. Note that ATA allows simply ignoring TRIM requests that we can't handle, and if we don't set the bit that guarantees TRIMed regions to be zeroed we don't even have to zero out the regions.
Am 10.09.2010 14:35, schrieb Stefan Hajnoczi: > On Fri, Sep 10, 2010 at 1:12 PM, Kevin Wolf <kwolf@redhat.com> wrote: >> Am 10.09.2010 13:43, schrieb Stefan Hajnoczi: >>>>>>> By creating two code paths within qcow2. >>>>>> >>>>>> You're creating two code paths for users. >>>>> >>>>> No, I'm creating a single path: QED. >>>>> >>>>> There are already two code paths: raw and qcow2. qcow2 has had such a bad >>>>> history that for a lot of users, it's not even a choice. >>>> >>>> qcow2 exists, people use it, and by the time qed is offered on distros (even >>>> more on enterprise distros), there will be a lot more qcow2 images. Not >>>> everyone runs qemu.git HEAD. >>>> >>>> What will you tell those people? Upgrade your image? They may still want >>>> to share it with older installations. What if they use features not present >>>> in qed? Bad luck? >>>> >>>> qcow2 is going to live forever no matter what we do. >>> >>> It should be possible to do (live) upgrades for supported images. >> >> That still leaves those qcow2 images that use features not supported by >> qed. Just a few features missing in qed are internal snapshots, qcow2 on >> block devices, compression, encryption. So qed can't be a complete >> replacement for qcow2 (and that was the whole point of doing qed). If >> anything, it can exist besides qcow2. > > qcow2 is a feature-driven format. It sacrifices some of the core > qualities of an image format in exchange for advanced features. I > like to use qcow2 myself for desktop virtualization. > > qed applies the 80/20 rule to disk image formats. Let's perfect the > basics for most users at a fraction of the {development,performance} > cost. So let's translate this into an answer to the question we're discussing here: Yes, Avi is right, qcow2 is going to live forever. > Then, with a clean base that takes on board the lessons of existing > formats it is much easier to innovate. Look at the image streaming, > defragmentation, and trim ideas that are playing out right now. All of these are possible with qcow2 as well or even better than with qed. For example trim feels like a really hacky thing in qed whereas freeing a cluster is something just natural in qcow2. Kevin
On Fri, Sep 10, 2010 at 08:22:14AM -0500, Anthony Liguori wrote: > fsck will always be fast on qed because the metadata is small. For a > 1PB image, there's 128MB worth of L2s if it's fully allocated (keeping > in mind, that once you're fully allocated, you'll never fsck again). If > you've got 1PB worth of storage, I'm fairly sure you're going to be able > to do 128MB of reads in a short period of time. Even if it's a few > seconds, it only occurs on power failure so it's pretty reasonable. I don't think it is. Even if the metadata is small it can still be spread all over the disks and seek latencies might kill you. I think if we want to make qed future proof it needs to provide transactional integrity for metadata updates, just like a journaling filesystem. Given that small amount of metadata and less different kinds it will still be a lot simpler than a full filesystem of course.
On Fri, Sep 10, 2010 at 08:39:21AM -0500, Anthony Liguori wrote: > You're hand waving to a dangerous degree here :-) > > TRIM in qcow2 would require the following sequence: > > 1) remove cluster from L2 table > 2) sync() > 3) reduce cluster reference count > 4) sync() > > TRIM needs to be fast so this is not going to be acceptable. How do you > solve it? It's utterly slow in any real life SSD. > For QED, TRIM requires: > > 1) remove cluster from L2 table > 2) sync() > > In both cases, I'm assuming we lazily write the free list and have a way > to detect unclean mounts. Unclean mounts require an fsck() and both > qcow2 and qed require it. If you do proper transactional metadata updates you can completely drop the sync. TRIM / SCSI unmap are optimizations that can just be noops without compromising data integrity.
On 09/10/2010 04:39 PM, Anthony Liguori wrote: > On 09/10/2010 07:47 AM, Avi Kivity wrote: >>> Then, with a clean base that takes on board the lessons of existing >>> formats it is much easier to innovate. Look at the image streaming, >>> defragmentation, and trim ideas that are playing out right now. I >>> think the reason we haven't seen them before is because the effort and >>> the baggage of doing them is too great. Sure, we maintain existing >>> formats but I don't see active development pushing virtualized storage >>> happening. >> >> >> The same could be said about much of qemu. It is an old code base >> that wasn't designed for virtualization. Yet we maintain it and >> develop it because compatibility is king. >> >> (as an aside, qcow2 is better positioned for TRIM support than qed is) > > You're hand waving to a dangerous degree here :-) > > TRIM in qcow2 would require the following sequence: > > 1) remove cluster from L2 table > 2) sync() > 3) reduce cluster reference count > 4) sync() > > TRIM needs to be fast so this is not going to be acceptable. How do > you solve it? > Batching. Of course, you don't reuse the cluster until you've synced. Note the whole thing can happen in the background. You issue the sync, but the waiting isn't exposed to the guest. Freeing and allocation are both easy to batch since they're not guest visible operation. > For QED, TRIM requires: > > 1) remove cluster from L2 table > 2) sync() > > In both cases, I'm assuming we lazily write the free list and have a > way to detect unclean mounts. You don't have a free list in qed. > Unclean mounts require an fsck() and both qcow2 and qed require it. qcow2 does not require an fsck (and neither does qed if it properly preallocates). > You can drop the last sync() in both QEDand qcow2 by delaying the > sync() until you reallocate the cluster. If you sync() for some other > reason before then, you can avoid it completely. > > I don't think you can remove (2) from qcow2 TRIM. Why not? If the guest writes to the same logical sector, you reallocate that cluster and update L2. All you need to make sure is that the refcount table is not updated and synced until L2 has been synced (directly or as a side effect of a guest sync). > This is the key feature of qed. Because there's only one piece of > metadata, you never have to worry about metadata ordering. You can > amortize the cost of metadata ordering in qcow2 by batching certain > operations but not all operations are easily batched. Unless you introduce a freelist, in which case you have exactly the same problems as qcow2 (perhaps with a better on-disk data structure). If you don't introduce a freelist, you have unbounded leakage on power failure. With a freelist you can always limit the amount of leakage. > > Maybe you could batch trim operations and attempt to do them all at > once. But then you need to track future write requests in order to > make sure you don't trim over a new write. Yes. > > When it comes to data integrity, increased complexity == increased > chance of screwing up. True.
On 09/10/2010 04:22 PM, Anthony Liguori wrote: >> Looks like it depends on fsck, which is not a good idea for large >> images. > > > fsck will always be fast on qed because the metadata is small. For a > 1PB image, there's 128MB worth of L2s if it's fully allocated It's 32,000 seeks. > (keeping in mind, that once you're fully allocated, you'll never fsck > again). Why? Fully populated L1 (so all L2s are allocated) doesn't mean a fully allocated image. You're still allocating and linking into L2s. > If you've got 1PB worth of storage, I'm fairly sure you're going to > be able to do 128MB of reads in a short period of time. Even if it's > a few seconds, it only occurs on power failure so it's pretty reasonable. Consider a cloud recovering from power loss, even if you're fscking thousands of 100GB images you'll create a horrible seek storm on your storage (to be followed by a seek storm from all the guests booting). No, fsck is not a good idea. > >>> I need to look at the actual ATA and SCSI specs for how this will >>> work. The issue I am concerned with is sub-cluster trim operations. >>> If the trim region is less than a cluster, then both qed and qcow2 >>> don't really have a way to handle it. Perhaps we could punch a hole >>> in the file, given a userspace interface to do this, but that isn't >>> ideal because we're losing sparseness again. >> >> To deal with a sub-cluster TRIM, look at the surrounding sectors. If >> they're zero, free the cluster. If not, write zeros or use >> sys_punch() to the range specified by TRIM. > > Better yet, if you can't trim a full cluster, just write out zeros and > have a separate background process that punches out zero clusters. > That can work as well, or a combination perhaps. > That approach is a bit more generic and will help compact images > independently of guest trims. You still need a freelist.
On 09/10/2010 04:47 PM, Christoph Hellwig wrote: > On Fri, Sep 10, 2010 at 12:33:09PM +0100, Stefan Hajnoczi wrote: >>> btw, despite being not properly designed, qcow2 is able to support TRIM. >>> ?qed isn't able to, except by leaking clusters on shutdown. ?TRIM support is >>> required unless you're okay with the image growing until it is no longer >>> sparse (the lack of TRIM support in guests make sparse image formats >>> somewhat of a joke, but nobody seems to notice). >> Anthony has started writing up notes on trim for qed: >> http://wiki.qemu.org/Features/QED/Trim >> >> I need to look at the actual ATA and SCSI specs for how this will >> work. The issue I am concerned with is sub-cluster trim operations. >> If the trim region is less than a cluster, then both qed and qcow2 >> don't really have a way to handle it. Perhaps we could punch a hole >> in the file, given a userspace interface to do this, but that isn't >> ideal because we're losing sparseness again. > ATA TRIM doesn't have a granularity, it's always sector sized. SCSI > WRITE SAME with the unmap bit or UNMAP as well as my virtio_blk support > for discarding blocks export topoligy information about the required > minimum discard request size. I export it from qemu the same way as > we export other topology information and at least Linux hosts can use > it. Ok, thanks for the correction. > Note that ATA allows simply ignoring TRIM requests that we can't handle, > and if we don't set the bit that guarantees TRIMed regions to be zeroed > we don't even have to zero out the regions. It would be nice to support it. TRIM is important to recover space, otherwise images grow and grow and there's no point in using a sparse format in the first place.
On 09/10/2010 04:16 PM, Anthony Liguori wrote: >> btw, despite being not properly designed, qcow2 is able to support >> TRIM. qed isn't able to, except by leaking clusters on shutdown. >> TRIM support is required unless you're okay with the image growing >> until it is no longer sparse (the lack of TRIM support in guests make >> sparse image formats somewhat of a joke, but nobody seems to notice). > > > It's actually pretty easy in QED and it should perform very well. > > http://wiki.qemu.org/Features/QED/Trim > If you don't add a free list, this is a pretty bad implementation. If you do, you're back to qcow2's problems.
On Fri, Sep 10, 2010 at 05:05:16PM +0300, Avi Kivity wrote: > >Note that ATA allows simply ignoring TRIM requests that we can't handle, > >and if we don't set the bit that guarantees TRIMed regions to be zeroed > >we don't even have to zero out the regions. > > It would be nice to support it. TRIM is important to recover space, > otherwise images grow and grow and there's no point in using a sparse > format in the first place. Sure. But supporting to tiny TRIM requests doesn't make sense. That is the same behaviour we see from real life SSDs, btw. If the request is smaller than their erase block size or whatever internal structure they use to track allocations it will not actually free space. On some of the lesser quality consumer SSDs the sectors won't even be zeroed even if they claim so in the ATA IDENTIFY response.
On 09/10/2010 05:12 PM, Christoph Hellwig wrote: > On Fri, Sep 10, 2010 at 05:05:16PM +0300, Avi Kivity wrote: >>> Note that ATA allows simply ignoring TRIM requests that we can't handle, >>> and if we don't set the bit that guarantees TRIMed regions to be zeroed >>> we don't even have to zero out the regions. >> It would be nice to support it. TRIM is important to recover space, >> otherwise images grow and grow and there's no point in using a sparse >> format in the first place. > Sure. But supporting to tiny TRIM requests doesn't make sense. That > is the same behaviour we see from real life SSDs, btw. If the request > is smaller than their erase block size or whatever internal structure > they use to track allocations it will not actually free space. On some > of the lesser quality consumer SSDs the sectors won't even be zeroed > even if they claim so in the ATA IDENTIFY response. Okay. Let's concentrate on those UNMAP requests that seem better designed.
On 09/10/2010 08:47 AM, Avi Kivity wrote: > The current qcow2 implementation, yes. The qcow2 format, no. The qcow2 format has more writes because it maintains more meta data. More writes == worse performance. You claim that you can effectively batch those writes such that the worse performance will be in the noise. That claim needs to be proven though because it's purely conjecture right now. There is a trade off to batching too as you leak address space. If you have to preallocate 2GB worth of address space to get good performance, then I'm very sceptical that qcow2 achieves the goals of a sparse file format. If I do a qemu-img create -f qcow2 foo.img 10GB, and then do a naive copy of the image file and end up with a 2GB image when there's nothing in it, that's badness. And what do you do when you shutdown and start up? You're setting a reference count on blocks and keeping metadata in memory that those blocks are really free. Do you need an atexit hook to decrement the reference counts? Do you need to create a free list structure that gets written out on close? Just saying "we can do batching" is not solving the problem. If you want to claim that the formats are equally, then in the very least, you have to give a very exact description of how this would work because it's not entirely straight forward. >> 2) qcow2 has historically had data integrity issues. It's unclear >> anyone is willing to say that they're 100% confident that there are >> still data integrity issues in the format. > > Fast forward a few years, no one will be 100% confident there are no > data integrity issues in qed. I don't think you have any grounds to make such a statement. >> 3) The users I care most about are absolutely uncompromising about >> data integrity. There is no room for uncertainty or trade offs when >> you're building an enterprise product. > > 100% in agreement here. > >> 4) We have looked at trying to fix qcow2. It appears to be a >> monumental amount of work that starts with a rewrite where it's >> unclear if we can even keep supporting all of the special features. >> IOW, there is likely to be a need for users to experience some type >> of image conversion or optimization process. > > I don't see why. Because you're oversimplifying what it takes to make qcow2 perform well. >> >> 5) A correct version of qcow2 has terrible performance. > > Not inherently. A "naive" correct version of qcow2 does. Look at the above example. If you introduce a free list, you change the format which means that you couldn't support moving an image to an older version. So just for your batching example, the only compatible approach is to reduce the reference count on shutdown. But there's definitely a trade off because a few unclean shut downs could result in a huge image. >> You need to do a bunch of fancy tricks to recover that performance. >> Every fancy trick needs to be carefully evaluated with respect to >> correctness. There's a large surface area for potential data >> corruptors. > > s/large/larger/. The only real difference is the refcount table, > which I agree sucks, but happens to be nice for TRIM support. I don't see the advantage at all. >> >> We're still collecting performance data, but here's an example of >> what we're talking about. >> >> FFSB Random Writes MB/s (Block Size=8KB) >> >> Native Raw QCow2 QED >> 1 Thread 30.2 24.4 22.7 23.4 >> 8 Threads 145.1 119.9 10.6 112.9 >> 16 Threads 177.1 139.0 10.1 120.9 >> >> The performance difference is an order of magnitude. qcow2 bounces >> all requests, needs to issue synchronous metadata updates, and only >> supports a single outstanding request at a time. > > Those are properties of the implementation, not the format. The > format makes it harder to get it right but doesn't give us a free pass > not to do it. If the complexity doesn't buy us anything, than why pay the cost of it? Let's review the proported downsides of QED. 1) It's a new image format. If users create QED images, they can't use them with older QEMU's. However, if we add a new feature to qcow2, we have the same problem. 2) If a user has an existing image qcow2 and wants to get the performance/correctness advantages of QED, they have to convert their images. That said, in place conversion can tremendously simplify this. 3) Another format adds choice, choice adds complexity. From my perspective, QED can reduce choice long term because we can tell users that unless they have a strong reason otherwise, use QED. We cannot do that with qcow2 today. That may be an implementation detail of qcow2, but it doesn't change the fact that there's complexity in choosing an image format today. >> >> With good performance and high confidence in integrity, it's a no >> brainer as far as I'm concerned. We have a format that it easy to >> rationalize as correct, performs damn close to raw. On the other >> hand, we have a format that no one is confident that is correct that >> is even harder to rationalize as correct, and is an order of >> magnitude off raw in performance. >> >> It's really a no brainer. > > Sure, because you don't care about users. All of the complexity of > changing image formats (and deciding whether to do that or not) is > hidden away. Let's not turn this into a "I care more about users than you do" argument. Changing image formats consists of running a single command. The command is pretty slow today but we can make it pretty darn fast. It seems like a relatively small price to pay for a relatively large gain. >> >> The impact to users is minimal. Upgrading images to a new format is >> not a big deal. This isn't guest visible and we're not talking about >> deleting qcow2 and removing support for it. > > It's a big deal to them. Users are not experts in qemu image > formats. They will have to learn how to do it, whether they can do it > (need to upgrade all your qemus before you can do it, need to make > sure you're not using qcow2 features, need to be sure you're not > planning to use qcow2 features). But we can't realistically support users that are using those extra features today anyway. It's those "features" that are the fundamental problem. > Sure, we'll support qcow2, but will we give it the same attention? We have a lot of block formats in QEMU today but only one block format that actually performs well and has good data integrity. We're not giving qcow2 the attention it would need today to promote it to a Useful Format so I'm not sure that it really matters. >> If you're willing to leak blocks on a scale that is still unknown. > > Who cares, those aren't real storage blocks. They are once you move the image from one place to another. If that doesn't concern you, it really should. >> It's not at all clear that making qcow2 have the same characteristics >> as qed is an easy problem. qed is specifically designed to avoid >> synchronous metadata updates. qcow2 cannot achieve that. > > qcow2 and qed are equivalent if you disregard the refcount table > (which we address by preallocation). Exactly the same technique you > use for sync-free metadata updates in qed can be used for qcow2. You cannot ignore the refcount table, that's the point of the discussion. >> You can *potentially* batch metadata updates by preallocating >> clusters, but what's the right amount to preallocate > > You look at your write rate and adjust it dynamically so you never wait. It's never that simple. How long do you look at the write rate? Do you lower the amount dynamically, if so, after how long? Predicting the future is never easy. >> and is it really okay to leak blocks at that scale? > > Again, those aren't real blocks. And we're talking power loss > anyway. It's certainly better than requiring fsck for correctness. They are once you copy the image. And power loss is the same thing as unexpected exit because you're not simply talking about delaying a sync, you're talking staging future I/O operations purely within QEMU. >> It's a weak story either way. There's a burden of proof still >> required to establish that this would, indeed, address the >> performance concerns. > > I don't see why you doubt it so much. Amortization is an well known > technique for reducing the cost of expensive operations. Because there are always limits, otherwise, all expensive operations would be cheap, and that's not reality. > You misunderstand me. I'm not advocating dropping qed and stopping > qcow2 development. I'm advocating dropping qed and working on qcow2 > to provide the benefits that qed brings. If you think qcow2 is fixable, than either 1) fix qcow2 and prove me wrong 2) detail in great length how you would fix qcow2, and prove me wrong. Either way, the burden of proof is on establishing that qcow2 is fixable. So far, the proposed fixes are not specific and/or have unacceptable trade offs. Having a leaking image is not acceptable IMHO because it potentially becomes something that is guest exploitable. If a guest finds a SEGV that is not exploitable in any meaningful way accept crashing QEMU, by leaking data in each crash, a guest can now grow an image's virtual size indefinitely. This does have real costs in disk space as the underlying file system does need to deal with metadata, but it's not unrealistic for management tools to copy images around for various reasons (maybe offline backup). A reasonable management tool might do planning based on maximum image size, but now the tools have to cope with (virtually) infinitely large images. >>>> A new format doesn't introduce much additional complexity. We >>>> provide image conversion tool and we can almost certainly provide >>>> an in-place conversion tool that makes the process very fast. >>> >>> It introduces a lot of complexity for the users who aren't qed >>> experts. They need to make a decision. What's the impact of the >>> change? Are the features that we lose important to us? Do we know >>> what they are? Is there any risk? Can we make the change online or >>> do we have to schedule downtime? Do all our hosts support qed? >> >> It's very simple. Use qed, convert all existing images. Image >> conversion is a part of virtualization. We have tools to do it. If >> they want to stick with qcow2 and are happy with it, fine, no one is >> advocating removing it. > > This simple formula doesn't work if some of your hosts don't support > qed yet. And it's still complicated for users because they have to > understand all of that. "trust me, use qed" is not going to work. Verses what? "Trust me, this time, we've finally fixed qcow2's data integrity issues" is going to work? That's an uphill battle no matter what. >> >>> Improving qcow2 will be very complicated for Kevin who already looks >>> older beyond his years [1] but very simple for users. >> >> I think we're all better off if we move past sunk costs and focus on >> solving other problems. I'd rather we all focus on improving >> performance and correctness even further than trying to make qcow2 be >> as good as what every other hypervisor had 5 years ago. >> >> qcow2 has been a failure. Let's live up to it and move on. Making >> statements at each release that qcow2 has issues but we'll fix it >> soon just makes us look like we don't know what we're doing. >> > > Switching file formats is a similar statement. It's not an easy thing to do, I'll be the first to admit it. But we have to do difficult things in the name of progress. This discussion is an important one to have because we should not do things of this significance lightly. But that doesn't mean we should be afraid to make significant changes. The lack of a useful image format in QEMU today in unacceptable. We cannot remain satisfied with the status quo. If you think we can fix qcow2, then fix qcow2. But it's not obvious to me that it's fixable so if you think it is, you'll need to guide the way. It's not enough to just wave your hands and say "ammortize the expensive operations". It's not that easy to solve or else we would have solved it ages ago. > IMO, the real problem is the state machine implementation. Threading > it would make it much simpler. I wish I had the time to go back to do > that. The hard parts of support multiple requests in qed had nothing to do with threading vs. state machine. It was ensuring that all requests had independent state that didn't depend on a global context. Since the meta data cache has to be shared content, you have to be very careful about thinking through the semantics of evicting entries from the cache and bringing entries into the cache. The concurrency model really doesn't matter. > What is specifically so bad about qcow2? The refcount table? It > happens to be necessary for TRIM. Copy-on-write? It's needed for > external snapshots. The refcount table is not necessary for trim. For trim, all you need is one bit of information, whether a block is allocated or not. With one bit of information, the refcount table is redundant because you have that same information in the L2 tables. It's harder to obtain but the fact that it's obtainable means you can have weak semantics with maintaining a refcount table (IOW, a free list) because it's only an optimization. >> The choices we have 1) provide our users a format that has high >> performance and good data integrity 2) continue to only offer a >> format that has poor performance and bad data integrity and promise >> that we'll eventually fix it. >> >> We've been doing (2) for too long now. We need to offer a solution >> to users today. It's not fair to our users to not offer them a good >> solution just because we don't want to admit to previous mistakes. >> >> If someone can fix qcow2 and make it competitive, by all means, >> please do. > > We can have them side by side and choose later based on performance. > Though I fear if qed is merged qcow2 will see no further work. I think that's a weak argument not to merge qed and it's a bad way to grow a community. We shouldn't prevent useful code from being merged because there was a previous half-baked implementation. Evolution is sometimes destructive and that's not a bad thing. Otherwise, I'd still be working on Xen :-) We certainly should do our best to ease transition for users. For guest facing things, we absolutely need to provide full compatibility and avoid changing guests at all costs. But upgrading on the host is a part of life. It's the same reason that every few years, we go from ext2 -> ext3, ext3 -> ext4, ext4 -> btrfs. It's never pretty but the earth still continues to orbit the sun and we all seem to get by. Regards, Anthony Liguori
On 09/10/2010 08:48 AM, Christoph Hellwig wrote: > On Fri, Sep 10, 2010 at 08:22:14AM -0500, Anthony Liguori wrote: > >> fsck will always be fast on qed because the metadata is small. For a >> 1PB image, there's 128MB worth of L2s if it's fully allocated (keeping >> in mind, that once you're fully allocated, you'll never fsck again). If >> you've got 1PB worth of storage, I'm fairly sure you're going to be able >> to do 128MB of reads in a short period of time. Even if it's a few >> seconds, it only occurs on power failure so it's pretty reasonable. >> > I don't think it is. Even if the metadata is small it can still be > spread all over the disks and seek latencies might kill you. I think > if we want to make qed future proof it needs to provide transactional > integrity for metadata updates, just like a journaling filesystem. > I think the biggest challenge with an image format is finding the balance between host FS features and image format features and deciding where to solve problems. Down the road, fsync() might not actually suck on file systems and recovery in the face of failure might be trivial because we can just fsync() after every metadata write. So going to great lengths to deal with meta data transactions may be a lot of work for little gain. What makes us future proof is having a good feature support. qcow2 doesn't have this. We have a good way at making purely informational changes and also making changes that break the format. Those features are independent so they can be backported in a compatible way too. Regards, Anthony Liguori > Given that small amount of metadata and less different kinds it will > still be a lot simpler than a full filesystem of course. > >
Am 10.09.2010 17:02, schrieb Anthony Liguori: > What makes us future proof is having a good feature support. qcow2 > doesn't have this. We have a good way at making purely informational > changes and also making changes that break the format. Those features > are independent so they can be backported in a compatible way too. I might have agreed that it's useful to be able to backport them independently if we had had lots of such features added in the past. But we haven't. The qcow2 mechanism for compatible changes is header extensions (used exactly once, for the backing file format) and for incompatible changes increasing the version number (never used so far, if you consider qcow1 and qcow2 completely independent formats, which I think they are). Kevin
On 09/10/2010 05:56 PM, Anthony Liguori wrote: > On 09/10/2010 08:47 AM, Avi Kivity wrote: >> The current qcow2 implementation, yes. The qcow2 format, no. > > The qcow2 format has more writes because it maintains more meta data. > More writes == worse performance. > > You claim that you can effectively batch those writes such that the > worse performance will be in the noise. That claim needs to be proven > though because it's purely conjecture right now. It's based on experience. Why do you think batching allocations will not improve performance? In the common case (growing the physical file) allocating involves writing a '(int64_t)1' to a refcount table. Allocating multiple contiguous clusters means writing multiple such entries. That's trivial to batch. > > There is a trade off to batching too as you leak address space. If > you have to preallocate 2GB worth of address space to get good > performance, then I'm very sceptical that qcow2 achieves the goals of > a sparse file format. 2GB is 20 seconds worth of writes at 100 MB/s. It's way beyond what's needed. At a guess I'd say 100ms worth, and of course, only if actively writing. > If I do a qemu-img create -f qcow2 foo.img 10GB, and then do a naive > copy of the image file and end up with a 2GB image when there's > nothing in it, that's badness. Only if you crash in the middle. If not, you free the preallocation during shutdown (or when running a guest, when it isn't actively writing at 100 MB/s). > > And what do you do when you shutdown and start up? You're setting a > reference count on blocks and keeping metadata in memory that those > blocks are really free. Do you need an atexit hook to decrement the > reference counts? Not atexit, just when we close the image. > Do you need to create a free list structure that gets written out on > close? Yes, the same freelist that we allocate from. It's an "allocated but not yet referenced" list. > Just saying "we can do batching" is not solving the problem. If you > want to claim that the formats are equally, then in the very least, > you have to give a very exact description of how this would work > because it's not entirely straight forward. I thought I did, but I realize it is spread over multiple email messages. If you like, I can try to summarize it. It will be equally useful for qed once you add a freelist for UNMAP support. At least one filesystem I'm aware of does preallocation in this manner. > >>> 2) qcow2 has historically had data integrity issues. It's unclear >>> anyone is willing to say that they're 100% confident that there are >>> still data integrity issues in the format. >> >> Fast forward a few years, no one will be 100% confident there are no >> data integrity issues in qed. > > I don't think you have any grounds to make such a statement. No, it's a forward-looking statement. But you're already looking at adding a freelist for UNMAP support and three levels for larger images. So it's safe to say that qed will not remain as nice and simple as it is now. >> >>> 4) We have looked at trying to fix qcow2. It appears to be a >>> monumental amount of work that starts with a rewrite where it's >>> unclear if we can even keep supporting all of the special features. >>> IOW, there is likely to be a need for users to experience some type >>> of image conversion or optimization process. >> >> I don't see why. > > Because you're oversimplifying what it takes to make qcow2 perform well. Maybe. With all its complexity, it's nowhere near as close to the simplest filesystem. The biggest burden is the state machine design. > >>> >>> 5) A correct version of qcow2 has terrible performance. >> >> Not inherently. > > A "naive" correct version of qcow2 does. Look at the above example. > If you introduce a free list, you change the format which means that > you couldn't support moving an image to an older version. qcow2 already has a free list, it's the refcount table. > > So just for your batching example, the only compatible approach is to > reduce the reference count on shutdown. But there's definitely a > trade off because a few unclean shut downs could result in a huge image. Not just on shutdown, also on guest quiesce. And yes, many unclean shutdowns will bloat the image size. Definitely a downside. The qed solution is to not support UNMAP or qed-on-lvm, and to require fsck instead. Or to introduce an on-disk freelist, at which point you get the qcow2 problems back. > >>> You need to do a bunch of fancy tricks to recover that performance. >>> Every fancy trick needs to be carefully evaluated with respect to >>> correctness. There's a large surface area for potential data >>> corruptors. >> >> s/large/larger/. The only real difference is the refcount table, >> which I agree sucks, but happens to be nice for TRIM support. > > I don't see the advantage at all. I can't parse this. You don't see the advantage of TRIM (now UNMAP)? You don't see the advantage of refcount tables? There isn't any, except when compared to a format with no freelist which therefore can't support UNMAP. >> Those are properties of the implementation, not the format. The >> format makes it harder to get it right but doesn't give us a free >> pass not to do it. > > > If the complexity doesn't buy us anything, than why pay the cost of it? Because of compatibility. Starting from scratch, I'd pick qed, with three levels and some way to support UNMAP. > > Let's review the proported downsides of QED. > > 1) It's a new image format. If users create QED images, they can't > use them with older QEMU's. However, if we add a new feature to > qcow2, we have the same problem. Depends. Some features don't need format changes (UNMAP). On the other hand, qcow2 doesn't have a feature bitmap, which complicates things. > > 2) If a user has an existing image qcow2 and wants to get the > performance/correctness advantages of QED, they have to convert their > images. That said, in place conversion can tremendously simplify this. Live conversion would be even better. It's still a user-visible hassle. > > 3) Another format adds choice, choice adds complexity. From my > perspective, QED can reduce choice long term because we can tell users > that unless they have a strong reason otherwise, use QED. We cannot > do that with qcow2 today. That may be an implementation detail of > qcow2, but it doesn't change the fact that there's complexity in > choosing an image format today. True. 4) Requires fsck on unclean shutdown 5) No support for qed-on-lvm 6) limited image resize 7) No support for UNMAP All are fixable, the latter with considerable changes to the format (allocating from an on-disk freelist requires an intermediate sync step; if the freelist is not on-disk, you can lose unbounded on-disk storage on clean shutdown). >> Sure, because you don't care about users. All of the complexity of >> changing image formats (and deciding whether to do that or not) is >> hidden away. > > Let's not turn this into a "I care more about users than you do" > argument. Changing image formats consists of running a single > command. The command is pretty slow today but we can make it pretty > darn fast. It seems like a relatively small price to pay for a > relatively large gain. It's true for desktop users. It's not true for large installations. >>> >>> The impact to users is minimal. Upgrading images to a new format is >>> not a big deal. This isn't guest visible and we're not talking >>> about deleting qcow2 and removing support for it. >> >> It's a big deal to them. Users are not experts in qemu image >> formats. They will have to learn how to do it, whether they can do >> it (need to upgrade all your qemus before you can do it, need to make >> sure you're not using qcow2 features, need to be sure you're not >> planning to use qcow2 features). > > But we can't realistically support users that are using those extra > features today anyway. Why not? > It's those "features" that are the fundamental problem. I agree some of them (compression, in-image snapshots) are misfeatures. >> Sure, we'll support qcow2, but will we give it the same attention? > > We have a lot of block formats in QEMU today but only one block format > that actually performs well and has good data integrity. > > We're not giving qcow2 the attention it would need today to promote it > to a Useful Format so I'm not sure that it really matters. I don't think it's so useless. It's really only slow when allocating, yes? Once you've allocated it is fully async IIRC. So even today qcow2 is only slow at the start of the lifetime of the image. >>> If you're willing to leak blocks on a scale that is still unknown. >> >> Who cares, those aren't real storage blocks. > > They are once you move the image from one place to another. If that > doesn't concern you, it really should. I don't see it as a huge problem, certainly less than fsck. If you think fsck is a smaller hit, you can use it to recover the space. Hm, you could have an 'unclean shutdown' bit in qcow2 and run a scrubber in the background if you see it set and recover the space. > >>> It's not at all clear that making qcow2 have the same >>> characteristics as qed is an easy problem. qed is specifically >>> designed to avoid synchronous metadata updates. qcow2 cannot >>> achieve that. >> >> qcow2 and qed are equivalent if you disregard the refcount table >> (which we address by preallocation). Exactly the same technique you >> use for sync-free metadata updates in qed can be used for qcow2. > > You cannot ignore the refcount table, that's the point of the discussion. #include "I'm using preallocation to reduce its cost". > >>> You can *potentially* batch metadata updates by preallocating >>> clusters, but what's the right amount to preallocate >> >> You look at your write rate and adjust it dynamically so you never wait. > > It's never that simple. How long do you look at the write rate? Do > you lower the amount dynamically, if so, after how long? Predicting > the future is never easy. No, it's not easy. But you have to do it in qed as well, if you want to avoid fsck. >>> and is it really okay to leak blocks at that scale? >> >> Again, those aren't real blocks. And we're talking power loss >> anyway. It's certainly better than requiring fsck for correctness. > > They are once you copy the image. And power loss is the same thing as > unexpected exit because you're not simply talking about delaying a > sync, you're talking staging future I/O operations purely within QEMU. qed is susceptible to the same problem. If you have a 100MB write and qemu exits before it updates L2s, then those 100MB are leaked. You could alleviate the problem by writing L2 at intermediate points, but even then, a power loss can leak those 100MB. qed trades off the freelist for the file size (anything beyond the file size is free), it doesn't eliminate it completely. So you still have some of its problems, but you don't get its benefits. >>> It's a weak story either way. There's a burden of proof still >>> required to establish that this would, indeed, address the >>> performance concerns. >> >> I don't see why you doubt it so much. Amortization is an well known >> technique for reducing the cost of expensive operations. > > Because there are always limits, otherwise, all expensive operations > would be cheap, and that's not reality. Well, I guess we won't get anywhere with a theoretical discussion here. > >> You misunderstand me. I'm not advocating dropping qed and stopping >> qcow2 development. I'm advocating dropping qed and working on qcow2 >> to provide the benefits that qed brings. > > If you think qcow2 is fixable, than either 1) fix qcow2 and prove me > wrong 2) detail in great length how you would fix qcow2, and prove me > wrong. Either way, the burden of proof is on establishing that qcow2 > is fixable. I agree the burden of proof is on me (I'm just going to bounce it off to Kevin). Mere words shouldn't be used to block off new work. > > So far, the proposed fixes are not specific and/or have unacceptable > trade offs. I thought they were quite specific. I'll try to summarize them in one place so at least they're not lost. > Having a leaking image is not acceptable IMHO because it potentially > becomes something that is guest exploitable. > > If a guest finds a SEGV that is not exploitable in any meaningful way > accept crashing QEMU, by leaking data in each crash, a guest can now > grow an image's virtual size indefinitely. > > This does have real costs in disk space as the underlying file system > does need to deal with metadata, but it's not unrealistic for > management tools to copy images around for various reasons (maybe > offline backup). A reasonable management tool might do planning based > on maximum image size, but now the tools have to cope with (virtually) > infinitely large images. The qed solution is fsck, which is a lot worse IMO. >> This simple formula doesn't work if some of your hosts don't support >> qed yet. And it's still complicated for users because they have to >> understand all of that. "trust me, use qed" is not going to work. > > Verses what? "Trust me, this time, we've finally fixed qcow2's data > integrity issues" is going to work? That's an uphill battle no matter > what. We have to fix qcow2 anyway, since we can't ensure users do upgrade to qed. >>> >>> qcow2 has been a failure. Let's live up to it and move on. Making >>> statements at each release that qcow2 has issues but we'll fix it >>> soon just makes us look like we don't know what we're doing. >>> >> >> Switching file formats is a similar statement. > > It's not an easy thing to do, I'll be the first to admit it. But we > have to do difficult things in the name of progress. > > This discussion is an important one to have because we should not do > things of this significance lightly. > > But that doesn't mean we should be afraid to make significant > changes. The lack of a useful image format in QEMU today in > unacceptable. We cannot remain satisfied with the status quo. > > If you think we can fix qcow2, then fix qcow2. But it's not obvious > to me that it's fixable so if you think it is, you'll need to guide > the way. I'm willing to list the things I think should be done. But someone else will have to actually do them and someone else will have to allocate the time for this work, which is not going to be insignificant. > It's not enough to just wave your hands and say "ammortize the > expensive operations". It's not that easy to solve or else we would > have solved it ages ago. We were rightly focusing on data integrity first. >> IMO, the real problem is the state machine implementation. Threading >> it would make it much simpler. I wish I had the time to go back to >> do that. > > The hard parts of support multiple requests in qed had nothing to do > with threading vs. state machine. It was ensuring that all requests > had independent state that didn't depend on a global context. Since > the meta data cache has to be shared content, you have to be very > careful about thinking through the semantics of evicting entries from > the cache and bringing entries into the cache. > > The concurrency model really doesn't matter. I disagree. When you want to order dependent operations with threads, you stick a mutex in the data structure that needs serialization. The same problem with a state machine means collecting all the state in the call stack, sticking it in a dependency chain, and scheduling a restart when the first operation completes. It's a lot more code. >> What is specifically so bad about qcow2? The refcount table? It >> happens to be necessary for TRIM. Copy-on-write? It's needed for >> external snapshots. > > The refcount table is not necessary for trim. For trim, all you need > is one bit of information, whether a block is allocated or not. > > With one bit of information, the refcount table is redundant because > you have that same information in the L2 tables. It's harder to > obtain but the fact that it's obtainable means you can have weak > semantics with maintaining a refcount table (IOW, a free list) because > it's only an optimization. Well, the refcount table is also redundant wrt qcow2's L2 tables. You can always reconstruct it with an fsck. You store 64 bits vs 1 bit (or less if you use an extent based format, or only store allocated blocks) but essentially it has the same requirements. >> We can have them side by side and choose later based on performance. >> Though I fear if qed is merged qcow2 will see no further work. > > I think that's a weak argument not to merge qed and it's a bad way to > grow a community. Certainly, it's open source and we should encourage new ideas. But I'm worried that when qed grows for a while it will become gnarly, and we'll lost some of the benefit, while we'll create user confusion. > We shouldn't prevent useful code from being merged because there was a > previous half-baked implementation. Evolution is sometimes > destructive and that's not a bad thing. Otherwise, I'd still be > working on Xen :-) > > We certainly should do our best to ease transition for users. For > guest facing things, we absolutely need to provide full compatibility > and avoid changing guests at all costs. > > But upgrading on the host is a part of life. It's the same reason > that every few years, we go from ext2 -> ext3, ext3 -> ext4, ext4 -> > btrfs. It's never pretty but the earth still continues to orbit the > sun and we all seem to get by. ext[234] is more like qcow2 evolution. qcow2->qed is more similar to ext4->btrfs, but compare the huge feature set difference between ext4 and btrfs, and qcow2 and qed.
On 09/10/2010 10:18 AM, Kevin Wolf wrote: > Am 10.09.2010 17:02, schrieb Anthony Liguori: > >> What makes us future proof is having a good feature support. qcow2 >> doesn't have this. We have a good way at making purely informational >> changes and also making changes that break the format. Those features >> are independent so they can be backported in a compatible way too. >> > I might have agreed that it's useful to be able to backport them > independently if we had had lots of such features added in the past. But > we haven't. > I think part of why we haven't had them is that the mechanisms aren't very flexible. A good example of where feature support would be very nice is for changing the way snapshots metadata is recorded in qcow2. It would be nice to be able to represent snapshots with a uuid. If you added new metadata that had uuid based snapshots that were hierarchical and added a feature bit, it would have some nice properties. Since most images don't have snapshots, the common case would be a qcow2 that was fully backwards compatible. You would also get a graceful failure for using a new image with an old QEMU. You could argue that you can do the same by bumping the version number but that really only works when all changes are additive. If you had two features and you only backported one, badness ensues. On it's own, I don't think feature support is enough to justify a new image format. However, it's a nice thing to have. Regards, Anthony Liguori > The qcow2 mechanism for compatible changes is header extensions (used > exactly once, for the backing file format) and for incompatible changes > increasing the version number (never used so far, if you consider qcow1 > and qcow2 completely independent formats, which I think they are). > > Kevin >
Am 10.09.2010 17:53, schrieb Anthony Liguori: > On 09/10/2010 10:18 AM, Kevin Wolf wrote: >> Am 10.09.2010 17:02, schrieb Anthony Liguori: >> >>> What makes us future proof is having a good feature support. qcow2 >>> doesn't have this. We have a good way at making purely informational >>> changes and also making changes that break the format. Those features >>> are independent so they can be backported in a compatible way too. >>> >> I might have agreed that it's useful to be able to backport them >> independently if we had had lots of such features added in the past. But >> we haven't. >> > > I think part of why we haven't had them is that the mechanisms aren't > very flexible. > > A good example of where feature support would be very nice is for > changing the way snapshots metadata is recorded in qcow2. > > It would be nice to be able to represent snapshots with a uuid. If you > added new metadata that had uuid based snapshots that were hierarchical > and added a feature bit, it would have some nice properties. > > Since most images don't have snapshots, the common case would be a qcow2 > that was fully backwards compatible. You would also get a graceful > failure for using a new image with an old QEMU. Well, snapshots have an ID today (which is different from their name). Nobody stops you from putting a UUID there. Fully backwards compatible, no feature flag needed. I think Miguel was planning to actually do this. Kevin
On 09/10/2010 10:49 AM, Avi Kivity wrote: >> If I do a qemu-img create -f qcow2 foo.img 10GB, and then do a >> naive copy of the image file and end up with a 2GB image when there's >> nothing in it, that's badness. > > Only if you crash in the middle. If not, you free the preallocation > during shutdown (or when running a guest, when it isn't actively > writing at 100 MB/s). Which is potentially guest exploitable. >> And what do you do when you shutdown and start up? You're setting a >> reference count on blocks and keeping metadata in memory that those >> blocks are really free. Do you need an atexit hook to decrement the >> reference counts? > > Not atexit, just when we close the image. Just a detail, but we need an atexit() handler to make sure block devices get closed because we have too many exit()s in the code today. >> Do you need to create a free list structure that gets written out on >> close? > > Yes, the same freelist that we allocate from. It's an "allocated but > not yet referenced" list. Does it get written to disk? >> Just saying "we can do batching" is not solving the problem. If you >> want to claim that the formats are equally, then in the very least, >> you have to give a very exact description of how this would work >> because it's not entirely straight forward. > > I thought I did, but I realize it is spread over multiple email > messages. If you like, I can try to summarize it. It will be equally > useful for qed once you add a freelist for UNMAP support. Yes, please consolidate so we can debate specifics. If there's a reasonable way to fix qcow2, I'm happy to walk away from qed. But we've studied the problem and do not believe there's a reasonable approach to fixing qcow2 whereas reasonable considers the amount of development effort, the time line required to get things right, and the confidence we would have in the final product compared against the one time cost of introducing a new format. >> >>>> 2) qcow2 has historically had data integrity issues. It's unclear >>>> anyone is willing to say that they're 100% confident that there are >>>> still data integrity issues in the format. >>> >>> Fast forward a few years, no one will be 100% confident there are no >>> data integrity issues in qed. >> >> I don't think you have any grounds to make such a statement. > > No, it's a forward-looking statement. But you're already looking at > adding a freelist for UNMAP support and three levels for larger > images. So it's safe to say that qed will not remain as nice and > simple as it is now. I have a lot of faith in starting from a strong base and avoiding making it weaker vs. starting from a weak base and trying to make it stronger. I realize it's somewhat subjective though. >>> >>>> 4) We have looked at trying to fix qcow2. It appears to be a >>>> monumental amount of work that starts with a rewrite where it's >>>> unclear if we can even keep supporting all of the special >>>> features. IOW, there is likely to be a need for users to >>>> experience some type of image conversion or optimization process. >>> >>> I don't see why. >> >> Because you're oversimplifying what it takes to make qcow2 perform well. > > Maybe. With all its complexity, it's nowhere near as close to the > simplest filesystem. The biggest burden is the state machine design. Maybe I'm broken with respect to how I think, but I find state machines very easy to rationalize. To me, the biggest burden in qcow2 is thinking through how you deal with shared resources. Because you can block for a long period of time during write operations, it's not enough to just carry a mutex during all metadata operations. You have to stage operations and commit them at very specific points in time. >> >>>> >>>> 5) A correct version of qcow2 has terrible performance. >>> >>> Not inherently. >> >> A "naive" correct version of qcow2 does. Look at the above example. >> If you introduce a free list, you change the format which means that >> you couldn't support moving an image to an older version. > > qcow2 already has a free list, it's the refcount table. Okay, qed already has a free list, it's the L1/L2 tables. Really, the ref count table in qcow2 is redundant. You can rebuild it if you needed to which means you could relax the integrity associated with it if you were willing to add an fsck process. But with internal snapshots, you can have a lot more metadata than without them so fsck can be very, very expensive. It's difficult to determine how to solve this problem. It's far easier to just avoid internal snapshots altogether and this is exactly the thought process that led to QED. Once you drop support for internal snapshots, you can dramatically simplify. >> >> So just for your batching example, the only compatible approach is to >> reduce the reference count on shutdown. But there's definitely a >> trade off because a few unclean shut downs could result in a huge image. > > Not just on shutdown, also on guest quiesce. And yes, many unclean > shutdowns will bloat the image size. Definitely a downside. > > The qed solution is to not support UNMAP or qed-on-lvm, and to require > fsck instead. We can support UNMAP. Not sure why you're suggesting we can't. Not doing qed-on-lvm is definitely a limitation. The one use case I've heard is qcow2 on top of clustered LVM as clustered LVM is simpler than a clustered filesystem. I don't know the space well enough so I need to think more about it. >> I don't see the advantage at all. > > I can't parse this. You don't see the advantage of TRIM (now UNMAP)? > You don't see the advantage of refcount tables? There isn't any, > except when compared to a format with no freelist which therefore > can't support UNMAP. Refcount table. See above discussion for my thoughts on refcount table. >> >> 2) If a user has an existing image qcow2 and wants to get the >> performance/correctness advantages of QED, they have to convert their >> images. That said, in place conversion can tremendously simplify this. > > Live conversion would be even better. It's still a user-visible hassle. Yeah, but you need a user to initiate it. Otherwise, it's doable. >> 3) Another format adds choice, choice adds complexity. From my >> perspective, QED can reduce choice long term because we can tell >> users that unless they have a strong reason otherwise, use QED. We >> cannot do that with qcow2 today. That may be an implementation >> detail of qcow2, but it doesn't change the fact that there's >> complexity in choosing an image format today. > > True. > > 4) Requires fsck on unclean shutdown I know it's uncool to do this in 2010, but I honestly believe it's a reasonable approach considering the relative simplicity of our FS compared to a normal FS. We're close to having fsck support so we can publish some performance data from doing it on a reasonable large disk (like 1TB). Let's see what that looks like before we draw too many conclusions. > 5) No support for qed-on-lvm > > 6) limited image resize Not anymore than qcow2 FWIW. Again, with the default create parameters, we can resize up to 64TB without rewriting metadata. I wouldn't call that limited image resize. > 7) No support for UNMAP > > All are fixable, the latter with considerable changes to the format > (allocating from an on-disk freelist requires an intermediate sync > step; if the freelist is not on-disk, you can lose unbounded on-disk > storage on clean shutdown). If you treat the on-disk free list as advisory, then you can be very loose with writing the free list to disk. You only have to rebuild the free list on unclean shutdown when you have to do an fsck anyway. If you're doing an fsck, you can rebuild the free list for free. So really, support for UNMAP is free if you're okay with fsck. And let's debate fsck some more when we have some proper performance data. > It's true for desktop users. It's not true for large installations. > >>>> >>>> The impact to users is minimal. Upgrading images to a new format >>>> is not a big deal. This isn't guest visible and we're not talking >>>> about deleting qcow2 and removing support for it. >>> >>> It's a big deal to them. Users are not experts in qemu image >>> formats. They will have to learn how to do it, whether they can do >>> it (need to upgrade all your qemus before you can do it, need to >>> make sure you're not using qcow2 features, need to be sure you're >>> not planning to use qcow2 features). >> >> But we can't realistically support users that are using those extra >> features today anyway. > > Why not? When I say, "support users", I mean make sure that they get very good performance and data integrity. So far, we've only talked about how to get good performance when there have never been snapshots but I think we also need to consider how to deal with making sure that no matter what feature a user is using, they get consistent results. >>> Sure, we'll support qcow2, but will we give it the same attention? >> >> We have a lot of block formats in QEMU today but only one block >> format that actually performs well and has good data integrity. >> >> We're not giving qcow2 the attention it would need today to promote >> it to a Useful Format so I'm not sure that it really matters. > > I don't think it's so useless. It's really only slow when allocating, > yes? Once you've allocated it is fully async IIRC. It bounces all buffers still and I still think it's synchronous (although Kevin would know better). >>>> If you're willing to leak blocks on a scale that is still unknown. >>> >>> Who cares, those aren't real storage blocks. >> >> They are once you move the image from one place to another. If that >> doesn't concern you, it really should. > > I don't see it as a huge problem, certainly less than fsck. If you > think fsck is a smaller hit, you can use it to recover the space. > > Hm, you could have an 'unclean shutdown' bit in qcow2 and run a > scrubber in the background if you see it set and recover the space. Yes, you'll want to have that regardless. But adding new things to qcow2 has all the problems of introducing a new image format. >> >>>> You can *potentially* batch metadata updates by preallocating >>>> clusters, but what's the right amount to preallocate >>> >>> You look at your write rate and adjust it dynamically so you never >>> wait. >> >> It's never that simple. How long do you look at the write rate? Do >> you lower the amount dynamically, if so, after how long? Predicting >> the future is never easy. > > No, it's not easy. But you have to do it in qed as well, if you want > to avoid fsck. I don't want to avoid fskc, but we need to provide data about cost of fsck in order to really make that case. > >>>> and is it really okay to leak blocks at that scale? >>> >>> Again, those aren't real blocks. And we're talking power loss >>> anyway. It's certainly better than requiring fsck for correctness. >> >> They are once you copy the image. And power loss is the same thing >> as unexpected exit because you're not simply talking about delaying a >> sync, you're talking staging future I/O operations purely within QEMU. > > qed is susceptible to the same problem. If you have a 100MB write and > qemu exits before it updates L2s, then those 100MB are leaked. You > could alleviate the problem by writing L2 at intermediate points, but > even then, a power loss can leak those 100MB. > > qed trades off the freelist for the file size (anything beyond the > file size is free), it doesn't eliminate it completely. So you still > have some of its problems, but you don't get its benefits. I think you've just established that qcow2 and qed both require an fsck. I don't disagree :-) >> It's not an easy thing to do, I'll be the first to admit it. But we >> have to do difficult things in the name of progress. >> >> This discussion is an important one to have because we should not do >> things of this significance lightly. >> >> But that doesn't mean we should be afraid to make significant >> changes. The lack of a useful image format in QEMU today in >> unacceptable. We cannot remain satisfied with the status quo. >> >> If you think we can fix qcow2, then fix qcow2. But it's not obvious >> to me that it's fixable so if you think it is, you'll need to guide >> the way. > > I'm willing to list the things I think should be done. But someone > else will have to actually do them and someone else will have to > allocate the time for this work, which is not going to be insignificant. Understood. >>> IMO, the real problem is the state machine implementation. >>> Threading it would make it much simpler. I wish I had the time to >>> go back to do that. >> >> The hard parts of support multiple requests in qed had nothing to do >> with threading vs. state machine. It was ensuring that all requests >> had independent state that didn't depend on a global context. Since >> the meta data cache has to be shared content, you have to be very >> careful about thinking through the semantics of evicting entries from >> the cache and bringing entries into the cache. >> >> The concurrency model really doesn't matter. > > I disagree. When you want to order dependent operations with threads, > you stick a mutex in the data structure that needs serialization. The > same problem with a state machine means collecting all the state in > the call stack, sticking it in a dependency chain, and scheduling a > restart when the first operation completes. It's a lot more code. Yeah, but I'm saying that you can't just carry a lock, you have to make sure you don't carry locks over write()s or read()s which means you end up having to stage certain operations with an explicit commit. If you think async is harder, that's fine. To me, that's a simple part. >>> What is specifically so bad about qcow2? The refcount table? It >>> happens to be necessary for TRIM. Copy-on-write? It's needed for >>> external snapshots. >> >> The refcount table is not necessary for trim. For trim, all you need >> is one bit of information, whether a block is allocated or not. >> >> With one bit of information, the refcount table is redundant because >> you have that same information in the L2 tables. It's harder to >> obtain but the fact that it's obtainable means you can have weak >> semantics with maintaining a refcount table (IOW, a free list) >> because it's only an optimization. > > Well, the refcount table is also redundant wrt qcow2's L2 tables. You > can always reconstruct it with an fsck. > > You store 64 bits vs 1 bit (or less if you use an extent based format, > or only store allocated blocks) but essentially it has the same > requirements. Precisely. >>> We can have them side by side and choose later based on >>> performance. Though I fear if qed is merged qcow2 will see no >>> further work. >> >> I think that's a weak argument not to merge qed and it's a bad way to >> grow a community. > > Certainly, it's open source and we should encourage new ideas. But > I'm worried that when qed grows for a while it will become gnarly, and > we'll lost some of the benefit, while we'll create user confusion. But that would be regressions and we need to be good about rejecting things that cause regressions. >> We shouldn't prevent useful code from being merged because there was >> a previous half-baked implementation. Evolution is sometimes >> destructive and that's not a bad thing. Otherwise, I'd still be >> working on Xen :-) >> >> We certainly should do our best to ease transition for users. For >> guest facing things, we absolutely need to provide full compatibility >> and avoid changing guests at all costs. >> >> But upgrading on the host is a part of life. It's the same reason >> that every few years, we go from ext2 -> ext3, ext3 -> ext4, ext4 -> >> btrfs. It's never pretty but the earth still continues to orbit the >> sun and we all seem to get by. > > ext[234] is more like qcow2 evolution. qcow2->qed is more similar to > ext4->btrfs, but compare the huge feature set difference between ext4 > and btrfs, and qcow2 and qed. To me, performance and correctness are huge features. Regards, Anthony Liguori
On 09/10/2010 11:05 AM, Kevin Wolf wrote: > Am 10.09.2010 17:53, schrieb Anthony Liguori: > >> On 09/10/2010 10:18 AM, Kevin Wolf wrote: >> >>> Am 10.09.2010 17:02, schrieb Anthony Liguori: >>> >>> >>>> What makes us future proof is having a good feature support. qcow2 >>>> doesn't have this. We have a good way at making purely informational >>>> changes and also making changes that break the format. Those features >>>> are independent so they can be backported in a compatible way too. >>>> >>>> >>> I might have agreed that it's useful to be able to backport them >>> independently if we had had lots of such features added in the past. But >>> we haven't. >>> >>> >> I think part of why we haven't had them is that the mechanisms aren't >> very flexible. >> >> A good example of where feature support would be very nice is for >> changing the way snapshots metadata is recorded in qcow2. >> >> It would be nice to be able to represent snapshots with a uuid. If you >> added new metadata that had uuid based snapshots that were hierarchical >> and added a feature bit, it would have some nice properties. >> >> Since most images don't have snapshots, the common case would be a qcow2 >> that was fully backwards compatible. You would also get a graceful >> failure for using a new image with an old QEMU. >> > Well, snapshots have an ID today (which is different from their name). > Nobody stops you from putting a UUID there. Fully backwards compatible, > no feature flag needed. I think Miguel was planning to actually do this. > The problem is that management tools have to make a decision about what to do with ID's that aren't UUIDs which means that in our management interface, we can't just expose UUIDs but instead we have to expose strings that may sometimes be UUIDs. I don't think it buys us a lot to get the backwards compatibility. Regards, Anthony Liguori > Kevin >
Am 10.09.2010 19:07, schrieb Anthony Liguori: >>>> Sure, we'll support qcow2, but will we give it the same attention? >>> >>> We have a lot of block formats in QEMU today but only one block >>> format that actually performs well and has good data integrity. >>> >>> We're not giving qcow2 the attention it would need today to promote >>> it to a Useful Format so I'm not sure that it really matters. >> >> I don't think it's so useless. It's really only slow when allocating, >> yes? Once you've allocated it is fully async IIRC. > > It bounces all buffers still and I still think it's synchronous > (although Kevin would know better). Yes, it does bounce the buffers, though I'm looking into this anyway because you raised concerns about unbounded allocations. (And it has been on my todo list for a while, but there were always more urgent things) What's synchronous in qcow2 is metadata access and COW. The guest requests themselves are handled asynchronously. Once the image has reached its full size, there are only metadata reads when you need to load a different L2 table. With 64k clusters we have an L2 cache that spans 8 GB of virtual disk space, so it shouldn't happen too often. >>>>> If you're willing to leak blocks on a scale that is still unknown. >>>> >>>> Who cares, those aren't real storage blocks. >>> >>> They are once you move the image from one place to another. If that >>> doesn't concern you, it really should. >> >> I don't see it as a huge problem, certainly less than fsck. If you >> think fsck is a smaller hit, you can use it to recover the space. >> >> Hm, you could have an 'unclean shutdown' bit in qcow2 and run a >> scrubber in the background if you see it set and recover the space. > > Yes, you'll want to have that regardless. But adding new things to > qcow2 has all the problems of introducing a new image format. Not exactly the same. The advantage of qed, namely not having the burden of supporting all qcow2 features, is at the same time its biggest disadvantage because it means that qcow2 can't be deprecated. Adding things to qcow2 means staying compatible to older versions (including all features) and still maintaining only one driver. Kevin
Am 10.09.2010 19:10, schrieb Anthony Liguori: > On 09/10/2010 11:05 AM, Kevin Wolf wrote: >> Am 10.09.2010 17:53, schrieb Anthony Liguori: >> >>> On 09/10/2010 10:18 AM, Kevin Wolf wrote: >>> >>>> Am 10.09.2010 17:02, schrieb Anthony Liguori: >>>> >>>> >>>>> What makes us future proof is having a good feature support. qcow2 >>>>> doesn't have this. We have a good way at making purely informational >>>>> changes and also making changes that break the format. Those features >>>>> are independent so they can be backported in a compatible way too. >>>>> >>>>> >>>> I might have agreed that it's useful to be able to backport them >>>> independently if we had had lots of such features added in the past. But >>>> we haven't. >>>> >>>> >>> I think part of why we haven't had them is that the mechanisms aren't >>> very flexible. >>> >>> A good example of where feature support would be very nice is for >>> changing the way snapshots metadata is recorded in qcow2. >>> >>> It would be nice to be able to represent snapshots with a uuid. If you >>> added new metadata that had uuid based snapshots that were hierarchical >>> and added a feature bit, it would have some nice properties. >>> >>> Since most images don't have snapshots, the common case would be a qcow2 >>> that was fully backwards compatible. You would also get a graceful >>> failure for using a new image with an old QEMU. >>> >> Well, snapshots have an ID today (which is different from their name). >> Nobody stops you from putting a UUID there. Fully backwards compatible, >> no feature flag needed. I think Miguel was planning to actually do this. >> > > The problem is that management tools have to make a decision about what > to do with ID's that aren't UUIDs which means that in our management > interface, we can't just expose UUIDs but instead we have to expose > strings that may sometimes be UUIDs. > > I don't think it buys us a lot to get the backwards compatibility. No matter how you store the UUID and no matter how many feature flags you're going to throw on it, you'll always have old images and new images. Management tools will have to cope with that (or break backwards compatibility, of course - they still have that choice). Kevin
On Fri, Sep 10, 2010 at 2:10 PM, Anthony Liguori <anthony@codemonkey.ws> wrote: >> >> Well, snapshots have an ID today (which is different from their name). >> Nobody stops you from putting a UUID there. Fully backwards compatible, >> no feature flag needed. I think Miguel was planning to actually do this. >> > > The problem is that management tools have to make a decision about what to > do with ID's that aren't UUIDs which means that in our management interface, > we can't just expose UUIDs but instead we have to expose strings that may > sometimes be UUIDs. > > I don't think it buys us a lot to get the backwards compatibility. > My main idea is to do not expose any ID/UUID information to the user, at least by default. Snapshots must have a name to be presented to the user, if he/she does not provide one we create it [1]. As you said, the ID field in qcow2 is just a string, so if we put an UUID there, no harm is done. The problem was to store parent information. qcow2 has an extra_data area that can store anything, so I used that space. This feature is an old wish from libvirt guys. I could not follow all the details discussed about qed so far, but would something like this work? Regards, Miguel [1] commit 7d631a116ad8fe07001e2cc4c559a06aac82745f
On 09/10/2010 12:42 PM, Kevin Wolf wrote: >> It bounces all buffers still and I still think it's synchronous >> (although Kevin would know better). >> > Yes, it does bounce the buffers, though I'm looking into this anyway > because you raised concerns about unbounded allocations. (And it has > been on my todo list for a while, but there were always more urgent things) > > What's synchronous in qcow2 is metadata access and COW. The guest > requests themselves are handled asynchronously. I think we should differentiate between serialized requests and synchronous requests because I got a bit sloppy in my nomenclature. Metadata reads and writes along with COW operations are synchronous in qcow2 IIUC. The data read/write is asynchronous. Metadata is cached and if a cache hit is taken, then a full request can be asynchronous. It's not clear to me if qcow2 can handle parallel requests though assuming that both requests don't require a cache miss for meta data. Does any meta data cache miss (read or write) cause a stall/flush of the request queue? In QED, all operations are asynchronous including metadata cache misses and COW operations. Multiple requests are handled in parallel with the exception of metadata writes. A metadata write will stall all future metadata cache miss fulfillment but they remain asynchronous. Requests that can be satisfied from the cache will be executed in parallel though. In the future, we'll do something more sophisticated and allow multiple simultaneous metadata writes provided they aren't do the same L2/L1 location. Regards, Anthony Liguori
Stefan Hajnoczi wrote: > Since there is no ordering imposed between the data write and metadata > update, the following scenarios may occur on crash: > 1. Neither data write nor metadata update reach the disk. This is > fine, qed metadata has not been corrupted. > 2. Data reaches disk but metadata update does not. We have leaked a > cluster but not corrupted metadata. Leaked clusters can be detected > with qemu-img check. > 3. Metadata update reaches disk but data does not. The interesting > case! The L2 table now points to a cluster which is beyond the last > cluster in the image file. Remember that file size is rounded down by > cluster size, so partial data writes are discarded and this case > applies. Better add: 4. File size is extended fully, but the data didn't all reach the disk. 5. Metadata is partially updated. 6. (Nasty) Metadata partial write has clobbered neighbouring metadata which wasn't meant to be changed. (This may happen up to a sector size on normal hard disks - data is hard to come by. This happens to a much larger file range on flash and RAIDs sometimes - I call it the "radius of destruction"). 6 can also happen when doing the L1 updated mentioned earlier, in which case you might lose a much larger part of the guest image. -- Jamie
On 09/06/2010 05:45 AM, Anthony Liguori wrote: >> >> Before inventing yet another image format, you certainly have checked >> the existing ones. > > Obviously, yes. > > Here are the issues: > > cow.c: it's cow of an otherwise sparse file. An important reason for > implementing a format is the ability to copy (or scp) an image without > special tools. > > qcow2.c: the refcount, cow cluster, and compression make an > implementation seeking integrity and performance challenging. > > vmdk.c: we feel it's important for qemu to have a block format with a > gpl friendly specification that we have a say in > > vhd/vpc.c: same as vmdk with the addition that the OSP is known to not > be gpl friendly > > vdi.c: uses a bitmap instead of a two level table. An advantage of a > two level table is that it allows image resize without much fuss. > > qcow.c: it lacks extensibility and compression means that there's no > guarantee that blocks are a fixed size. This makes it very difficult to > implement a high performance block format without having two separate > code paths. > Okay... how about, say, Linux LVM2 format? -hpa
On 09/10/2010 08:07 PM, Anthony Liguori wrote: > On 09/10/2010 10:49 AM, Avi Kivity wrote: >>> If I do a qemu-img create -f qcow2 foo.img 10GB, and then do a >>> naive copy of the image file and end up with a 2GB image when >>> there's nothing in it, that's badness. >> >> Only if you crash in the middle. If not, you free the preallocation >> during shutdown (or when running a guest, when it isn't actively >> writing at 100 MB/s). > > Which is potentially guest exploitable. If this worries you, run a scrubber in the background after an uncontrolled crash. Like qed fsck, this will recover the free list from L2. Unlike qed fsck, it will not delay starting of large guests. > >>> And what do you do when you shutdown and start up? You're setting a >>> reference count on blocks and keeping metadata in memory that those >>> blocks are really free. Do you need an atexit hook to decrement the >>> reference counts? >> >> Not atexit, just when we close the image. > > Just a detail, but we need an atexit() handler to make sure block > devices get closed because we have too many exit()s in the code today. Right. > >>> Do you need to create a free list structure that gets written out on >>> close? >> >> Yes, the same freelist that we allocate from. It's an "allocated but >> not yet referenced" list. > > Does it get written to disk? On exit or when there is no allocation activity. > >>> Just saying "we can do batching" is not solving the problem. If you >>> want to claim that the formats are equally, then in the very least, >>> you have to give a very exact description of how this would work >>> because it's not entirely straight forward. >> >> I thought I did, but I realize it is spread over multiple email >> messages. If you like, I can try to summarize it. It will be >> equally useful for qed once you add a freelist for UNMAP support. > > Yes, please consolidate so we can debate specifics. If there's a > reasonable way to fix qcow2, I'm happy to walk away from qed. But > we've studied the problem and do not believe there's a reasonable > approach to fixing qcow2 whereas reasonable considers the amount of > development effort, the time line required to get things right, and > the confidence we would have in the final product compared against the > one time cost of introducing a new format. I've started something and will post it soon. When considering development time, also consider the time it will take users to actually use qed (6 months for qemu release users, ~9 months on average for semiannual community distro releases, 12-18 months for enterprise distros. Consider also that we still have to support qcow2 since people do use the extra features, and since I don't see us forcing them to migrate. >>> I don't think you have any grounds to make such a statement. >> >> No, it's a forward-looking statement. But you're already looking at >> adding a freelist for UNMAP support and three levels for larger >> images. So it's safe to say that qed will not remain as nice and >> simple as it is now. > > I have a lot of faith in starting from a strong base and avoiding > making it weaker vs. starting from a weak base and trying to make it > stronger. This has led to many rewrites in the past. > > I realize it's somewhat subjective though. While qed looks like a good start, it has at least three flaws already (relying on physical image size, relying on fsck, and limited logical image size). Just fixing those will introduce complication. What about new features or newly discovered flaws? > >>>> >>>>> 4) We have looked at trying to fix qcow2. It appears to be a >>>>> monumental amount of work that starts with a rewrite where it's >>>>> unclear if we can even keep supporting all of the special >>>>> features. IOW, there is likely to be a need for users to >>>>> experience some type of image conversion or optimization process. >>>> >>>> I don't see why. >>> >>> Because you're oversimplifying what it takes to make qcow2 perform >>> well. >> >> Maybe. With all its complexity, it's nowhere near as close to the >> simplest filesystem. The biggest burden is the state machine design. > > Maybe I'm broken with respect to how I think, but I find state > machines very easy to rationalize. Your father's state machine. Not as clumsy or random as a thread; an elegant weapon for a more civilized age > To me, the biggest burden in qcow2 is thinking through how you deal > with shared resources. Because you can block for a long period of > time during write operations, it's not enough to just carry a mutex > during all metadata operations. You have to stage operations and > commit them at very specific points in time. The standard way of dealing with this is to have a hash table for metadata that contains a local mutex: l2cache = defaultdict(L2) def get_l2(pos): l2 = l2cache[pos] l2.mutex.lock() if not l2.valid: l2.pos = pos l2.read() l2.valid = True return l2 def put_l2(l2): if l2.dirty: l2.write() l2.dirty = False l2.mutex.unlock() Further tricks allow you to batch unrelated updates of a single L2 into one write. You can do all this with a state machine, except now you have to maintain dependency lists and manually call waiters. >>> A "naive" correct version of qcow2 does. Look at the above >>> example. If you introduce a free list, you change the format which >>> means that you couldn't support moving an image to an older version. >> >> qcow2 already has a free list, it's the refcount table. > > > Okay, qed already has a free list, it's the L1/L2 tables. > > Really, the ref count table in qcow2 is redundant. You can rebuild it > if you needed to which means you could relax the integrity associated > with it if you were willing to add an fsck process. > > But with internal snapshots, you can have a lot more metadata than > without them so fsck can be very, very expensive. It's difficult to > determine how to solve this problem. > > It's far easier to just avoid internal snapshots altogether and this > is exactly the thought process that led to QED. Once you drop support > for internal snapshots, you can dramatically simplify. The amount of metadata is O(nb_L2 * nb_snapshots). For qed, nb_snapshots = 1 but nb_L2 can be still quite large. If fsck is too long for one, it is too long for the other. I don't see the huge simplification. You simply iterate over all snapshots to build your free list. > >>> >>> So just for your batching example, the only compatible approach is >>> to reduce the reference count on shutdown. But there's definitely a >>> trade off because a few unclean shut downs could result in a huge >>> image. >> >> Not just on shutdown, also on guest quiesce. And yes, many unclean >> shutdowns will bloat the image size. Definitely a downside. >> >> The qed solution is to not support UNMAP or qed-on-lvm, and to >> require fsck instead. > > We can support UNMAP. Not sure why you're suggesting we can't. I meant, without doing an fsck to recover the space. It's hard for me to consider a scan of all metadata on start as something normal; with large enough disks it's simply way too slow from cold cache. Can you run an experiment? Populate a 1TB disk with fio running a random write workload over the whole range for a while. Reboot the host. How long does fsck take? > Not doing qed-on-lvm is definitely a limitation. The one use case > I've heard is qcow2 on top of clustered LVM as clustered LVM is > simpler than a clustered filesystem. I don't know the space well > enough so I need to think more about it. I don't either. If this use case survives, and if qed isn't changed to accomodate it, it means that that's another place where qed can't supplant qcow2. >>> I don't see the advantage at all. >> >> I can't parse this. You don't see the advantage of TRIM (now >> UNMAP)? You don't see the advantage of refcount tables? There isn't >> any, except when compared to a format with no freelist which >> therefore can't support UNMAP. > > Refcount table. See above discussion for my thoughts on refcount table. Ok. It boils down to "is fsck on startup acceptable". Without a freelist, you need fsck for both unclean shutdown and for UNMAP. > >>> 3) Another format adds choice, choice adds complexity. From my >>> perspective, QED can reduce choice long term because we can tell >>> users that unless they have a strong reason otherwise, use QED. We >>> cannot do that with qcow2 today. That may be an implementation >>> detail of qcow2, but it doesn't change the fact that there's >>> complexity in choosing an image format today. >> >> True. >> >> 4) Requires fsck on unclean shutdown > > I know it's uncool to do this in 2010, but I honestly believe it's a > reasonable approach considering the relative simplicity of our FS > compared to a normal FS. > > We're close to having fsck support so we can publish some performance > data from doing it on a reasonable large disk (like 1TB). Let's see > what that looks like before we draw too many conclusions. Great, you already have a test request queued above. > >> 5) No support for qed-on-lvm >> >> 6) limited image resize > > Not anymore than qcow2 FWIW. > > Again, with the default create parameters, we can resize up to 64TB > without rewriting metadata. I wouldn't call that limited image resize. I guess 64TB should last a bit. And if you relax the L1 size to be any number of clusters (or have three levels) you're unlimited. btw, having 256KB L2s is too large IMO. Reading them will slow down your random read throughput. Even 64K is a bit large, but there's no point making them smaller than a cluster. > >> 7) No support for UNMAP >> >> All are fixable, the latter with considerable changes to the format >> (allocating from an on-disk freelist requires an intermediate sync >> step; if the freelist is not on-disk, you can lose unbounded on-disk >> storage on clean shutdown). > > If you treat the on-disk free list as advisory, then you can be very > loose with writing the free list to disk. You only have to rebuild > the free list on unclean shutdown when you have to do an fsck anyway. > If you're doing an fsck, you can rebuild the free list for free. > > So really, support for UNMAP is free if you're okay with fsck. And > let's debate fsck some more when we have some proper performance data. You can decide to treat qcow2's on-disk free list as advisory if you like. No need for format change. Of course starting an unclean shutdown image from, new qemu on old qemu would cause corruption, so this has to be managed carefully. That gives you sync-free qcow2, no need for conversions. >>> But we can't realistically support users that are using those extra >>> features today anyway. >> >> Why not? > > When I say, "support users", I mean make sure that they get very good > performance and data integrity. So far, we've only talked about how > to get good performance when there have never been snapshots but I > think we also need to consider how to deal with making sure that no > matter what feature a user is using, they get consistent results. I don't think those features impact data integrity. Snapshots and encryption are just uses of read-modify-write which we already have. Not sure about compression, maybe that needs copy-on-write too. >> I don't think it's so useless. It's really only slow when >> allocating, yes? Once you've allocated it is fully async IIRC. > > It bounces all buffers still and I still think it's synchronous > (although Kevin would know better). (an aside: with cache!=none we're bouncing in the kernel as well; we really need to make it work for cache=none, perhaps use O_DIRECT for data and writeback for metadata and shared backing images). > >>>>> If you're willing to leak blocks on a scale that is still unknown. >>>> >>>> Who cares, those aren't real storage blocks. >>> >>> They are once you move the image from one place to another. If that >>> doesn't concern you, it really should. >> >> I don't see it as a huge problem, certainly less than fsck. If you >> think fsck is a smaller hit, you can use it to recover the space. >> >> Hm, you could have an 'unclean shutdown' bit in qcow2 and run a >> scrubber in the background if you see it set and recover the space. > > Yes, you'll want to have that regardless. But adding new things to > qcow2 has all the problems of introducing a new image format. Just some of them. On mount, rewrite the image format as qcow3. On clean shutdown, write it back to qcow2. So now there's no risk of data corruption (but there is reduced usability). >>> They are once you copy the image. And power loss is the same thing >>> as unexpected exit because you're not simply talking about delaying >>> a sync, you're talking staging future I/O operations purely within >>> QEMU. >> >> qed is susceptible to the same problem. If you have a 100MB write >> and qemu exits before it updates L2s, then those 100MB are leaked. >> You could alleviate the problem by writing L2 at intermediate points, >> but even then, a power loss can leak those 100MB. >> >> qed trades off the freelist for the file size (anything beyond the >> file size is free), it doesn't eliminate it completely. So you still >> have some of its problems, but you don't get its benefits. > > I think you've just established that qcow2 and qed both require an > fsck. I don't disagree :-) There's a difference between a background scrubber and a foreground fsck.
On 09/12/2010 08:24 AM, Avi Kivity wrote: >>> Not atexit, just when we close the image. >> >> Just a detail, but we need an atexit() handler to make sure block >> devices get closed because we have too many exit()s in the code today. > > > Right. So when you click the 'X' on the qemu window, we get to wait a few seconds for it to actually disappear because it's flushing metadata to disk.. > I've started something and will post it soon. Excellent, thank you. > When considering development time, also consider the time it will > take users to actually use qed (6 months for qemu release users, ~9 > months on average for semiannual community distro releases, 12-18 > months for enterprise distros. Consider also that we still have to > support qcow2 since people do use the extra features, and since I > don't see us forcing them to migrate. I'm of the opinion that qcow2 is unfit for production use for the type of production environments I care about. The amount of changes needed to make qcow2 fit for production use put it on at least the same timeline as you cite above. Yes, there are people today that qcow2 is appropriate but by the same respect, it will continue to be appropriate for them in the future. In my view, we don't have an image format fit for production use. You're arguing we should make qcow2 fit for production use whereas I am arguing we should start from scratch. My reasoning for starting from scratch is that it simplifies the problem. Your reasoning for improving qcow2 is simplifying the transition for non-production users of qcow2. We have an existence proof that we can achieve good data integrity and good performance by simplifying the problem. The burden still is establishing that it's possible to improve qcow2 in a reasonable amount of effort. NB, you could use qcow2 today if you had all of the data integrity fixes or didn't care about data integrity in the event of power failure or didn't care about performance. I don't have any customers that fit that bill so from my perspective, qcow2 isn't production fit. That doesn't mean that it's not fit for someone else's production use. >> >> I realize it's somewhat subjective though. > > While qed looks like a good start, it has at least three flaws already > (relying on physical image size, relying on fsck, and limited logical > image size). Just fixing those will introduce complication. What > about new features or newly discovered flaws? Let's quantify fsck. My suspicion is that if you've got the storage for 1TB disk images, it's fast enough that fsck can not be so bad. Keep in mind, we don't have to completely pause the guest while fsck'ing. We simply have to prevent cluster allocations. We can allow reads and we can allow writes to allocated clusters. Consequently, if you had a 1TB disk image, it's extremely likely that the vast majority of I/O is just to allocated clusters which means that fsck() is entirely a background task. The worst case scenario is actually a half-allocated disk. But since you have to boot before you can run any serious test, if it takes 5 seconds to do an fsck(), it's highly likely that it's not even noticeable. >> Maybe I'm broken with respect to how I think, but I find state >> machines very easy to rationalize. > > Your father's state machine. Not as clumsy or random as a thread; an > elegant weapon for a more civilized age I find your lack of faith in QED disturbing. >> To me, the biggest burden in qcow2 is thinking through how you deal >> with shared resources. Because you can block for a long period of >> time during write operations, it's not enough to just carry a mutex >> during all metadata operations. You have to stage operations and >> commit them at very specific points in time. > > The standard way of dealing with this is to have a hash table for > metadata that contains a local mutex: > > l2cache = defaultdict(L2) > > def get_l2(pos): > l2 = l2cache[pos] > l2.mutex.lock() > if not l2.valid: > l2.pos = pos > l2.read() > l2.valid = True > return l2 > > def put_l2(l2): > if l2.dirty: > l2.write() > l2.dirty = False > l2.mutex.unlock() You're missing how you create entries. That means you've got to do: def put_l2(l2): if l2.committed: if l2.dirty l2.write() l2.dirty = False l2.mutex.unlock() else: l2.mutex.lock() l2cache[l2.pos] = l2 l2.mutex.unlock() And this really illustrates my point. It's a harder problem that it seems. You also are keeping l2 reads from occurring when flushing a dirty l2 entry which is less parallel than what qed achieves today. This is part of why I prefer state machines. Acquiring a mutex is too easy and it makes it easy to not think through what all could be running. When you are more explicit about when you are allowing concurrency, I think it's easier to be more aggressive. It's a personal preference really. You can find just as many folks on the intertubes that claim Threads are Evil as claim State Machines are Evil. The only reason we're discussing this is you've claimed QEMU's state machine model is the biggest inhibitor and I think that's over simplifying things. It's like saying, QEMU's biggest problem is that too many of it's developers use vi verses emacs. You may personally believe that vi is entirely superior to emacs but by the same token, you should be able to recognize that some people are able to be productive with emacs. If someone wants to rewrite qcow2 to be threaded, I'm all for it. I don't think it's really any simpler than making it a state machine. I find it hard to believe you think there's an order of magnitude difference in development work too. >> It's far easier to just avoid internal snapshots altogether and this >> is exactly the thought process that led to QED. Once you drop >> support for internal snapshots, you can dramatically simplify. > > The amount of metadata is O(nb_L2 * nb_snapshots). For qed, > nb_snapshots = 1 but nb_L2 can be still quite large. If fsck is too > long for one, it is too long for the other. nb_L2 is very small. It's exactly n / 2GB + 1 where n is image size. Since image size is typically < 100GB, practically speaking it's less than 50. OTOH, nb_snapshots in qcow2 can be very large. In fact, it's not unrealistic for nb_snapshots to be >> 50. What that means is that instead of metadata being O(n) as it is today, it's at least O(n^2). Doing internal snapshots right is far more complicated than qcow2 does things. > How long does fsck take? We'll find out soon. But remember, fsck() only blocks pending metadata writes so it's not entirely all up-front. >> Not doing qed-on-lvm is definitely a limitation. The one use case >> I've heard is qcow2 on top of clustered LVM as clustered LVM is >> simpler than a clustered filesystem. I don't know the space well >> enough so I need to think more about it. > > I don't either. If this use case survives, and if qed isn't changed > to accomodate it, it means that that's another place where qed can't > supplant qcow2. I'm okay with that. An image file should require a file system. If I was going to design an image file to be used on top of raw storage, I would take an entirely different approach. >> Refcount table. See above discussion for my thoughts on refcount >> table. > > Ok. It boils down to "is fsck on startup acceptable". Without a > freelist, you need fsck for both unclean shutdown and for UNMAP. To rebuild the free list on unclean shutdown. >>> 5) No support for qed-on-lvm >>> >>> 6) limited image resize >> >> Not anymore than qcow2 FWIW. >> >> Again, with the default create parameters, we can resize up to 64TB >> without rewriting metadata. I wouldn't call that limited image resize. > > I guess 64TB should last a bit. And if you relax the L1 size to be > any number of clusters (or have three levels) you're unlimited. > > btw, having 256KB L2s is too large IMO. Reading them will slow down > your random read throughput. Even 64K is a bit large, but there's no > point making them smaller than a cluster. This is just defaults and honestly, adding another level would be pretty trivial. > (an aside: with cache!=none we're bouncing in the kernel as well; we > really need to make it work for cache=none, perhaps use O_DIRECT for > data and writeback for metadata and shared backing images). QED achieves zero-copy with cache=none today. In fact, our performance testing that we'll publish RSN is exclusively with cache=none. >> Yes, you'll want to have that regardless. But adding new things to >> qcow2 has all the problems of introducing a new image format. > > Just some of them. On mount, rewrite the image format as qcow3. On > clean shutdown, write it back to qcow2. So now there's no risk of > data corruption (but there is reduced usability). It means on unclean shutdown, you can't move images to older versions. That means a management tool can't rely on the mobility of images which means it's a new format for all practical purposes. QED started it's life as qcow3. You start with qcow3, remove the features that are poorly thought out and make correctness hard, add some future proofing, and you're left with QED. We're fully backwards compatible with qcow2 (by virtue that qcow2 is still in tree) but new images require new versions of QEMU. That said, we have a conversion tool to convert new images to the old format if mobility is truly required. So it's the same story that you're telling above from an end-user perspective. >>>> They are once you copy the image. And power loss is the same thing >>>> as unexpected exit because you're not simply talking about delaying >>>> a sync, you're talking staging future I/O operations purely within >>>> QEMU. >>> >>> qed is susceptible to the same problem. If you have a 100MB write >>> and qemu exits before it updates L2s, then those 100MB are leaked. >>> You could alleviate the problem by writing L2 at intermediate >>> points, but even then, a power loss can leak those 100MB. >>> >>> qed trades off the freelist for the file size (anything beyond the >>> file size is free), it doesn't eliminate it completely. So you >>> still have some of its problems, but you don't get its benefits. >> >> I think you've just established that qcow2 and qed both require an >> fsck. I don't disagree :-) > > There's a difference between a background scrubber and a foreground fsck. The difference between qcow2 and qed is that qed relies on the file size and qcow2 uses a bitmap. The bitmap grows synchronously whereas in qed, we're not relying on synchronous file growth. If we did, there would be no need for an fsck. If you attempt to grow the refcount table in qcow2 without doing a sync(), then you're going to have to have an fsync to avoid corruption. qcow2 doesn't have an advantage, it's just not trying to be as sophisticated as qed is. Regards, Anthony Liguori
On 09/12/2010 05:13 PM, Anthony Liguori wrote: > On 09/12/2010 08:24 AM, Avi Kivity wrote: >>>> Not atexit, just when we close the image. >>> >>> Just a detail, but we need an atexit() handler to make sure block >>> devices get closed because we have too many exit()s in the code today. >> >> >> Right. > > So when you click the 'X' on the qemu window, we get to wait a few > seconds for it to actually disappear because it's flushing metadata to > disk.. If it was doing heavy write I/O, you'll need to wait a bit (a few seconds are a few hundreds of clusters worth of metadata). If it managed to flush while you were moving your mouse, no delay. > >> When considering development time, also consider the time it will >> take users to actually use qed (6 months for qemu release users, ~9 >> months on average for semiannual community distro releases, 12-18 >> months for enterprise distros. Consider also that we still have to >> support qcow2 since people do use the extra features, and since I >> don't see us forcing them to migrate. > > I'm of the opinion that qcow2 is unfit for production use for the type > of production environments I care about. The amount of changes needed > to make qcow2 fit for production use put it on at least the same > timeline as you cite above. If it's exactly the same time, we gain by having one less format. > > Yes, there are people today that qcow2 is appropriate but by the same > respect, it will continue to be appropriate for them in the future. > > In my view, we don't have an image format fit for production use. > You're arguing we should make qcow2 fit for production use whereas I > am arguing we should start from scratch. My reasoning for starting > from scratch is that it simplifies the problem. Your reasoning for > improving qcow2 is simplifying the transition for non-production users > of qcow2. > > We have an existence proof that we can achieve good data integrity and > good performance by simplifying the problem. The burden still is > establishing that it's possible to improve qcow2 in a reasonable > amount of effort. Agreed. >>> >>> I realize it's somewhat subjective though. >> >> While qed looks like a good start, it has at least three flaws >> already (relying on physical image size, relying on fsck, and limited >> logical image size). Just fixing those will introduce complication. >> What about new features or newly discovered flaws? > > Let's quantify fsck. My suspicion is that if you've got the storage > for 1TB disk images, it's fast enough that fsck can not be so bad. It doesn't follow. The storage is likely to be shared among many guests. The image size (or how full it is) don't really matter; startup time is the aggregate number of L2s over all images starting now, divided by the number of spindles, divided by the number of IOPS each spindle provides. Since an L2 spans a lot of logical address space, it is likely that many L2s will be allocated (in fact, it makes sense to preallocate them). > > Keep in mind, we don't have to completely pause the guest while > fsck'ing. We simply have to prevent cluster allocations. We can > allow reads and we can allow writes to allocated clusters. True. > > Consequently, if you had a 1TB disk image, it's extremely likely that > the vast majority of I/O is just to allocated clusters which means > that fsck() is entirely a background task. The worst case scenario is > actually a half-allocated disk. No, the worst case is 0.003% allocated disk, with the allocated clusters distributed uniformly. That means all your L2s are allocated, but almost none of your clusters are. > > But since you have to boot before you can run any serious test, if it > takes 5 seconds to do an fsck(), it's highly likely that it's not even > noticeable. What if it takes 300 seconds? > >>> Maybe I'm broken with respect to how I think, but I find state >>> machines very easy to rationalize. >> >> Your father's state machine. Not as clumsy or random as a thread; an >> elegant weapon for a more civilized age > > I find your lack of faith in QED disturbing. When 900 years old you reach, state machines you will not find so easy to understand. >>> To me, the biggest burden in qcow2 is thinking through how you deal >>> with shared resources. Because you can block for a long period of >>> time during write operations, it's not enough to just carry a mutex >>> during all metadata operations. You have to stage operations and >>> commit them at very specific points in time. >> >> The standard way of dealing with this is to have a hash table for >> metadata that contains a local mutex: >> >> l2cache = defaultdict(L2) >> >> def get_l2(pos): >> l2 = l2cache[pos] >> l2.mutex.lock() >> if not l2.valid: >> l2.pos = pos >> l2.read() >> l2.valid = True >> return l2 >> >> def put_l2(l2): >> if l2.dirty: >> l2.write() >> l2.dirty = False >> l2.mutex.unlock() > > You're missing how you create entries. That means you've got to do: > > def put_l2(l2): > if l2.committed: > if l2.dirty > l2.write() > l2.dirty = False > l2.mutex.unlock() > else: > l2.mutex.lock() > l2cache[l2.pos] = l2 > l2.mutex.unlock() The in-memory L2 is created by defaultdict(). I did omit linking L2 into L1, by that's a function call. With a state machine, it's a new string of states and calls. > > And this really illustrates my point. It's a harder problem that it > seems. You also are keeping l2 reads from occurring when flushing a > dirty l2 entry which is less parallel than what qed achieves today. There are standard threading primitives like shared/exclusive locks or barriers that can be used to increase concurrency. It's nowhere near as brittle as modifying a state machine. > > This is part of why I prefer state machines. Acquiring a mutex is too > easy and it makes it easy to not think through what all could be > running. When you are more explicit about when you are allowing > concurrency, I think it's easier to be more aggressive. > > It's a personal preference really. You can find just as many folks on > the intertubes that claim Threads are Evil as claim State Machines are > Evil. The dark side of the force is tempting. > The only reason we're discussing this is you've claimed QEMU's state > machine model is the biggest inhibitor and I think that's over > simplifying things. It's like saying, QEMU's biggest problem is that > too many of it's developers use vi verses emacs. You may personally > believe that vi is entirely superior to emacs but by the same token, > you should be able to recognize that some people are able to be > productive with emacs. > > If someone wants to rewrite qcow2 to be threaded, I'm all for it. I > don't think it's really any simpler than making it a state machine. I > find it hard to believe you think there's an order of magnitude > difference in development work too. Kevin is best positioned to comment on this. >>> It's far easier to just avoid internal snapshots altogether and this >>> is exactly the thought process that led to QED. Once you drop >>> support for internal snapshots, you can dramatically simplify. >> >> The amount of metadata is O(nb_L2 * nb_snapshots). For qed, >> nb_snapshots = 1 but nb_L2 can be still quite large. If fsck is too >> long for one, it is too long for the other. > > nb_L2 is very small. It's exactly n / 2GB + 1 where n is image size. > Since image size is typically < 100GB, practically speaking it's less > than 50. > > OTOH, nb_snapshots in qcow2 can be very large. In fact, it's not > unrealistic for nb_snapshots to be >> 50. What that means is that > instead of metadata being O(n) as it is today, it's at least O(n^2). Why is in n^2? It's still n*m. If your image is 4TB instead of 100GB, the time increases by a factor of 40 for both. >>> Not doing qed-on-lvm is definitely a limitation. The one use case >>> I've heard is qcow2 on top of clustered LVM as clustered LVM is >>> simpler than a clustered filesystem. I don't know the space well >>> enough so I need to think more about it. >> >> I don't either. If this use case survives, and if qed isn't changed >> to accomodate it, it means that that's another place where qed can't >> supplant qcow2. > > I'm okay with that. An image file should require a file system. If I > was going to design an image file to be used on top of raw storage, I > would take an entirely different approach. That spreads our efforts further. >>> Refcount table. See above discussion for my thoughts on refcount >>> table. >> >> Ok. It boils down to "is fsck on startup acceptable". Without a >> freelist, you need fsck for both unclean shutdown and for UNMAP. > > To rebuild the free list on unclean shutdown. If you have an on-disk compact freelist, you don't need that fsck. If your freelist is the L2 table, then you need that fsck to find out if you have any holes in your image. On the other hand, allocating a cluster in qcow2 as it is now requires scanning the refcount table. Not very pretty. Kevin, how does that perform? >> (an aside: with cache!=none we're bouncing in the kernel as well; we >> really need to make it work for cache=none, perhaps use O_DIRECT for >> data and writeback for metadata and shared backing images). > > QED achieves zero-copy with cache=none today. In fact, our > performance testing that we'll publish RSN is exclusively with > cache=none. In this case, preallocation should really be cheap, since there isn't a ton of dirty data that needs to be flushed. You issue an extra flush once in a while so your truncate (or physical image size in the header) gets to disk, but that doesn't block new writes. It makes qed/lvm work, and it replaces the need to fsck for the next allocation with the need for a background scrubber to reclaim storage (you need that anyway for UNMAP). It makes the whole thing a lot more attractive IMO. > >>> Yes, you'll want to have that regardless. But adding new things to >>> qcow2 has all the problems of introducing a new image format. >> >> Just some of them. On mount, rewrite the image format as qcow3. On >> clean shutdown, write it back to qcow2. So now there's no risk of >> data corruption (but there is reduced usability). > > It means on unclean shutdown, you can't move images to older > versions. That means a management tool can't rely on the mobility of > images which means it's a new format for all practical purposes. > > QED started it's life as qcow3. You start with qcow3, remove the > features that are poorly thought out and make correctness hard, add > some future proofing, and you're left with QED. > > We're fully backwards compatible with qcow2 (by virtue that qcow2 is > still in tree) but new images require new versions of QEMU. That > said, we have a conversion tool to convert new images to the old > format if mobility is truly required. > > So it's the same story that you're telling above from an end-user > perspective. It's not exactly the same story (you can enable it selectively, or you can run fsck before moving) but I agree it isn't a good thing. > >>>>> They are once you copy the image. And power loss is the same >>>>> thing as unexpected exit because you're not simply talking about >>>>> delaying a sync, you're talking staging future I/O operations >>>>> purely within QEMU. >>>> >>>> qed is susceptible to the same problem. If you have a 100MB write >>>> and qemu exits before it updates L2s, then those 100MB are leaked. >>>> You could alleviate the problem by writing L2 at intermediate >>>> points, but even then, a power loss can leak those 100MB. >>>> >>>> qed trades off the freelist for the file size (anything beyond the >>>> file size is free), it doesn't eliminate it completely. So you >>>> still have some of its problems, but you don't get its benefits. >>> >>> I think you've just established that qcow2 and qed both require an >>> fsck. I don't disagree :-) >> >> There's a difference between a background scrubber and a foreground >> fsck. > > The difference between qcow2 and qed is that qed relies on the file > size and qcow2 uses a bitmap. > > The bitmap grows synchronously whereas in qed, we're not relying on > synchronous file growth. If we did, there would be no need for an fsck. > > If you attempt to grow the refcount table in qcow2 without doing a > sync(), then you're going to have to have an fsync to avoid corruption. > > qcow2 doesn't have an advantage, it's just not trying to be as > sophisticated as qed is. The difference is between preallocation and leaking, on one hand, and uncommitted allocation and later rebuilds, on the other. It isn't a difference between formats, but between implementations.
On 09/12/2010 10:56 AM, Avi Kivity wrote: > No, the worst case is 0.003% allocated disk, with the allocated > clusters distributed uniformly. That means all your L2s are > allocated, but almost none of your clusters are. But in this case, you're so sparse that your metadata is pretty much co-located which means seek performance won't matter much. >> >> But since you have to boot before you can run any serious test, if it >> takes 5 seconds to do an fsck(), it's highly likely that it's not >> even noticeable. > > What if it takes 300 seconds? That means for a 1TB disk you're taking 500ms per L2 entry, you're fully allocated and yet still doing an fsck. That seems awfully unlikely. >> if l2.committed: >> if l2.dirty >> l2.write() >> l2.dirty = False >> l2.mutex.unlock() >> else: >> l2.mutex.lock() >> l2cache[l2.pos] = l2 >> l2.mutex.unlock() > > The in-memory L2 is created by defaultdict(). I did omit linking L2 > into L1, by that's a function call. With a state machine, it's a new > string of states and calls. But you have to write the L2 to disk first before you link it so it's not purely in memory. >>>> It's far easier to just avoid internal snapshots altogether and >>>> this is exactly the thought process that led to QED. Once you drop >>>> support for internal snapshots, you can dramatically simplify. >>> >>> The amount of metadata is O(nb_L2 * nb_snapshots). For qed, >>> nb_snapshots = 1 but nb_L2 can be still quite large. If fsck is too >>> long for one, it is too long for the other. >> >> nb_L2 is very small. It's exactly n / 2GB + 1 where n is image >> size. Since image size is typically < 100GB, practically speaking >> it's less than 50. >> >> OTOH, nb_snapshots in qcow2 can be very large. In fact, it's not >> unrealistic for nb_snapshots to be >> 50. What that means is that >> instead of metadata being O(n) as it is today, it's at least O(n^2). > > Why is in n^2? It's still n*m. If your image is 4TB instead of > 100GB, the time increases by a factor of 40 for both. It's n*m but either n ~= m in which case it's n^2 or m << n, in which case, it's just n, or m >> n in which case, it's just O(m). This is where asymptotic complexity ends up not being terribly helpful :-) Let me put this another way though, if you support internal snapshots, what's a reasonable number of snapshots to expect reasonable performance with? 10? 100? 1000? 10000? >>>> Not doing qed-on-lvm is definitely a limitation. The one use case >>>> I've heard is qcow2 on top of clustered LVM as clustered LVM is >>>> simpler than a clustered filesystem. I don't know the space well >>>> enough so I need to think more about it. >>> >>> I don't either. If this use case survives, and if qed isn't changed >>> to accomodate it, it means that that's another place where qed can't >>> supplant qcow2. >> >> I'm okay with that. An image file should require a file system. If >> I was going to design an image file to be used on top of raw storage, >> I would take an entirely different approach. > > That spreads our efforts further. No. I don't think we should be in the business of designing on top of raw storage. Either assume fixed partitions, LVM, or a file system. We shouldn't reinvent the wheel at every opportunity (just the carefully chosen opportunities). >>>> Refcount table. See above discussion for my thoughts on refcount >>>> table. >>> >>> Ok. It boils down to "is fsck on startup acceptable". Without a >>> freelist, you need fsck for both unclean shutdown and for UNMAP. >> >> To rebuild the free list on unclean shutdown. > > If you have an on-disk compact freelist, you don't need that fsck. "If you have an on-disk compact [consistent] freelist, you don't need that fsck." Consistency is the key point. We go out of our way to avoid a consistent freelist in QED because it's the path to best performance. The key goal for a file format should be to have exactly as much consistency as required and not one bit more as consistency always means worse performance. > On the other hand, allocating a cluster in qcow2 as it is now requires > scanning the refcount table. Not very pretty. Kevin, how does that > perform? > >>> (an aside: with cache!=none we're bouncing in the kernel as well; we >>> really need to make it work for cache=none, perhaps use O_DIRECT for >>> data and writeback for metadata and shared backing images). >> >> QED achieves zero-copy with cache=none today. In fact, our >> performance testing that we'll publish RSN is exclusively with >> cache=none. > > In this case, preallocation should really be cheap, since there isn't > a ton of dirty data that needs to be flushed. You issue an extra > flush once in a while so your truncate (or physical image size in the > header) gets to disk, but that doesn't block new writes. > > It makes qed/lvm work, and it replaces the need to fsck for the next > allocation with the need for a background scrubber to reclaim storage > (you need that anyway for UNMAP). It makes the whole thing a lot more > attractive IMO. For a 1PB disk image with qcow2, the reference count table is 128GB. For a 1TB image, the reference count table is 128MB. For a 128GB image, the reference table is 16MB which is why we get away with it today. Anytime you grow the freelist with qcow2, you have to write a brand new freelist table and update the metadata synchronously to point to a new version of it. That means for a 1TB image, you're potentially writing out 128MB of data just to allocate a new cluster. s/freelist/refcount table/ to translate to current qcow2 nomenclature. This is certainly not fast. You can add a bunch of free blocks each time you mitigate the growth but I can't of many circumstances where a 128MB write isn't going to be noticeable. And it only gets worse as time moves on because 1TB disk images are already in use today. NB, with a 64-bit refcount table, the size of the refcount table is almost exactly the same size as the L1/L2 table in QED. IOW, the cost of transversing the refcount table to allocate a cluster is exactly the cost of transversing all of the L1/L2 metadata to build a freelist. IOW, you're doing the equivalent of an fsck everytime you open a qcow2 file today. It's very easy to neglect the details in something like qcow2. We've been talking like the refcount table is basically free to read and write but it's absolutely not. With large disk images, you're caching an awful lot of metadata to read the refcount table in fully. If you reduce the reference count table to exactly two bits, you can store that within the L1/L2 metadata since we have an extra 12 bits worth of storage space. Since you need the L1/L2 metadata anyway, we might as well just use that space as the authoritative source of the free list information. The only difference between qcow2 and qed is that since we use an on-demand table for L1/L2, our free list may be non-contiguous. Since we store virtual -> physical instead of physical->virtual, you have to do a full transversal with QED whereas with qcow2 you may get lucky. However, the fact that the reference count table is contiguous in qcow2 is a design flaw IMHO because it makes growth extremely painful with large images to the point where I'll claim that qcow2 is probably unusable by design with > 1TB disk images. We can optimize qed by having a contiguous freelist mapping physical->virtual (that's just a bitmap, and therefore considerably smaller) but making the freelist not authoritative. That makes it much faster because we don't add another sync and let's us fallback to the L1/L2 table for authoritative information if we had an unclean shutdown. It's a good compromise for performance and it validates the qed philosophy. By starting with a correct and performant approach that scales to large disk images, we can add features (like unmap) without sacrificing either. Regards, Anthony Liguori > >> >>>> Yes, you'll want to have that regardless. But adding new things to >>>> qcow2 has all the problems of introducing a new image format. >>> >>> Just some of them. On mount, rewrite the image format as qcow3. On >>> clean shutdown, write it back to qcow2. So now there's no risk of >>> data corruption (but there is reduced usability). >> >> It means on unclean shutdown, you can't move images to older >> versions. That means a management tool can't rely on the mobility of >> images which means it's a new format for all practical purposes. >> >> QED started it's life as qcow3. You start with qcow3, remove the >> features that are poorly thought out and make correctness hard, add >> some future proofing, and you're left with QED. >> >> We're fully backwards compatible with qcow2 (by virtue that qcow2 is >> still in tree) but new images require new versions of QEMU. That >> said, we have a conversion tool to convert new images to the old >> format if mobility is truly required. >> >> So it's the same story that you're telling above from an end-user >> perspective. > > It's not exactly the same story (you can enable it selectively, or you > can run fsck before moving) but I agree it isn't a good thing. > >> >>>>>> They are once you copy the image. And power loss is the same >>>>>> thing as unexpected exit because you're not simply talking about >>>>>> delaying a sync, you're talking staging future I/O operations >>>>>> purely within QEMU. >>>>> >>>>> qed is susceptible to the same problem. If you have a 100MB write >>>>> and qemu exits before it updates L2s, then those 100MB are >>>>> leaked. You could alleviate the problem by writing L2 at >>>>> intermediate points, but even then, a power loss can leak those >>>>> 100MB. >>>>> >>>>> qed trades off the freelist for the file size (anything beyond the >>>>> file size is free), it doesn't eliminate it completely. So you >>>>> still have some of its problems, but you don't get its benefits. >>>> >>>> I think you've just established that qcow2 and qed both require an >>>> fsck. I don't disagree :-) >>> >>> There's a difference between a background scrubber and a foreground >>> fsck. >> >> The difference between qcow2 and qed is that qed relies on the file >> size and qcow2 uses a bitmap. >> >> The bitmap grows synchronously whereas in qed, we're not relying on >> synchronous file growth. If we did, there would be no need for an fsck. >> >> If you attempt to grow the refcount table in qcow2 without doing a >> sync(), then you're going to have to have an fsync to avoid corruption. >> >> qcow2 doesn't have an advantage, it's just not trying to be as >> sophisticated as qed is. > > The difference is between preallocation and leaking, on one hand, and > uncommitted allocation and later rebuilds, on the other. It isn't a > difference between formats, but between implementations. >
On 09/12/2010 07:09 PM, Anthony Liguori wrote: > On 09/12/2010 10:56 AM, Avi Kivity wrote: >> No, the worst case is 0.003% allocated disk, with the allocated >> clusters distributed uniformly. That means all your L2s are >> allocated, but almost none of your clusters are. > > But in this case, you're so sparse that your metadata is pretty much > co-located which means seek performance won't matter much. You still get the rotational delay. But yes, the hit is reduced. > >>> >>> But since you have to boot before you can run any serious test, if >>> it takes 5 seconds to do an fsck(), it's highly likely that it's not >>> even noticeable. >> >> What if it takes 300 seconds? > > That means for a 1TB disk you're taking 500ms per L2 entry, you're > fully allocated and yet still doing an fsck. That seems awfully > unlikely. I meant for a fully populated L1. That's 10ms per L2. But since that's 64TB, that's unlikely too. It can still take 10s for a 2TB disk. > >>> if l2.committed: >>> if l2.dirty >>> l2.write() >>> l2.dirty = False >>> l2.mutex.unlock() >>> else: >>> l2.mutex.lock() >>> l2cache[l2.pos] = l2 >>> l2.mutex.unlock() >> >> The in-memory L2 is created by defaultdict(). I did omit linking L2 >> into L1, by that's a function call. With a state machine, it's a new >> string of states and calls. > > But you have to write the L2 to disk first before you link it so it's > not purely in memory. That's fine. Threading allows you to have blocking calls. It's slower, but very rare anyway. >> Why is in n^2? It's still n*m. If your image is 4TB instead of >> 100GB, the time increases by a factor of 40 for both. > > It's n*m but either n ~= m in which case it's n^2 or m << n, in which > case, it's just n, or m >> n in which case, it's just O(m). > > This is where asymptotic complexity ends up not being terribly helpful > :-) > > Let me put this another way though, if you support internal snapshots, > what's a reasonable number of snapshots to expect reasonable > performance with? 10? 100? 1000? 10000? I'd say 10. Not that I really want to support internal snapshots, it doesn't work well with multiple disks. >>> I'm okay with that. An image file should require a file system. If >>> I was going to design an image file to be used on top of raw >>> storage, I would take an entirely different approach. >> >> That spreads our efforts further. > > No. I don't think we should be in the business of designing on top of > raw storage. Either assume fixed partitions, LVM, or a file system. > We shouldn't reinvent the wheel at every opportunity (just the > carefully chosen opportunities). I agree, but in this case there was no choice. > >>>>> Refcount table. See above discussion for my thoughts on refcount >>>>> table. >>>> >>>> Ok. It boils down to "is fsck on startup acceptable". Without a >>>> freelist, you need fsck for both unclean shutdown and for UNMAP. >>> >>> To rebuild the free list on unclean shutdown. >> >> If you have an on-disk compact freelist, you don't need that fsck. > > "If you have an on-disk compact [consistent] freelist, you don't need > that fsck." > > Consistency is the key point. We go out of our way to avoid a > consistent freelist in QED because it's the path to best performance. > The key goal for a file format should be to have exactly as much > consistency as required and not one bit more as consistency always > means worse performance. Preallocation lets you have a consistent (or at least conservative) free list, with just a bit of extra consistency. If you piggy back preallocation on guest syncs, you don't even pay for that. On the other hand, linear L2 (which now become L1) means your fsck is just a linear scan of the table, which is probably faster than qcow2 allocation... >> >>>> (an aside: with cache!=none we're bouncing in the kernel as well; >>>> we really need to make it work for cache=none, perhaps use O_DIRECT >>>> for data and writeback for metadata and shared backing images). >>> >>> QED achieves zero-copy with cache=none today. In fact, our >>> performance testing that we'll publish RSN is exclusively with >>> cache=none. >> >> In this case, preallocation should really be cheap, since there isn't >> a ton of dirty data that needs to be flushed. You issue an extra >> flush once in a while so your truncate (or physical image size in the >> header) gets to disk, but that doesn't block new writes. >> >> It makes qed/lvm work, and it replaces the need to fsck for the next >> allocation with the need for a background scrubber to reclaim storage >> (you need that anyway for UNMAP). It makes the whole thing a lot >> more attractive IMO. > > > For a 1PB disk image with qcow2, the reference count table is 128GB. > For a 1TB image, the reference count table is 128MB. For a 128GB > image, the reference table is 16MB which is why we get away with it > today. > > Anytime you grow the freelist with qcow2, you have to write a brand > new freelist table and update the metadata synchronously to point to a > new version of it. That means for a 1TB image, you're potentially > writing out 128MB of data just to allocate a new cluster. > > s/freelist/refcount table/ to translate to current qcow2 > nomenclature. This is certainly not fast. You can add a bunch of > free blocks each time you mitigate the growth but I can't of many > circumstances where a 128MB write isn't going to be noticeable. And > it only gets worse as time moves on because 1TB disk images are > already in use today. > That's a strong point. qcow2 doubles on each allocation, it amortizes, but the delay is certainly going to be noticable. You can do it ahead of time (so guest writes don't need to wait) but it's still expensive. > NB, with a 64-bit refcount table, the size of the refcount table is > almost exactly the same size as the L1/L2 table in QED. IOW, the cost > of transversing the refcount table to allocate a cluster is exactly > the cost of transversing all of the L1/L2 metadata to build a > freelist. IOW, you're doing the equivalent of an fsck everytime you > open a qcow2 file today. No, L2 is O(logical size), refcount is O(physical size). > It's very easy to neglect the details in something like qcow2. We've > been talking like the refcount table is basically free to read and > write but it's absolutely not. With large disk images, you're caching > an awful lot of metadata to read the refcount table in fully. > > If you reduce the reference count table to exactly two bits, you can > store that within the L1/L2 metadata since we have an extra 12 bits > worth of storage space. Since you need the L1/L2 metadata anyway, we > might as well just use that space as the authoritative source of the > free list information. > > The only difference between qcow2 and qed is that since we use an > on-demand table for L1/L2, our free list may be non-contiguous. Since > we store virtual -> physical instead of physical->virtual, you have to > do a full transversal with QED whereas with qcow2 you may get lucky. > However, the fact that the reference count table is contiguous in > qcow2 is a design flaw IMHO because it makes growth extremely painful > with large images to the point where I'll claim that qcow2 is probably > unusable by design with > 1TB disk images. If you grow it in the background, it should be usable; since it happens once every 1TB worth of writes, it's not such a huge load. I'll agree this is increasing complexity. > We can optimize qed by having a contiguous freelist mapping > physical->virtual (that's just a bitmap, and therefore considerably > smaller) but making the freelist not authoritative. That makes it > much faster because we don't add another sync and let's us fallback to > the L1/L2 table for authoritative information if we had an unclean > shutdown. > > It's a good compromise for performance and it validates the qed > philosophy. By starting with a correct and performant approach that > scales to large disk images, we can add features (like unmap) without > sacrificing either. How would you implement the bitmap as a compatible feature?
On 09/12/2010 12:51 PM, Avi Kivity wrote: > On 09/12/2010 07:09 PM, Anthony Liguori wrote: >> On 09/12/2010 10:56 AM, Avi Kivity wrote: >>> No, the worst case is 0.003% allocated disk, with the allocated >>> clusters distributed uniformly. That means all your L2s are >>> allocated, but almost none of your clusters are. >> >> But in this case, you're so sparse that your metadata is pretty much >> co-located which means seek performance won't matter much. > > You still get the rotational delay. But yes, the hit is reduced. > >> >>>> >>>> But since you have to boot before you can run any serious test, if >>>> it takes 5 seconds to do an fsck(), it's highly likely that it's >>>> not even noticeable. >>> >>> What if it takes 300 seconds? >> >> That means for a 1TB disk you're taking 500ms per L2 entry, you're >> fully allocated and yet still doing an fsck. That seems awfully >> unlikely. > > I meant for a fully populated L1. That's 10ms per L2. Your math is off. A single L2 entry covers 2GB worth of logical space. That means a 1TB image consists of 512 L2s. 300 / 512 == .585 which is 585ms. That's a fully populated L1 on a 1TB image. > But since that's 64TB, that's unlikely too. It can still take 10s for > a 2TB disk. Ah, you're talking about a 64TB image. Recall that we can read L2s in parallel. I have trouble imaging that we'd get serialized performance with a 64TB backing store. It's much more likely you've got more than one spindle in this scenario. >>> Why is in n^2? It's still n*m. If your image is 4TB instead of >>> 100GB, the time increases by a factor of 40 for both. >> >> It's n*m but either n ~= m in which case it's n^2 or m << n, in which >> case, it's just n, or m >> n in which case, it's just O(m). >> >> This is where asymptotic complexity ends up not being terribly >> helpful :-) >> >> Let me put this another way though, if you support internal >> snapshots, what's a reasonable number of snapshots to expect >> reasonable performance with? 10? 100? 1000? 10000? > > I'd say 10. Not that I really want to support internal snapshots, it > doesn't work well with multiple disks. I don't think that's reasonable. The folks that I've talked to about snapshots seem to want to do crazy things like use it for checkpointing. TBH, I think they're looking for the ability to do thousands of checkpoints with an efficient way to release old checkpoints. I imagine that's the design point things like btrfs are trying to achieve. > On the other hand, linear L2 (which now become L1) means your fsck is > just a linear scan of the table, which is probably faster than qcow2 > allocation... And this is just a data layout optimization which is the sort of thing that we should let performance data drive. >> For a 1PB disk image with qcow2, the reference count table is 128GB. >> For a 1TB image, the reference count table is 128MB. For a 128GB >> image, the reference table is 16MB which is why we get away with it >> today. >> >> Anytime you grow the freelist with qcow2, you have to write a brand >> new freelist table and update the metadata synchronously to point to >> a new version of it. That means for a 1TB image, you're potentially >> writing out 128MB of data just to allocate a new cluster. >> >> s/freelist/refcount table/ to translate to current qcow2 >> nomenclature. This is certainly not fast. You can add a bunch of >> free blocks each time you mitigate the growth but I can't of many >> circumstances where a 128MB write isn't going to be noticeable. And >> it only gets worse as time moves on because 1TB disk images are >> already in use today. >> > > That's a strong point. qcow2 doubles on each allocation, it > amortizes, but the delay is certainly going to be noticable. > > You can do it ahead of time (so guest writes don't need to wait) but > it's still expensive. The trouble is, safe growth of the reference count table is hard because it's contiguous. That means you need to copy the table to another location all at once instead of just creating a new L1 table and reusing most of the existing L2 entries. It's a damning flaw in the format for large images. You can preallocate the whole thing up front to try to avoid the cost at run time but even then, that's a huge cost to pay in disk space up front. >> It's very easy to neglect the details in something like qcow2. We've >> been talking like the refcount table is basically free to read and >> write but it's absolutely not. With large disk images, you're >> caching an awful lot of metadata to read the refcount table in fully. >> >> If you reduce the reference count table to exactly two bits, you can >> store that within the L1/L2 metadata since we have an extra 12 bits >> worth of storage space. Since you need the L1/L2 metadata anyway, we >> might as well just use that space as the authoritative source of the >> free list information. >> >> The only difference between qcow2 and qed is that since we use an >> on-demand table for L1/L2, our free list may be non-contiguous. >> Since we store virtual -> physical instead of physical->virtual, you >> have to do a full transversal with QED whereas with qcow2 you may get >> lucky. However, the fact that the reference count table is >> contiguous in qcow2 is a design flaw IMHO because it makes growth >> extremely painful with large images to the point where I'll claim >> that qcow2 is probably unusable by design with > 1TB disk images. > > If you grow it in the background, it should be usable; since it > happens once every 1TB worth of writes, it's not such a huge load. > I'll agree this is increasing complexity. Trouble is, the reference count table is your authoritative source of whether something is free. You can grow it in the background but if you need to allocate clusters before your done growing, you have to stall the request. Those stalls can get very long on large disk images. >> We can optimize qed by having a contiguous freelist mapping >> physical->virtual (that's just a bitmap, and therefore considerably >> smaller) but making the freelist not authoritative. That makes it >> much faster because we don't add another sync and let's us fallback >> to the L1/L2 table for authoritative information if we had an unclean >> shutdown. >> >> It's a good compromise for performance and it validates the qed >> philosophy. By starting with a correct and performant approach that >> scales to large disk images, we can add features (like unmap) without >> sacrificing either. > > How would you implement the bitmap as a compatible feature? First, a refresher on the consistency model. Metadata is not sync'd to disk when initially written because we try to avoid having stronger metadata consistency than data consistency. We are forced to sync to enforce ordering when updating L1 with a new L2 but that's rare. If a guest attempts to sync data, we sync metadata too. Because we may have unsync'd metadata, all L1 and L2 entries need to be scanned to try to find unreachable entries based on the start up file size *before* allocating any new clusters upon start-up (noting that we can read and rewrite existing clusters). This is a correct design and performant this is where we start from. The start up cost is undesirable, so to reduce the need to scan all entries up front, we add a feature that introduces a meta-data clean flag in the header. The meta-data clean flag tells an implementation that there were no cluster allocations since the last sync() which means cluster allocation can now happen without searching for and correcting unreachable entries. An implementation can now set the meta-data clean flag right before any sync() operation and unset the flag before the first cluster allocation (being careful to sync the unsetting of the flag). This eliminates a lot of unnecessary scans in the case of safe shut down. We can also set a timer after unsetting the meta-data clean flag to sync() after, say, 5 minutes. This further decreases the number of times we have to scan so that the fsck() window only happens when power failure occurs within 5 minutes of the last cluster allocation. N.B., this flag is purely an optional feature that is backwards compatible. If an implementation ignores the meta-data clean flag, it just always scans for unreachable entries at startup. This is still a correct design and we've eliminated the vast majority of start up scans. The freelist would also be an optional feature. If the freelist pointer is non-zero in the header, it means that an implementation can find free blocks by reading the location of the freelist pointer in the header. We maintain the freelist in memory and we write it out right before any sync() operation. Adding to the freelist in memory (i.e. UNMAP) would clear the freelist pointer on disk. We could use this as an opportunity to schedule a future sync() just like as above. We would actually write free list changes to disk while the freelist pointer was set to 0 such that when we did a sync(), we were just marking the freelist valid again. An implementation that doesn't know about free list has to do a full scan to determine free blocks. That said, an implementation can also just ignore free blocks entirely and always allocate from the end. The freelist would be a cluster that also happens to be an free'd entry. It would then contain a list of free clusters and would look something like: struct freelist { uint64_t next_cluster; /* zero to represent EOL */ uint64_t num_entries; uint64_t entry[num_entries]; /* cluster offsets of free blocks */ }; It's important to not represent the full list in a contiguous region for the reasons discussed in other notes re: refcount table. This remains a completely correct implementation. We add no additional syncs above what we already have and we write out the freelist changes on demand. The only window that would require a rebuild of the freelist would be a hard power failure 5 minutes since the last unmap. One gotcha is that the freelist is double metadata which means that freeing a block requires an ordered write with respect to the L2 table update. Otherwise, you could allocate something off of the freelist that an L2 entry still pointed to. So there's a sync() in the UNMAP path, but that's unavoidable without a lot of cleverness. You could potentially delay the sync until you reallocate the block but it's not clear that UNMAP needs to be fast (yet). Regards, Anthony Liguori
On 09/12/2010 10:18 PM, Anthony Liguori wrote: >> >>> >>>>> >>>>> But since you have to boot before you can run any serious test, if >>>>> it takes 5 seconds to do an fsck(), it's highly likely that it's >>>>> not even noticeable. >>>> >>>> What if it takes 300 seconds? >>> >>> That means for a 1TB disk you're taking 500ms per L2 entry, you're >>> fully allocated and yet still doing an fsck. That seems awfully >>> unlikely. >> >> I meant for a fully populated L1. That's 10ms per L2. > > > Your math is off. A single L2 entry covers 2GB worth of logical > space. That means a 1TB image consists of 512 L2s. 300 / 512 == .585 > which is 585ms. > > That's a fully populated L1 on a 1TB image. I meant fully populated L1, all 32K entries. Not the 1TB disk. > >> But since that's 64TB, that's unlikely too. It can still take 10s >> for a 2TB disk. > > Ah, you're talking about a 64TB image. Recall that we can read L2s in > parallel. I have trouble imaging that we'd get serialized performance > with a 64TB backing store. It's much more likely you've got more than > one spindle in this scenario. When you have more than one spindle, you also have more than one guest. If they're all fscking at the same time, it actually gets slower. > >>>> Why is in n^2? It's still n*m. If your image is 4TB instead of >>>> 100GB, the time increases by a factor of 40 for both. >>> >>> It's n*m but either n ~= m in which case it's n^2 or m << n, in >>> which case, it's just n, or m >> n in which case, it's just O(m). >>> >>> This is where asymptotic complexity ends up not being terribly >>> helpful :-) >>> >>> Let me put this another way though, if you support internal >>> snapshots, what's a reasonable number of snapshots to expect >>> reasonable performance with? 10? 100? 1000? 10000? >> >> I'd say 10. Not that I really want to support internal snapshots, it >> doesn't work well with multiple disks. > > I don't think that's reasonable. The folks that I've talked to about > snapshots seem to want to do crazy things like use it for > checkpointing. TBH, I think they're looking for the ability to do > thousands of checkpoints with an efficient way to release old > checkpoints. > > I imagine that's the design point things like btrfs are trying to > achieve. Well, qcow2 isn't going to get anywhere close to that. For that many you basically have to have extents and back references, or your metadata explodes. > >> On the other hand, linear L2 (which now become L1) means your fsck is >> just a linear scan of the table, which is probably faster than qcow2 >> allocation... > > And this is just a data layout optimization which is the sort of thing > that we should let performance data drive. Well, if you drop L1, the code becomes simpler. > >>> For a 1PB disk image with qcow2, the reference count table is >>> 128GB. For a 1TB image, the reference count table is 128MB. For a >>> 128GB image, the reference table is 16MB which is why we get away >>> with it today. >>> >>> Anytime you grow the freelist with qcow2, you have to write a brand >>> new freelist table and update the metadata synchronously to point to >>> a new version of it. That means for a 1TB image, you're potentially >>> writing out 128MB of data just to allocate a new cluster. >>> >>> s/freelist/refcount table/ to translate to current qcow2 >>> nomenclature. This is certainly not fast. You can add a bunch of >>> free blocks each time you mitigate the growth but I can't of many >>> circumstances where a 128MB write isn't going to be noticeable. And >>> it only gets worse as time moves on because 1TB disk images are >>> already in use today. >>> >> >> That's a strong point. qcow2 doubles on each allocation, it >> amortizes, but the delay is certainly going to be noticable. >> >> You can do it ahead of time (so guest writes don't need to wait) but >> it's still expensive. > > The trouble is, safe growth of the reference count table is hard > because it's contiguous. That means you need to copy the table to > another location all at once instead of just creating a new L1 table > and reusing most of the existing L2 entries. You can use the same technique as streaming. You initiate a copy in the background. Updates to the region already copied go to both copies, updates to the region not yet copied go to the original copy. > > It's a damning flaw in the format for large images. You can > preallocate the whole thing up front to try to avoid the cost at run > time but even then, that's a huge cost to pay in disk space up front. I'm more or less convinced it qcow2 is unusable for large images without incompatible format changes. >>> It's very easy to neglect the details in something like qcow2. >>> We've been talking like the refcount table is basically free to read >>> and write but it's absolutely not. With large disk images, you're >>> caching an awful lot of metadata to read the refcount table in fully. >>> >>> If you reduce the reference count table to exactly two bits, you can >>> store that within the L1/L2 metadata since we have an extra 12 bits >>> worth of storage space. Since you need the L1/L2 metadata anyway, >>> we might as well just use that space as the authoritative source of >>> the free list information. >>> >>> The only difference between qcow2 and qed is that since we use an >>> on-demand table for L1/L2, our free list may be non-contiguous. >>> Since we store virtual -> physical instead of physical->virtual, you >>> have to do a full transversal with QED whereas with qcow2 you may >>> get lucky. However, the fact that the reference count table is >>> contiguous in qcow2 is a design flaw IMHO because it makes growth >>> extremely painful with large images to the point where I'll claim >>> that qcow2 is probably unusable by design with > 1TB disk images. >> >> If you grow it in the background, it should be usable; since it >> happens once every 1TB worth of writes, it's not such a huge load. >> I'll agree this is increasing complexity. > > Trouble is, the reference count table is your authoritative source of > whether something is free. You can grow it in the background but if > you need to allocate clusters before your done growing, you have to > stall the request. Those stalls can get very long on large disk images. No, you update both copies. It's hardly fun though. > >>> We can optimize qed by having a contiguous freelist mapping >>> physical->virtual (that's just a bitmap, and therefore considerably >>> smaller) but making the freelist not authoritative. That makes it >>> much faster because we don't add another sync and let's us fallback >>> to the L1/L2 table for authoritative information if we had an >>> unclean shutdown. >>> >>> It's a good compromise for performance and it validates the qed >>> philosophy. By starting with a correct and performant approach that >>> scales to large disk images, we can add features (like unmap) >>> without sacrificing either. >> >> How would you implement the bitmap as a compatible feature? > > First, a refresher on the consistency model. > > Metadata is not sync'd to disk when initially written because we try > to avoid having stronger metadata consistency than data consistency. > We are forced to sync to enforce ordering when updating L1 with a new > L2 but that's rare. If a guest attempts to sync data, we sync > metadata too. > > Because we may have unsync'd metadata, all L1 and L2 entries need to > be scanned to try to find unreachable entries based on the start up > file size *before* allocating any new clusters upon start-up (noting > that we can read and rewrite existing clusters). > > This is a correct design and performant this is where we start from. > > The start up cost is undesirable, so to reduce the need to scan all > entries up front, we add a feature that introduces a meta-data clean > flag in the header. The meta-data clean flag tells an implementation > that there were no cluster allocations since the last sync() which > means cluster allocation can now happen without searching for and > correcting unreachable entries. I don't think you can add it as a compatible feature. Implementations without the feature will load a clean image and dirty it without clearing the flag. So it has to be in from day 1. > > An implementation can now set the meta-data clean flag right before > any sync() operation What happens if the clean flag reaches the disk before a metadata write (or truncate) makes it? That's a false-clean which can lead us to avoid a scan which would be needed. > and unset the flag before the first cluster allocation (being careful > to sync the unsetting of the flag). This eliminates a lot of > unnecessary scans in the case of safe shut down. Why do you need to sync clearing the flag? Worst case it is lost and you have a false-dirty, leading to an unnecessary scan. > > We can also set a timer after unsetting the meta-data clean flag to > sync() after, say, 5 minutes. This further decreases the number of > times we have to scan so that the fsck() window only happens when > power failure occurs within 5 minutes of the last cluster allocation. > > N.B., this flag is purely an optional feature that is backwards > compatible. If an implementation ignores the meta-data clean flag, it > just always scans for unreachable entries at startup. No, you can get a false-clean by an old qemu crashing. I think this sort of stuff can be done using a mount count. Every mount increments the mount count. "clean" sets a clean counter equal to the mount count. "dirty" sets the clean counter to 0. You could do this for any number of recoverable metadata structures (like a dirty bitmap). > > This is still a correct design and we've eliminated the vast majority > of start up scans. > > The freelist would also be an optional feature. If the freelist > pointer is non-zero in the header, it means that an implementation can > find free blocks by reading the location of the freelist pointer in > the header. The same issue - old qemu mounts, updates disk but ignores freelist, new qemu mounts and uses stale freelist. A mount count fixes this (an alternative to a mount count is a new flags field with 1 bit per optional feature, any unrecognized compatible feature is cleared in the flags field, letting a new implementation detect that an old qemu saw and ignored the feature). > We maintain the freelist in memory and we write it out right before > any sync() operation. Adding to the freelist in memory (i.e. UNMAP) > would clear the freelist pointer on disk. We could use this as an > opportunity to schedule a future sync() just like as above. > > We would actually write free list changes to disk while the freelist > pointer was set to 0 such that when we did a sync(), we were just > marking the freelist valid again. > > An implementation that doesn't know about free list has to do a full > scan to determine free blocks. That said, an implementation can also > just ignore free blocks entirely and always allocate from the end. > > The freelist would be a cluster that also happens to be an free'd > entry. It would then contain a list of free clusters and would look > something like: > > struct freelist > { > uint64_t next_cluster; /* zero to represent EOL */ > uint64_t num_entries; > uint64_t entry[num_entries]; /* cluster offsets of free blocks */ > }; A bitmap would work better vs fragmentation - the worst case for this structure is pretty bad. For 64k clusters, each 1TB needs just 2MB of space, which is easy to scan and even reasonable to hold in memory. > > It's important to not represent the full list in a contiguous region > for the reasons discussed in other notes re: refcount table. Background copying can work around the issue, but yes. > > This remains a completely correct implementation. We add no > additional syncs above what we already have and we write out the > freelist changes on demand. The only window that would require a > rebuild of the freelist would be a hard power failure 5 minutes since > the last unmap. After the last allocation; unmap only results in a leak (which can be recovered in the background). > > One gotcha is that the freelist is double metadata which means that > freeing a block requires an ordered write with respect to the L2 table > update. Otherwise, you could allocate something off of the freelist > that an L2 entry still pointed to. > > So there's a sync() in the UNMAP path, but that's unavoidable without > a lot of cleverness. You could potentially delay the sync until you > reallocate the block but it's not clear that UNMAP needs to be fast > (yet). I don't see why you need the sync. If the freelist update is lost, you've leaked a block which you can recover with a rebuild. If the L2 update is lost, you have the freelist pointing into a used cluster, but you can recover using fsck. If both are lost, nothing happened.
Am 10.09.2010 21:33, schrieb Anthony Liguori: > On 09/10/2010 12:42 PM, Kevin Wolf wrote: >>> It bounces all buffers still and I still think it's synchronous >>> (although Kevin would know better). >>> >> Yes, it does bounce the buffers, though I'm looking into this anyway >> because you raised concerns about unbounded allocations. (And it has >> been on my todo list for a while, but there were always more urgent things) >> >> What's synchronous in qcow2 is metadata access and COW. The guest >> requests themselves are handled asynchronously. > > I think we should differentiate between serialized requests and > synchronous requests because I got a bit sloppy in my nomenclature. > > Metadata reads and writes along with COW operations are synchronous in > qcow2 IIUC. The data read/write is asynchronous. > > Metadata is cached and if a cache hit is taken, then a full request can > be asynchronous. > > It's not clear to me if qcow2 can handle parallel requests though > assuming that both requests don't require a cache miss for meta data. Without a cache miss, qcow2_aio_readv/writev is a lookup in an array (the L2 table) and then directly translated into a raw_aio_readv/writev. So I don't see any reason why it shouldn't handle requests in parallel. > Does any meta data cache miss (read or write) cause a stall/flush of the > request queue? Metadata reads mean that we do a synchronous bdrv_pread and block the caller for this time. I think the caller is either the device model or the processing of AIO callbacks. So I guess effectively this is what you called a stall. Kevin
Am 12.09.2010 17:56, schrieb Avi Kivity: >>>> To me, the biggest burden in qcow2 is thinking through how you deal >>>> with shared resources. Because you can block for a long period of >>>> time during write operations, it's not enough to just carry a mutex >>>> during all metadata operations. You have to stage operations and >>>> commit them at very specific points in time. >>> >>> The standard way of dealing with this is to have a hash table for >>> metadata that contains a local mutex: >>> >>> l2cache = defaultdict(L2) >>> >>> def get_l2(pos): >>> l2 = l2cache[pos] >>> l2.mutex.lock() >>> if not l2.valid: >>> l2.pos = pos >>> l2.read() >>> l2.valid = True >>> return l2 >>> >>> def put_l2(l2): >>> if l2.dirty: >>> l2.write() >>> l2.dirty = False >>> l2.mutex.unlock() >> >> You're missing how you create entries. That means you've got to do: >> >> def put_l2(l2): >> if l2.committed: >> if l2.dirty >> l2.write() >> l2.dirty = False >> l2.mutex.unlock() >> else: >> l2.mutex.lock() >> l2cache[l2.pos] = l2 >> l2.mutex.unlock() > > The in-memory L2 is created by defaultdict(). I did omit linking L2 > into L1, by that's a function call. With a state machine, it's a new > string of states and calls. > >> >> And this really illustrates my point. It's a harder problem that it >> seems. You also are keeping l2 reads from occurring when flushing a >> dirty l2 entry which is less parallel than what qed achieves today. > > There are standard threading primitives like shared/exclusive locks or > barriers that can be used to increase concurrency. It's nowhere near as > brittle as modifying a state machine. > >> >> This is part of why I prefer state machines. Acquiring a mutex is too >> easy and it makes it easy to not think through what all could be >> running. When you are more explicit about when you are allowing >> concurrency, I think it's easier to be more aggressive. >> >> It's a personal preference really. You can find just as many folks on >> the intertubes that claim Threads are Evil as claim State Machines are >> Evil. > > The dark side of the force is tempting. > >> The only reason we're discussing this is you've claimed QEMU's state >> machine model is the biggest inhibitor and I think that's over >> simplifying things. It's like saying, QEMU's biggest problem is that >> too many of it's developers use vi verses emacs. You may personally >> believe that vi is entirely superior to emacs but by the same token, >> you should be able to recognize that some people are able to be >> productive with emacs. >> >> If someone wants to rewrite qcow2 to be threaded, I'm all for it. I >> don't think it's really any simpler than making it a state machine. I >> find it hard to believe you think there's an order of magnitude >> difference in development work too. > > Kevin is best positioned to comment on this. Not sure. Anthony claims there's not much difference in how hard it is to implement either way. I'm not going to contradict because I haven't thought a lot about the complicating details this would involve, but in any case I don't think a state machine would be easier to implement than a threaded model either. However, even if we assume that it's the same amount of work, the result of that work is a different one. The state machine model would tend to clutter up the code even more than it already is, impacting readability. >>>> Refcount table. See above discussion for my thoughts on refcount >>>> table. >>> >>> Ok. It boils down to "is fsck on startup acceptable". Without a >>> freelist, you need fsck for both unclean shutdown and for UNMAP. >> >> To rebuild the free list on unclean shutdown. > > If you have an on-disk compact freelist, you don't need that fsck. If > your freelist is the L2 table, then you need that fsck to find out if > you have any holes in your image. > > On the other hand, allocating a cluster in qcow2 as it is now requires > scanning the refcount table. Not very pretty. Kevin, how does that > perform? Probably not very well in the worst case (which is: free a cluster at the start of a fully allocated image, allocate two clusters). We probably get away with it today because typical image sizes have only a couple of refcount blocks and because we don't free clusters a lot anyway. If QED can manage a free list in memory, though, there's no reason why the same shouldn't work qcow2. >>>> Yes, you'll want to have that regardless. But adding new things to >>>> qcow2 has all the problems of introducing a new image format. >>> >>> Just some of them. On mount, rewrite the image format as qcow3. On >>> clean shutdown, write it back to qcow2. So now there's no risk of >>> data corruption (but there is reduced usability). >> >> It means on unclean shutdown, you can't move images to older >> versions. That means a management tool can't rely on the mobility of >> images which means it's a new format for all practical purposes. >> >> QED started it's life as qcow3. You start with qcow3, remove the >> features that are poorly thought out and make correctness hard, add >> some future proofing, and you're left with QED. >> >> We're fully backwards compatible with qcow2 (by virtue that qcow2 is >> still in tree) but new images require new versions of QEMU. That >> said, we have a conversion tool to convert new images to the old >> format if mobility is truly required. >> >> So it's the same story that you're telling above from an end-user >> perspective. > > It's not exactly the same story (you can enable it selectively, or you > can run fsck before moving) but I agree it isn't a good thing. The real reason why it's not the same story is that a qcow3 would be backwards compatible. Old images would just work as qcow3 by changing the version number in the header. Even if they are on a block device. Even if they are encrypted. Even if they are compressed. Even if they have internal snapshots. We would have just one driver instead of two, and we could tell people the upgrade their images and they didn't have to care about feature loss. Kevin
Am 12.09.2010 19:09, schrieb Anthony Liguori: > For a 1PB disk image with qcow2, the reference count table is 128GB. > For a 1TB image, the reference count table is 128MB. For a 128GB > image, the reference table is 16MB which is why we get away with it today. This is physical size. If you have a 1 PB disk, you're probably okay with using 128 GB of it for metadata (and I think it's less than that, see below) > Anytime you grow the freelist with qcow2, you have to write a brand new > freelist table and update the metadata synchronously to point to a new > version of it. That means for a 1TB image, you're potentially writing > out 128MB of data just to allocate a new cluster. No. qcow2 has two-level tables. File size: 1 TB Number of clusters: 1 TB / 64 kB = 16 M Number of refcount blocks: (16 M * 2 B) / 64kB = 512 Total size of all refcount blocks: 512 * 64kB = 32 MB Size of recount table: 512 * 8 B = 4 kB When we grow an image file, the refcount blocks can stay where they are, only the refcount table needs to be rewritten. So we have to copy a total of 4 kB for growing the image file when it's 1 TB in size (all assuming 64k clusters). The other result of this calculation is that we need to grow the refcount table each time we cross a 16 TB boundary. So additionally to being a small amount of data, it doesn't happen in practice anyway. Kevin
On 09/13/2010 01:28 PM, Kevin Wolf wrote: > >> Anytime you grow the freelist with qcow2, you have to write a brand new >> freelist table and update the metadata synchronously to point to a new >> version of it. That means for a 1TB image, you're potentially writing >> out 128MB of data just to allocate a new cluster. > No. qcow2 has two-level tables. > > File size: 1 TB > Number of clusters: 1 TB / 64 kB = 16 M > Number of refcount blocks: (16 M * 2 B) / 64kB = 512 > Total size of all refcount blocks: 512 * 64kB = 32 MB > Size of recount table: 512 * 8 B = 4 kB > > When we grow an image file, the refcount blocks can stay where they are, > only the refcount table needs to be rewritten. So we have to copy a > total of 4 kB for growing the image file when it's 1 TB in size (all > assuming 64k clusters). > > The other result of this calculation is that we need to grow the > refcount table each time we cross a 16 TB boundary. So additionally to > being a small amount of data, it doesn't happen in practice anyway. > > Interesting, I misremembered it as 8 bytes per cluster, not 2. So it's actually fairly dense (though still not as dense as a bitmap).
Am 13.09.2010 13:34, schrieb Avi Kivity: > On 09/13/2010 01:28 PM, Kevin Wolf wrote: >> >>> Anytime you grow the freelist with qcow2, you have to write a brand new >>> freelist table and update the metadata synchronously to point to a new >>> version of it. That means for a 1TB image, you're potentially writing >>> out 128MB of data just to allocate a new cluster. >> No. qcow2 has two-level tables. >> >> File size: 1 TB >> Number of clusters: 1 TB / 64 kB = 16 M >> Number of refcount blocks: (16 M * 2 B) / 64kB = 512 >> Total size of all refcount blocks: 512 * 64kB = 32 MB >> Size of recount table: 512 * 8 B = 4 kB >> >> When we grow an image file, the refcount blocks can stay where they are, >> only the refcount table needs to be rewritten. So we have to copy a >> total of 4 kB for growing the image file when it's 1 TB in size (all >> assuming 64k clusters). >> >> The other result of this calculation is that we need to grow the >> refcount table each time we cross a 16 TB boundary. So additionally to >> being a small amount of data, it doesn't happen in practice anyway. > > Interesting, I misremembered it as 8 bytes per cluster, not 2. So it's > actually fairly dense (though still not as dense as a bitmap). Yes, refcounts are 16 bit. Just checked it with the code once again to be 100% sure. But if it was only that, it would be just a small factor. The important part is that it's a two-level structure, so Anthony's numbers are completely off. Kevin
On 09/13/2010 06:03 AM, Kevin Wolf wrote: > > The real reason why it's not the same story is that a qcow3 would be > backwards compatible. Old images would just work as qcow3 by changing > the version number in the header. Even if they are on a block device. > Even if they are encrypted. Even if they are compressed. Even if they > have internal snapshots. > I think that's really the point of QED. I think the features of qcow2 make it prohibitively difficult to achieve correctness and good peformance. > We would have just one driver instead of two, and we could tell people > the upgrade their images and they didn't have to care about feature loss. > If the features don't work correctly, is it really feature loss? Regards, Anthony Liguori > Kevin >
On 09/13/2010 06:28 AM, Kevin Wolf wrote: >> Anytime you grow the freelist with qcow2, you have to write a brand new >> freelist table and update the metadata synchronously to point to a new >> version of it. That means for a 1TB image, you're potentially writing >> out 128MB of data just to allocate a new cluster. >> > No. qcow2 has two-level tables. > > File size: 1 TB > Number of clusters: 1 TB / 64 kB = 16 M > Number of refcount blocks: (16 M * 2 B) / 64kB = 512 > Total size of all refcount blocks: 512 * 64kB = 32 MB > Size of recount table: 512 * 8 B = 4 kB > > When we grow an image file, the refcount blocks can stay where they are, > only the refcount table needs to be rewritten. So we have to copy a > total of 4 kB for growing the image file when it's 1 TB in size (all > assuming 64k clusters). > Yes, I misread the code. It is a two level table. Even though it's 4x smaller than I previously stated, it's still quite large and finding a free block is an O(n) operation where n is the physical file size. An fsck() on qed is also an O(n) operation where n is the physical file size so I still contend the two are similar in cost. Regards, Anthony Liguori > The other result of this calculation is that we need to grow the > refcount table each time we cross a 16 TB boundary. So additionally to > being a small amount of data, it doesn't happen in practice anyway. > > Kevin >
On 09/13/2010 06:48 AM, Kevin Wolf wrote: > Am 13.09.2010 13:34, schrieb Avi Kivity: > >> On 09/13/2010 01:28 PM, Kevin Wolf wrote: >> >>> >>>> Anytime you grow the freelist with qcow2, you have to write a brand new >>>> freelist table and update the metadata synchronously to point to a new >>>> version of it. That means for a 1TB image, you're potentially writing >>>> out 128MB of data just to allocate a new cluster. >>>> >>> No. qcow2 has two-level tables. >>> >>> File size: 1 TB >>> Number of clusters: 1 TB / 64 kB = 16 M >>> Number of refcount blocks: (16 M * 2 B) / 64kB = 512 >>> Total size of all refcount blocks: 512 * 64kB = 32 MB >>> Size of recount table: 512 * 8 B = 4 kB >>> >>> When we grow an image file, the refcount blocks can stay where they are, >>> only the refcount table needs to be rewritten. So we have to copy a >>> total of 4 kB for growing the image file when it's 1 TB in size (all >>> assuming 64k clusters). >>> >>> The other result of this calculation is that we need to grow the >>> refcount table each time we cross a 16 TB boundary. So additionally to >>> being a small amount of data, it doesn't happen in practice anyway. >>> >> Interesting, I misremembered it as 8 bytes per cluster, not 2. So it's >> actually fairly dense (though still not as dense as a bitmap). >> > Yes, refcounts are 16 bit. Just checked it with the code once again to > be 100% sure. But if it was only that, it would be just a small factor. > The important part is that it's a two-level structure, so Anthony's > numbers are completely off. > A two-level structure makes growth more efficient, however, searching for a free cluster is still an expensive operation on large disk images. This is an important point because without snapshots, the argument for a refcount table is supporting UNMAP and efficient UNMAP support in qcow2 looks like it will require an additional structure. One of the troubles with qcow2 as a format is that the metadata on disk is redundant, it's already defined as authoritative. So while in QED, we can define the L1/L2 tables as the only authoritative source of information and treat a freelist as an optimization, the refcount table must remain authoritative in qcow2 in order to remain backwards compatible. You could rewrite the header to be qcow3 in order to relax this restriction but then you lose image mobility to older versions which really negates the advantage of not introducing a new format. Regards, Anthony Liguori Regards, Anthony Liguori > Kevin >
Am 13.09.2010 15:07, schrieb Anthony Liguori: > On 09/13/2010 06:03 AM, Kevin Wolf wrote: >> >> The real reason why it's not the same story is that a qcow3 would be >> backwards compatible. Old images would just work as qcow3 by changing >> the version number in the header. Even if they are on a block device. >> Even if they are encrypted. Even if they are compressed. Even if they >> have internal snapshots. >> > > I think that's really the point of QED. I think the features of qcow2 > make it prohibitively difficult to achieve correctness and good peformance. I know and I understand that. I'm not even saying that it's necessarily a bad thing (though I think that having to maintain one format is better than having to maintain two formats). But then you can't say it's the same story as qcow3. >> We would have just one driver instead of two, and we could tell people >> the upgrade their images and they didn't have to care about feature loss. > > If the features don't work correctly, is it really feature loss? Obviously the features work correctly enough for people to use them without complaints (other than performance currently, of course). Or let me rephrase that: Where are your bug reports? Kevin
On Fri, Sep 10, 2010 at 10:22 PM, Jamie Lokier <jamie@shareable.org> wrote: > Stefan Hajnoczi wrote: >> Since there is no ordering imposed between the data write and metadata >> update, the following scenarios may occur on crash: >> 1. Neither data write nor metadata update reach the disk. Â This is >> fine, qed metadata has not been corrupted. >> 2. Data reaches disk but metadata update does not. Â We have leaked a >> cluster but not corrupted metadata. Â Leaked clusters can be detected >> with qemu-img check. >> 3. Metadata update reaches disk but data does not. Â The interesting >> case! Â The L2 table now points to a cluster which is beyond the last >> cluster in the image file. Â Remember that file size is rounded down by >> cluster size, so partial data writes are discarded and this case >> applies. > > Better add: > > 4. File size is extended fully, but the data didn't all reach the disk. This case is okay. If a data cluster does not reach the disk but the file size is increased there are two outcomes: 1. A leaked cluster if the L2 table update did not reach the disk. 2. A cluster with junk data, which is fine since the guest has no promise the data safely landed on disk without a completing a flush. A flush is performed after allocating new L2 tables and before linking them into the L1 table. Therefore clusters can be leaked but an invalid L2 table can never be linked into the L1 table. > 5. Metadata is partially updated. > 6. (Nasty) Metadata partial write has clobbered neighbouring > Â metadata which wasn't meant to be changed. Â (This may happen up > Â to a sector size on normal hard disks - data is hard to come by. > Â This happens to a much larger file range on flash and RAIDs > Â sometimes - I call it the "radius of destruction"). > > 6 can also happen when doing the L1 updated mentioned earlier, in > which case you might lose a much larger part of the guest image. These two cases are problematic. I've been thinking in atomic sector updates and not in a model where updates can be partial or even destructive at the byte level. Do you have references where I can read more about the radius of destruction ;)? Transactional I/O solves this problem. Checksums can detect but do not fix the problem alone. Duplicate metadata together with checksums could be a solution but I haven't thought through the details. Any other suggestions? Time to peek at md and dm to see how they safeguard metadata. Stefan
On Tue, Sep 14, 2010 at 11:46 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> Time to peek at md and dm to see how they safeguard metadata.
Seems to me that dm-snap does not take measures to guard against
snapshot metadata (exceptions) partial updates/corruption. I was
hoping to find useful approaches there rather than in a full-blown
filesystem :).
Stefan
On 09/14/2010 05:46 AM, Stefan Hajnoczi wrote: > On Fri, Sep 10, 2010 at 10:22 PM, Jamie Lokier<jamie@shareable.org> wrote: > >> Stefan Hajnoczi wrote: >> >>> Since there is no ordering imposed between the data write and metadata >>> update, the following scenarios may occur on crash: >>> 1. Neither data write nor metadata update reach the disk. This is >>> fine, qed metadata has not been corrupted. >>> 2. Data reaches disk but metadata update does not. We have leaked a >>> cluster but not corrupted metadata. Leaked clusters can be detected >>> with qemu-img check. >>> 3. Metadata update reaches disk but data does not. The interesting >>> case! The L2 table now points to a cluster which is beyond the last >>> cluster in the image file. Remember that file size is rounded down by >>> cluster size, so partial data writes are discarded and this case >>> applies. >>> >> Better add: >> >> 4. File size is extended fully, but the data didn't all reach the disk. >> > This case is okay. > > If a data cluster does not reach the disk but the file size is > increased there are two outcomes: > 1. A leaked cluster if the L2 table update did not reach the disk. > 2. A cluster with junk data, which is fine since the guest has no > promise the data safely landed on disk without a completing a flush. > > A flush is performed after allocating new L2 tables and before linking > them into the L1 table. Therefore clusters can be leaked but an > invalid L2 table can never be linked into the L1 table. > > >> 5. Metadata is partially updated. >> 6. (Nasty) Metadata partial write has clobbered neighbouring >> metadata which wasn't meant to be changed. (This may happen up >> to a sector size on normal hard disks - data is hard to come by. >> This happens to a much larger file range on flash and RAIDs >> sometimes - I call it the "radius of destruction"). >> >> 6 can also happen when doing the L1 updated mentioned earlier, in >> which case you might lose a much larger part of the guest image. >> > These two cases are problematic. And not worth the hassle. It might matter if you've bought your C-Gate hard drives from a guy with a blanket on the street and you're sending your disk array on the space shuttle during a solar storm, but if you're building on top of file systems with reasonable storage, these are not reasonable failure scenarios to design for. There's a place for trying to cover these types of scenarios to build reliable storage arrays on top of super cheap storage but that's not our mission. That's what the btrfs's of the world are for. Regards, Anthony Liguori > I've been thinking in atomic sector > updates and not in a model where updates can be partial or even > destructive at the byte level. Do you have references where I can > read more about the radius of destruction ;)? > > Transactional I/O solves this problem. Checksums can detect but do > not fix the problem alone. Duplicate metadata together with checksums > could be a solution but I haven't thought through the details. > > Any other suggestions? > > Time to peek at md and dm to see how they safeguard metadata. > > Stefan > >
On Mon, Sep 06, 2010 at 11:04:38AM +0100, Stefan Hajnoczi wrote: > The format supports sparse disk images. It does not rely on the host > filesystem holes feature, making it a good choice for sparse disk images > that need to be transferred over channels where holes are not supported. Are these so important? Do people really pass VM images around with http? Maybe it's enough to teach scp about sparse files instead.
On 09/15/2010 04:01 PM, Michael S. Tsirkin wrote: > On Mon, Sep 06, 2010 at 11:04:38AM +0100, Stefan Hajnoczi wrote: > >> The format supports sparse disk images. It does not rely on the host >> filesystem holes feature, making it a good choice for sparse disk images >> that need to be transferred over channels where holes are not supported. >> > Are these so important? Do people really pass VM images around with > http? Absolutely. Part of the problem is that there's an industry expectation that virtual machines are stored in images. Even when you look at something like VMFS, which is a filesystem designed specifically for virtual machine images, you still use VMDKs on top of VMFS. Regards, Anthony Liguori > Maybe it's enough to teach scp about sparse files instead. > >
diff --git a/Makefile.objs b/Makefile.objs index 4a1eaa1..a5acb32 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -14,6 +14,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o +block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o block-nested-$(CONFIG_WIN32) += raw-win32.o block-nested-$(CONFIG_POSIX) += raw-posix.o diff --git a/block/qcow2.c b/block/qcow2.c index a53014d..72c923a 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -767,28 +767,6 @@ static int qcow2_change_backing_file(BlockDriverState *bs, return qcow2_update_ext_header(bs, backing_file, backing_fmt); } -static int get_bits_from_size(size_t size) -{ - int res = 0; - - if (size == 0) { - return -1; - } - - while (size != 1) { - /* Not a power of two */ - if (size & 1) { - return -1; - } - - size >>= 1; - res++; - } - - return res; -} - - static int preallocate(BlockDriverState *bs) { uint64_t nb_sectors; diff --git a/block/qed-cluster.c b/block/qed-cluster.c new file mode 100644 index 0000000..6deea27 --- /dev/null +++ b/block/qed-cluster.c @@ -0,0 +1,136 @@ +/* + * QEMU Enhanced Disk Format Cluster functions + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qed.h" + +/** + * Count the number of contiguous data clusters + * + * @s: QED state + * @table: L2 table + * @index: First cluster index + * @n: Maximum number of clusters + * @offset: Set to first cluster offset + * + * This function scans tables for contiguous allocated or free clusters. + */ +static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s, + QEDTable *table, + unsigned int index, + unsigned int n, + uint64_t *offset) +{ + unsigned int end = MIN(index + n, s->table_nelems); + uint64_t last = table->offsets[index]; + unsigned int i; + + *offset = last; + + for (i = index + 1; i < end; i++) { + if (last == 0) { + /* Counting free clusters */ + if (table->offsets[i] != 0) { + break; + } + } else { + /* Counting allocated clusters */ + if (table->offsets[i] != last + s->header.cluster_size) { + break; + } + last = table->offsets[i]; + } + } + return i - index; +} + +typedef struct { + BDRVQEDState *s; + uint64_t pos; + size_t len; + + QEDRequest *request; + + /* User callback */ + QEDFindClusterFunc *cb; + void *opaque; +} QEDFindClusterCB; + +static void qed_find_cluster_cb(void *opaque, int ret) +{ + QEDFindClusterCB *find_cluster_cb = opaque; + BDRVQEDState *s = find_cluster_cb->s; + QEDRequest *request = find_cluster_cb->request; + uint64_t offset = 0; + size_t len = 0; + unsigned int index; + unsigned int n; + + if (ret) { + ret = QED_CLUSTER_ERROR; + goto out; + } + + index = qed_l2_index(s, find_cluster_cb->pos); + n = qed_bytes_to_clusters(s, + qed_offset_into_cluster(s, find_cluster_cb->pos) + + find_cluster_cb->len); + n = qed_count_contiguous_clusters(s, request->l2_table->table, + index, n, &offset); + + ret = offset ? QED_CLUSTER_FOUND : QED_CLUSTER_L2; + len = MIN(find_cluster_cb->len, n * s->header.cluster_size - + qed_offset_into_cluster(s, find_cluster_cb->pos)); + +out: + find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len); + qemu_free(find_cluster_cb); +} + +/** + * Find the offset of a data cluster + * + * @s: QED state + * @pos: Byte position in device + * @len: Number of bytes + * @cb: Completion function + * @opaque: User data for completion function + */ +void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, + size_t len, QEDFindClusterFunc *cb, void *opaque) +{ + QEDFindClusterCB *find_cluster_cb; + uint64_t l2_offset; + + /* Limit length to L2 boundary. Requests are broken up at the L2 boundary + * so that a request acts on one L2 table at a time. + */ + len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos); + + l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)]; + if (!l2_offset) { + cb(opaque, QED_CLUSTER_L1, 0, len); + return; + } + + find_cluster_cb = qemu_malloc(sizeof(*find_cluster_cb)); + find_cluster_cb->s = s; + find_cluster_cb->pos = pos; + find_cluster_cb->len = len; + find_cluster_cb->cb = cb; + find_cluster_cb->opaque = opaque; + find_cluster_cb->request = request; + + qed_read_l2_table(s, request, l2_offset, + qed_find_cluster_cb, find_cluster_cb); +} diff --git a/block/qed-gencb.c b/block/qed-gencb.c new file mode 100644 index 0000000..d389e12 --- /dev/null +++ b/block/qed-gencb.c @@ -0,0 +1,32 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qed.h" + +void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque) +{ + GenericCB *gencb = qemu_malloc(len); + gencb->cb = cb; + gencb->opaque = opaque; + return gencb; +} + +void gencb_complete(void *opaque, int ret) +{ + GenericCB *gencb = opaque; + BlockDriverCompletionFunc *cb = gencb->cb; + void *user_opaque = gencb->opaque; + + qemu_free(gencb); + cb(user_opaque, ret); +} diff --git a/block/qed-l2-cache.c b/block/qed-l2-cache.c new file mode 100644 index 0000000..747a629 --- /dev/null +++ b/block/qed-l2-cache.c @@ -0,0 +1,131 @@ +/* + * QEMU Enhanced Disk Format L2 Cache + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qed.h" + +/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */ +#define MAX_L2_CACHE_SIZE 50 + +/** + * Initialize the L2 cache + */ +void qed_init_l2_cache(L2TableCache *l2_cache, + L2TableAllocFunc *alloc_l2_table, + void *alloc_l2_table_opaque) +{ + QTAILQ_INIT(&l2_cache->entries); + l2_cache->n_entries = 0; + l2_cache->alloc_l2_table = alloc_l2_table; + l2_cache->alloc_l2_table_opaque = alloc_l2_table_opaque; +} + +/** + * Free the L2 cache + */ +void qed_free_l2_cache(L2TableCache *l2_cache) +{ + CachedL2Table *entry, *next_entry; + + QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) { + qemu_free(entry->table); + qemu_free(entry); + } +} + +/** + * Allocate an uninitialized entry from the cache + * + * The returned entry has a reference count of 1 and is owned by the caller. + */ +CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache) +{ + CachedL2Table *entry; + + entry = qemu_mallocz(sizeof(*entry)); + entry->table = l2_cache->alloc_l2_table(l2_cache->alloc_l2_table_opaque); + entry->ref++; + + return entry; +} + +/** + * Decrease an entry's reference count and free if necessary when the reference + * count drops to zero. + */ +void qed_unref_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *entry) +{ + if (!entry) { + return; + } + + entry->ref--; + if (entry->ref == 0) { + qemu_free(entry->table); + qemu_free(entry); + } +} + +/** + * Find an entry in the L2 cache. This may return NULL and it's up to the + * caller to satisfy the cache miss. + * + * For a cached entry, this function increases the reference count and returns + * the entry. + */ +CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset) +{ + CachedL2Table *entry; + + QTAILQ_FOREACH(entry, &l2_cache->entries, node) { + if (entry->offset == offset) { + entry->ref++; + return entry; + } + } + return NULL; +} + +/** + * Commit an L2 cache entry into the cache. This is meant to be used as part of + * the process to satisfy a cache miss. A caller would allocate an entry which + * is not actually in the L2 cache and then once the entry was valid and + * present on disk, the entry can be committed into the cache. + * + * Since the cache is write-through, it's important that this function is not + * called until the entry is present on disk and the L1 has been updated to + * point to the entry. + * + * This function will take a reference to the entry so the caller is still + * responsible for unreferencing the entry. + */ +void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table) +{ + CachedL2Table *entry; + + entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset); + if (entry) { + qed_unref_l2_cache_entry(l2_cache, entry); + return; + } + + if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) { + entry = QTAILQ_FIRST(&l2_cache->entries); + QTAILQ_REMOVE(&l2_cache->entries, entry, node); + l2_cache->n_entries--; + qed_unref_l2_cache_entry(l2_cache, entry); + } + + l2_table->ref++; + l2_cache->n_entries++; + QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node); +} diff --git a/block/qed-table.c b/block/qed-table.c new file mode 100644 index 0000000..9a72582 --- /dev/null +++ b/block/qed-table.c @@ -0,0 +1,242 @@ +/* + * QEMU Enhanced Disk Format Table I/O + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qed.h" + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + QEDTable *table; + + struct iovec iov; + QEMUIOVector qiov; +} QEDReadTableCB; + +static void qed_read_table_cb(void *opaque, int ret) +{ + QEDReadTableCB *read_table_cb = opaque; + QEDTable *table = read_table_cb->table; + int noffsets = read_table_cb->iov.iov_len / sizeof(uint64_t); + int i; + + /* Handle I/O error */ + if (ret) { + goto out; + } + + /* Byteswap and verify offsets */ + for (i = 0; i < noffsets; i++) { + table->offsets[i] = le64_to_cpu(table->offsets[i]); + } + +out: + /* Completion */ + gencb_complete(&read_table_cb->gencb, ret); +} + +static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, + BlockDriverCompletionFunc *cb, void *opaque) +{ + QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb), + cb, opaque); + QEMUIOVector *qiov = &read_table_cb->qiov; + BlockDriverAIOCB *aiocb; + + read_table_cb->s = s; + read_table_cb->table = table; + read_table_cb->iov.iov_base = table->offsets, + read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size, + + qemu_iovec_init_external(qiov, &read_table_cb->iov, 1); + aiocb = bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov, + read_table_cb->iov.iov_len / BDRV_SECTOR_SIZE, + qed_read_table_cb, read_table_cb); + if (!aiocb) { + qed_read_table_cb(read_table_cb, -EIO); + } +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + QEDTable *orig_table; + bool flush; /* flush after write? */ + + struct iovec iov; + QEMUIOVector qiov; + + QEDTable table; +} QEDWriteTableCB; + +static void qed_write_table_cb(void *opaque, int ret) +{ + QEDWriteTableCB *write_table_cb = opaque; + + if (ret) { + goto out; + } + + if (write_table_cb->flush) { + /* We still need to flush first */ + write_table_cb->flush = false; + bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb, + write_table_cb); + return; + } + +out: + gencb_complete(&write_table_cb->gencb, ret); + return; +} + +/** + * Write out an updated part or all of a table + * + * @s: QED state + * @offset: Offset of table in image file, in bytes + * @table: Table + * @index: Index of first element + * @n: Number of elements + * @flush: Whether or not to sync to disk + * @cb: Completion function + * @opaque: Argument for completion function + */ +static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, + unsigned int index, unsigned int n, bool flush, + BlockDriverCompletionFunc *cb, void *opaque) +{ + QEDWriteTableCB *write_table_cb; + BlockDriverAIOCB *aiocb; + unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1; + unsigned int start, end, i; + size_t len_bytes; + + /* Calculate indices of the first and one after last elements */ + start = index & ~sector_mask; + end = (index + n + sector_mask) & ~sector_mask; + + len_bytes = (end - start) * sizeof(uint64_t); + + write_table_cb = gencb_alloc(sizeof(*write_table_cb) + len_bytes, + cb, opaque); + write_table_cb->s = s; + write_table_cb->orig_table = table; + write_table_cb->flush = flush; + write_table_cb->iov.iov_base = write_table_cb->table.offsets; + write_table_cb->iov.iov_len = len_bytes; + qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1); + + /* Byteswap table */ + for (i = start; i < end; i++) { + write_table_cb->table.offsets[i - start] = cpu_to_le64(table->offsets[i]); + } + + /* Adjust for offset into table */ + offset += start * sizeof(uint64_t); + + aiocb = bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, + &write_table_cb->qiov, + write_table_cb->iov.iov_len / BDRV_SECTOR_SIZE, + qed_write_table_cb, write_table_cb); + if (!aiocb) { + qed_write_table_cb(write_table_cb, -EIO); + } +} + +static void qed_read_l1_table_cb(void *opaque, int ret) +{ + *(int *)opaque = ret; +} + +/** + * Read the L1 table synchronously + */ +int qed_read_l1_table(BDRVQEDState *s) +{ + int ret = -EINPROGRESS; + + /* TODO push/pop async context? */ + + qed_read_table(s, s->header.l1_table_offset, + s->l1_table, qed_read_l1_table_cb, &ret); + while (ret == -EINPROGRESS) { + qemu_aio_wait(); + } + return ret; +} + +void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, + BlockDriverCompletionFunc *cb, void *opaque) +{ + qed_write_table(s, s->header.l1_table_offset, + s->l1_table, index, n, false, cb, opaque); +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + uint64_t l2_offset; + QEDRequest *request; +} QEDReadL2TableCB; + +static void qed_read_l2_table_cb(void *opaque, int ret) +{ + QEDReadL2TableCB *read_l2_table_cb = opaque; + QEDRequest *request = read_l2_table_cb->request; + BDRVQEDState *s = read_l2_table_cb->s; + + if (ret) { + /* can't trust loaded L2 table anymore */ + qed_unref_l2_cache_entry(&s->l2_cache, request->l2_table); + request->l2_table = NULL; + } else { + request->l2_table->offset = read_l2_table_cb->l2_offset; + qed_commit_l2_cache_entry(&s->l2_cache, request->l2_table); + } + + gencb_complete(&read_l2_table_cb->gencb, ret); +} + +void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, + BlockDriverCompletionFunc *cb, void *opaque) +{ + QEDReadL2TableCB *read_l2_table_cb; + + qed_unref_l2_cache_entry(&s->l2_cache, request->l2_table); + + /* Check for cached L2 entry */ + request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset); + if (request->l2_table) { + cb(opaque, 0); + return; + } + + request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); + + read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque); + read_l2_table_cb->s = s; + read_l2_table_cb->l2_offset = offset; + read_l2_table_cb->request = request; + + qed_read_table(s, offset, request->l2_table->table, + qed_read_l2_table_cb, read_l2_table_cb); +} + +void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, + unsigned int index, unsigned int n, bool flush, + BlockDriverCompletionFunc *cb, void *opaque) +{ + qed_write_table(s, request->l2_table->offset, + request->l2_table->table, index, n, flush, cb, opaque); +} diff --git a/block/qed.c b/block/qed.c new file mode 100644 index 0000000..cf64418 --- /dev/null +++ b/block/qed.c @@ -0,0 +1,1103 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qed.h" + +/* TODO blkdebug support */ +/* TODO BlockDriverState::buffer_alignment */ +/* TODO check L2 table sizes before accessing them? */ +/* TODO skip zero prefill since the filesystem should zero the sectors anyway */ +/* TODO if a table element's offset is invalid then the image is broken. If + * there was a power failure and the table update reached storage but the data + * being pointed to did not, forget about the lost data by clearing the offset. + * However, need to be careful to detect invalid offsets for tables that are + * read *after* more clusters have been allocated. */ + +enum { + QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24, + + /* The image supports a backing file */ + QED_F_BACKING_FILE = 0x01, + + /* The image has the backing file format */ + QED_CF_BACKING_FORMAT = 0x01, + + /* Feature bits must be used when the on-disk format changes */ + QED_FEATURE_MASK = QED_F_BACKING_FILE, /* supported feature bits */ + QED_COMPAT_FEATURE_MASK = QED_CF_BACKING_FORMAT, /* supported compat feature bits */ + + /* Data is stored in groups of sectors called clusters. Cluster size must + * be large to avoid keeping too much metadata. I/O requests that have + * sub-cluster size will require read-modify-write. + */ + QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */ + QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024, + QED_DEFAULT_CLUSTER_SIZE = 64 * 1024, + + /* Allocated clusters are tracked using a 2-level pagetable. Table size is + * a multiple of clusters so large maximum image sizes can be supported + * without jacking up the cluster size too much. + */ + QED_MIN_TABLE_SIZE = 1, /* in clusters */ + QED_MAX_TABLE_SIZE = 16, + QED_DEFAULT_TABLE_SIZE = 4, +}; + +static void qed_aio_cancel(BlockDriverAIOCB *acb) +{ + qemu_aio_release(acb); +} + +static AIOPool qed_aio_pool = { + .aiocb_size = sizeof(QEDAIOCB), + .cancel = qed_aio_cancel, +}; + +/** + * Allocate memory that satisfies image file and backing file alignment requirements + * + * TODO make this common and consider propagating max buffer_alignment to the root image + */ +static void *qed_memalign(BDRVQEDState *s, size_t len) +{ + size_t align = s->bs->file->buffer_alignment; + BlockDriverState *backing_hd = s->bs->backing_hd; + + if (backing_hd && backing_hd->buffer_alignment > align) { + align = backing_hd->buffer_alignment; + } + + return qemu_memalign(align, len); +} + +static int bdrv_qed_probe(const uint8_t *buf, int buf_size, + const char *filename) +{ + const QEDHeader *header = (const void *)buf; + + if (buf_size < sizeof(*header)) { + return 0; + } + if (le32_to_cpu(header->magic) != QED_MAGIC) { + return 0; + } + return 100; +} + +static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu) +{ + cpu->magic = le32_to_cpu(le->magic); + cpu->cluster_size = le32_to_cpu(le->cluster_size); + cpu->table_size = le32_to_cpu(le->table_size); + cpu->first_cluster = le32_to_cpu(le->first_cluster); + cpu->features = le64_to_cpu(le->features); + cpu->compat_features = le64_to_cpu(le->compat_features); + cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset); + cpu->image_size = le64_to_cpu(le->image_size); + cpu->backing_file_offset = le32_to_cpu(le->backing_file_offset); + cpu->backing_file_size = le32_to_cpu(le->backing_file_size); + cpu->backing_fmt_offset = le32_to_cpu(le->backing_fmt_offset); + cpu->backing_fmt_size = le32_to_cpu(le->backing_fmt_size); +} + +static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le) +{ + le->magic = cpu_to_le32(cpu->magic); + le->cluster_size = cpu_to_le32(cpu->cluster_size); + le->table_size = cpu_to_le32(cpu->table_size); + le->first_cluster = cpu_to_le32(cpu->first_cluster); + le->features = cpu_to_le64(cpu->features); + le->compat_features = cpu_to_le64(cpu->compat_features); + le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset); + le->image_size = cpu_to_le64(cpu->image_size); + le->backing_file_offset = cpu_to_le32(cpu->backing_file_offset); + le->backing_file_size = cpu_to_le32(cpu->backing_file_size); + le->backing_fmt_offset = cpu_to_le32(cpu->backing_fmt_offset); + le->backing_fmt_size = cpu_to_le32(cpu->backing_fmt_size); +} + +static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size) +{ + uint64_t table_entries; + uint64_t l2_size; + + table_entries = (table_size * cluster_size) / 8; + l2_size = table_entries * cluster_size; + + return l2_size * table_entries; +} + +static bool qed_is_cluster_size_valid(uint32_t cluster_size) +{ + if (cluster_size < QED_MIN_CLUSTER_SIZE || + cluster_size > QED_MAX_CLUSTER_SIZE) { + return false; + } + if (cluster_size & (cluster_size - 1)) { + return false; /* not power of 2 */ + } + return true; +} + +static bool qed_is_table_size_valid(uint32_t table_size) +{ + if (table_size < QED_MIN_TABLE_SIZE || + table_size > QED_MAX_TABLE_SIZE) { + return false; + } + if (table_size & (table_size - 1)) { + return false; /* not power of 2 */ + } + return true; +} + +static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size, + uint32_t table_size) +{ + if (image_size == 0) { + /* Supporting zero size images makes life harder because even the L1 + * table is not needed. Make life simple and forbid zero size images. + */ + return false; + } + if (image_size & (cluster_size - 1)) { + return false; /* not multiple of cluster size */ + } + if (image_size > qed_max_image_size(cluster_size, table_size)) { + return false; /* image is too large */ + } + return true; +} + +/** + * Test if a byte offset is cluster aligned and within the image file + */ +static bool qed_check_byte_offset(BDRVQEDState *s, uint64_t offset) +{ + if (offset & (s->header.cluster_size - 1)) { + return false; + } + if (offset == 0) { + return false; /* first cluster contains the header and is not valid */ + } + return offset < s->file_size; +} + +/** + * Read a string of known length from the image file + * + * @file: Image file + * @offset: File offset to start of string, in bytes + * @n: String length in bytes + * @buf: Destination buffer + * @buflen: Destination buffer length in bytes + * + * The string is NUL-terminated. + */ +static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n, + char *buf, size_t buflen) +{ + int ret; + if (n >= buflen) { + return -EINVAL; + } + ret = bdrv_pread(file, offset, buf, n); + if (ret != n) { + return ret; + } + buf[n] = '\0'; + return 0; +} + +/** + * Allocate new clusters + * + * @s: QED state + * @n: Number of contiguous clusters to allocate + * @offset: Offset of first allocated cluster, filled in on success + */ +static int qed_alloc_clusters(BDRVQEDState *s, unsigned int n, uint64_t *offset) +{ + *offset = s->file_size; + s->file_size += n * s->header.cluster_size; + return 0; +} + +static QEDTable *qed_alloc_table(void *opaque) +{ + BDRVQEDState *s = opaque; + + /* Honor O_DIRECT memory alignment requirements */ + return qed_memalign(s, s->header.cluster_size * s->header.table_size); +} + +/** + * Allocate a new zeroed L2 table + */ +static CachedL2Table *qed_new_l2_table(BDRVQEDState *s) +{ + uint64_t offset; + int ret; + CachedL2Table *l2_table; + + ret = qed_alloc_clusters(s, s->header.table_size, &offset); + if (ret) { + return NULL; + } + + l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); + l2_table->offset = offset; + + memset(l2_table->table->offsets, 0, + s->header.cluster_size * s->header.table_size); + return l2_table; +} + +static int bdrv_qed_open(BlockDriverState *bs, int flags) +{ + BDRVQEDState *s = bs->opaque; + QEDHeader le_header; + int64_t file_size; + int ret; + + s->bs = bs; + QSIMPLEQ_INIT(&s->allocating_write_reqs); + + ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header)); + if (ret != sizeof(le_header)) { + return ret; + } + qed_header_le_to_cpu(&le_header, &s->header); + + if (s->header.magic != QED_MAGIC) { + return -ENOENT; + } + if (s->header.features & ~QED_FEATURE_MASK) { + return -ENOTSUP; /* image uses unsupported feature bits */ + } + if (!qed_is_cluster_size_valid(s->header.cluster_size)) { + return -EINVAL; + } + + /* Round up file size to the next cluster */ + file_size = bdrv_getlength(bs->file); + if (file_size < 0) { + return file_size; + } + s->file_size = qed_start_of_cluster(s, file_size + s->header.cluster_size - 1); + + if (!qed_is_table_size_valid(s->header.table_size)) { + return -EINVAL; + } + if (!qed_is_image_size_valid(s->header.image_size, + s->header.cluster_size, + s->header.table_size)) { + return -EINVAL; + } + if (!qed_check_byte_offset(s, s->header.l1_table_offset)) { + return -EINVAL; + } + + s->table_nelems = (s->header.cluster_size * s->header.table_size) / + sizeof(s->l1_table->offsets[0]); + s->l2_shift = get_bits_from_size(s->header.cluster_size); + s->l2_mask = s->table_nelems - 1; + s->l1_shift = s->l2_shift + get_bits_from_size(s->l2_mask + 1); + + if ((s->header.features & QED_F_BACKING_FILE)) { + ret = qed_read_string(bs->file, s->header.backing_file_offset, + s->header.backing_file_size, bs->backing_file, + sizeof(bs->backing_file)); + if (ret < 0) { + return ret; + } + + if ((s->header.compat_features & QED_CF_BACKING_FORMAT)) { + ret = qed_read_string(bs->file, s->header.backing_fmt_offset, + s->header.backing_fmt_size, + bs->backing_format, + sizeof(bs->backing_format)); + if (ret < 0) { + return ret; + } + } + } + + s->l1_table = qed_alloc_table(s); + qed_init_l2_cache(&s->l2_cache, qed_alloc_table, s); + + ret = qed_read_l1_table(s); + if (ret) { + qed_free_l2_cache(&s->l2_cache); + qemu_free(s->l1_table); + } + return ret; +} + +static void bdrv_qed_close(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + qed_free_l2_cache(&s->l2_cache); + qemu_free(s->l1_table); +} + +static void bdrv_qed_flush(BlockDriverState *bs) +{ + bdrv_flush(bs->file); +} + +static int qed_create(const char *filename, uint32_t cluster_size, + uint64_t image_size, uint32_t table_size, + const char *backing_file, const char *backing_fmt) +{ + QEDHeader header = { + .magic = QED_MAGIC, + .cluster_size = cluster_size, + .table_size = table_size, + .first_cluster = 1, + .features = 0, + .compat_features = 0, + .l1_table_offset = cluster_size, + .image_size = image_size, + }; + QEDHeader le_header; + uint8_t *l1_table = NULL; + size_t l1_size = header.cluster_size * header.table_size; + int ret = 0; + int fd; + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); + if (fd < 0) { + return -errno; + } + + if (backing_file) { + header.features |= QED_F_BACKING_FILE; + header.backing_file_offset = sizeof(le_header); + header.backing_file_size = strlen(backing_file); + if (backing_fmt) { + header.compat_features |= QED_CF_BACKING_FORMAT; + header.backing_fmt_offset = header.backing_file_offset + + header.backing_file_size; + header.backing_fmt_size = strlen(backing_fmt); + } + } + + qed_header_cpu_to_le(&header, &le_header); + if (qemu_write_full(fd, &le_header, sizeof(le_header)) != sizeof(le_header)) { + ret = -errno; + goto out; + } + if (qemu_write_full(fd, backing_file, header.backing_file_size) != header.backing_file_size) { + ret = -errno; + goto out; + } + if (qemu_write_full(fd, backing_fmt, header.backing_fmt_size) != header.backing_fmt_size) { + ret = -errno; + goto out; + } + + l1_table = qemu_mallocz(l1_size); + lseek(fd, header.l1_table_offset, SEEK_SET); + if (qemu_write_full(fd, l1_table, l1_size) != l1_size) { + ret = -errno; + goto out; + } + +out: + qemu_free(l1_table); + close(fd); + return ret; +} + +static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options) +{ + uint64_t image_size = 0; + uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE; + uint32_t table_size = QED_DEFAULT_TABLE_SIZE; + const char *backing_file = NULL; + const char *backing_fmt = NULL; + + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + image_size = options->value.n; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { + backing_file = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { + backing_fmt = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { + if (options->value.n) { + cluster_size = options->value.n; + } + } else if (!strcmp(options->name, "table_size")) { + if (options->value.n) { + table_size = options->value.n; + } + } + options++; + } + + if (!qed_is_cluster_size_valid(cluster_size)) { + fprintf(stderr, "QED cluster size must be within range [%u, %u] and power of 2\n", + QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE); + return -EINVAL; + } + if (!qed_is_table_size_valid(table_size)) { + fprintf(stderr, "QED table size must be within range [%u, %u] and power of 2\n", + QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE); + return -EINVAL; + } + if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) { + fprintf(stderr, + "QED image size must be a non-zero multiple of cluster size and less than %s\n", + bytes_to_str(qed_max_image_size(cluster_size, table_size))); + return -EINVAL; + } + + return qed_create(filename, cluster_size, image_size, table_size, + backing_file, backing_fmt); +} + +typedef struct { + int is_allocated; + int *pnum; +} QEDIsAllocatedCB; + +static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) +{ + QEDIsAllocatedCB *cb = opaque; + *cb->pnum = len / BDRV_SECTOR_SIZE; + cb->is_allocated = ret == QED_CLUSTER_FOUND; +} + +static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + BDRVQEDState *s = bs->opaque; + uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; + size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE; + QEDIsAllocatedCB cb = { + .is_allocated = -1, + .pnum = pnum, + }; + QEDRequest request = { .l2_table = NULL }; + + /* TODO push/pop async context? */ + + qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb); + + while (cb.is_allocated == -1) { + qemu_aio_wait(); + } + + qed_unref_l2_cache_entry(&s->l2_cache, request.l2_table); + + return cb.is_allocated; +} + +static int bdrv_qed_make_empty(BlockDriverState *bs) +{ + return -ENOTSUP; /* TODO */ +} + +static BDRVQEDState *acb_to_s(QEDAIOCB *acb) +{ + return acb->common.bs->opaque; +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + QEMUIOVector qiov; + struct iovec iov; + uint64_t offset; +} CopyFromBackingFileCB; + +static void qed_copy_from_backing_file_cb(void *opaque, int ret) +{ + CopyFromBackingFileCB *copy_cb = opaque; + qemu_vfree(copy_cb->iov.iov_base); + gencb_complete(©_cb->gencb, ret); +} + +static void qed_copy_from_backing_file_write(void *opaque, int ret) +{ + CopyFromBackingFileCB *copy_cb = opaque; + BDRVQEDState *s = copy_cb->s; + BlockDriverAIOCB *aiocb; + + if (ret) { + qed_copy_from_backing_file_cb(copy_cb, ret); + return; + } + + aiocb = bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE, + ©_cb->qiov, + copy_cb->qiov.size / BDRV_SECTOR_SIZE, + qed_copy_from_backing_file_cb, copy_cb); + if (!aiocb) { + qed_copy_from_backing_file_cb(copy_cb, -EIO); + } +} + +/** + * Copy data from backing file into the image + * + * @s: QED state + * @pos: Byte position in device + * @len: Number of bytes + * @offset: Byte offset in image file + * @cb: Completion function + * @opaque: User data for completion function + */ +static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, + uint64_t len, uint64_t offset, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + CopyFromBackingFileCB *copy_cb; + BlockDriverAIOCB *aiocb; + + /* Skip copy entirely if there is no work to do */ + if (len == 0) { + cb(opaque, 0); + return; + } + + copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque); + copy_cb->s = s; + copy_cb->offset = offset; + copy_cb->iov.iov_base = qed_memalign(s, len); + copy_cb->iov.iov_len = len; + qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1); + + /* Zero sectors if there is no backing file */ + if (!s->bs->backing_hd) { + memset(copy_cb->iov.iov_base, 0, len); + qed_copy_from_backing_file_write(copy_cb, 0); + return; + } + + aiocb = bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE, + ©_cb->qiov, len / BDRV_SECTOR_SIZE, + qed_copy_from_backing_file_write, copy_cb); + if (!aiocb) { + qed_copy_from_backing_file_cb(copy_cb, -EIO); + } +} + +/** + * Link one or more contiguous clusters into a table + * + * @s: QED state + * @table: L2 table + * @index: First cluster index + * @n: Number of contiguous clusters + * @cluster: First cluster byte offset in image file + */ +static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index, + unsigned int n, uint64_t cluster) +{ + int i; + for (i = index; i < index + n; i++) { + table->offsets[i] = cluster; + cluster += s->header.cluster_size; + } +} + +static void qed_aio_next_io(void *opaque, int ret); + +static void qed_aio_complete_bh(void *opaque) +{ + QEDAIOCB *acb = opaque; + BlockDriverCompletionFunc *cb = acb->common.cb; + void *user_opaque = acb->common.opaque; + int ret = acb->bh_ret; + + qemu_bh_delete(acb->bh); + qemu_aio_release(acb); + + /* Invoke callback */ + cb(user_opaque, ret); +} + +static void qed_aio_complete(QEDAIOCB *acb, int ret) +{ + BDRVQEDState *s = acb_to_s(acb); + + /* Free resources */ + qemu_iovec_destroy(&acb->cur_qiov); + qed_unref_l2_cache_entry(&s->l2_cache, acb->request.l2_table); + + /* Arrange for a bh to invoke the completion function */ + acb->bh_ret = ret; + acb->bh = qemu_bh_new(qed_aio_complete_bh, acb); + qemu_bh_schedule(acb->bh); + + /* Start next allocating write request waiting behind this one. Note that + * requests enqueue themselves when they first hit an unallocated cluster + * but they wait until the entire request is finished before waking up the + * next request in the queue. This ensures that we don't cycle through + * requests multiple times but rather finish one at a time completely. + */ + if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { + QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next); + acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); + if (acb) { + qed_aio_next_io(acb, 0); + } + } +} + +/** + * Construct an iovec array for the current cluster + * + * @acb: I/O request + * @len: Maximum number of bytes + */ +static void qed_acb_build_qiov(QEDAIOCB *acb, size_t len) +{ + struct iovec *iov_end = &acb->qiov->iov[acb->qiov->niov]; + size_t iov_offset = acb->cur_iov_offset; + struct iovec *iov = acb->cur_iov; + + /* Fill in one cluster's worth of iovecs */ + while (iov != iov_end && len > 0) { + size_t nbytes = MIN(iov->iov_len - iov_offset, len); + + qemu_iovec_add(&acb->cur_qiov, iov->iov_base + iov_offset, nbytes); + iov_offset += nbytes; + len -= nbytes; + + if (iov_offset >= iov->iov_len) { + iov_offset = 0; + iov++; + } + } + + /* Stash state for next time */ + acb->cur_iov = iov; + acb->cur_iov_offset = iov_offset; +} + +/** + * Commit the current L2 table to the cache + */ +static void qed_commit_l2_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + + qed_commit_l2_cache_entry(&s->l2_cache, acb->request.l2_table); + qed_aio_next_io(opaque, ret); +} + +/** + * Update L1 table with new L2 table offset and write it out + */ +static void qed_aio_write_l1_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + int index; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + index = qed_l1_index(s, acb->cur_pos); + s->l1_table->offsets[index] = acb->request.l2_table->offset; + + qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb); +} + +/** + * Update L2 table with new cluster offsets and write them out + */ +static void qed_aio_write_l2_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1; + int index; + + if (ret) { + goto err; + } + + if (need_alloc) { + qed_unref_l2_cache_entry(&s->l2_cache, acb->request.l2_table); + acb->request.l2_table = qed_new_l2_table(s); + if (!acb->request.l2_table) { + ret = -EIO; + goto err; + } + } + + index = qed_l2_index(s, acb->cur_pos); + qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters, + acb->cur_cluster); + + if (need_alloc) { + /* Write out the whole new L2 table */ + qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true, + qed_aio_write_l1_update, acb); + } else { + /* Write out only the updated part of the L2 table */ + qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false, + qed_aio_next_io, acb); + } + return; + +err: + qed_aio_complete(acb, ret); +} + +/** + * Write data to the image file + */ +static void qed_aio_write_main(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + bool need_alloc = acb->find_cluster_ret != QED_CLUSTER_FOUND; + uint64_t offset = acb->cur_cluster; + BlockDriverAIOCB *file_acb; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + offset += qed_offset_into_cluster(s, acb->cur_pos); + file_acb = bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, + &acb->cur_qiov, + acb->cur_qiov.size / BDRV_SECTOR_SIZE, + need_alloc ? qed_aio_write_l2_update : + qed_aio_next_io, + acb); + if (!file_acb) { + qed_aio_complete(acb, -EIO); + } +} + +/** + * Populate back untouched region of new data cluster + */ +static void qed_aio_write_postfill(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + uint64_t start = acb->cur_pos + acb->cur_qiov.size; + uint64_t len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start; + uint64_t offset = acb->cur_cluster + qed_offset_into_cluster(s, acb->cur_pos) + acb->cur_qiov.size; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + qed_copy_from_backing_file(s, start, len, offset, + qed_aio_write_main, acb); +} + +/** + * Populate front untouched region of new data cluster + */ +static void qed_aio_write_prefill(QEDAIOCB *acb) +{ + BDRVQEDState *s = acb_to_s(acb); + uint64_t start = qed_start_of_cluster(s, acb->cur_pos); + uint64_t len = qed_offset_into_cluster(s, acb->cur_pos); + + qed_copy_from_backing_file(s, start, len, acb->cur_cluster, + qed_aio_write_postfill, acb); +} + +/** + * Write data cluster + * + * @opaque: Write request + * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, + * or QED_CLUSTER_ERROR + * @offset: Cluster offset in bytes + * @len: Length in bytes + * + * Callback from qed_find_cluster(). + */ +static void qed_aio_write_data(void *opaque, int ret, + uint64_t offset, size_t len) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + bool need_alloc = ret != QED_CLUSTER_FOUND; + + if (ret == QED_CLUSTER_ERROR) { + goto err; + } + + /* Freeze this request if another allocating write is in progress */ + if (need_alloc) { + if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { + QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next); + } + if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { + return; /* wait for existing request to finish */ + } + } + + acb->cur_nclusters = qed_bytes_to_clusters(s, + qed_offset_into_cluster(s, acb->cur_pos) + len); + + if (need_alloc) { + if (qed_alloc_clusters(s, acb->cur_nclusters, &offset) != 0) { + goto err; + } + } + + acb->find_cluster_ret = ret; + acb->cur_cluster = offset; + qed_acb_build_qiov(acb, len); + + if (need_alloc) { + qed_aio_write_prefill(acb); + } else { + qed_aio_write_main(acb, 0); + } + return; + +err: + qed_aio_complete(acb, -EIO); +} + +/** + * Read data cluster + * + * @opaque: Read request + * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, + * or QED_CLUSTER_ERROR + * @offset: Cluster offset in bytes + * @len: Length in bytes + * + * Callback from qed_find_cluster(). + */ +static void qed_aio_read_data(void *opaque, int ret, + uint64_t offset, size_t len) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + BlockDriverState *bs = acb->common.bs; + BlockDriverState *file = bs->file; + BlockDriverAIOCB *file_acb; + + if (ret == QED_CLUSTER_ERROR) { + goto err; + } + + qed_acb_build_qiov(acb, len); + + /* Adjust offset into cluster */ + offset += qed_offset_into_cluster(s, acb->cur_pos); + + /* Handle backing file and unallocated sparse hole reads */ + if (ret != QED_CLUSTER_FOUND) { + if (!bs->backing_hd) { + qemu_iovec_zero(&acb->cur_qiov); + qed_aio_next_io(acb, 0); + return; + } + + /* Pass through read to backing file */ + offset = acb->cur_pos; + file = bs->backing_hd; + } + + file_acb = bdrv_aio_readv(file, offset / BDRV_SECTOR_SIZE, + &acb->cur_qiov, + acb->cur_qiov.size / BDRV_SECTOR_SIZE, + qed_aio_next_io, acb); + if (!file_acb) { + goto err; + } + return; + +err: + qed_aio_complete(acb, -EIO); +} + +/** + * Begin next I/O or complete the request + */ +static void qed_aio_next_io(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + QEDFindClusterFunc *io_fn = + acb->is_write ? qed_aio_write_data : qed_aio_read_data; + + /* Handle I/O error */ + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + acb->cur_pos += acb->cur_qiov.size; + qemu_iovec_reset(&acb->cur_qiov); + + /* Complete request */ + if (acb->cur_pos >= acb->end_pos) { + qed_aio_complete(acb, 0); + return; + } + + /* Find next cluster and start I/O */ + qed_find_cluster(s, &acb->request, + acb->cur_pos, acb->end_pos - acb->cur_pos, + io_fn, acb); +} + +static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque, bool is_write) +{ + QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque); + + acb->is_write = is_write; + acb->qiov = qiov; + acb->cur_iov = acb->qiov->iov; + acb->cur_iov_offset = 0; + acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; + acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE; + acb->request.l2_table = NULL; + qemu_iovec_init(&acb->cur_qiov, qiov->niov); + + /* Start request */ + qed_aio_next_io(acb, 0); + return &acb->common; +} + +static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, false); +} + +static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, true); +} + +static BlockDriverAIOCB *bdrv_qed_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + return bdrv_aio_flush(bs->file, cb, opaque); +} + +static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset) +{ + return -ENOTSUP; /* TODO */ +} + +static int64_t bdrv_qed_getlength(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + return s->header.image_size; +} + +static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVQEDState *s = bs->opaque; + + memset(bdi, 0, sizeof(*bdi)); + bdi->cluster_size = s->header.cluster_size; + return 0; +} + +static int bdrv_qed_change_backing_file(BlockDriverState *bs, + const char *backing_file, + const char *backing_fmt) +{ + return -ENOTSUP; /* TODO */ +} + +static int bdrv_qed_check(BlockDriverState* bs, BdrvCheckResult *result) +{ + return -ENOTSUP; /* TODO */ +} + +static QEMUOptionParameter qed_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size (in bytes)" + }, { + .name = BLOCK_OPT_BACKING_FILE, + .type = OPT_STRING, + .help = "File name of a base image" + }, { + .name = BLOCK_OPT_BACKING_FMT, + .type = OPT_STRING, + .help = "Image format of the base image" + }, { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = OPT_SIZE, + .help = "Cluster size (in bytes)" + }, { + .name = "table_size", + .type = OPT_SIZE, + .help = "L1/L2 table size (in clusters)" + }, + { /* end of list */ } +}; + +static BlockDriver bdrv_qed = { + .format_name = "qed", + .instance_size = sizeof(BDRVQEDState), + .create_options = qed_create_options, + + .bdrv_probe = bdrv_qed_probe, + .bdrv_open = bdrv_qed_open, + .bdrv_close = bdrv_qed_close, + .bdrv_create = bdrv_qed_create, + .bdrv_flush = bdrv_qed_flush, + .bdrv_is_allocated = bdrv_qed_is_allocated, + .bdrv_make_empty = bdrv_qed_make_empty, + .bdrv_aio_readv = bdrv_qed_aio_readv, + .bdrv_aio_writev = bdrv_qed_aio_writev, + .bdrv_aio_flush = bdrv_qed_aio_flush, + .bdrv_truncate = bdrv_qed_truncate, + .bdrv_getlength = bdrv_qed_getlength, + .bdrv_get_info = bdrv_qed_get_info, + .bdrv_change_backing_file = bdrv_qed_change_backing_file, + .bdrv_check = bdrv_qed_check, +}; + +static void bdrv_qed_init(void) +{ + bdrv_register(&bdrv_qed); +} + +block_init(bdrv_qed_init); diff --git a/block/qed.h b/block/qed.h new file mode 100644 index 0000000..4711fbd --- /dev/null +++ b/block/qed.h @@ -0,0 +1,212 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef BLOCK_QED_H +#define BLOCK_QED_H + +#include "block_int.h" + +/* The layout of a QED file is as follows: + * + * +--------+----------+----------+----------+-----+ + * | header | L1 table | cluster0 | cluster1 | ... | + * +--------+----------+----------+----------+-----+ + * + * There is a 2-level pagetable for cluster allocation: + * + * +----------+ + * | L1 table | + * +----------+ + * ,------' | '------. + * +----------+ | +----------+ + * | L2 table | ... | L2 table | + * +----------+ +----------+ + * ,------' | '------. + * +----------+ | +----------+ + * | Data | ... | Data | + * +----------+ +----------+ + * + * The L1 table is fixed size and always present. L2 tables are allocated on + * demand. The L1 table size determines the maximum possible image size; it + * can be influenced using the cluster_size and table_size values. + * + * All fields are little-endian on disk. + */ + +typedef struct { + uint32_t magic; /* QED */ + + uint32_t cluster_size; /* in bytes */ + uint32_t table_size; /* table size, in clusters */ + uint32_t first_cluster; /* first usable cluster */ + + uint64_t features; /* format feature bits */ + uint64_t compat_features; /* compatible feature bits */ + uint64_t l1_table_offset; /* L1 table offset, in bytes */ + uint64_t image_size; /* total image size, in bytes */ + + uint32_t backing_file_offset; /* in bytes from start of header */ + uint32_t backing_file_size; /* in bytes */ + uint32_t backing_fmt_offset; /* in bytes from start of header */ + uint32_t backing_fmt_size; /* in bytes */ +} QEDHeader; + +typedef struct { + uint64_t offsets[0]; /* in bytes */ +} QEDTable; + +/* The L2 cache is a simple write-through cache for L2 structures */ +typedef struct CachedL2Table { + QEDTable *table; + uint64_t offset; /* offset=0 indicates an invalidate entry */ + QTAILQ_ENTRY(CachedL2Table) node; + int ref; +} CachedL2Table; + +/** + * Allocate an L2 table + * + * This callback is used by the L2 cache to allocate tables without knowing + * their size or alignment requirements. + */ +typedef QEDTable *L2TableAllocFunc(void *opaque); + +typedef struct { + QTAILQ_HEAD(, CachedL2Table) entries; + unsigned int n_entries; + L2TableAllocFunc *alloc_l2_table; + void *alloc_l2_table_opaque; +} L2TableCache; + +typedef struct QEDRequest { + CachedL2Table *l2_table; +} QEDRequest; + +typedef struct QEDAIOCB { + BlockDriverAIOCB common; + QEMUBH *bh; + int bh_ret; /* final return status for completion bh */ + QSIMPLEQ_ENTRY(QEDAIOCB) next; /* next request */ + bool is_write; /* false - read, true - write */ + + /* User scatter-gather list */ + QEMUIOVector *qiov; + struct iovec *cur_iov; /* current iovec to process */ + size_t cur_iov_offset; /* byte count already processed in iovec */ + + /* Current cluster scatter-gather list */ + QEMUIOVector cur_qiov; + uint64_t cur_pos; /* position on block device, in bytes */ + uint64_t end_pos; + uint64_t cur_cluster; /* cluster offset in image file */ + unsigned int cur_nclusters; /* number of clusters being accessed */ + int find_cluster_ret; /* used for L1/L2 update */ + + QEDRequest request; +} QEDAIOCB; + +typedef struct { + BlockDriverState *bs; /* device */ + uint64_t file_size; /* length of image file, in bytes */ + + QEDHeader header; /* always cpu-endian */ + QEDTable *l1_table; + L2TableCache l2_cache; /* l2 table cache */ + uint32_t table_nelems; + uint32_t l1_shift; + uint32_t l2_shift; + uint32_t l2_mask; + + /* Allocating write request queue */ + QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs; +} BDRVQEDState; + +enum { + QED_CLUSTER_FOUND, /* cluster found */ + QED_CLUSTER_L2, /* cluster missing in L2 */ + QED_CLUSTER_L1, /* cluster missing in L1 */ + QED_CLUSTER_ERROR, /* error looking up cluster */ +}; + +typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len); + +/** + * Generic callback for chaining async callbacks + */ +typedef struct { + BlockDriverCompletionFunc *cb; + void *opaque; +} GenericCB; + +void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque); +void gencb_complete(void *opaque, int ret); + +/** + * L2 cache functions + */ +void qed_init_l2_cache(L2TableCache *l2_cache, L2TableAllocFunc *alloc_l2_table, void *alloc_l2_table_opaque); +void qed_free_l2_cache(L2TableCache *l2_cache); +CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache); +void qed_unref_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *entry); +CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset); +void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table); + +/** + * Table I/O functions + */ +int qed_read_l1_table(BDRVQEDState *s); +void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, + BlockDriverCompletionFunc *cb, void *opaque); +void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, + BlockDriverCompletionFunc *cb, void *opaque); +void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, + unsigned int index, unsigned int n, bool flush, + BlockDriverCompletionFunc *cb, void *opaque); + +/** + * Cluster functions + */ +void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, + size_t len, QEDFindClusterFunc *cb, void *opaque); + +/** + * Utility functions + */ +static inline uint64_t qed_start_of_cluster(BDRVQEDState *s, uint64_t offset) +{ + return offset & ~(uint64_t)(s->header.cluster_size - 1); +} + +static inline uint64_t qed_offset_into_cluster(BDRVQEDState *s, uint64_t offset) +{ + return offset & (s->header.cluster_size - 1); +} + +static inline unsigned int qed_bytes_to_clusters(BDRVQEDState *s, size_t bytes) +{ + return qed_start_of_cluster(s, bytes + (s->header.cluster_size - 1)) / + (s->header.cluster_size - 1); +} + +static inline unsigned int qed_l1_index(BDRVQEDState *s, uint64_t pos) +{ + return pos >> s->l1_shift; +} + +static inline unsigned int qed_l2_index(BDRVQEDState *s, uint64_t pos) +{ + return (pos >> s->l2_shift) & s->l2_mask; +} + +#endif /* BLOCK_QED_H */ diff --git a/cutils.c b/cutils.c index 036ae3c..e5b6fae 100644 --- a/cutils.c +++ b/cutils.c @@ -234,6 +234,14 @@ void qemu_iovec_from_buffer(QEMUIOVector *qiov, const void *buf, size_t count) } } +void qemu_iovec_zero(QEMUIOVector *qiov) +{ + struct iovec *iov; + for (iov = qiov->iov; iov != &qiov->iov[qiov->niov]; iov++) { + memset(iov->iov_base, 0, iov->iov_len); + } +} + #ifndef _WIN32 /* Sets a specific flag */ int fcntl_setfl(int fd, int flag) @@ -251,3 +259,48 @@ int fcntl_setfl(int fd, int flag) } #endif +/** + * Get the number of bits for a power of 2 + * + * The following is true for powers of 2: + * n == 1 << get_bits_from_size(n) + */ +int get_bits_from_size(size_t size) +{ + int res = 0; + + if (size == 0) { + return -1; + } + + while (size != 1) { + /* Not a power of two */ + if (size & 1) { + return -1; + } + + size >>= 1; + res++; + } + + return res; +} + +const char *bytes_to_str(uint64_t size) +{ + static char buffer[64]; + + if (size < (1ULL << 10)) { + snprintf(buffer, sizeof(buffer), "%" PRIu64 " byte(s)", size); + } else if (size < (1ULL << 20)) { + snprintf(buffer, sizeof(buffer), "%" PRIu64 " KB(s)", size >> 10); + } else if (size < (1ULL << 30)) { + snprintf(buffer, sizeof(buffer), "%" PRIu64 " MB(s)", size >> 20); + } else if (size < (1ULL << 40)) { + snprintf(buffer, sizeof(buffer), "%" PRIu64 " GB(s)", size >> 30); + } else { + snprintf(buffer, sizeof(buffer), "%" PRIu64 " TB(s)", size >> 40); + } + + return buffer; +} diff --git a/qemu-common.h b/qemu-common.h index dfd3dc0..754b107 100644 --- a/qemu-common.h +++ b/qemu-common.h @@ -137,6 +137,8 @@ time_t mktimegm(struct tm *tm); int qemu_fls(int i); int qemu_fdatasync(int fd); int fcntl_setfl(int fd, int flag); +int get_bits_from_size(size_t size); +const char *bytes_to_str(uint64_t size); /* path.c */ void init_paths(const char *prefix); @@ -283,6 +285,7 @@ void qemu_iovec_destroy(QEMUIOVector *qiov); void qemu_iovec_reset(QEMUIOVector *qiov); void qemu_iovec_to_buffer(QEMUIOVector *qiov, void *buf); void qemu_iovec_from_buffer(QEMUIOVector *qiov, const void *buf, size_t count); +void qemu_iovec_zero(QEMUIOVector *qiov); struct Monitor; typedef struct Monitor Monitor;