Patchwork [RFC,1/2] qcow2: Add QcowCache

login
register
mail settings
Submitter Kevin Wolf
Date Jan. 10, 2011, 4:53 p.m.
Message ID <1294678428-7266-2-git-send-email-kwolf@redhat.com>
Download mbox | patch
Permalink /patch/78173/
State New
Headers show

Comments

Kevin Wolf - Jan. 10, 2011, 4:53 p.m.
This adds some new cache functions to qcow2 which can be used for caching
refcount blocks and L2 tables. When used with cache=writethrough they work
like the old caching code which is spread all over qcow2, so for this case we
have merely a cleanup.

The interesting case is with writeback caching (this includes cache=none) where
data isn't written to disk immediately but only kept in cache initially. This
leads to some form of metadata write batching which avoids the current "write
to refcount block, flush, write to L2 table" pattern for each single request
when a lot of cluster allocations happen. Instead, cache entries are only
written out if its required to maintain the right order. In the pure cluster
allocation case this means that all metadata updates for requests are done in
memory initially and on sync, first the refcount blocks are written to disk,
then fsync, then L2 tables.

This improves performance of scenarios with lots of cluster allocations
noticably (e.g. installation or after taking a snapshot).

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 Makefile.objs       |    2 +-
 block/qcow2-cache.c |  270 +++++++++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2.h       |   17 +++
 3 files changed, 288 insertions(+), 1 deletions(-)
 create mode 100644 block/qcow2-cache.c
Kevin Wolf - Jan. 14, 2011, 3:22 p.m.
Am 14.01.2011 15:36, schrieb Stefan Hajnoczi:
> On Mon, Jan 10, 2011 at 4:53 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>> +typedef struct QcowCachedTable {
> 
> How about using Qcow2* naming?  Just recently the old qcow_* functions
> in qcow2.c were renamed and it would be nice to continue that.

Will change it.

>> +    void*   table;
>> +    int64_t offset;
>> +    bool    dirty;
>> +    int     cache_hits;
>> +    int     ref;
>> +} QcowCachedTable;
>> +
>> +struct QcowCache {
>> +    int                     size;
>> +    QcowCachedTable*        entries;
>> +    struct QcowCache*       depends;
>> +    bool                    writethrough;
>> +};
>> +
>> +QcowCache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
>> +    bool writethrough)
>> +{
>> +    BDRVQcowState *s = bs->opaque;
>> +    QcowCache *c;
>> +    int i;
>> +
>> +    c = qemu_mallocz(sizeof(*c));
>> +    c->size = num_tables;
>> +    c->entries = qemu_mallocz(sizeof(*c->entries) * num_tables);
>> +    c->writethrough = writethrough;
>> +
>> +    for (i = 0; i < c->size; i++) {
>> +        c->entries[i].table = qemu_blockalign(bs, s->cluster_size);
>> +    }
> 
> These could be allocated lazily.  For a single cache it doesn't
> matter, but we will have n QcowCaches where n is the number of
> dependencies?

There is one L2 cache and one refcount block cache, both initialized
only once during bdrv_open.

Also, the only dependency we have is that L2 depends on refcounts being
flushed first or vice versa, i.e. the two caches (not tables!) that
exist may depend on each other but new caches are never created.

>> +
>> +    return c;
>> +}
>> +
>> +int qcow2_cache_destroy(BlockDriverState* bs, QcowCache *c)
>> +{
>> +    int i;
>> +    int ret;
>> +
>> +    ret = qcow2_cache_flush(bs, c);
>> +
>> +    for (i = 0; i < c->size; i++) {
>> +        assert(c->entries[i].ref == 0);
>> +        qemu_vfree(c->entries[i].table);
>> +    }
>> +    qemu_free(c->entries);
> 
> And qemu_free(c)?

Right...

> 
>> +
>> +    bdrv_flush(bs->file);
>> +
>> +    return ret;
>> +}
>> +
>> +static int qcow2_cache_flush_dependency(BlockDriverState *bs, QcowCache *c)
>> +{
>> +    int ret;
>> +
>> +    ret = qcow2_cache_flush(bs, c->depends);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    c->depends = NULL;
>> +    return 0;
>> +}
>> +
>> +static int qcow2_cache_entry_flush(BlockDriverState *bs, QcowCache *c, int i)
>> +{
>> +    BDRVQcowState *s = bs->opaque;
>> +    int ret;
>> +
>> +    if (!c->entries[i].dirty || !c->entries[i].offset) {
>> +        return 0;
>> +    }
>> +
>> +    if (c->depends) {
>> +        ret = qcow2_cache_flush_dependency(bs, c);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +    }
>> +
>> +    ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table,
>> +        s->cluster_size);
>> +
>> +    c->entries[i].dirty = false;
> 
> In the error case we should leave this entry marked dirty.

Will fix.

> 
>> +
>> +    return ret;
>> +}
>> +
>> +int qcow2_cache_flush(BlockDriverState *bs, QcowCache *c)
>> +{
>> +    int result = 0;
>> +    int ret;
>> +    int i;
>> +
>> +    for (i = 0; i < c->size; i++) {
>> +        ret = qcow2_cache_entry_flush(bs, c, i);
>> +        if (ret < 0 && result != -ENOSPC) {
>> +            result = ret;
>> +        }
>> +    }
>> +
>> +    if (result == 0) {
>> +        ret = bdrv_flush(bs->file);
>> +        if (ret < 0) {
>> +            result = ret;
>> +        }
>> +    }
>> +
>> +    return result;
>> +}
>> +
>> +int qcow2_cache_set_dependency(BlockDriverState *bs, QcowCache *c,
>> +    QcowCache *dependency)
>> +{
>> +    int ret;
>> +
>> +    if (dependency->depends) {
>> +        ret = qcow2_cache_flush_dependency(bs, dependency);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +    }
>> +
>> +    if (c->depends && (c->depends != dependency)) {
>> +        ret = qcow2_cache_flush_dependency(bs, c);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +    }
>> +
>> +    c->depends = dependency;
>> +    return 0;
>> +}
>> +
>> +static int qcow2_cache_find_entry_to_replace(QcowCache *c)
>> +{
>> +    int i;
>> +    int min_count = INT_MAX;
>> +    int min_index = 0;
>> +
>> +
>> +    for (i = 0; i < c->size; i++) {
>> +        if (c->entries[i].ref) {
>> +            continue;
>> +        }
>> +
>> +        if (c->entries[i].cache_hits < min_count) {
>> +            min_index = i;
>> +            min_count = c->entries[i].cache_hits;
>> +        }
>> +
>> +        /* Give newer hits priority */
>> +        /* TODO Check how to optimize the replacement strategy */
>> +        c->entries[i].cache_hits /= 2;
>> +    }
>> +
>> +    if (c->entries[min_index].ref) {
>> +        abort(); /* TODO */
>> +    }
> 
> Initialize min_index to -1, then you don't need this check.  The
> caller could return ENOSPC or ENOBUFS if this function returns < 0.

I'll implement the -1 initialization.

The check itself is more of a reminder once we make things asynchronous
and multiple requests can use the cache at the same time. In the current
code it's more like an assert, it can never happen that there is no free
cache entry.

>> +    return min_index;
>> +}
>> +
>> +int qcow2_cache_get(BlockDriverState *bs, QcowCache *c, uint64_t offset,
>> +    void **table)
>> +{
>> +    BDRVQcowState *s = bs->opaque;
>> +    int i;
>> +    int ret;
>> +
>> +    /* Check if the table is already cached */
>> +    for (i = 0; i < c->size; i++) {
>> +        if (c->entries[i].offset == offset) {
>> +            goto found;
>> +        }
>> +    }
>> +
>> +    /* If not, write a table back and replace it */
>> +    i = qcow2_cache_find_entry_to_replace(c);
>> +    if (i < 0) {
>> +        return i;
>> +    }
>> +
>> +    ret = qcow2_cache_entry_flush(bs, c, i);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    c->entries[i].offset = 0;
>> +    ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    c->entries[i].cache_hits = 32;
> 
> 32?

It should be 42, "an arbitrary but carefully chosen number" ;-)

The point is that we don't want a new entry to be freed immediately
again, so it gets some hits for the start. I'll add a comment for that.

Kevin
Stefan Hajnoczi - Jan. 14, 2011, 4:59 p.m.
On Fri, Jan 14, 2011 at 3:22 PM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 14.01.2011 15:36, schrieb Stefan Hajnoczi:
>> On Mon, Jan 10, 2011 at 4:53 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>>> +    for (i = 0; i < c->size; i++) {
>>> +        c->entries[i].table = qemu_blockalign(bs, s->cluster_size);
>>> +    }
>>
>> These could be allocated lazily.  For a single cache it doesn't
>> matter, but we will have n QcowCaches where n is the number of
>> dependencies?
>
> There is one L2 cache and one refcount block cache, both initialized
> only once during bdrv_open.
>
> Also, the only dependency we have is that L2 depends on refcounts being
> flushed first or vice versa, i.e. the two caches (not tables!) that
> exist may depend on each other but new caches are never created.

I understand now.  Without having looked at patch 2/2 I thought this
would impose ordering between allocating write requests, but it's not
needed to meet block device semantics.

Still, I'm reluctant to introduce request reordering into the stack.
The cache makes no attempt to keep things in order whatsoever.  On the
other hand, if we're using cache=writeback then the host page cache
can decide in what order to write out pages too.

>>> +    c->entries[i].offset = 0;
>>> +    ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size);
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    c->entries[i].cache_hits = 32;
>>
>> 32?
>
> It should be 42, "an arbitrary but carefully chosen number" ;-)
>
> The point is that we don't want a new entry to be freed immediately
> again, so it gets some hits for the start. I'll add a comment for that.

I see.

Stefan

Patch

diff --git a/Makefile.objs b/Makefile.objs
index d6b3d60..4f499c8 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -19,7 +19,7 @@  block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
-block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
+block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
new file mode 100644
index 0000000..8767632
--- /dev/null
+++ b/block/qcow2-cache.c
@@ -0,0 +1,270 @@ 
+/*
+ * L2/refcount table cache for the QCOW2 format
+ *
+ * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block_int.h"
+#include "qemu-common.h"
+#include "qcow2.h"
+
+typedef struct QcowCachedTable {
+    void*   table;
+    int64_t offset;
+    bool    dirty;
+    int     cache_hits;
+    int     ref;
+} QcowCachedTable;
+
+struct QcowCache {
+    int                     size;
+    QcowCachedTable*        entries;
+    struct QcowCache*       depends;
+    bool                    writethrough;
+};
+
+QcowCache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
+    bool writethrough)
+{
+    BDRVQcowState *s = bs->opaque;
+    QcowCache *c;
+    int i;
+
+    c = qemu_mallocz(sizeof(*c));
+    c->size = num_tables;
+    c->entries = qemu_mallocz(sizeof(*c->entries) * num_tables);
+    c->writethrough = writethrough;
+
+    for (i = 0; i < c->size; i++) {
+        c->entries[i].table = qemu_blockalign(bs, s->cluster_size);
+    }
+
+    return c;
+}
+
+int qcow2_cache_destroy(BlockDriverState* bs, QcowCache *c)
+{
+    int i;
+    int ret;
+
+    ret = qcow2_cache_flush(bs, c);
+
+    for (i = 0; i < c->size; i++) {
+        assert(c->entries[i].ref == 0);
+        qemu_vfree(c->entries[i].table);
+    }
+    qemu_free(c->entries);
+
+    bdrv_flush(bs->file);
+
+    return ret;
+}
+
+static int qcow2_cache_flush_dependency(BlockDriverState *bs, QcowCache *c)
+{
+    int ret;
+
+    ret = qcow2_cache_flush(bs, c->depends);
+    if (ret < 0) {
+        return ret;
+    }
+
+    c->depends = NULL;
+    return 0;
+}
+
+static int qcow2_cache_entry_flush(BlockDriverState *bs, QcowCache *c, int i)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    if (!c->entries[i].dirty || !c->entries[i].offset) {
+        return 0;
+    }
+
+    if (c->depends) {
+        ret = qcow2_cache_flush_dependency(bs, c);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table,
+        s->cluster_size);
+
+    c->entries[i].dirty = false;
+
+    return ret;
+}
+
+int qcow2_cache_flush(BlockDriverState *bs, QcowCache *c)
+{
+    int result = 0;
+    int ret;
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        ret = qcow2_cache_entry_flush(bs, c, i);
+        if (ret < 0 && result != -ENOSPC) {
+            result = ret;
+        }
+    }
+
+    if (result == 0) {
+        ret = bdrv_flush(bs->file);
+        if (ret < 0) {
+            result = ret;
+        }
+    }
+
+    return result;
+}
+
+int qcow2_cache_set_dependency(BlockDriverState *bs, QcowCache *c,
+    QcowCache *dependency)
+{
+    int ret;
+
+    if (dependency->depends) {
+        ret = qcow2_cache_flush_dependency(bs, dependency);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    if (c->depends && (c->depends != dependency)) {
+        ret = qcow2_cache_flush_dependency(bs, c);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    c->depends = dependency;
+    return 0;
+}
+
+static int qcow2_cache_find_entry_to_replace(QcowCache *c)
+{
+    int i;
+    int min_count = INT_MAX;
+    int min_index = 0;
+
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].ref) {
+            continue;
+        }
+
+        if (c->entries[i].cache_hits < min_count) {
+            min_index = i;
+            min_count = c->entries[i].cache_hits;
+        }
+
+        /* Give newer hits priority */
+        /* TODO Check how to optimize the replacement strategy */
+        c->entries[i].cache_hits /= 2;
+    }
+
+    if (c->entries[min_index].ref) {
+        abort(); /* TODO */
+    }
+    return min_index;
+}
+
+int qcow2_cache_get(BlockDriverState *bs, QcowCache *c, uint64_t offset,
+    void **table)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i;
+    int ret;
+
+    /* Check if the table is already cached */
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].offset == offset) {
+            goto found;
+        }
+    }
+
+    /* If not, write a table back and replace it */
+    i = qcow2_cache_find_entry_to_replace(c);
+    if (i < 0) {
+        return i;
+    }
+
+    ret = qcow2_cache_entry_flush(bs, c, i);
+    if (ret < 0) {
+        return ret;
+    }
+
+    c->entries[i].offset = 0;
+    ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size);
+    if (ret < 0) {
+        return ret;
+    }
+
+    c->entries[i].cache_hits = 32;
+    c->entries[i].offset = offset;
+
+    /* And return the right table */
+found:
+    c->entries[i].cache_hits++;
+    c->entries[i].ref++;
+    *table = c->entries[i].table;
+    return 0;
+}
+
+int qcow2_cache_put(BlockDriverState *bs, QcowCache *c, void *table)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].table == table) {
+            goto found;
+        }
+    }
+    return -ENOENT;
+
+found:
+    c->entries[i].ref--;
+    assert(c->entries[i].ref >= 0);
+
+    if (c->writethrough) {
+        return qcow2_cache_entry_flush(bs, c, i);
+    } else {
+        return 0;
+    }
+}
+
+void qcow2_cache_entry_mark_dirty(QcowCache *c, void *table)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].table == table) {
+            goto found;
+        }
+    }
+    abort();
+
+found:
+    c->entries[i].dirty = true;
+}
+
diff --git a/block/qcow2.h b/block/qcow2.h
index 5217bea..81d4853 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -78,6 +78,9 @@  typedef struct QCowSnapshot {
     uint64_t vm_clock_nsec;
 } QCowSnapshot;
 
+struct QcowCache;
+typedef struct QcowCache QcowCache;
+
 typedef struct BDRVQcowState {
     int cluster_bits;
     int cluster_size;
@@ -215,4 +218,18 @@  int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name);
 void qcow2_free_snapshots(BlockDriverState *bs);
 int qcow2_read_snapshots(BlockDriverState *bs);
 
+/* qcow2-cache.c functions */
+QcowCache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
+    bool writethrough);
+int qcow2_cache_destroy(BlockDriverState* bs, QcowCache *c);
+
+void qcow2_cache_entry_mark_dirty(QcowCache *c, void *table);
+int qcow2_cache_flush(BlockDriverState *bs, QcowCache *c);
+int qcow2_cache_set_dependency(BlockDriverState *bs, QcowCache *c,
+    QcowCache *dependency);
+
+int qcow2_cache_get(BlockDriverState *bs, QcowCache *c, uint64_t offset,
+    void **table);
+int qcow2_cache_put(BlockDriverState *bs, QcowCache *c, void *table);
+
 #endif