diff mbox

[v4,2/3] qcow2: Use QcowCache

Message ID 1295543412-24012-3-git-send-email-kwolf@redhat.com
State New
Headers show

Commit Message

Kevin Wolf Jan. 20, 2011, 5:10 p.m. UTC
Use the new functions of qcow2-cache.c for everything that works on refcount
block and L2 tables.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cache.c    |   10 ++
 block/qcow2-cluster.c  |  207 +++++++++++++-------------------------
 block/qcow2-refcount.c |  258 ++++++++++++++++++++----------------------------
 block/qcow2.c          |   48 ++++++++-
 block/qcow2.h          |   12 ++-
 5 files changed, 239 insertions(+), 296 deletions(-)

Comments

Kevin Wolf Jan. 24, 2011, 2:54 p.m. UTC | #1
[ Re-adding qemu-devel to CC ]

Am 24.01.2011 15:34, schrieb Stefan Hajnoczi:
> On Thu, Jan 20, 2011 at 5:10 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>> @@ -702,17 +622,30 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
>>
>>     if (m->nb_available & (s->cluster_sectors - 1)) {
>>         uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
>> +        cow = true;
>>         ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
>>                 m->nb_available - end, s->cluster_sectors);
>>         if (ret < 0)
>>             goto err;
>>     }
>>
>> -    /* update L2 table */
>> +    /*
>> +     * Update L2 table.
>> +     *
>> +     * Before we update the L2 table to actually point to the new cluster, we
>> +     * need to be sure that the refcounts have been increased and COW was
>> +     * handled.
>> +     */
>> +    if (cow) {
>> +        bdrv_flush(bs->file);
> 
> Just bdrv_flush(bs->file) and not a refcounts cache flush?

Refcounts and data need not to be ordered against each other. They only
must both be on disk when we write the L2 table.

>> +    }
>> +
>> +    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
>>     ret = get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index);
>>     if (ret < 0) {
>>         goto err;
>>     }
>> +    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
>>
>>     for (i = 0; i < m->nb_clusters; i++) {
>>         /* if two concurrent writes happen to the same unallocated cluster
>> @@ -728,16 +661,9 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
>>                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
>>      }
>>
>> -    /*
>> -     * Before we update the L2 table to actually point to the new cluster, we
>> -     * need to be sure that the refcounts have been increased and COW was
>> -     * handled.
>> -     */
>> -    bdrv_flush(bs->file);
>>
>> -    ret = write_l2_entries(bs, l2_table, l2_offset, l2_index, m->nb_clusters);
>> +    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
>>     if (ret < 0) {
>> -        qcow2_l2_cache_reset(bs);
>>         goto err;
>>     }
>>
> 
> The function continues like this:
> 
> /*
>  * If this was a COW, we need to decrease the refcount of the old cluster.
>  * Also flush bs->file to get the right order for L2 and refcount update.
>  */
> if (j != 0) {
>     bdrv_flush(bs->file);
>     for (i = 0; i < j; i++) {
>         qcow2_free_any_clusters(bs,
>             be64_to_cpu(old_cluster[i]) & ~QCOW_OFLAG_COPIED, 1);
>     }
> }
> 
> Does bdrv_flush(bs->file) "get the right order for L2 and refcount
> update"?  We've just put an L2 table, should this be an L2 table
> flush?

I agree, this looks wrong. What we need is a dependency to ensure that
first L2 is flushed and then the refcount block.
qcow2_free_any_clusters() calls update_refcount() indirectly, which
takes care of setting this dependency.

So in the end I think it's just an unnecessary bdrv_flush. Makes sense?

Kevin
Stefan Hajnoczi Jan. 24, 2011, 3:26 p.m. UTC | #2
On Mon, Jan 24, 2011 at 2:54 PM, Kevin Wolf <kwolf@redhat.com> wrote:
> [ Re-adding qemu-devel to CC ]
>
> Am 24.01.2011 15:34, schrieb Stefan Hajnoczi:
>> On Thu, Jan 20, 2011 at 5:10 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>>> @@ -702,17 +622,30 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
>>>
>>>     if (m->nb_available & (s->cluster_sectors - 1)) {
>>>         uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
>>> +        cow = true;
>>>         ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
>>>                 m->nb_available - end, s->cluster_sectors);
>>>         if (ret < 0)
>>>             goto err;
>>>     }
>>>
>>> -    /* update L2 table */
>>> +    /*
>>> +     * Update L2 table.
>>> +     *
>>> +     * Before we update the L2 table to actually point to the new cluster, we
>>> +     * need to be sure that the refcounts have been increased and COW was
>>> +     * handled.
>>> +     */
>>> +    if (cow) {
>>> +        bdrv_flush(bs->file);
>>
>> Just bdrv_flush(bs->file) and not a refcounts cache flush?
>
> Refcounts and data need not to be ordered against each other. They only
> must both be on disk when we write the L2 table.

Have I missed where refcounts actually get flushed from the cache out
to the disk because bdrv_flush(bs->file) only syncs the file but
doesn't write out dirty data held in cache.

>>> +    }
>>> +
>>> +    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
>>>     ret = get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index);
>>>     if (ret < 0) {
>>>         goto err;
>>>     }
>>> +    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
>>>
>>>     for (i = 0; i < m->nb_clusters; i++) {
>>>         /* if two concurrent writes happen to the same unallocated cluster
>>> @@ -728,16 +661,9 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
>>>                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
>>>      }
>>>
>>> -    /*
>>> -     * Before we update the L2 table to actually point to the new cluster, we
>>> -     * need to be sure that the refcounts have been increased and COW was
>>> -     * handled.
>>> -     */
>>> -    bdrv_flush(bs->file);
>>>
>>> -    ret = write_l2_entries(bs, l2_table, l2_offset, l2_index, m->nb_clusters);
>>> +    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
>>>     if (ret < 0) {
>>> -        qcow2_l2_cache_reset(bs);
>>>         goto err;
>>>     }
>>>
>>
>> The function continues like this:
>>
>> /*
>>  * If this was a COW, we need to decrease the refcount of the old cluster.
>>  * Also flush bs->file to get the right order for L2 and refcount update.
>>  */
>> if (j != 0) {
>>     bdrv_flush(bs->file);
>>     for (i = 0; i < j; i++) {
>>         qcow2_free_any_clusters(bs,
>>             be64_to_cpu(old_cluster[i]) & ~QCOW_OFLAG_COPIED, 1);
>>     }
>> }
>>
>> Does bdrv_flush(bs->file) "get the right order for L2 and refcount
>> update"?  We've just put an L2 table, should this be an L2 table
>> flush?
>
> I agree, this looks wrong. What we need is a dependency to ensure that
> first L2 is flushed and then the refcount block.
> qcow2_free_any_clusters() calls update_refcount() indirectly, which
> takes care of setting this dependency.
>
> So in the end I think it's just an unnecessary bdrv_flush. Makes sense?

I don't understand this fully.  I've noticed that the cache isn't the
only mechanism for making changes to tables - there are functions like
write_l2_entries() that directly write out parts of tables without
honoring dependencies or using the dirty bit on the cache entry.  I
probably need to look at this more carefully to fully understand
whether or not it is correct.

Stefan
Kevin Wolf Jan. 24, 2011, 3:36 p.m. UTC | #3
Am 24.01.2011 16:26, schrieb Stefan Hajnoczi:
> On Mon, Jan 24, 2011 at 2:54 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>> [ Re-adding qemu-devel to CC ]
>>
>> Am 24.01.2011 15:34, schrieb Stefan Hajnoczi:
>>> On Thu, Jan 20, 2011 at 5:10 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>>>> @@ -702,17 +622,30 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
>>>>
>>>>     if (m->nb_available & (s->cluster_sectors - 1)) {
>>>>         uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
>>>> +        cow = true;
>>>>         ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
>>>>                 m->nb_available - end, s->cluster_sectors);
>>>>         if (ret < 0)
>>>>             goto err;
>>>>     }
>>>>
>>>> -    /* update L2 table */
>>>> +    /*
>>>> +     * Update L2 table.
>>>> +     *
>>>> +     * Before we update the L2 table to actually point to the new cluster, we
>>>> +     * need to be sure that the refcounts have been increased and COW was
>>>> +     * handled.
>>>> +     */
>>>> +    if (cow) {
>>>> +        bdrv_flush(bs->file);
>>>
>>> Just bdrv_flush(bs->file) and not a refcounts cache flush?
>>
>> Refcounts and data need not to be ordered against each other. They only
>> must both be on disk when we write the L2 table.
> 
> Have I missed where refcounts actually get flushed from the cache out
> to the disk because bdrv_flush(bs->file) only syncs the file but
> doesn't write out dirty data held in cache.

The bdrv_flush isn't supposed to flush the refcounts, but the data
copied during COW (what you call pre/postfill in QED)

The refcounts are handled by the qcow2_cache_set_dependency below, so
that before writing the L2 tables we always write the refcounts first.

>>>> +    }
>>>> +
>>>> +    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
>>>>     ret = get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index);
>>>>     if (ret < 0) {
>>>>         goto err;
>>>>     }
>>>> +    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
>>>>
>>>>     for (i = 0; i < m->nb_clusters; i++) {
>>>>         /* if two concurrent writes happen to the same unallocated cluster
>>>> @@ -728,16 +661,9 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
>>>>                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
>>>>      }
>>>>
>>>> -    /*
>>>> -     * Before we update the L2 table to actually point to the new cluster, we
>>>> -     * need to be sure that the refcounts have been increased and COW was
>>>> -     * handled.
>>>> -     */
>>>> -    bdrv_flush(bs->file);
>>>>
>>>> -    ret = write_l2_entries(bs, l2_table, l2_offset, l2_index, m->nb_clusters);
>>>> +    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
>>>>     if (ret < 0) {
>>>> -        qcow2_l2_cache_reset(bs);
>>>>         goto err;
>>>>     }
>>>>
>>>
>>> The function continues like this:
>>>
>>> /*
>>>  * If this was a COW, we need to decrease the refcount of the old cluster.
>>>  * Also flush bs->file to get the right order for L2 and refcount update.
>>>  */
>>> if (j != 0) {
>>>     bdrv_flush(bs->file);
>>>     for (i = 0; i < j; i++) {
>>>         qcow2_free_any_clusters(bs,
>>>             be64_to_cpu(old_cluster[i]) & ~QCOW_OFLAG_COPIED, 1);
>>>     }
>>> }
>>>
>>> Does bdrv_flush(bs->file) "get the right order for L2 and refcount
>>> update"?  We've just put an L2 table, should this be an L2 table
>>> flush?
>>
>> I agree, this looks wrong. What we need is a dependency to ensure that
>> first L2 is flushed and then the refcount block.
>> qcow2_free_any_clusters() calls update_refcount() indirectly, which
>> takes care of setting this dependency.
>>
>> So in the end I think it's just an unnecessary bdrv_flush. Makes sense?
> 
> I don't understand this fully.  I've noticed that the cache isn't the
> only mechanism for making changes to tables - there are functions like
> write_l2_entries() that directly write out parts of tables without
> honoring dependencies or using the dirty bit on the cache entry.  I
> probably need to look at this more carefully to fully understand
> whether or not it is correct.

No, the cache should be the only mechanism that is used for accessing L2
tables and refcount blocks. write_l2_entries() is an old function that
is removed by the patch.

Direct accesses should only be left for L1 tables and refcount tables.

Kevin
Stefan Hajnoczi Jan. 24, 2011, 3:39 p.m. UTC | #4
On Mon, Jan 24, 2011 at 3:36 PM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 24.01.2011 16:26, schrieb Stefan Hajnoczi:
>> On Mon, Jan 24, 2011 at 2:54 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>>> [ Re-adding qemu-devel to CC ]
>>>
>>> Am 24.01.2011 15:34, schrieb Stefan Hajnoczi:
>>>> On Thu, Jan 20, 2011 at 5:10 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>>>>> @@ -702,17 +622,30 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
>>>>>
>>>>>     if (m->nb_available & (s->cluster_sectors - 1)) {
>>>>>         uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
>>>>> +        cow = true;
>>>>>         ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
>>>>>                 m->nb_available - end, s->cluster_sectors);
>>>>>         if (ret < 0)
>>>>>             goto err;
>>>>>     }
>>>>>
>>>>> -    /* update L2 table */
>>>>> +    /*
>>>>> +     * Update L2 table.
>>>>> +     *
>>>>> +     * Before we update the L2 table to actually point to the new cluster, we
>>>>> +     * need to be sure that the refcounts have been increased and COW was
>>>>> +     * handled.
>>>>> +     */
>>>>> +    if (cow) {
>>>>> +        bdrv_flush(bs->file);
>>>>
>>>> Just bdrv_flush(bs->file) and not a refcounts cache flush?
>>>
>>> Refcounts and data need not to be ordered against each other. They only
>>> must both be on disk when we write the L2 table.
>>
>> Have I missed where refcounts actually get flushed from the cache out
>> to the disk because bdrv_flush(bs->file) only syncs the file but
>> doesn't write out dirty data held in cache.
>
> The bdrv_flush isn't supposed to flush the refcounts, but the data
> copied during COW (what you call pre/postfill in QED)
>
> The refcounts are handled by the qcow2_cache_set_dependency below, so
> that before writing the L2 tables we always write the refcounts first.
>
>>>>> +    }
>>>>> +
>>>>> +    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
>>>>>     ret = get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index);
>>>>>     if (ret < 0) {
>>>>>         goto err;
>>>>>     }
>>>>> +    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
>>>>>
>>>>>     for (i = 0; i < m->nb_clusters; i++) {
>>>>>         /* if two concurrent writes happen to the same unallocated cluster
>>>>> @@ -728,16 +661,9 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
>>>>>                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
>>>>>      }
>>>>>
>>>>> -    /*
>>>>> -     * Before we update the L2 table to actually point to the new cluster, we
>>>>> -     * need to be sure that the refcounts have been increased and COW was
>>>>> -     * handled.
>>>>> -     */
>>>>> -    bdrv_flush(bs->file);
>>>>>
>>>>> -    ret = write_l2_entries(bs, l2_table, l2_offset, l2_index, m->nb_clusters);
>>>>> +    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
>>>>>     if (ret < 0) {
>>>>> -        qcow2_l2_cache_reset(bs);
>>>>>         goto err;
>>>>>     }
>>>>>
>>>>
>>>> The function continues like this:
>>>>
>>>> /*
>>>>  * If this was a COW, we need to decrease the refcount of the old cluster.
>>>>  * Also flush bs->file to get the right order for L2 and refcount update.
>>>>  */
>>>> if (j != 0) {
>>>>     bdrv_flush(bs->file);
>>>>     for (i = 0; i < j; i++) {
>>>>         qcow2_free_any_clusters(bs,
>>>>             be64_to_cpu(old_cluster[i]) & ~QCOW_OFLAG_COPIED, 1);
>>>>     }
>>>> }
>>>>
>>>> Does bdrv_flush(bs->file) "get the right order for L2 and refcount
>>>> update"?  We've just put an L2 table, should this be an L2 table
>>>> flush?
>>>
>>> I agree, this looks wrong. What we need is a dependency to ensure that
>>> first L2 is flushed and then the refcount block.
>>> qcow2_free_any_clusters() calls update_refcount() indirectly, which
>>> takes care of setting this dependency.
>>>
>>> So in the end I think it's just an unnecessary bdrv_flush. Makes sense?
>>
>> I don't understand this fully.  I've noticed that the cache isn't the
>> only mechanism for making changes to tables - there are functions like
>> write_l2_entries() that directly write out parts of tables without
>> honoring dependencies or using the dirty bit on the cache entry.  I
>> probably need to look at this more carefully to fully understand
>> whether or not it is correct.
>
> No, the cache should be the only mechanism that is used for accessing L2
> tables and refcount blocks. write_l2_entries() is an old function that
> is removed by the patch.
>
> Direct accesses should only be left for L1 tables and refcount tables.

Agreed.  I was switching between branches and had looked at a
non-qcowcache world!

Stefan
Avi Kivity Feb. 9, 2011, 11:19 a.m. UTC | #5
On 01/20/2011 07:10 PM, Kevin Wolf wrote:
> Use the new functions of qcow2-cache.c for everything that works on refcount
> block and L2 tables.
>

While autotesting qemu-kvm's stable-0.14 branch, this commit causes 
failures.  The scenario is:

- install OS
- reboot
- hang at GRUB prompt
- qemu-img reports hundreds of leaked clusters

The actual commit I am testing is in qemu-kvm.git 
qcow2-qcowcache-failure (tag).  The test runs with -drive ,cache=unsafe.

> @@ -719,7 +727,13 @@ static void qcow2_close(BlockDriverState *bs)
>   {
>       BDRVQcowState *s = bs->opaque;
>       qemu_free(s->l1_table);
> -    qemu_free(s->l2_cache);
> +
> +    qcow2_cache_flush(bs, s->l2_table_cache);
> +    qcow2_cache_flush(bs, s->refcount_block_cache);
> +
> +    qcow2_cache_destroy(bs, s->l2_table_cache);
> +    qcow2_cache_destroy(bs, s->refcount_block_cache);
> +
>       qemu_free(s->cluster_cache);
>       qemu_free(s->cluster_data);
>       qcow2_refcount_close(bs);

This is of course the immediate suspect, but it appears correct.  
Perhaps it isn't called, or maybe in the wrong place?
Avi Kivity Feb. 9, 2011, 11:23 a.m. UTC | #6
On 02/09/2011 01:19 PM, Avi Kivity wrote:
> On 01/20/2011 07:10 PM, Kevin Wolf wrote:
>> Use the new functions of qcow2-cache.c for everything that works on 
>> refcount
>> block and L2 tables.
>>
>
> While autotesting qemu-kvm's stable-0.14 branch, this commit causes 
> failures.  The scenario is:
>
> - install OS
> - reboot
> - hang at GRUB prompt
> - qemu-img reports hundreds of leaked clusters
>
> The actual commit I am testing is in qemu-kvm.git 
> qcow2-qcowcache-failure (tag).  The test runs with -drive ,cache=unsafe.

Got it - it's specific to qemu-kvm; the call to bdrv_close_all() wasn't 
replicated in qemu-kvm's main loop.
diff mbox

Patch

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index f7c4e2a..5f1740b 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -104,6 +104,12 @@  static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
         }
     }
 
+    if (c == s->refcount_block_cache) {
+        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
+    } else if (c == s->l2_table_cache) {
+        BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
+    }
+
     ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table,
         s->cluster_size);
     if (ret < 0) {
@@ -218,6 +224,10 @@  static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
 
     c->entries[i].offset = 0;
     if (read_from_disk) {
+        if (c == s->l2_table_cache) {
+            BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
+        }
+
         ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size);
         if (ret < 0) {
             return ret;
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index c3ef550..7fa4fa1 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -67,7 +67,11 @@  int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
         qemu_free(new_l1_table);
         return new_l1_table_offset;
     }
-    bdrv_flush(bs->file);
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        return ret;
+    }
 
     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
     for(i = 0; i < s->l1_size; i++)
@@ -98,63 +102,6 @@  int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
     return ret;
 }
 
-void qcow2_l2_cache_reset(BlockDriverState *bs)
-{
-    BDRVQcowState *s = bs->opaque;
-
-    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
-    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
-    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
-}
-
-static inline int l2_cache_new_entry(BlockDriverState *bs)
-{
-    BDRVQcowState *s = bs->opaque;
-    uint32_t min_count;
-    int min_index, i;
-
-    /* find a new entry in the least used one */
-    min_index = 0;
-    min_count = 0xffffffff;
-    for(i = 0; i < L2_CACHE_SIZE; i++) {
-        if (s->l2_cache_counts[i] < min_count) {
-            min_count = s->l2_cache_counts[i];
-            min_index = i;
-        }
-    }
-    return min_index;
-}
-
-/*
- * seek_l2_table
- *
- * seek l2_offset in the l2_cache table
- * if not found, return NULL,
- * if found,
- *   increments the l2 cache hit count of the entry,
- *   if counter overflow, divide by two all counters
- *   return the pointer to the l2 cache entry
- *
- */
-
-static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
-{
-    int i, j;
-
-    for(i = 0; i < L2_CACHE_SIZE; i++) {
-        if (l2_offset == s->l2_cache_offsets[i]) {
-            /* increment the hit count */
-            if (++s->l2_cache_counts[i] == 0xffffffff) {
-                for(j = 0; j < L2_CACHE_SIZE; j++) {
-                    s->l2_cache_counts[j] >>= 1;
-                }
-            }
-            return s->l2_cache + (i << s->l2_bits);
-        }
-    }
-    return NULL;
-}
-
 /*
  * l2_load
  *
@@ -169,33 +116,11 @@  static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
     uint64_t **l2_table)
 {
     BDRVQcowState *s = bs->opaque;
-    int min_index;
     int ret;
 
-    /* seek if the table for the given offset is in the cache */
-
-    *l2_table = seek_l2_table(s, l2_offset);
-    if (*l2_table != NULL) {
-        return 0;
-    }
-
-    /* not found: load a new entry in the least used one */
-
-    min_index = l2_cache_new_entry(bs);
-    *l2_table = s->l2_cache + (min_index << s->l2_bits);
-
-    BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
-    ret = bdrv_pread(bs->file, l2_offset, *l2_table,
-        s->l2_size * sizeof(uint64_t));
-    if (ret < 0) {
-        qcow2_l2_cache_reset(bs);
-        return ret;
-    }
+    ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table);
 
-    s->l2_cache_offsets[min_index] = l2_offset;
-    s->l2_cache_counts[min_index] = 1;
-
-    return 0;
+    return ret;
 }
 
 /*
@@ -238,7 +163,6 @@  static int write_l1_entry(BlockDriverState *bs, int l1_index)
 static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
 {
     BDRVQcowState *s = bs->opaque;
-    int min_index;
     uint64_t old_l2_offset;
     uint64_t *l2_table;
     int64_t l2_offset;
@@ -252,29 +176,48 @@  static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
     if (l2_offset < 0) {
         return l2_offset;
     }
-    bdrv_flush(bs->file);
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        goto fail;
+    }
 
     /* allocate a new entry in the l2 cache */
 
-    min_index = l2_cache_new_entry(bs);
-    l2_table = s->l2_cache + (min_index << s->l2_bits);
+    ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    l2_table = *table;
 
     if (old_l2_offset == 0) {
         /* if there was no old l2 table, clear the new table */
         memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
     } else {
+        uint64_t* old_table;
+
         /* if there was an old l2 table, read it from the disk */
         BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
-        ret = bdrv_pread(bs->file, old_l2_offset, l2_table,
-            s->l2_size * sizeof(uint64_t));
+        ret = qcow2_cache_get(bs, s->l2_table_cache, old_l2_offset,
+            (void**) &old_table);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        memcpy(l2_table, old_table, s->cluster_size);
+
+        ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table);
         if (ret < 0) {
             goto fail;
         }
     }
+
     /* write the l2 table to the file */
     BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
-    ret = bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
-        s->l2_size * sizeof(uint64_t));
+
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
     if (ret < 0) {
         goto fail;
     }
@@ -286,17 +229,12 @@  static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
         goto fail;
     }
 
-    /* update the l2 cache entry */
-
-    s->l2_cache_offsets[min_index] = l2_offset;
-    s->l2_cache_counts[min_index] = 1;
-
     *table = l2_table;
     return 0;
 
 fail:
+    qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
     s->l1_table[l1_index] = old_l2_offset;
-    qcow2_l2_cache_reset(bs);
     return ret;
 }
 
@@ -521,6 +459,8 @@  int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
                 &l2_table[l2_index], 0, QCOW_OFLAG_COPIED);
     }
 
+    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+
    nb_available = (c * s->cluster_sectors);
 out:
     if (nb_available > nb_needed)
@@ -575,6 +515,7 @@  static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
             return ret;
         }
     } else {
+        /* FIXME Order */
         if (l2_offset)
             qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
         ret = l2_allocate(bs, l1_index, &l2_table);
@@ -632,6 +573,7 @@  uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
 
     cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
     if (cluster_offset < 0) {
+        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
         return 0;
     }
 
@@ -646,38 +588,14 @@  uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
     /* compressed clusters never have the copied flag */
 
     BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
     l2_table[l2_index] = cpu_to_be64(cluster_offset);
-    if (bdrv_pwrite_sync(bs->file,
-                    l2_offset + l2_index * sizeof(uint64_t),
-                    l2_table + l2_index,
-                    sizeof(uint64_t)) < 0)
-        return 0;
-
-    return cluster_offset;
-}
-
-/*
- * Write L2 table updates to disk, writing whole sectors to avoid a
- * read-modify-write in bdrv_pwrite
- */
-#define L2_ENTRIES_PER_SECTOR (512 / 8)
-static int write_l2_entries(BlockDriverState *bs, uint64_t *l2_table,
-    uint64_t l2_offset, int l2_index, int num)
-{
-    int l2_start_index = l2_index & ~(L1_ENTRIES_PER_SECTOR - 1);
-    int start_offset = (8 * l2_index) & ~511;
-    int end_offset = (8 * (l2_index + num) + 511) & ~511;
-    size_t len = end_offset - start_offset;
-    int ret;
-
-    BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
-    ret = bdrv_pwrite(bs->file, l2_offset + start_offset,
-        &l2_table[l2_start_index], len);
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
     if (ret < 0) {
-        return ret;
+        return 0;
     }
 
-    return 0;
+    return cluster_offset;
 }
 
 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
@@ -686,6 +604,7 @@  int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     int i, j = 0, l2_index, ret;
     uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
     uint64_t cluster_offset = m->cluster_offset;
+    bool cow = false;
 
     if (m->nb_clusters == 0)
         return 0;
@@ -695,6 +614,7 @@  int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     /* copy content of unmodified sectors */
     start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
     if (m->n_start) {
+        cow = true;
         ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
         if (ret < 0)
             goto err;
@@ -702,17 +622,30 @@  int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
 
     if (m->nb_available & (s->cluster_sectors - 1)) {
         uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
+        cow = true;
         ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
                 m->nb_available - end, s->cluster_sectors);
         if (ret < 0)
             goto err;
     }
 
-    /* update L2 table */
+    /*
+     * Update L2 table.
+     *
+     * Before we update the L2 table to actually point to the new cluster, we
+     * need to be sure that the refcounts have been increased and COW was
+     * handled.
+     */
+    if (cow) {
+        bdrv_flush(bs->file);
+    }
+
+    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
     ret = get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index);
     if (ret < 0) {
         goto err;
     }
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
 
     for (i = 0; i < m->nb_clusters; i++) {
         /* if two concurrent writes happen to the same unallocated cluster
@@ -728,16 +661,9 @@  int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
      }
 
-    /*
-     * Before we update the L2 table to actually point to the new cluster, we
-     * need to be sure that the refcounts have been increased and COW was
-     * handled.
-     */
-    bdrv_flush(bs->file);
 
-    ret = write_l2_entries(bs, l2_table, l2_offset, l2_index, m->nb_clusters);
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
     if (ret < 0) {
-        qcow2_l2_cache_reset(bs);
         goto err;
     }
 
@@ -868,7 +794,8 @@  int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
                 m->depends_on = old_alloc;
                 m->nb_clusters = 0;
                 *num = 0;
-                return 0;
+                ret = 0;
+                goto fail;
             }
         }
     }
@@ -884,7 +811,8 @@  int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
     cluster_offset = qcow2_alloc_clusters(bs, nb_clusters * s->cluster_size);
     if (cluster_offset < 0) {
         QLIST_REMOVE(m, next_in_flight);
-        return cluster_offset;
+        ret = cluster_offset;
+        goto fail;
     }
 
     /* save info needed for meta data update */
@@ -893,12 +821,21 @@  int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
     m->nb_clusters = nb_clusters;
 
 out:
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (ret < 0) {
+        return ret;
+    }
+
     m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
     m->cluster_offset = cluster_offset;
 
     *num = m->nb_available - n_start;
 
     return 0;
+
+fail:
+    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    return ret;
 }
 
 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index a10453c..7725147 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -32,27 +32,6 @@  static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                             int addend);
 
 
-static int cache_refcount_updates = 0;
-
-static int write_refcount_block(BlockDriverState *bs)
-{
-    BDRVQcowState *s = bs->opaque;
-    size_t size = s->cluster_size;
-
-    if (s->refcount_block_cache_offset == 0) {
-        return 0;
-    }
-
-    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE);
-    if (bdrv_pwrite_sync(bs->file, s->refcount_block_cache_offset,
-            s->refcount_block_cache, size) < 0)
-    {
-        return -EIO;
-    }
-
-    return 0;
-}
-
 /*********************************************************/
 /* refcount handling */
 
@@ -61,7 +40,6 @@  int qcow2_refcount_init(BlockDriverState *bs)
     BDRVQcowState *s = bs->opaque;
     int ret, refcount_table_size2, i;
 
-    s->refcount_block_cache = qemu_malloc(s->cluster_size);
     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
     s->refcount_table = qemu_malloc(refcount_table_size2);
     if (s->refcount_table_size > 0) {
@@ -81,34 +59,22 @@  int qcow2_refcount_init(BlockDriverState *bs)
 void qcow2_refcount_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
-    qemu_free(s->refcount_block_cache);
     qemu_free(s->refcount_table);
 }
 
 
 static int load_refcount_block(BlockDriverState *bs,
-                               int64_t refcount_block_offset)
+                               int64_t refcount_block_offset,
+                               void **refcount_block)
 {
     BDRVQcowState *s = bs->opaque;
     int ret;
 
-    if (cache_refcount_updates) {
-        ret = write_refcount_block(bs);
-        if (ret < 0) {
-            return ret;
-        }
-    }
-
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
-    ret = bdrv_pread(bs->file, refcount_block_offset, s->refcount_block_cache,
-                     s->cluster_size);
-    if (ret < 0) {
-        s->refcount_block_cache_offset = 0;
-        return ret;
-    }
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+        refcount_block);
 
-    s->refcount_block_cache_offset = refcount_block_offset;
-    return 0;
+    return ret;
 }
 
 /*
@@ -122,6 +88,8 @@  static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
     int refcount_table_index, block_index;
     int64_t refcount_block_offset;
     int ret;
+    uint16_t *refcount_block;
+    uint16_t refcount;
 
     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
     if (refcount_table_index >= s->refcount_table_size)
@@ -129,16 +97,24 @@  static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
     refcount_block_offset = s->refcount_table[refcount_table_index];
     if (!refcount_block_offset)
         return 0;
-    if (refcount_block_offset != s->refcount_block_cache_offset) {
-        /* better than nothing: return allocated if read error */
-        ret = load_refcount_block(bs, refcount_block_offset);
-        if (ret < 0) {
-            return ret;
-        }
+
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+        (void**) &refcount_block);
+    if (ret < 0) {
+        return ret;
     }
+
     block_index = cluster_index &
         ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
-    return be16_to_cpu(s->refcount_block_cache[block_index]);
+    refcount = be16_to_cpu(refcount_block[block_index]);
+
+    ret = qcow2_cache_put(bs, s->refcount_block_cache,
+        (void**) &refcount_block);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return refcount;
 }
 
 /*
@@ -174,9 +150,10 @@  static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
  * Loads a refcount block. If it doesn't exist yet, it is allocated first
  * (including growing the refcount table if needed).
  *
- * Returns the offset of the refcount block on success or -errno in error case
+ * Returns 0 on success or -errno in error case
  */
-static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
+static int alloc_refcount_block(BlockDriverState *bs,
+    int64_t cluster_index, uint16_t **refcount_block)
 {
     BDRVQcowState *s = bs->opaque;
     unsigned int refcount_table_index;
@@ -194,13 +171,8 @@  static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
 
         /* If it's already there, we're done */
         if (refcount_block_offset) {
-            if (refcount_block_offset != s->refcount_block_cache_offset) {
-                ret = load_refcount_block(bs, refcount_block_offset);
-                if (ret < 0) {
-                    return ret;
-                }
-            }
-            return refcount_block_offset;
+             return load_refcount_block(bs, refcount_block_offset,
+                 (void**) refcount_block);
         }
     }
 
@@ -226,12 +198,10 @@  static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
      *   refcount block into the cache
      */
 
-    if (cache_refcount_updates) {
-        ret = write_refcount_block(bs);
-        if (ret < 0) {
-            return ret;
-        }
-    }
+    *refcount_block = NULL;
+
+    /* We write to the refcount table, so we might depend on L2 tables */
+    qcow2_cache_flush(bs, s->l2_table_cache);
 
     /* Allocate the refcount block itself and mark it as used */
     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
@@ -247,13 +217,18 @@  static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
 
     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
         /* Zero the new refcount block before updating it */
-        memset(s->refcount_block_cache, 0, s->cluster_size);
-        s->refcount_block_cache_offset = new_block;
+        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
+            (void**) refcount_block);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        memset(*refcount_block, 0, s->cluster_size);
 
         /* The block describes itself, need to update the cache */
         int block_index = (new_block >> s->cluster_bits) &
             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
-        s->refcount_block_cache[block_index] = cpu_to_be16(1);
+        (*refcount_block)[block_index] = cpu_to_be16(1);
     } else {
         /* Described somewhere else. This can recurse at most twice before we
          * arrive at a block that describes itself. */
@@ -266,14 +241,19 @@  static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
 
         /* Initialize the new refcount block only after updating its refcount,
          * update_refcount uses the refcount cache itself */
-        memset(s->refcount_block_cache, 0, s->cluster_size);
-        s->refcount_block_cache_offset = new_block;
+        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
+            (void**) refcount_block);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        memset(*refcount_block, 0, s->cluster_size);
     }
 
     /* Now the new refcount block needs to be written to disk */
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
-    ret = bdrv_pwrite_sync(bs->file, new_block, s->refcount_block_cache,
-        s->cluster_size);
+    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
     if (ret < 0) {
         goto fail_block;
     }
@@ -293,6 +273,11 @@  static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
         return new_block;
     }
 
+    ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+    if (ret < 0) {
+        goto fail_block;
+    }
+
     /*
      * If we come here, we need to grow the refcount table. Again, a new
      * refcount table needs some space and we can't simply allocate to avoid
@@ -410,9 +395,9 @@  static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
     s->free_cluster_index = old_free_cluster_index;
 
-    ret = load_refcount_block(bs, new_block);
+    ret = load_refcount_block(bs, new_block, (void**) refcount_block);
     if (ret < 0) {
-        goto fail_block;
+        return ret;
     }
 
     return new_block;
@@ -420,41 +405,10 @@  static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
 fail_table:
     qemu_free(new_table);
 fail_block:
-    s->refcount_block_cache_offset = 0;
-    return ret;
-}
-
-#define REFCOUNTS_PER_SECTOR (512 >> REFCOUNT_SHIFT)
-static int write_refcount_block_entries(BlockDriverState *bs,
-    int64_t refcount_block_offset, int first_index, int last_index)
-{
-    BDRVQcowState *s = bs->opaque;
-    size_t size;
-    int ret;
-
-    if (cache_refcount_updates) {
-        return 0;
-    }
-
-    if (first_index < 0) {
-        return 0;
-    }
-
-    first_index &= ~(REFCOUNTS_PER_SECTOR - 1);
-    last_index = (last_index + REFCOUNTS_PER_SECTOR)
-        & ~(REFCOUNTS_PER_SECTOR - 1);
-
-    size = (last_index - first_index) << REFCOUNT_SHIFT;
-
-    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
-    ret = bdrv_pwrite(bs->file,
-        refcount_block_offset + (first_index << REFCOUNT_SHIFT),
-        &s->refcount_block_cache[first_index], size);
-    if (ret < 0) {
-        return ret;
+    if (*refcount_block != NULL) {
+        qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
     }
-
-    return 0;
+    return ret;
 }
 
 /* XXX: cache several refcount block clusters ? */
@@ -463,9 +417,8 @@  static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 {
     BDRVQcowState *s = bs->opaque;
     int64_t start, last, cluster_offset;
-    int64_t refcount_block_offset = 0;
-    int64_t table_index = -1, old_table_index;
-    int first_index = -1, last_index = -1;
+    uint16_t *refcount_block = NULL;
+    int64_t old_table_index = -1;
     int ret;
 
 #ifdef DEBUG_ALLOC2
@@ -478,6 +431,11 @@  static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         return 0;
     }
 
+    if (addend < 0) {
+        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
+            s->l2_table_cache);
+    }
+
     start = offset & ~(s->cluster_size - 1);
     last = (offset + length - 1) & ~(s->cluster_size - 1);
     for(cluster_offset = start; cluster_offset <= last;
@@ -485,42 +443,33 @@  static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
     {
         int block_index, refcount;
         int64_t cluster_index = cluster_offset >> s->cluster_bits;
-        int64_t new_block;
+        int64_t table_index =
+            cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
 
-        /* Only write refcount block to disk when we are done with it */
-        old_table_index = table_index;
-        table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
-        if ((old_table_index >= 0) && (table_index != old_table_index)) {
+        /* Load the refcount block and allocate it if needed */
+        if (table_index != old_table_index) {
+            if (refcount_block) {
+                ret = qcow2_cache_put(bs, s->refcount_block_cache,
+                    (void**) &refcount_block);
+                if (ret < 0) {
+                    goto fail;
+                }
+            }
 
-            ret = write_refcount_block_entries(bs, refcount_block_offset,
-                first_index, last_index);
+            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
             if (ret < 0) {
-                return ret;
+                goto fail;
             }
-
-            first_index = -1;
-            last_index = -1;
         }
+        old_table_index = table_index;
 
-        /* Load the refcount block and allocate it if needed */
-        new_block = alloc_refcount_block(bs, cluster_index);
-        if (new_block < 0) {
-            ret = new_block;
-            goto fail;
-        }
-        refcount_block_offset = new_block;
+        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
 
         /* we can update the count and save it */
         block_index = cluster_index &
             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
-        if (first_index == -1 || block_index < first_index) {
-            first_index = block_index;
-        }
-        if (block_index > last_index) {
-            last_index = block_index;
-        }
 
-        refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
+        refcount = be16_to_cpu(refcount_block[block_index]);
         refcount += addend;
         if (refcount < 0 || refcount > 0xffff) {
             ret = -EINVAL;
@@ -529,17 +478,16 @@  static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         if (refcount == 0 && cluster_index < s->free_cluster_index) {
             s->free_cluster_index = cluster_index;
         }
-        s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
+        refcount_block[block_index] = cpu_to_be16(refcount);
     }
 
     ret = 0;
 fail:
-
     /* Write last changed block to disk */
-    if (refcount_block_offset != 0) {
+    if (refcount_block) {
         int wret;
-        wret = write_refcount_block_entries(bs, refcount_block_offset,
-            first_index, last_index);
+        wret = qcow2_cache_put(bs, s->refcount_block_cache,
+            (void**) &refcount_block);
         if (wret < 0) {
             return ret < 0 ? ret : wret;
         }
@@ -758,9 +706,7 @@  int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
     int64_t old_offset, old_l2_offset;
     int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
-
-    qcow2_l2_cache_reset(bs);
-    cache_refcount_updates = 1;
+    int ret;
 
     l2_table = NULL;
     l1_table = NULL;
@@ -784,7 +730,6 @@  int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     }
 
     l2_size = s->l2_size * sizeof(uint64_t);
-    l2_table = qemu_malloc(l2_size);
     l1_modified = 0;
     for(i = 0; i < l1_size; i++) {
         l2_offset = l1_table[i];
@@ -792,8 +737,13 @@  int qcow2_update_snapshot_refcount(BlockDriverState *bs,
             old_l2_offset = l2_offset;
             l2_offset &= ~QCOW_OFLAG_COPIED;
             l2_modified = 0;
-            if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
+
+            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
+                (void**) &l2_table);
+            if (ret < 0) {
                 goto fail;
+            }
+
             for(j = 0; j < s->l2_size; j++) {
                 offset = be64_to_cpu(l2_table[j]);
                 if (offset != 0) {
@@ -833,17 +783,23 @@  int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                         offset |= QCOW_OFLAG_COPIED;
                     }
                     if (offset != old_offset) {
+                        if (addend > 0) {
+                            qcow2_cache_set_dependency(bs, s->l2_table_cache,
+                                s->refcount_block_cache);
+                        }
                         l2_table[j] = cpu_to_be64(offset);
                         l2_modified = 1;
+                        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
                     }
                 }
             }
-            if (l2_modified) {
-                if (bdrv_pwrite_sync(bs->file,
-                                l2_offset, l2_table, l2_size) < 0)
-                    goto fail;
+
+            ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+            if (ret < 0) {
+                goto fail;
             }
 
+
             if (addend != 0) {
                 refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
             } else {
@@ -871,16 +827,14 @@  int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     }
     if (l1_allocated)
         qemu_free(l1_table);
-    qemu_free(l2_table);
-    cache_refcount_updates = 0;
-    write_refcount_block(bs);
     return 0;
  fail:
+    if (l2_table) {
+        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    }
+
     if (l1_allocated)
         qemu_free(l1_table);
-    qemu_free(l2_table);
-    cache_refcount_updates = 0;
-    write_refcount_block(bs);
     return -EIO;
 }
 
diff --git a/block/qcow2.c b/block/qcow2.c
index b6b094c..49bf7b9 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -143,6 +143,7 @@  static int qcow2_open(BlockDriverState *bs, int flags)
     int len, i, ret = 0;
     QCowHeader header;
     uint64_t ext_end;
+    bool writethrough;
 
     ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
     if (ret < 0) {
@@ -217,8 +218,13 @@  static int qcow2_open(BlockDriverState *bs, int flags)
             be64_to_cpus(&s->l1_table[i]);
         }
     }
-    /* alloc L2 cache */
-    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+
+    /* alloc L2 table/refcount block cache */
+    writethrough = ((flags & BDRV_O_CACHE_MASK) == 0);
+    s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE, writethrough);
+    s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE,
+        writethrough);
+
     s->cluster_cache = qemu_malloc(s->cluster_size);
     /* one more sector for decompressed data alignment */
     s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
@@ -270,7 +276,9 @@  static int qcow2_open(BlockDriverState *bs, int flags)
     qcow2_free_snapshots(bs);
     qcow2_refcount_close(bs);
     qemu_free(s->l1_table);
-    qemu_free(s->l2_cache);
+    if (s->l2_table_cache) {
+        qcow2_cache_destroy(bs, s->l2_table_cache);
+    }
     qemu_free(s->cluster_cache);
     qemu_free(s->cluster_data);
     return ret;
@@ -719,7 +727,13 @@  static void qcow2_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
     qemu_free(s->l1_table);
-    qemu_free(s->l2_cache);
+
+    qcow2_cache_flush(bs, s->l2_table_cache);
+    qcow2_cache_flush(bs, s->refcount_block_cache);
+
+    qcow2_cache_destroy(bs, s->l2_table_cache);
+    qcow2_cache_destroy(bs, s->refcount_block_cache);
+
     qemu_free(s->cluster_cache);
     qemu_free(s->cluster_data);
     qcow2_refcount_close(bs);
@@ -1179,6 +1193,19 @@  static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
 
 static int qcow2_flush(BlockDriverState *bs)
 {
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        return ret;
+    }
+
     return bdrv_flush(bs->file);
 }
 
@@ -1186,6 +1213,19 @@  static BlockDriverAIOCB *qcow2_aio_flush(BlockDriverState *bs,
                                          BlockDriverCompletionFunc *cb,
                                          void *opaque)
 {
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        return NULL;
+    }
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        return NULL;
+    }
+
     return bdrv_aio_flush(bs->file, cb, opaque);
 }
 
diff --git a/block/qcow2.h b/block/qcow2.h
index e5473e1..11cbce3 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -51,6 +51,9 @@ 
 
 #define L2_CACHE_SIZE 16
 
+/* Must be at least 4 to cover all cases of refcount table growth */
+#define REFCOUNT_CACHE_SIZE 4
+
 typedef struct QCowHeader {
     uint32_t magic;
     uint32_t version;
@@ -94,9 +97,10 @@  typedef struct BDRVQcowState {
     uint64_t cluster_offset_mask;
     uint64_t l1_table_offset;
     uint64_t *l1_table;
-    uint64_t *l2_cache;
-    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
-    uint32_t l2_cache_counts[L2_CACHE_SIZE];
+
+    Qcow2Cache* l2_table_cache;
+    Qcow2Cache* refcount_block_cache;
+
     uint8_t *cluster_cache;
     uint8_t *cluster_data;
     uint64_t cluster_cache_offset;
@@ -105,8 +109,6 @@  typedef struct BDRVQcowState {
     uint64_t *refcount_table;
     uint64_t refcount_table_offset;
     uint32_t refcount_table_size;
-    uint64_t refcount_block_cache_offset;
-    uint16_t *refcount_block_cache;
     int64_t free_cluster_index;
     int64_t free_byte_offset;