@@ -911,6 +911,10 @@ void qcow2_host_range_unref(BlockDriverState *bs, int64_t offset,
uint64_t qcow2_get_host_range_refcnt(BlockDriverState *bs,
int64_t cluster_index);
+bool qcow2_host_cluster_postponed_discard(BlockDriverState *bs,
+ int64_t cluster_index,
+ enum qcow2_discard_type type);
+
/* qcow2-cluster.c functions */
int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
bool exact_size);
@@ -30,6 +30,13 @@ typedef struct HostCluster {
/* For convenience, keep cluster_index here */
int64_t cluster_index;
+
+ /*
+ * Qcow2 refcount of this host cluster is zero. So, when all dynamic users
+ * put their references back, we should discard the cluster.
+ */
+ bool postponed_discard;
+ enum qcow2_discard_type postponed_discard_type;
} HostCluster;
void qcow2_init_host_range_refs(BDRVQcow2State *s)
@@ -122,6 +129,46 @@ qcow2_host_range_unref(BlockDriverState *bs, int64_t offset, int64_t length)
continue;
}
+ if (!cl->postponed_discard) {
+ g_hash_table_remove(s->host_range_refs, &cluster_index);
+ continue;
+ }
+
+ /*
+ * OK. refcnt become 0 and we should do postponed discard. Let's keep
+ * host_range_refcnt = 1 during this final IO operation.
+ */
+ if (s->discard_passthrough[cl->postponed_discard_type]) {
+ int64_t cluster_offset = cluster_index << s->cluster_bits;
+ if (s->cache_discards) {
+ qcow2_cache_host_discard(bs, cluster_offset, s->cluster_size);
+ } else {
+ /* Discard is optional, ignore the return value */
+ bdrv_pdiscard(bs->file, cluster_offset, s->cluster_size);
+ }
+ }
+
g_hash_table_remove(s->host_range_refs, &cluster_index);
+
+ if (cluster_index < s->free_cluster_index) {
+ s->free_cluster_index = cluster_index;
+ }
+ }
+}
+
+bool qcow2_host_cluster_postponed_discard(BlockDriverState *bs,
+ int64_t cluster_index,
+ enum qcow2_discard_type type)
+{
+ BDRVQcow2State *s = bs->opaque;
+ HostCluster *cl = find_host_cluster(s, cluster_index);
+
+ if (!cl) {
+ return false;
}
+
+ cl->postponed_discard = true;
+ cl->postponed_discard_type = type;
+
+ return true;
}
@@ -878,9 +878,6 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
} else {
refcount += addend;
}
- if (refcount == 0 && cluster_index < s->free_cluster_index) {
- s->free_cluster_index = cluster_index;
- }
s->set_refcount(refcount_block, block_index, refcount);
if (refcount == 0) {
@@ -900,8 +897,20 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
qcow2_cache_discard(s->l2_table_cache, table);
}
- if (s->discard_passthrough[type]) {
- qcow2_cache_host_discard(bs, cluster_offset, s->cluster_size);
+ if (!qcow2_host_cluster_postponed_discard(bs, cluster_index,
+ type))
+ {
+ /*
+ * Refcount is zero as well as host-range-refcnt. Cluster is
+ * free.
+ */
+ if (cluster_index < s->free_cluster_index) {
+ s->free_cluster_index = cluster_index;
+ }
+ if (s->discard_passthrough[type]) {
+ qcow2_cache_host_discard(bs, cluster_offset,
+ s->cluster_size);
+ }
}
}
}
@@ -963,7 +972,7 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs,
/*
- * Cluster is free when its refcount is 0
+ * Cluster is free when its refcount is 0 and there is no in-flight writes
*
* Return < 0 if failed to get refcount
* 0 if cluster is not free
@@ -979,7 +988,7 @@ static int is_cluster_free(BlockDriverState *bs, int64_t cluster_index)
return ret;
}
- return refcount == 0;
+ return refcount == 0 && qcow2_get_host_range_refcnt(bs, cluster_index) == 0;
}
/* return < 0 if error */
We have a bug in qcow2: assume we've started data write into host cluster A. s->lock is unlocked. During the write the refcount of cluster A may become zero, cluster may be reallocated for other needs, and our in-flight write become a use-after-free. To fix the bug let's do the following. Or better, let's start from what we have now: Now we consider cluster "free", when its refcount is 0. When cluster becomes "free" we also update s->free_cluster_index and optionally discard it on bs->file level. These two operations are done in same s->lock critical section where refcount becomes 0 (and this all is in update_refcount()). Calling update_refcount() wirthout s->lock held is wrong. It's ofcourse done sometimes, as not everything is moved to coroutine for now.. Still, it's out of our topic. Later, we can reallocate "free" cluster in alloc_clusters_noref() and qcow2_alloc_clusters_at(), where is_cluster_free() is used. OK, to correctly handle in-flight writes, let's modify a concept of "free" cluster, so that cluster is "free" when its refcount is 0 and there no inflight writes. We are going to track in-flight writes (and other operations with host data clusters) with help of host-range-references recently implemented. So cluster would be "free" when its refcount is 0 and host-range-refcnt is 0 too. So, we discard the cluster at bs->file level, update s->free_cluster_index and allow reallocation only when both refcount and inflight-write-cnt becomes both zero. It may happen either in update_refcount() or in qcow2_host_range_unref(). In update_refcount() we just discard if host-range-refcnt is 0 and register postponded discard if it isnt. We implement postponed discard functionality so that qcow2_host_range_unref() doesn't have to load refcounts. So in qcow2_host_range_unref() we just do postcponed discard if it is registered in HostCluster struct. Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> --- block/qcow2.h | 4 +++ block/qcow2-host-range-refs.c | 47 +++++++++++++++++++++++++++++++++++ block/qcow2-refcount.c | 23 +++++++++++------ 3 files changed, 67 insertions(+), 7 deletions(-)