Patchwork [partial,RFC,2/2] qcow2: Deduplication write mechanism.

login
register
mail settings
Submitter Benoît Canet
Date Sept. 27, 2012, 2:29 p.m.
Message ID <1348756198-10845-3-git-send-email-benoit@irqsave.net>
Download mbox | patch
Permalink /patch/187384/
State New
Headers show

Comments

Benoît Canet - Sept. 27, 2012, 2:29 p.m.
---
 block/Makefile.objs |    2 +-
 block/qcow2-dedup.c |  368 +++++++++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2.c       |   74 ++++++++++-
 block/qcow2.h       |   40 +++++-
 4 files changed, 481 insertions(+), 3 deletions(-)
 create mode 100644 block/qcow2-dedup.c

Patch

diff --git a/block/Makefile.objs b/block/Makefile.objs
index b5754d3..37a8aab 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,5 +1,5 @@ 
 block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
-block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
+block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o qcow2-dedup.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
diff --git a/block/qcow2-dedup.c b/block/qcow2-dedup.c
new file mode 100644
index 0000000..12ea96f
--- /dev/null
+++ b/block/qcow2-dedup.c
@@ -0,0 +1,368 @@ 
+/*
+ * Deduplication for the QCOW2 format
+ *
+ * Copyright (C) Nodalink, SARL. 2012
+ *
+ * Author:
+ *   Benoît Canet <benoit.canet@irqsave.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block_int.h"
+#include "qemu-common.h"
+#include "qcow2.h"
+#include <openssl/sha.h>
+#include <openssl/evp.h>
+
+static int qcow2_dedup_insert_in_rb_tree(BlockDriverState *bs,
+                            uint8_t *hash,
+                            int64_t cluster_offset)
+{
+    return 0;
+}
+
+static int64_t qcow2_dedup_rb_tree_lookup(BlockDriverState *bs, uint8_t *hash)
+{
+    return 0;
+}
+
+static void qcow2_dedup_rb_tree_remove(BlockDriverState *bs, uint8_t *hash)
+{
+}
+
+static uint8_t *qcow2_get_dedup_cluster_hash(BlockDriverState *bs,
+                                             uint8_t *data)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint8_t *hash = g_malloc0(SHA256_DIGEST_LENGTH);
+    EVP_Digest(data, s->cluster_sectors << 9,
+               hash, NULL, EVP_sha256(), NULL);
+    return hash;
+}
+
+static int qcow2_dedup_write_hash(BlockDriverState *bs,
+                            uint8_t *hash,
+                            int64_t cluster_offset)
+{
+    return 0;
+}
+
+static uint8_t *qcow2_dedup_read_hash(BlockDriverState *bs,
+                               uint64_t cluster_offset)
+{
+    return 0;
+}
+
+static int qcow2_dedup_inc_refcount(BlockDriverState *bs,
+                                    uint64_t cluster_offset)
+{
+    return 0;
+}
+
+static int qcow2_dedup_link_l2(BlockDriverState *bs, uint64_t cluster_offset)
+{
+    return 0;
+}
+
+static int qcow2_read_missing_dedup_cluster_data(BlockDriverState *bs,
+                                                 uint8_t *data,
+                                                 uint64_t sector,
+                                                 int nr)
+{
+    QEMUIOVector qiov;
+    struct iovec iov;
+    int ret;
+
+    iov.iov_len = nr * BDRV_SECTOR_SIZE;
+    iov.iov_base = data;
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    ret = bs->drv->bdrv_co_readv(bs, sector, nr, &qiov);
+    if (ret < 0) {
+        error_report("failed to read %d sectors at offset %" PRIu64 "\n",
+                     nr, sector);
+    }
+
+    return ret;
+}
+
+/*
+ * This function prepare a buffer containing all the required data
+ * in order to compute hashes and look for duplicate clusters.
+ * It read the missing data before and after the requested qiov
+ * and concatenate with the qiov content.
+ */
+int qcow2_dedup_read_missing_and_concatenate(BlockDriverState *bs,
+                                             QEMUIOVector *qiov,
+                                             uint64_t sector,
+                                             int remaining_sectors,
+                                             uint8_t *dedup_cluster_data,
+                                             int *dedup_cluster_data_nr)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+    uint64_t dedup_cluster_beginning_sector;
+    uint64_t dedup_cluster_ending_sector;
+    int dedup_cluster_beginning_nr;
+    int dedup_cluster_ending_nr;
+
+    /* compute how much and where to read at the beginning */
+    dedup_cluster_beginning_nr = sector & (s->cluster_sectors - 1);
+    dedup_cluster_beginning_sector = sector - dedup_cluster_beginning_nr;
+
+    /* same for the end */
+    dedup_cluster_ending_sector = sector + remaining_sectors;
+    dedup_cluster_ending_nr = s->cluster_sectors -
+                              (dedup_cluster_ending_sector &
+                              (s->cluster_sectors - 1));
+
+    /* compute total size in sectors and allocate memory */
+    *dedup_cluster_data_nr = dedup_cluster_beginning_nr +
+                             dedup_cluster_ending_nr +
+                             remaining_sectors;
+    dedup_cluster_data = qemu_blockalign(bs, *dedup_cluster_data_nr *
+                             BDRV_SECTOR_SIZE);
+
+    /* read beginning */
+    ret = qcow2_read_missing_dedup_cluster_data(bs,
+              dedup_cluster_data,
+              dedup_cluster_beginning_sector,
+              dedup_cluster_beginning_nr);
+
+    if (ret < 0) {
+        goto fail_exit;
+    }
+
+    /* append qiov content */
+    qemu_iovec_to_buf(qiov, 0, dedup_cluster_data +
+         dedup_cluster_beginning_nr * BDRV_SECTOR_SIZE,
+         qiov->size);
+
+    /* read and add ending */
+    ret = qcow2_read_missing_dedup_cluster_data(bs,
+              dedup_cluster_data +
+              (dedup_cluster_beginning_nr + qiov->size) * BDRV_SECTOR_SIZE,
+              dedup_cluster_ending_sector,
+              dedup_cluster_ending_nr);
+
+    if (ret < 0) {
+        goto fail_exit;
+    }
+
+    return 0;
+
+fail_exit:
+    qemu_vfree(dedup_cluster_data);
+    return ret;
+}
+
+/* this function lookup the physical offset of a cluster
+ * after having computed it's hash.
+ * a precomputed hash can be used to do the lookup if hash != NULL
+ */
+static int64_t qcow2_dedup_lookup_hash(BlockDriverState *bs,
+                                       uint8_t *data,
+                                       int cluster_offset,
+                                       uint8_t **hash)
+{
+    if (!(*hash)) {
+        *hash = qcow2_get_dedup_cluster_hash(bs,
+                    data + (cluster_offset << 9));
+    }
+    return qcow2_dedup_rb_tree_lookup(bs, *hash);
+}
+
+static QCowDedupHash *qcow2_get_dedup_hash(uint8_t *hash)
+{
+    QCowDedupHash *dedup_hash;
+    dedup_hash = g_new0(QCowDedupHash, 1);
+    dedup_hash->hash = hash;
+    return dedup_hash;
+}
+
+static void qcow2_put_dedup_hash(QCowDedupHash *dedup_hash)
+{
+    g_free(dedup_hash->hash);
+    g_free(dedup_hash);
+}
+
+/* this function tries to deduplicate a given cluster,
+ * it returns a true on success and store a QCowDedupHash
+ * structure on dedup_hash.
+ * it use a precomputed hash if hash != NULL.
+ */
+static bool qcow2_dedup_cluster(BlockDriverState *bs,
+                                      uint8_t *data,
+                                      int64_t sector_num,
+                                      int skip_before_dedup_clusters_nr,
+                                      QCowDedupHash *dedup_hash,
+                                      uint8_t **hash)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t deduped_cluster_offset;
+    uint64_t to_free_cluster_offset;
+    int64_t cluster_offset;
+    int ret;
+    int pnum = s->cluster_sectors;
+    dedup_hash = NULL;
+
+    cluster_offset = sector_num & (s->cluster_sectors - 1);
+    deduped_cluster_offset = qcow2_dedup_lookup_hash(bs,
+                                 data,
+                                 skip_before_dedup_clusters_nr,
+                                 hash);
+    /* duplicated cluster found */
+    if (deduped_cluster_offset != -1) {
+        /* fixme : handle error
+         * is the order right for the transaction to be
+         * atomic ?
+         */
+        qcow2_dedup_inc_refcount(bs, deduped_cluster_offset);
+        qcow2_dedup_link_l2(bs, deduped_cluster_offset);
+
+        /* If we are changing an existing physical cluster by a deduped
+         * one decrease its refcount.
+         */
+        ret = qcow2_get_cluster_offset(bs, cluster_offset << 9,
+                  &pnum, &to_free_cluster_offset);
+        if (ret < 0) {
+            free(*hash);
+            return false;
+        }
+        if (to_free_cluster_offset) {
+            *hash = qcow2_dedup_read_hash(bs, to_free_cluster_offset);
+            qcow2_dedup_rb_tree_remove(bs, *hash);
+            g_free(hash);
+            qcow2_free_clusters(bs, to_free_cluster_offset,
+                s->cluster_sectors << 9);
+        }
+    }
+    dedup_hash = qcow2_get_dedup_hash(*hash);
+    *hash = NULL;
+    return deduped_cluster_offset == -1 ? false : true;
+}
+
+int qcow2_dedup(BlockDriverState *bs,
+                uint8_t *dedup_cluster_data,
+                int dedup_cluster_data_nr,
+                int64_t sector_num,
+                int remaining_sectors,
+                int *next_non_dedupable_sectors_nr,
+                int *skip_before_dedup_clusters_nr,
+                uint8_t **next_call_first_hash)
+{
+    BDRVQcowState *s = bs->opaque;
+    int deduped_clusters_nr = 0;
+    int left_to_test_clusters_nr;
+    int dedup_cluster_beginning_nr;
+    uint8_t *hash = NULL;
+    QCowDedupHash *non_duplicated_dedup_hash = NULL;
+    /* should already be zero when entering this function */
+    *next_non_dedupable_sectors_nr = 0;
+
+    dedup_cluster_beginning_nr = sector_num & (s->cluster_sectors - 1);
+
+    left_to_test_clusters_nr = dedup_cluster_data_nr -
+                               *skip_before_dedup_clusters_nr;
+
+    /* Deduplicate all that can be */
+    while (left_to_test_clusters_nr-- &&
+           qcow2_dedup_cluster(bs,
+                               dedup_cluster_data,
+                               sector_num,
+                               (*skip_before_dedup_clusters_nr)++,
+                               non_duplicated_dedup_hash,
+                               next_call_first_hash)) {
+        deduped_clusters_nr++;
+    }
+
+    /* We consumed the precomputed hash -> set it to NULL */
+    *next_call_first_hash = NULL;
+
+    /* We deduped everything till the end */
+    if (!non_duplicated_dedup_hash) {
+        *next_non_dedupable_sectors_nr = 0;
+        goto exit;
+    }
+
+    /* Memorize the hash of the first non duplicated cluster.
+     * we will store it when writing the cluster to disk.
+     */
+    QTAILQ_INSERT_TAIL(&s->undedupable_hashes, non_duplicated_dedup_hash, next);
+    *next_non_dedupable_sectors_nr += s->cluster_sectors;
+
+    /* Count how many non duplicated sector can be written without dedupe
+     * and memorize the hashes for the disk writing code.
+     * We make sure we pass hash == NULL to force computation of the hash.
+     */
+    hash = NULL;
+    while (left_to_test_clusters_nr-- &&
+           -1 == qcow2_dedup_lookup_hash(bs,
+                                         dedup_cluster_data,
+                                         (*skip_before_dedup_clusters_nr)++,
+                                         &hash)) {
+        *next_non_dedupable_sectors_nr += s->cluster_sectors;
+        non_duplicated_dedup_hash = qcow2_get_dedup_hash(hash);
+        QTAILQ_INSERT_TAIL(&s->undedupable_hashes,
+            non_duplicated_dedup_hash, next);
+        hash = NULL;
+    }
+
+    /* We find a duplicated cluster before stopping iterating
+     * store it for next call.
+     */
+    if (non_duplicated_dedup_hash->hash != hash) {
+        *next_call_first_hash = hash;
+    }
+
+exit:
+    if (!deduped_clusters_nr) {
+        return 0;
+    }
+    return deduped_clusters_nr * s->cluster_sectors -
+               dedup_cluster_beginning_nr;
+}
+
+
+/* This function write the hashes of the blocks which are not duplicated
+ */
+void qcow2_dedup_write_new_hashes(BlockDriverState *bs,
+                                  uint64_t cluster_offset,
+                                  int count)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowDedupHash *dedup_hash, *next_dedup_hash;
+    int i = 0;
+
+    QTAILQ_FOREACH_SAFE(dedup_hash, &s->undedupable_hashes,
+                        next, next_dedup_hash) {
+        qcow2_dedup_write_hash(bs, dedup_hash->hash,
+                               cluster_offset + i * s->cluster_sectors);
+        qcow2_dedup_insert_in_rb_tree(bs, dedup_hash->hash,
+                                      cluster_offset + i * s->cluster_sectors);
+        QTAILQ_REMOVE(&s->undedupable_hashes,
+                      dedup_hash,
+                      next);
+        qcow2_put_dedup_hash(dedup_hash);
+        i++;
+        if (i == count) {
+            break;
+        }
+    }
+}
diff --git a/block/qcow2.c b/block/qcow2.c
index 8f183f1..2471d9a 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -765,6 +765,12 @@  static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
     QEMUIOVector hd_qiov;
     uint64_t bytes_done = 0;
     uint8_t *cluster_data = NULL;
+    uint8_t *dedup_cluster_data = NULL;
+    uint8_t *next_call_first_hash;
+    int dedup_cluster_data_nr;
+    int deduped_sectors_nr;
+    int skip_before_dedup_clusters_nr;
+    int next_non_dedupable_sectors_nr;
     QCowL2Meta l2meta = {
         .nb_clusters = 0,
     };
@@ -780,11 +786,66 @@  static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
 
     qemu_co_mutex_lock(&s->lock);
 
+    if (s->has_dedup) {
+        /* if deduplication is on we make sure dedup_cluster_data
+         * contains a multiple of cluster size of data in order
+         * to compute the hashes
+         */
+        ret = qcow2_dedup_read_missing_and_concatenate(bs,
+                  qiov,
+                  sector_num,
+                  remaining_sectors,
+                  dedup_cluster_data,
+                  &dedup_cluster_data_nr);
+
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    next_non_dedupable_sectors_nr = 0;
+    skip_before_dedup_clusters_nr = 0;
+    next_call_first_hash = NULL;
     while (remaining_sectors != 0) {
 
         trace_qcow2_writev_start_part(qemu_coroutine_self());
+
+        if (s->has_dedup && next_non_dedupable_sectors_nr == 0) {
+            /* Try to deduplicate as much cluster as possible */
+            deduped_sectors_nr = qcow2_dedup(bs,
+                                     dedup_cluster_data,
+                                     dedup_cluster_data_nr,
+                                     sector_num,
+                                     remaining_sectors,
+                                     &next_non_dedupable_sectors_nr,
+                                     &skip_before_dedup_clusters_nr,
+                                     &next_call_first_hash);
+
+            remaining_sectors -= deduped_sectors_nr;
+            sector_num += deduped_sectors_nr;
+            bytes_done += deduped_sectors_nr * 512;
+
+            /* no more data to write -> exit
+             * Can be < 0 because of the presence of sectors we read in
+             * qcow2_read_missing_dedup_sectors_and_concatenate.
+             */
+            if (remaining_sectors < 0) {
+                break;
+            }
+
+            /* if we deduped something trace it */
+            if (deduped_sectors_nr) {
+                trace_qcow2_writev_done_part(qemu_coroutine_self(),
+                    deduped_sectors_nr);
+                trace_qcow2_writev_start_part(qemu_coroutine_self());
+            }
+        }
+
         index_in_cluster = sector_num & (s->cluster_sectors - 1);
-        n_end = index_in_cluster + remaining_sectors;
+        n_end = index_in_cluster +
+                next_non_dedupable_sectors_nr < remaining_sectors ?
+                next_non_dedupable_sectors_nr : remaining_sectors;
+
         if (s->crypt_method &&
             n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
             n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
@@ -826,6 +887,15 @@  static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                 cur_nr_sectors * 512);
         }
 
+        /* Write the non duplicated clusters hashes to disk
+         * just before writing actual data.
+         */
+        if (s->has_dedup) {
+            qcow2_dedup_write_new_hashes(bs, cluster_offset,
+                                         l2meta.nb_clusters > 0 ?
+                                         l2meta.nb_clusters : 1);
+        }
+
         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
         qemu_co_mutex_unlock(&s->lock);
         trace_qcow2_writev_data(qemu_coroutine_self(),
@@ -845,6 +915,7 @@  static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
 
         run_dependent_requests(s, &l2meta);
 
+        next_non_dedupable_sectors_nr -= cur_nr_sectors;
         remaining_sectors -= cur_nr_sectors;
         sector_num += cur_nr_sectors;
         bytes_done += cur_nr_sectors * 512;
@@ -859,6 +930,7 @@  fail:
 
     qemu_iovec_destroy(&hd_qiov);
     qemu_vfree(cluster_data);
+    qemu_vfree(dedup_cluster_data);
     trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
 
     return ret;
diff --git a/block/qcow2.h b/block/qcow2.h
index b4eb654..fd2657b 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -114,8 +114,10 @@  enum {
 enum {
     QCOW2_INCOMPAT_DIRTY_BITNR   = 0,
     QCOW2_INCOMPAT_DIRTY         = 1 << QCOW2_INCOMPAT_DIRTY_BITNR,
+    QCOW2_INCOMPAT_DEDUP_BITNR   = 1,
+    QCOW2_INCOMPAT_DEDUP         = 1 << QCOW2_INCOMPAT_DEDUP_BITNR,
 
-    QCOW2_INCOMPAT_MASK          = QCOW2_INCOMPAT_DIRTY,
+    QCOW2_INCOMPAT_MASK          = QCOW2_INCOMPAT_DIRTY | QCOW2_INCOMPAT_DEDUP,
 };
 
 /* Compatible feature bits */
@@ -132,6 +134,11 @@  typedef struct Qcow2Feature {
     char    name[46];
 } QEMU_PACKED Qcow2Feature;
 
+typedef struct QCowDedupHash {
+    uint8_t *hash;
+    QTAILQ_ENTRY(QCowDedupHash) next;
+} QCowDedupHash;
+
 typedef struct BDRVQcowState {
     int cluster_bits;
     int cluster_size;
@@ -148,6 +155,7 @@  typedef struct BDRVQcowState {
 
     Qcow2Cache* l2_table_cache;
     Qcow2Cache* refcount_block_cache;
+    Qcow2Cache *dedup_block_cache;
 
     uint8_t *cluster_cache;
     uint8_t *cluster_data;
@@ -160,6 +168,11 @@  typedef struct BDRVQcowState {
     int64_t free_cluster_index;
     int64_t free_byte_offset;
 
+    bool has_dedup;
+    uint64_t *dedup_table;
+    uint64_t dedup_table_offset;
+    uint32_t dedup_table_size;
+
     CoMutex lock;
 
     uint32_t crypt_method; /* current crypt method, 0 if no key yet */
@@ -181,6 +194,7 @@  typedef struct BDRVQcowState {
     size_t unknown_header_fields_size;
     void* unknown_header_fields;
     QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext;
+    QTAILQ_HEAD(, QCowDedupHash) undedupable_hashes;
 } BDRVQcowState;
 
 /* XXX: use std qcow open function ? */
@@ -333,4 +347,28 @@  int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
     void **table);
 int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
 
+/* qcow2-dedup functions */
+int qcow2_dedup_read_missing_and_concatenate(BlockDriverState *bs,
+                                             QEMUIOVector *qiov,
+                                             uint64_t sector,
+                                             int remaining_sectors,
+                                             uint8_t *dedup_cluster_data,
+                                             int *dedup_cluster_data_nr);
+
+int qcow2_dedup(BlockDriverState *bs,
+                uint8_t *dedup_cluster_data,
+                int dedup_cluster_data_nr,
+                int64_t sector_num,
+                int remaining_sectors,
+                int *next_non_dedupable_sectors_nr,
+                int *skip_before_dedup_cluster_nr,
+                uint8_t **next_call_first_hash);
+
+void qcow2_dedup_write_new_hashes(BlockDriverState *bs,
+                                  uint64_t cluster_offset,
+                                  int count);
+
+void qcow2_dedup_free_cluster(BlockDriverState *bs,
+                              uint64_t cluster_offset);
+
 #endif