diff mbox

[RFC,V6,19/33] block: Add qcow2_dedup format and image creation code.

Message ID 1360153926-9492-20-git-send-email-benoit@irqsave.net
State New
Headers show

Commit Message

Benoît Canet Feb. 6, 2013, 12:31 p.m. UTC
Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
 block/qcow2.c             |  188 +++++++++++++++++++++++++++++++++++++++++----
 block/qcow2.h             |    2 +
 include/block/block_int.h |    1 +
 3 files changed, 178 insertions(+), 13 deletions(-)

Comments

Stefan Hajnoczi Feb. 7, 2013, 10:16 a.m. UTC | #1
On Wed, Feb 06, 2013 at 01:31:52PM +0100, Benoît Canet wrote:
> diff --git a/block/qcow2.c b/block/qcow2.c
> index ad202fa..9cbb2f0 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -277,6 +277,11 @@ int qcow2_mark_dirty(BlockDriverState *bs)
>      return qcow2_add_feature(bs, QCOW2_INCOMPAT_DIRTY);
>  }
>  
> +static int qcow2_activate_dedup(BlockDriverState *bs)
> +{
> +    return qcow2_add_feature(bs, QCOW2_INCOMPAT_DEDUP);
> +}

I suggest dropping this wrapper function, what "activiate dedup" means
is not clear.  It turns out it simply sets the feature bit in the file
header, nothing else.

Best to set the feature bit directly so save readers from having to jump
to the definition of qcow2_activate_dedup().

> @@ -1371,11 +1381,29 @@ static int qcow2_create2(const char *filename, int64_t total_size,
>      }
>  
>      /* Okay, now that we have a valid image, let's give it the right size */
> +    s = bs->opaque;
>      ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
>      if (ret < 0) {
>          goto out;
>      }
>  
> +    if (dedup) {

BDRVQcowState *s = bs->opaque;

A local variable here would be nicer than at function scope.

> @@ -1447,24 +1501,42 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options)
>              }
>          } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
>              flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
> +        } else if (!strcmp(options->name, BLOCK_OPT_DEDUP)) {
> +            hash_algo = qcow2_get_dedup_hash_algo(options->value.s);
> +            if (hash_algo < 0) {
> +                return hash_algo;
> +            }
> +            dedup = true;
>          }
>          options++;
>      }
>  
> +    if (dedup) {
> +        version = 3;
> +    }

Lazy refcounts don't force the version.  It would be consistent to
refrain from forcing the version too.

> @@ -1809,9 +1931,49 @@ static BlockDriver bdrv_qcow2 = {
>      .bdrv_check = qcow2_check,
>  };
>  
> +static BlockDriver bdrv_qcow2_dedup = {

Missing comment explaining the need to duplicate the BlockDriver for
dedup.
Benoît Canet March 11, 2013, 3:20 p.m. UTC | #2
> > +    if (dedup) {
> > +        version = 3;
> > +    }
> 
> Lazy refcounts don't force the version.  It would be consistent to
> refrain from forcing the version too.

I don't understand. Aren't the incompatible features bits implying version = 3 ?
When compat=1.1 is set the code force version to 3.

Benoît
Stefan Hajnoczi March 12, 2013, 9:33 a.m. UTC | #3
On Mon, Mar 11, 2013 at 04:20:03PM +0100, Benoît Canet wrote:
> > > +    if (dedup) {
> > > +        version = 3;
> > > +    }
> > 
> > Lazy refcounts don't force the version.  It would be consistent to
> > refrain from forcing the version too.
> 
> I don't understand. Aren't the incompatible features bits implying version = 3 ?
> When compat=1.1 is set the code force version to 3.

Only compat= affects the version.  Lazy refcounts and dedup should be
orthogonal.

If you specify lazy_refcounts=on without compat=1.1 you get an error.

Same should apply to deduplication.  Otherwise the qemu-img create
behavior is inconsistent - you get an error for lazy_refcounts but no
error and silent version=3 for deduplication.

Stefan
diff mbox

Patch

diff --git a/block/qcow2.c b/block/qcow2.c
index ad202fa..9cbb2f0 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -277,6 +277,11 @@  int qcow2_mark_dirty(BlockDriverState *bs)
     return qcow2_add_feature(bs, QCOW2_INCOMPAT_DIRTY);
 }
 
+static int qcow2_activate_dedup(BlockDriverState *bs)
+{
+    return qcow2_add_feature(bs, QCOW2_INCOMPAT_DEDUP);
+}
+
 /*
  * Clears an incompatible feature bit and flushes before if necessary.
  * Only call this function when there are no pending requests, it does not
@@ -1267,7 +1272,8 @@  static int preallocate(BlockDriverState *bs)
 static int qcow2_create2(const char *filename, int64_t total_size,
                          const char *backing_file, const char *backing_format,
                          int flags, size_t cluster_size, int prealloc,
-                         QEMUOptionParameter *options, int version)
+                         QEMUOptionParameter *options, int version,
+                         bool dedup, uint8_t hash_algo)
 {
     /* Calculate cluster_bits */
     int cluster_bits;
@@ -1294,8 +1300,10 @@  static int qcow2_create2(const char *filename, int64_t total_size,
      * size for any qcow2 image.
      */
     BlockDriverState* bs;
+    BDRVQcowState *s;
     QCowHeader header;
-    uint8_t* refcount_table;
+    uint8_t *tables;
+    int size;
     int ret;
 
     ret = bdrv_create_file(filename, options);
@@ -1337,10 +1345,11 @@  static int qcow2_create2(const char *filename, int64_t total_size,
         goto out;
     }
 
-    /* Write an empty refcount table */
-    refcount_table = g_malloc0(cluster_size);
-    ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
-    g_free(refcount_table);
+    /* Write an empty refcount table + extra space for dedup table if needed */
+    size = dedup ? 2 : 1;
+    tables = g_malloc0(size * cluster_size);
+    ret = bdrv_pwrite(bs, cluster_size, tables, size * cluster_size);
+    g_free(tables);
 
     if (ret < 0) {
         goto out;
@@ -1351,7 +1360,7 @@  static int qcow2_create2(const char *filename, int64_t total_size,
     /*
      * And now open the image and make it consistent first (i.e. increase the
      * refcount of the cluster that is occupied by the header and the refcount
-     * table)
+     * table and the eventual dedup table)
      */
     BlockDriver* drv = bdrv_find_format("qcow2");
     assert(drv != NULL);
@@ -1361,7 +1370,8 @@  static int qcow2_create2(const char *filename, int64_t total_size,
         goto out;
     }
 
-    ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
+    size++; /* Add a cluster for the header */
+    ret = qcow2_alloc_clusters(bs, size * cluster_size);
     if (ret < 0) {
         goto out;
 
@@ -1371,11 +1381,29 @@  static int qcow2_create2(const char *filename, int64_t total_size,
     }
 
     /* Okay, now that we have a valid image, let's give it the right size */
+    s = bs->opaque;
     ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
     if (ret < 0) {
         goto out;
     }
 
+    if (dedup) {
+        s->has_dedup = true;
+        s->dedup_table_offset = cluster_size * 2;
+        s->dedup_table_size = cluster_size / sizeof(uint64_t);
+        s->dedup_hash_algo = hash_algo;
+
+        ret = qcow2_activate_dedup(bs);
+        if (ret < 0) {
+            goto out;
+        }
+
+        ret = qcow2_update_header(bs);
+        if (ret < 0) {
+            goto out;
+        }
+    }
+
     /* Want a backing file? There you go.*/
     if (backing_file) {
         ret = bdrv_change_backing_file(bs, backing_file, backing_format);
@@ -1401,15 +1429,41 @@  out:
     return ret;
 }
 
+static int qcow2_warn_if_version_3_is_needed(int version,
+                                             bool has_feature,
+                                             const char *feature)
+{
+    if (version < 3 && has_feature) {
+        fprintf(stderr, "%s only supported with compatibility "
+                "level 1.1 and above (use compat=1.1 or greater)\n",
+                feature);
+        return -EINVAL;
+    }
+    return 0;
+}
+
+static int8_t qcow2_get_dedup_hash_algo(char *value)
+{
+    if (!value || !strcmp(value, "sha256")) {
+        return QCOW_HASH_SHA256;
+    }
+
+    error_printf("Unsupported deduplication hash algorithm.\n");
+    return -EINVAL;
+}
+
 static int qcow2_create(const char *filename, QEMUOptionParameter *options)
 {
     const char *backing_file = NULL;
     const char *backing_fmt = NULL;
     uint64_t sectors = 0;
     int flags = 0;
+    int ret;
     size_t cluster_size = DEFAULT_CLUSTER_SIZE;
     int prealloc = 0;
     int version = 2;
+    bool dedup = false;
+    int8_t hash_algo = 0;
 
     /* Read out options */
     while (options && options->name) {
@@ -1447,24 +1501,42 @@  static int qcow2_create(const char *filename, QEMUOptionParameter *options)
             }
         } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
             flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
+        } else if (!strcmp(options->name, BLOCK_OPT_DEDUP)) {
+            hash_algo = qcow2_get_dedup_hash_algo(options->value.s);
+            if (hash_algo < 0) {
+                return hash_algo;
+            }
+            dedup = true;
         }
         options++;
     }
 
+    if (dedup) {
+        version = 3;
+    }
+
     if (backing_file && prealloc) {
         fprintf(stderr, "Backing file and preallocation cannot be used at "
             "the same time\n");
         return -EINVAL;
     }
 
-    if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
-        fprintf(stderr, "Lazy refcounts only supported with compatibility "
-                "level 1.1 and above (use compat=1.1 or greater)\n");
-        return -EINVAL;
+    ret = qcow2_warn_if_version_3_is_needed(version,
+                                            flags & BLOCK_FLAG_LAZY_REFCOUNTS,
+                                            "Lazy refcounts");
+    if (ret < 0) {
+        return ret;
+    }
+    ret = qcow2_warn_if_version_3_is_needed(version,
+                                            dedup,
+                                            "Deduplication");
+    if (ret < 0) {
+        return ret;
     }
 
     return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
-                         cluster_size, prealloc, options, version);
+                         cluster_size, prealloc, options, version,
+                         dedup, hash_algo);
 }
 
 static int qcow2_make_empty(BlockDriverState *bs)
@@ -1770,6 +1842,56 @@  static QEMUOptionParameter qcow2_create_options[] = {
     { NULL }
 };
 
+static QEMUOptionParameter qcow2_dedup_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_COMPAT_LEVEL,
+        .type = OPT_STRING,
+        .help = "Compatibility level (0.10 or 1.1)",
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FMT,
+        .type = OPT_STRING,
+        .help = "Image format of the base image"
+    },
+    {
+        .name = BLOCK_OPT_ENCRYPT,
+        .type = OPT_FLAG,
+        .help = "Encrypt the image"
+    },
+    {
+        .name = BLOCK_OPT_CLUSTER_SIZE,
+        .type = OPT_SIZE,
+        .help = "qcow2 cluster size",
+        .value = { .n = DEFAULT_DEDUP_CLUSTER_SIZE },
+    },
+    {
+        .name = BLOCK_OPT_PREALLOC,
+        .type = OPT_STRING,
+        .help = "Preallocation mode (allowed values: off, metadata)"
+    },
+    {
+        .name = BLOCK_OPT_LAZY_REFCOUNTS,
+        .type = OPT_FLAG,
+        .help = "Postpone refcount updates",
+    },
+    {
+        .name = BLOCK_OPT_DEDUP,
+        .type = OPT_STRING,
+        .help = "Deduplication",
+    },
+    { NULL }
+};
+
 static BlockDriver bdrv_qcow2 = {
     .format_name        = "qcow2",
     .instance_size      = sizeof(BDRVQcowState),
@@ -1809,9 +1931,49 @@  static BlockDriver bdrv_qcow2 = {
     .bdrv_check = qcow2_check,
 };
 
+static BlockDriver bdrv_qcow2_dedup = {
+    .format_name        = "qcow2_dedup",
+    .instance_size      = sizeof(BDRVQcowState),
+    .bdrv_probe         = qcow2_probe,
+    .bdrv_open          = qcow2_open,
+    .bdrv_close         = qcow2_close,
+    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
+    .bdrv_create        = qcow2_create,
+    .bdrv_co_is_allocated = qcow2_co_is_allocated,
+    .bdrv_set_key       = qcow2_set_key,
+    .bdrv_make_empty    = qcow2_make_empty,
+
+    .bdrv_co_readv          = qcow2_co_readv,
+    .bdrv_co_writev         = qcow2_co_writev,
+    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
+
+    .bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
+    .bdrv_co_discard        = qcow2_co_discard,
+    .bdrv_truncate          = qcow2_truncate,
+    .bdrv_write_compressed  = qcow2_write_compressed,
+
+    .bdrv_snapshot_create   = qcow2_snapshot_create,
+    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
+    .bdrv_snapshot_delete   = qcow2_snapshot_delete,
+    .bdrv_snapshot_list     = qcow2_snapshot_list,
+    .bdrv_snapshot_load_tmp     = qcow2_snapshot_load_tmp,
+    .bdrv_get_info      = qcow2_get_info,
+
+    .bdrv_save_vmstate    = qcow2_save_vmstate,
+    .bdrv_load_vmstate    = qcow2_load_vmstate,
+
+    .bdrv_change_backing_file   = qcow2_change_backing_file,
+
+    .bdrv_invalidate_cache      = qcow2_invalidate_cache,
+
+    .create_options = qcow2_dedup_create_options,
+    .bdrv_check = qcow2_check,
+};
+
 static void bdrv_qcow2_init(void)
 {
     bdrv_register(&bdrv_qcow2);
+    bdrv_register(&bdrv_qcow2_dedup);
 }
 
 block_init(bdrv_qcow2_init);
diff --git a/block/qcow2.h b/block/qcow2.h
index 77ffa0b..5b51005 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -60,7 +60,9 @@ 
 /* Must be at least 4 to cover all cases of refcount table growth */
 #define REFCOUNT_CACHE_SIZE 4
 
+#define DEDUP_CACHE_SIZE 4
 #define DEFAULT_CLUSTER_SIZE 65536
+#define DEFAULT_DEDUP_CLUSTER_SIZE 4096
 
 #define HASH_LENGTH 32
 /* indicate that the hash structure is empty and miss offset */
diff --git a/include/block/block_int.h b/include/block/block_int.h
index eaad53e..62c72fc 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -57,6 +57,7 @@ 
 #define BLOCK_OPT_COMPAT_LEVEL      "compat"
 #define BLOCK_OPT_LAZY_REFCOUNTS    "lazy_refcounts"
 #define BLOCK_OPT_ADAPTER_TYPE      "adapter_type"
+#define BLOCK_OPT_DEDUP             "dedup"
 
 typedef struct BdrvTrackedRequest BdrvTrackedRequest;