Patchwork FVD: add internal snapshot functionality

login
register
mail settings
Submitter Chunqiang Tang
Date March 13, 2011, 5:33 a.m.
Message ID <1299994438-15713-1-git-send-email-ctang@us.ibm.com>
Download mbox | patch
Permalink /patch/86593/
State New
Headers show

Comments

Chunqiang Tang - March 13, 2011, 5:33 a.m.
This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds to FVD the feature of internal snapsot, which is a big
step forward over QCOW's snapshot function in terms of efficiency.
FVD's snapshot implementation achieves the following ideal properties of
a snapshot function:

G1: Do no harm (or avoid being a misfeature), i.e., the added snapshot
code should not slow down the runtime performance of an image that has
no snapshots.  This implies that an image without snapshot should not
cache the reference count table in memory and should not update the
on-disk reference count table.

G2: Even better, an image with 1 snapshot runs as fast as an image
without snapshot.

G3: Even even better, an image with 1,000 snapshots runs as fast as an
image without snapshot. This basically means getting the snapshot
feature for free.

G4: An image with 1,000 snapshots consumes no more memory than an image
without snapshot. This again means getting the snapshot feature for
free.

G5: Regardless of the number of existing snapshots, creating a new
snapshot is fast, e.g., taking no more than 1 second.

G6: Regardless of the number of existing snapshots, deleting a snapshot
is fast, e.g., taking no more than 1 second.

FVD's internal snapshot achieves the ideal properties of G1-G6, by 1)
using the reference count table to only track "static" snapshots, 2) not
keeping the reference count table in memory, 3) not updating the on-disk
"static" reference count table when the VM runs, and 4) efficiently
tracking dynamically allocated blocks by piggybacking on FVD's other
features, i.e., its journal and small one-level lookup table.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 Makefile.objs           |    2 +-
 block.c                 |    2 +-
 block/blksim.c          |   25 +-
 block/fvd-create.c      |  272 ++++++++-------
 block/fvd-debug.c       |   16 +-
 block/fvd-flush.c       |  192 -----------
 block/fvd-journal-buf.c |   52 ++--
 block/fvd-journal.c     |  159 +++++----
 block/fvd-load.c        |   43 ++-
 block/fvd-misc.c        |  202 +++++++----
 block/fvd-open.c        |  381 ++++++++++++++-------
 block/fvd-prefetch.c    |    2 +-
 block/fvd-read.c        |    8 +-
 block/fvd-snapshot.c    |  878 +++++++++++++++++++++++++++++++++++++++++++++++
 block/fvd-store.c       |  321 ++++++++++++------
 block/fvd-update.c      |  260 +++++++++++++--
 block/fvd-write.c       |  398 +++++++++++++++-------
 block/fvd.c             |   15 +-
 block/fvd.h             |  188 ++++++-----
 bswap.h                 |   24 ++
 qemu-img.c              |    2 +-
 qemu-io-auto.c          |    9 +-
 qemu-option.c           |   12 +-
 test-fvd.sh             |  115 +++++--
 test-qcow2.sh           |    4 +-
 test-vdi.sh             |    2 +-
 26 files changed, 2554 insertions(+), 1030 deletions(-)
 delete mode 100644 block/fvd-flush.c
 create mode 100644 block/fvd-snapshot.c

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 9185d3e..2054fcb 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -15,6 +15,7 @@  oslib-obj-$(CONFIG_POSIX) += oslib-posix.o
 
 block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o
 block-obj-y += nbd.o block.o aio.o aes.o qemu-config.o
+block-obj-y += bitops.o bitmap.o
 block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
@@ -101,7 +102,6 @@  common-obj-y += msmouse.o ps2.o
 common-obj-y += qdev.o qdev-properties.o
 common-obj-y += block-migration.o
 common-obj-y += pflib.o
-common-obj-y += bitmap.o bitops.o
 
 common-obj-$(CONFIG_BRLAPI) += baum.o
 common-obj-$(CONFIG_POSIX) += migration-exec.o migration-unix.o migration-fd.o
diff --git a/block.c b/block.c
index 36e55f0..0559d83 100644
--- a/block.c
+++ b/block.c
@@ -58,7 +58,7 @@  static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
                          const uint8_t *buf, int nb_sectors);
 
-QTAILQ_HEAD(, BlockDriverState) bdrv_states =
+static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
diff --git a/block/blksim.c b/block/blksim.c
index 16e44ee..1c72e1f 100644
--- a/block/blksim.c
+++ b/block/blksim.c
@@ -116,9 +116,10 @@  static int do_io(BlockDriverState * bs, int64_t sector_num, uint8_t * buf,
                 sector_num, nb_sectors);
     }
 
-    if ((ret=lseek(s->fd, sector_num * 512, SEEK_SET)) < 0) {
+    if (lseek64(s->fd, (off64_t)(sector_num * 512), SEEK_SET) < 0) {
         fprintf(stderr, "Error: lseek %s sector_num=%"PRId64"\n",
                 bs->filename, sector_num);
+        abort();
     }
 
     /* Buffer must be aligned for O_DIRECT. */
@@ -464,13 +465,12 @@  static void run_task_by_acb(SimAIOCB * acb)
 
         if (acb->ret == 0) {
             if (acb->qiov->niov == 1) {
-                if (blksim_read
-                    (bs, acb->sector_num, acb->qiov->iov->iov_base,
-                     acb->nb_sectors) != 0) {
+                if (blksim_read(bs, acb->sector_num, acb->qiov->iov->iov_base,
+                                acb->nb_sectors) != 0) {
                     fprintf(stderr, "Error in reading %s sector_num=%"PRId64
                             " nb_sectors=%d\n", acb->common.bs->filename,
                             acb->sector_num, acb->nb_sectors);
-                    exit(1);
+                    abort();
                 }
             } else {
                 uint8_t *buf = qemu_blockalign(acb->common.bs, acb->qiov->size);
@@ -478,7 +478,7 @@  static void run_task_by_acb(SimAIOCB * acb)
                     fprintf(stderr, "Error in reading %s sector_num=%"PRId64
                             " nb_sectors=%d\n", acb->common.bs->filename,
                             acb->sector_num, acb->nb_sectors);
-                    exit(1);
+                    abort();
                 }
                 qemu_iovec_from_buffer(acb->qiov, buf, acb->qiov->size);
                 qemu_vfree(buf);
@@ -499,7 +499,7 @@  static void run_task_by_acb(SimAIOCB * acb)
                     fprintf(stderr, "Error in writing %s sector_num=%"PRId64
                             " nb_sectors=%d\n", acb->common.bs->filename,
                             acb->sector_num, acb->nb_sectors);
-                    exit(1);
+                    abort();
                 }
             } else {
                 uint8_t *buf = qemu_blockalign(acb->common.bs,
@@ -509,7 +509,7 @@  static void run_task_by_acb(SimAIOCB * acb)
                     fprintf(stderr, "Error in writing %s sector_num=%"PRId64
                             " nb_sectors=%d\n", acb->common.bs->filename,
                             acb->sector_num, acb->nb_sectors);
-                    exit(1);
+                    abort();
                 }
                 qemu_vfree(buf);
             }
@@ -625,6 +625,7 @@  static int blksim_open(BlockDriverState * bs, const char *filename,
 {
     BDRVSimState *s = bs->opaque;
     int open_flags = O_BINARY | O_LARGEFILE;
+    int64_t len;
 
     blksim_invoked = true;
 
@@ -644,13 +645,11 @@  static int blksim_open(BlockDriverState * bs, const char *filename,
         return -errno;
     }
 
-    int64_t len = lseek(s->fd, 0, SEEK_END);
-    if (len >= 0) {
-        bs->total_sectors = len / 512;
-    } else {
-        bs->total_sectors = 0;
+    if ((len = lseek64(s->fd, (off64_t)0, SEEK_END)) < 0) {
+        return len;
     }
 
+    bs->total_sectors = len / 512;
     bs->growable = 1;
     bs->buffer_alignment = 512;
     return 0;
diff --git a/block/fvd-create.c b/block/fvd-create.c
index c8912aa..c9fac90 100644
--- a/block/fvd-create.c
+++ b/block/fvd-create.c
@@ -19,16 +19,26 @@  static inline int search_empty_blocks(int fd, uint8_t * bitmap,
                                       int32_t hole_size,
                                       int32_t block_size);
 
+static inline int calc_bitmap_size(int64_t base_img_size, int32_t block_size)
+{
+    const uint64_t blocks = DIV_ROUND_UP(base_img_size, block_size);
+    return DIV_ROUND_UP(blocks, 8); /* Convert bits to bytes. */
+}
+
+static inline uint64_t calc_table_size(uint64_t virtual_disk_size,
+                                       uint64_t chunk_size)
+{
+    uint64_t table_entries = DIV_ROUND_UP(virtual_disk_size, chunk_size);
+    return sizeof(uint32_t) * table_entries;
+}
+
 static int fvd_create(const char *filename, QEMUOptionParameter * options)
 {
     int fd, ret = 0;
-    FvdHeader *header;
     int64_t virtual_disk_size = DEF_PAGE_SIZE;
     int32_t header_size;
     const char *base_img = NULL;
     const char *base_img_fmt = NULL;
-    const char *data_file = NULL;
-    const char *data_file_fmt = NULL;
     int32_t hole_size = 0;
     int copy_on_read = false;
     int prefetch_start_delay = -1;
@@ -38,7 +48,8 @@  static int fvd_create(const char *filename, QEMUOptionParameter * options)
     int64_t table_size = 0;
     int64_t journal_size = 0;
     int32_t block_size = 0;
-    int compact_image = false;
+    uint64_t chunk_size = 0;
+    int raw_layout = false;
     uint64_t max_copy_on_read = MAX_OUTSTANDING_COPY_ON_READ_DATA;
     uint32_t num_prefetch_slots = NUM_PREFETCH_SLOTS;
     uint64_t bytes_per_prefetch = BYTES_PER_PREFETCH;
@@ -49,11 +60,9 @@  static int fvd_create(const char *filename, QEMUOptionParameter * options)
     uint64_t prefetch_min_write_throughput = PREFETCH_MIN_WRITE_THROUGHPUT;
     uint64_t prefetch_max_read_throughput = PREFETCH_MAX_READ_THROUGHPUT;
     uint64_t prefetch_max_write_throughput = PREFETCH_MAX_WRITE_THROUGHPUT;
+    FvdHeader _header, *header = &_header;
 
-    header_size = sizeof(FvdHeader);
-    header_size = ROUND_UP(header_size, DEF_PAGE_SIZE);
-    header = my_qemu_mallocz(header_size);
-    header->header_size = header_size;
+    memset(header, 0, sizeof(FvdHeader));
 
     /* Read out options */
     while (options && options->name) {
@@ -71,18 +80,14 @@  static int fvd_create(const char *filename, QEMUOptionParameter * options)
             base_img_fmt = options->value.s;
         } else if (!strcmp(options->name, "copy_on_read")) {
             copy_on_read = options->value.n;
-        } else if (!strcmp(options->name, "data_file")) {
-            data_file = options->value.s;
-        } else if (!strcmp(options->name, "data_file_fmt")) {
-            data_file_fmt = options->value.s;
         } else if (!strcmp(options->name, "optimize_empty_block")) {
             hole_size = options->value.n;
-        } else if (!strcmp(options->name, "compact_image")) {
-            compact_image = options->value.n;
+        } else if (!strcmp(options->name, "raw_layout")) {
+            raw_layout = options->value.n;
         } else if (!strcmp(options->name, "block_size")) {
             block_size = options->value.n;
         } else if (!strcmp(options->name, "chunk_size")) {
-            header->chunk_size = options->value.n;
+            chunk_size = options->value.n;
         } else if (!strcmp(options->name, "journal_size")) {
             journal_size = options->value.n;
         } else if (!strcmp(options->name, "journal_buf_size")) {
@@ -137,6 +142,7 @@  static int fvd_create(const char *filename, QEMUOptionParameter * options)
     }
 
     virtual_disk_size = ROUND_UP(virtual_disk_size, 512);
+    const bool compact_image = !raw_layout;
 
     /* Check if arguments are valid. */
     if (base_img && strlen(base_img) > 1023) {
@@ -145,33 +151,46 @@  static int fvd_create(const char *filename, QEMUOptionParameter * options)
         return -EINVAL;
     }
 
-    if (base_img && hole_size > 0) {
-        if (compact_image) {
-            fprintf(stderr, "compact_image and optimize_empty_block cannot be "
-                    "enabled together. Please disable optimize_empty_block.\n");
-            return -EINVAL;
-        }
-        header->need_zero_init = true;
+    /* Set chunk size and block size. */
+    if (chunk_size <= 0) {
+        chunk_size = CHUNK_SIZE;
     } else {
-        header->need_zero_init = false;
+        chunk_size = ROUND_UP(chunk_size, 512);
     }
-
-    if (data_file) {
-        pstrcpy(header->data_file, 1024, data_file);
-        if (data_file_fmt) {
-            pstrcpy(header->data_file_fmt, 16, data_file_fmt);
-        }
+    if (block_size <= 0) {
+        block_size = chunk_size; /* By default, they are of the same size. */
+    } else {
+        block_size = ROUND_UP(block_size, 512);
+        /* Chunk size must be a multiple of block size. */
+        chunk_size = ROUND_UP(chunk_size, block_size);
+    }
+    if (!compact_image) {
+        /* Initialize chunk_size in case that later we want to convert a
+         * non-compact image into a compact image. */
+        chunk_size = block_size;
+    }
+    if (!base_img) {
+        /* Initialize block_size to simplify code in do_aio_write(). */
+        block_size = chunk_size;
     }
 
+    /* Initialize chunk_size and block_size even if it is not a compact image
+     * and has no base image, because later we might to convert a non-compact
+     * image into a compact image. It guarantees that the table, the journal,
+     * and the data are all properly aligned on chunk_size boundary. */
+    header->chunk_size = chunk_size;
+    header->block_size = block_size;
+    header->header_size = sizeof(FvdHeader);
+    header_size = ROUND_UP(header->header_size, chunk_size);
+    header->header_padding_size = header_size - header->header_size;
+    header->need_zero_init = base_img && hole_size > 0;
     header->magic = FVD_MAGIC;
     header->last_open_version = header->create_version = FVD_VERSION;
     header->virtual_disk_size = virtual_disk_size;
     header->clean_shutdown = true;
 
-    if (!base_img) {
-        header->base_img_fully_prefetched = true;
-    } else {
-        /* Handle base image. */
+    /* Initialization related to base image. */
+    if (base_img) {
         int ret;
 
         bs = bdrv_new("");
@@ -202,28 +221,9 @@  static int fvd_create(const char *filename, QEMUOptionParameter * options)
         base_img_size = bdrv_getlength(bs);
         base_img_size = MIN(virtual_disk_size, base_img_size);
         base_img_size = ROUND_UP(base_img_size, 512);
-
-        if (block_size <= 0) {
-            /* No block size is provided. Find the smallest block size that
-             * does not make the bitmap too big. */
-            block_size = 512;
-            while (1) {
-                int64_t blocks = (base_img_size + block_size - 1) / block_size;
-                bitmap_size = (blocks + 7) / 8;
-                if (bitmap_size <= MODERATE_BITMAP_SIZE) {
-                    break;
-                }
-                block_size *= 2;
-            }
-        } else {
-            block_size = ROUND_UP(block_size, 512);
-            int64_t blocks = (base_img_size + block_size - 1) / block_size;
-            bitmap_size = (blocks + 7) / 8;
-        }
-
-        bitmap_size = ROUND_UP(bitmap_size, DEF_PAGE_SIZE);
+        bitmap_size = calc_bitmap_size(base_img_size, block_size);
+        bitmap_size = ROUND_UP(bitmap_size, chunk_size);
         header->bitmap_size = bitmap_size;
-        header->block_size = block_size;
         header->bitmap_offset = header_size;
         header->base_img_size = base_img_size;
         header->max_outstanding_copy_on_read_data = max_copy_on_read;
@@ -241,32 +241,27 @@  static int fvd_create(const char *filename, QEMUOptionParameter * options)
         header->prefetch_max_read_throughput = prefetch_max_read_throughput;
         header->prefetch_max_write_throughput = prefetch_max_write_throughput;
         header->base_img_fully_prefetched = false;
+    } else { /* no base image. */
+        header->base_img_fully_prefetched = true;
     }
 
     /* Set the table size. */
     if (compact_image) {
-        if (header->chunk_size <= 0) {
-            header->chunk_size = CHUNK_SIZE;
-        }
-        if (base_img) {
-            /* chunk_size must be a multiple of block_size. */
-            header->chunk_size = ROUND_UP(header->chunk_size, block_size);
-        } else {
-            header->chunk_size = ROUND_UP(header->chunk_size, DEF_PAGE_SIZE);
-        }
-
         if (header->storage_grow_unit <= 0) {
             header->storage_grow_unit = STORAGE_GROW_UNIT;
         }
-        if (header->storage_grow_unit < header->chunk_size) {
-            header->storage_grow_unit = header->chunk_size;
+        if (header->storage_grow_unit < chunk_size) {
+            header->storage_grow_unit = chunk_size;
         }
-        int64_t table_entries =
-            (virtual_disk_size + header->chunk_size - 1) / header->chunk_size;
-        table_size = sizeof(uint32_t) * table_entries;
-        table_size = ROUND_UP(table_size, DEF_PAGE_SIZE);
+
+        /* The table is aligned on chunk boundary and its size is a multiple
+         * of chunk_size. This allows the table to be relocated and grow if
+         * the virtual disk is resized to a larger size. */
+        table_size = calc_table_size(virtual_disk_size, chunk_size);
+        table_size = ROUND_UP(table_size, chunk_size);
+        header->table_size = table_size;
         if (table_size > 0) {
-            header->table_size = table_size;
+            header->table_offset = header_size + bitmap_size;
         }
     }
 
@@ -287,16 +282,19 @@  static int fvd_create(const char *filename, QEMUOptionParameter * options)
             /* Make sure that the journal is at least large enough to record
              * all table changes in one shot, which is the extremely unlikely
              * worst case. */
-            int64_t vsize = virtual_disk_size + header->chunk_size - 1;
-            int64_t table_entries = vsize / header->chunk_size;
+            int64_t table_entries = DIV_ROUND_UP(virtual_disk_size, chunk_size);
             int64_t min_journal_size = calc_min_journal_size(table_entries);
             if (journal_size < min_journal_size) {
                 journal_size = min_journal_size;
             }
         }
-        journal_size = ROUND_UP(journal_size, DEF_PAGE_SIZE);
+
+        /* The journal is aligned on chunk boundary and its size is a
+         * multiple of chunk_size. This allows the journal to be relocated
+         * and grow if journal_size is increased by 'qemu-img update'. */
+        journal_size = ROUND_UP(journal_size, chunk_size);
         header->journal_size = journal_size;
-        header->journal_offset = header_size + bitmap_size;
+        header->journal_offset = header_size + bitmap_size + table_size;
 
         if (header->journal_buf_size <= 0) {
             header->journal_buf_size = JOURNAL_BUF_SIZE;
@@ -310,32 +308,35 @@  static int fvd_create(const char *filename, QEMUOptionParameter * options)
         }
     }
 
-    if (table_size > 0) {
-        /* Table is located right before the data region. When expanding the
-         * size of an existing FVD image, the table can be expanded to borrow
-         * space from the data region, by relocating some data chunks. */
-        header->table_offset = header_size + bitmap_size + journal_size;
-    }
-
-    header->data_offset = header_size + bitmap_size + table_size +
-        MAX(0, journal_size);
-
     /* Create the image file. */
     fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
     if (fd < 0) {
         fprintf(stderr, "Failed to open %s\n", filename);
         goto fail;
     }
-    fvd_header_cpu_to_le(header);
 
-    if (qemu_write_full(fd, header, header_size) != header_size) {
+    /* Write the header. */
+    fvd_header_cpu_to_le(header);
+    if (qemu_write_full(fd, header, sizeof(FvdHeader)) != sizeof(FvdHeader)) {
         fprintf(stderr, "Failed to write the header of %s\n", filename);
         goto fail;
     }
 
+    /* Write padding zeros after the header. */
+    if (header_size > sizeof(FvdHeader)) {
+        size_t n = header_size - sizeof(FvdHeader);
+        uint8_t *p = my_qemu_mallocz(n);
+        if (qemu_write_full(fd, p, n) != n) {
+            fprintf(stderr, "Failed to write header padding of %s\n", filename);
+            goto fail;
+        }
+        my_qemu_free(p);
+    }
+
     /* Initialize the bitmap. */
+    uint8_t *bitmap = NULL;
     if (bitmap_size > 0) {
-        uint8_t *bitmap = my_qemu_mallocz(bitmap_size);
+        bitmap = my_qemu_mallocz(bitmap_size);
         if (hole_size > 0) {
             if ((ret = search_empty_blocks(fd, bitmap, bs, base_img_size / 512,
                                            hole_size, block_size))) {
@@ -344,13 +345,46 @@  static int fvd_create(const char *filename, QEMUOptionParameter * options)
         }
 
         ret = qemu_write_full(fd, bitmap, bitmap_size);
-        my_qemu_free(bitmap);
         if (ret != bitmap_size) {
             fprintf(stderr, "Failed to zero out the bitmap of %s\n", filename);
             goto fail;
         }
     }
 
+    /* Initialize the table. */
+    if (table_size > 0) {
+        int i;
+        uint32_t *table = my_qemu_mallocz(table_size);
+
+        if (bitmap_size > 0 && hole_size > 0) {
+            /* bitmap has pre-set bits for empty blocks. Need to initialize
+             * the table for those empty blocks as well to ensure that an empty
+             * block will never be mapped to a place that is written
+             * before and hence with potentially non-zero content. */
+            int blocks = DIV_ROUND_UP(base_img_size, block_size);
+            uint32_t next_avail_chunk = (header_size + bitmap_size +
+                                    table_size + journal_size) / chunk_size;
+            for (i = 0; i < blocks; i++) {
+                if (test_bit(i, (unsigned long*)bitmap)) {
+                    int chunk = i * block_size / chunk_size;
+                    if (IS_EMPTY(table[chunk])) {
+                        table[chunk] = next_avail_chunk++;
+                    }
+                }
+            }
+        }
+
+        ret = qemu_write_full(fd, table, table_size);
+        my_qemu_free(table);
+        if (ret != table_size) {
+            fprintf(stderr, "Failed to write the table of %s\n.", filename);
+            goto fail;
+        }
+    }
+    if (bitmap) {
+        my_qemu_free(bitmap);
+    }
+
     /* Initialize the journal. */
     if (journal_size > 0) {
         uint8_t *empty_journal = my_qemu_malloc(journal_size);
@@ -363,23 +397,9 @@  static int fvd_create(const char *filename, QEMUOptionParameter * options)
         }
     }
 
-    /* Initialize the table. */
-    if (table_size > 0) {
-        /* Set all entries to EMPTY_TABLE (0xFFFFFFFF). */
-        uint8_t *empty_table = my_qemu_malloc(table_size);
-        memset(empty_table, 0xFF, table_size);
-        ret = qemu_write_full(fd, empty_table, table_size);
-        my_qemu_free(empty_table);
-        if (ret != table_size) {
-            fprintf(stderr, "Failed to write the table of %s\n.", filename);
-            goto fail;
-        }
-    }
-
     if (bs) {
         bdrv_close(bs);
     }
-    my_qemu_free(header);
     return 0;
 
 fail:
@@ -387,7 +407,6 @@  fail:
         bdrv_close(bs);
     }
     close(fd);
-    my_qemu_free(header);
     return -EIO;
 }
 
@@ -468,10 +487,10 @@  static void fvd_header_cpu_to_le(FvdHeader * header)
 {
     cpu_to_le32s(&header->magic);
     cpu_to_le32s(&header->header_size);
+    cpu_to_le32s(&header->header_padding_size);
     cpu_to_le32s(&header->create_version);
     cpu_to_le32s(&header->last_open_version);
     cpu_to_le32s((uint32_t *) & header->base_img_fully_prefetched);
-    cpu_to_le64s((uint64_t *) & header->data_offset);
     cpu_to_le64s((uint64_t *) & header->virtual_disk_size);
     cpu_to_le64s((uint64_t *) & header->base_img_size);
     cpu_to_le64s((uint64_t *) & header->max_outstanding_copy_on_read_data);
@@ -491,6 +510,10 @@  static void fvd_header_cpu_to_le(FvdHeader * header)
     cpu_to_le64s((uint64_t *) & header->prefetch_max_write_throughput);
     cpu_to_le64s((uint64_t *) & header->block_size);
     cpu_to_le64s((uint64_t *) & header->chunk_size);
+    cpu_to_le64s((uint64_t *) & header->refcount_offset);
+    cpu_to_le64s((uint64_t *) & header->refcount_size);
+    cpu_to_le64s((uint64_t *) & header->vm_state_offset);
+    cpu_to_le64s((uint64_t *) & header->vm_state_size);
     cpu_to_le64s((uint64_t *) & header->storage_grow_unit);
     cpu_to_le64s((uint64_t *) & header->table_offset);
     cpu_to_le32s((uint32_t *) & header->clean_shutdown);
@@ -505,10 +528,10 @@  static void fvd_header_le_to_cpu(FvdHeader * header)
 {
     le32_to_cpus(&header->magic);
     le32_to_cpus(&header->header_size);
+    le32_to_cpus(&header->header_padding_size);
     le32_to_cpus(&header->create_version);
     le32_to_cpus(&header->last_open_version);
     le32_to_cpus((uint32_t *) & header->base_img_fully_prefetched);
-    le64_to_cpus((uint64_t *) & header->data_offset);
     le64_to_cpus((uint64_t *) & header->virtual_disk_size);
     le64_to_cpus((uint64_t *) & header->base_img_size);
     le64_to_cpus((uint64_t *) & header->max_outstanding_copy_on_read_data);
@@ -528,6 +551,10 @@  static void fvd_header_le_to_cpu(FvdHeader * header)
     le64_to_cpus((uint64_t *) & header->prefetch_max_write_throughput);
     le64_to_cpus((uint64_t *) & header->block_size);
     le64_to_cpus((uint64_t *) & header->chunk_size);
+    le64_to_cpus((uint64_t *) & header->refcount_offset);
+    le64_to_cpus((uint64_t *) & header->refcount_size);
+    le64_to_cpus((uint64_t *) & header->vm_state_offset);
+    le64_to_cpus((uint64_t *) & header->vm_state_size);
     le64_to_cpus((uint64_t *) & header->storage_grow_unit);
     le64_to_cpus((uint64_t *) & header->table_offset);
     le32_to_cpus((uint32_t *) & header->clean_shutdown);
@@ -540,11 +567,11 @@  static void fvd_header_le_to_cpu(FvdHeader * header)
 
 /* This function can handle incompatibility issues between different FVD
  * versions, specifically, FvdHeader might have different sizes. */
-static int read_fvd_header(BDRVFvdState * s, FvdHeader * header)
+static int read_fvd_header(BlockDriverState * bs, FvdHeader * header)
 {
     /* FvdHeader of different FVD versions might have different sizes. Read
      * header->header_size first. */
-    if (bdrv_pread(s->fvd_metadata, 0, header, 512) != 512) {
+    if (bdrv_pread(bs->file, 0, header, 512) != 512) {
         fprintf(stderr, "Failed to read the FVD header.\n");
         return -EIO;
     }
@@ -553,7 +580,7 @@  static int read_fvd_header(BDRVFvdState * s, FvdHeader * header)
      * FVD version that created the image and this FVD version. */
     le32_to_cpus(&header->header_size);
     int common_size = MIN(header->header_size, sizeof(FvdHeader));
-    if (bdrv_pread(s->fvd_metadata, 0, header, common_size) != common_size) {
+    if (bdrv_pread(bs->file, 0, header, common_size) != common_size) {
         fprintf(stderr, "Failed to read the FVD header.\n");
         return -EIO;
     }
@@ -571,34 +598,35 @@  static int read_fvd_header(BDRVFvdState * s, FvdHeader * header)
 
 /* This function can handle incompatibility issues between different FVD
  * versions, specifically, FvdHeader might have different sizes. */
-static int update_fvd_header(BDRVFvdState * s, FvdHeader * header)
+static int update_fvd_header(BlockDriverState * bs, FvdHeader * header)
 {
     /* FvdHeader of different FVD versions might have different sizes. Only
      * write the part of FvdHeader that is commonly understandable to the
      * FVD version that created the image and this FVD version. */
     int common_size = MIN(header->header_size, sizeof(FvdHeader));
     fvd_header_cpu_to_le(header);
-    int ret = bdrv_pwrite(s->fvd_metadata, 0, header, common_size);
+    int ret = bdrv_pwrite(bs->file, 0, header, common_size);
 
     if (ret != common_size) {
         fprintf(stderr, "Failed to update the FVD header.\n");
-        ASSERT(false);
         return -EIO;
     }
 
     return 0;
 }
 
-static inline void update_clean_shutdown_flag(BDRVFvdState * s, int clean)
+static inline void update_clean_shutdown_flag(BlockDriverState * bs, int clean)
 {
+    BDRVFvdState *s = bs->opaque;
     FvdHeader header;
-    if (!read_fvd_header(s, &header)) {
+
+    if (!read_fvd_header(bs, &header)) {
         header.last_open_version = FVD_VERSION;
         header.clean_shutdown = clean;
 
-        if (!update_fvd_header(s, &header)) {
+        if (!update_fvd_header(bs, &header)) {
             QDEBUG("Set clean_shutdown to %s\n", BOOL(clean));
-            if (bdrv_flush(s->fvd_metadata)) {
+            if (bdrv_flush(bs->file)) {
                 s->metadata_err_prohibit_write = true;
             }
         }
@@ -612,9 +640,9 @@  static QEMUOptionParameter fvd_create_options[] = {
      .type = OPT_SIZE,
      .help = "Virtual disk size"},
     {
-     .name = "compact_image",
+     .name = "raw_layout",
      .type = OPT_FLAG,
-     .help = "compact_image=on|off"},
+     .help = "raw_layout=on|off"},
     {
      .name = "block_size",
      .type = OPT_SIZE,
@@ -640,14 +668,6 @@  static QEMUOptionParameter fvd_create_options[] = {
      .type = OPT_STRING,
      .help = "Image format of the backing image"},
     {
-     .name = "data_file",
-     .type = OPT_STRING,
-     .help = "File name of a data file"},
-    {
-     .name = "data_file_fmt",
-     .type = OPT_STRING,
-     .help = "Image format of the data file"},
-    {
      .name = "copy_on_read",
      .type = OPT_FLAG,
      .help = "copy_on_read=on|off"},
diff --git a/block/fvd-debug.c b/block/fvd-debug.c
index 36b4c43..4c82da3 100644
--- a/block/fvd-debug.c
+++ b/block/fvd-debug.c
@@ -51,6 +51,7 @@  static void TRACE_STORE_IN_FVD(const char *str, int64_t sector_num,
 
 #ifndef FVD_DEBUG
 #define my_qemu_malloc qemu_malloc
+#define my_qemu_realloc(p,old_size,new_size) qemu_realloc(p,new_size)
 #define my_qemu_mallocz qemu_mallocz
 #define my_qemu_blockalign qemu_blockalign
 #define my_qemu_free qemu_free
@@ -71,6 +72,10 @@  static int alloc_line;
 #define my_qemu_malloc(size) \
     ((void*)(alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_malloc(size)))
 
+#define my_qemu_realloc(p,old_size,new_size) \
+    ((void*)(alloc_file=__FILE__, alloc_line=__LINE__, \
+             _my_qemu_realloc(p,old_size,new_size)))
+
 #define my_qemu_mallocz(size) \
     ((void*)(alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_mallocz(size)))
 
@@ -94,7 +99,6 @@  static void COPY_UUID(FvdAIOCB * to, FvdAIOCB * from)
 {
     if (from) {
         to->uuid = from->uuid;
-        FVD_DEBUG_ACB(to);
     }
 }
 
@@ -210,8 +214,6 @@  static inline void *_my_qemu_aio_get(AIOPool * pool, BlockDriverState * bs,
     acb->uuid = ++fvd_uuid;
     acb->magic = FVDAIOCB_MAGIC;
 
-    FVD_DEBUG_ACB(acb);
-
 #ifdef DEBUG_MEMORY_LEAK
     trace_alloc(&acb->tracer, -1);
 #endif
@@ -306,6 +308,14 @@  static inline void _my_qemu_vfree(void *ptr)
 #endif
 }
 
+static inline void *_my_qemu_realloc(void *p, size_t old_size, size_t new_size)
+{
+    void *q = _my_qemu_malloc(new_size);
+    memcpy(q, p, MIN(old_size, new_size));
+    _my_qemu_free(p);
+    return q;
+}
+
 static void count_pending_requests(BDRVFvdState * s)
 {
     int m = 0, n = 0, k = 0;
diff --git a/block/fvd-flush.c b/block/fvd-flush.c
deleted file mode 100644
index 6658d27..0000000
--- a/block/fvd-flush.c
+++ /dev/null
@@ -1,192 +0,0 @@ 
-/*
- * QEMU Fast Virtual Disk Format Misc Functions of BlockDriver Interface
- *
- * Copyright IBM, Corp. 2010
- *
- * Authors:
- *    Chunqiang Tang <ctang@us.ibm.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING.LIB file in the top-level directory.
- *
- */
-
-static void aio_wrapper_bh(void *opaque);
-static int bjnl_sync_flush(BlockDriverState * bs);
-static bool bjnl_clean_buf_on_aio_flush(BlockDriverState *bs,
-                              BlockDriverCompletionFunc * cb,
-                              void *opaque, BlockDriverAIOCB **p_acb);
-static BlockDriverAIOCB *fvd_aio_flush_start(BlockDriverState * bs,
-                              BlockDriverCompletionFunc * cb,
-                              void *opaque, FvdAIOCB *parent_acb);
-
-static int fvd_flush(BlockDriverState * bs)
-{
-    BDRVFvdState *s = bs->opaque;
-    int ret;
-
-    QDEBUG("fvd_flush() invoked\n");
-
-    if (s->metadata_err_prohibit_write) {
-        return -EIO;
-    }
-
-    if (!s->fvd_metadata->enable_write_cache) {
-        /* No need to flush since it uses O_DSYNC. */
-        return 0;
-    }
-
-    if (s->use_bjnl) {
-        return bjnl_sync_flush(bs);
-    }
-
-    /* Simply flush for unbuffered journal update. */
-    if ((ret = bdrv_flush(s->fvd_data))) {
-        return ret;
-    }
-    if (s->fvd_metadata == s->fvd_data) {
-        return 0;
-    }
-    return bdrv_flush(s->fvd_metadata);
-}
-
-static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs,
-                                       BlockDriverCompletionFunc * cb,
-                                       void *opaque)
-{
-    BDRVFvdState *s = bs->opaque;
-    BlockDriverAIOCB * pacb;
-    FvdAIOCB  *acb;
-
-    QDEBUG("fvd_aio_flush() invoked\n");
-
-    if (s->metadata_err_prohibit_write) {
-        return NULL;
-    }
-
-    if (!s->fvd_data->enable_write_cache) {
-        /* Need to flush since it uses O_DSYNC. Use a QEMUBH to invoke the
-         * callback. */
-
-        if (!(acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque))) {
-            return NULL;
-        }
-
-        acb->type = OP_WRAPPER;
-        acb->cancel_in_progress = false;
-        acb->wrapper.bh = qemu_bh_new(aio_wrapper_bh, acb);
-        qemu_bh_schedule(acb->wrapper.bh);
-        return &acb->common;
-    }
-
-    if (!s->use_bjnl) {
-        QDEBUG("FLUSH: start now for unbuffered journal update");
-        return fvd_aio_flush_start(bs, cb, opaque, NULL);
-    }
-
-    if (bjnl_clean_buf_on_aio_flush(bs, cb, opaque, &pacb)) {
-        /* Waiting for the journal buffer to be cleaned first. */
-        return pacb;
-    }
-
-    /* No buffered journal data. Start flush now. */
-    QDEBUG("FLUSH: start now as no buffered journal data");
-    return fvd_aio_flush_start(bs, cb, opaque, NULL);
-}
-
-static inline void finish_flush(FvdAIOCB * acb)
-{
-    QDEBUG("FLUSH: acb%llu-%p  finish_flush ret=%d\n",
-           acb->uuid, acb, acb->flush.ret);
-    acb->common.cb(acb->common.opaque, acb->flush.ret);
-    my_qemu_aio_release(acb);
-}
-
-static void flush_data_cb(void *opaque, int ret)
-{
-    FvdAIOCB *acb = opaque;
-
-    if (acb->cancel_in_progress) {
-        return;
-    }
-
-    QDEBUG("FLUSH: acb%llu-%p  flush_data_cb ret=%d\n", acb->uuid, acb, ret);
-
-    if (acb->flush.ret == 0) {
-        acb->flush.ret = ret;
-    }
-
-    acb->flush.data_acb = NULL;
-    acb->flush.num_finished++;
-    if (acb->flush.num_finished == 2) {
-        finish_flush(acb);
-    }
-}
-
-static void flush_metadata_cb(void *opaque, int ret)
-{
-    FvdAIOCB *acb = opaque;
-
-    if (acb->cancel_in_progress) {
-        return;
-    }
-
-    QDEBUG("FLUSH: acb%llu-%p  flush_metadata_cb ret=%d\n",
-           acb->uuid, acb, ret);
-
-    if (acb->flush.ret == 0) {
-        acb->flush.ret = ret;
-    }
-
-    acb->flush.metadata_acb = NULL;
-    acb->flush.num_finished++;
-    if (acb->flush.num_finished == 2) {
-        finish_flush(acb);
-    }
-}
-
-static BlockDriverAIOCB *fvd_aio_flush_start(BlockDriverState * bs,
-                                       BlockDriverCompletionFunc * cb,
-                                       void *opaque, FvdAIOCB *parent_acb)
-{
-    BDRVFvdState *s = bs->opaque;
-    FvdAIOCB  *acb;
-
-    if (s->fvd_data == s->fvd_metadata) {
-        if (parent_acb) {
-            QDEBUG("FLUSH: acb%llu-%p  started.\n",parent_acb->uuid,parent_acb);
-        }
-        return bdrv_aio_flush(s->fvd_metadata, cb, opaque);
-    }
-
-    acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
-    if (!acb) {
-        return NULL;
-    }
-    COPY_UUID(acb, parent_acb); /* UUID helps debugging. */
-
-    /* fvd_data and fvd_metadata are different. Need to flush both. The order
-     * is not important. If (cache != writethrough && bitmap_updated), a flush
-     * on fvd_data must have already been performed by write_journal_buf(). */
-
-    acb->type = OP_FLUSH;
-    acb->cancel_in_progress = false;
-    acb->flush.num_finished = 0;
-    acb->flush.ret = 0;
-    acb->flush.data_acb = bdrv_aio_flush(s->fvd_data, flush_data_cb, acb);
-    if (!acb->flush.data_acb) {
-        my_qemu_aio_release(acb);
-        return NULL;
-    }
-
-    acb->flush.metadata_acb = bdrv_aio_flush(s->fvd_metadata,
-                                             flush_metadata_cb, acb);
-    if (!acb->flush.metadata_acb) {
-        bdrv_aio_cancel(acb->flush.data_acb);
-        my_qemu_aio_release(acb);
-        return NULL;
-    }
-
-    QDEBUG("FLUSH: acb%llu-%p  started.\n", acb->uuid, acb);
-    return &acb->common;
-}
diff --git a/block/fvd-journal-buf.c b/block/fvd-journal-buf.c
index c6b60f9..35a1bef 100644
--- a/block/fvd-journal-buf.c
+++ b/block/fvd-journal-buf.c
@@ -119,7 +119,7 @@  static inline void bjnl_handle_aio_flush(FvdAIOCB *acb)
     if (!s->metadata_err_prohibit_write) {
         /* Buffered data have been written to journal. Now start flush. */
         QDEBUG("JOURNAL: bjnl_start_flush for acb%llu-%p\n", acb->uuid, acb);
-        acb->jcb.hd_acb = fvd_aio_flush_start(bs, bjnl_aio_flush_cb, acb, acb);
+        acb->jcb.hd_acb = bdrv_aio_flush(bs->file, bjnl_aio_flush_cb, acb);
         if (acb->jcb.hd_acb) {
             return;
         }
@@ -177,7 +177,7 @@  static int bjnl_write_buf_start(FvdAIOCB *acb)
 
     PRINT_JRECORDS(acb->jcb.iov.iov_base, acb->jcb.iov.iov_len);
 
-    acb->jcb.hd_acb = bdrv_aio_writev(s->fvd_metadata,
+    acb->jcb.hd_acb = bdrv_aio_writev(bs->file,
                                       s->journal_offset + journal_sec,
                                       &acb->jcb.qiov, nb_sectors,
                                       bjnl_write_buf_cb, acb);
@@ -215,7 +215,6 @@  static void bjnl_flush_data_before_update_bitmap_cb(void *opaque, int ret)
 static inline int bjnl_write_buf(FvdAIOCB *acb)
 {
     BlockDriverState *bs = acb->common.bs;
-    BDRVFvdState *s = bs->opaque;
 
     QDEBUG("JOURNAL: bjnl_write_buf acb%llu-%p\n", acb->uuid, acb);
 
@@ -231,11 +230,11 @@  static inline int bjnl_write_buf(FvdAIOCB *acb)
         return bjnl_write_buf_start(acb);
     }
 
-    /* If bitmap_updated, fvd_data need be flushed first before bitmap changes
+    /* If bitmap_updated, data need be flushed first before bitmap changes
      * can be committed. Otherwise, a host crashes after bitmap metadata are
      * updated but before the corresponding data are persisted on disk, the VM
      * will get corrupted data, as correct data may be in the base image. */
-    acb->jcb.hd_acb = bdrv_aio_flush(s->fvd_data,
+    acb->jcb.hd_acb = bdrv_aio_flush(bs->file,
                                      bjnl_flush_data_before_update_bitmap_cb,
                                      acb);
     if (acb->jcb.hd_acb) {
@@ -355,7 +354,7 @@  use_current_buf:
     record_size = ROUND_UP(record_size, 512);
     s->bjnl.buf_size = MAX(record_size, s->bjnl.def_buf_size);
     s->bjnl.buf_contains_bitmap_update = update_bitmap;
-    s->bjnl.buf = my_qemu_blockalign(s->fvd_metadata, s->bjnl.buf_size);
+    s->bjnl.buf = my_qemu_blockalign(bs->file, s->bjnl.buf_size);
 
     return s->bjnl.buf;
 }
@@ -461,8 +460,8 @@  static int bjnl_sync_flush(BlockDriverState * bs)
     if (bitmap_updated) {
         /* Need a flush to ensure the correct semantics of copy-on-write in
          * the event of a host crash. */
-        QDEBUG("JOURNAL: bjnl_sync_flush bitmap_updated flush_fvd_data\n");
-        if ((ret = bdrv_flush(s->fvd_data))) {
+        QDEBUG("JOURNAL: bjnl_sync_flush bitmap_updated flush_data\n");
+        if ((ret = bdrv_flush(bs->file))) {
             goto cleanup;
         }
     }
@@ -481,7 +480,7 @@  static int bjnl_sync_flush(BlockDriverState * bs)
     s->next_journal_sector += nb_sectors;
 
     /* Copy all metadata updates into one buffer. */
-    p = buf = my_qemu_blockalign(s->fvd_metadata, buf_size);
+    p = buf = my_qemu_blockalign(bs->file, buf_size);
     acb = QTAILQ_FIRST(&s->bjnl.queued_bufs);
     while (acb) {
         if (acb->type == OP_BJNL_BUF_WRITE) {
@@ -503,7 +502,7 @@  static int bjnl_sync_flush(BlockDriverState * bs)
 
     /* Write all metadata updates synchronously. */
     QDEBUG("JOURNAL: bjnl_sync_flush write_buffer\n");
-    if ((ret=bdrv_write(s->fvd_metadata, s->journal_offset + journal_sec,
+    if ((ret=bdrv_write(bs->file, s->journal_offset + journal_sec,
                         buf, nb_sectors)) < 0) {
         goto cleanup;
     }
@@ -511,12 +510,7 @@  static int bjnl_sync_flush(BlockDriverState * bs)
 done:
     /* Flush finally. */
     QDEBUG("JOURNAL: bjnl_sync_flush do final flush\n");
-    if (s->fvd_data != s->fvd_metadata) {
-        if ((ret = bdrv_flush(s->fvd_data)) != 0) {
-            goto cleanup;
-        }
-    }
-    ret = bdrv_flush(s->fvd_metadata);
+    ret = bdrv_flush(bs->file);
 
 cleanup:
     if (buf) {
@@ -557,24 +551,28 @@  static void print_jrecords(const uint8_t *sector, size_t len)
     while (sector < end) {
         uint32_t *type = (uint32_t *) sector;   /* Journal record type. */
         while ((uint8_t *) type < (sector + 512)) {
-            if (le32_to_cpu(*type) == BITMAP_JRECORD) {
+            const uint32_t ctype = le32_to_cpup(type);
+            if (ctype == BITMAP_JRECORD) {
                 uint32_t *nb_sectors = type + 1;
-                int64_t *sector_num = (int64_t *) (type + 2);
+                int64_t *sector_num = (int64_t *) (nb_sectors + 1);
+                uint64_t *epoch = (uint64_t*) (sector_num + 1);
 
-                QDEBUG("JOURNAL: write BITMAP_JRECORD sector_num=%" PRId64
-                       " nb_sectors=%u\n", le64_to_cpu(*sector_num),
-                       le32_to_cpu(*nb_sectors));
+                QDEBUG("JOURNAL: write BITMAP_JRECORD epoch=%"PRIu64
+                       " sector_num=%"PRId64" nb_sectors=%u\n",
+                       le64_to_cpupu(epoch),
+                       le64_to_cpupu((uint64_t*)sector_num),
+                       le32_to_cpup(nb_sectors));
 
                 /* First field of the next journal record. */
-                type = (uint32_t *) (sector_num + 1);
-            } else if (le32_to_cpu(*type) == TABLE_JRECORD) {
+                type = (uint32_t *) (epoch + 1);
+            } else if (ctype == TABLE_JRECORD) {
                 uint64_t *epoch = (uint64_t *) (type + 1);
                 uint32_t *count = (uint32_t *) (epoch + 1);
                 uint32_t *offset = count + 1;
                 uint32_t *content = offset + 1;
-                const uint32_t chunk = le32_to_cpu(*offset);
-                const uint64_t epo = le64_to_cpu(*epoch);
-                const uint32_t n = le32_to_cpu(*count);
+                const uint32_t chunk = le32_to_cpup(offset);
+                const uint64_t epo = le64_to_cpupu(epoch);
+                const uint32_t n = le32_to_cpup(count);
                 uint32_t i;
 
                 QDEBUG("JOURNAL: write TABLE_JRECORD epoch=%" PRIu64
@@ -587,7 +585,7 @@  static void print_jrecords(const uint8_t *sector, size_t len)
                 type = content + n;     /* First field of the next record. */
             } else {
                 /* End of valid records in this journal sector. */
-                ASSERT(le32_to_cpu(*type) == EMPTY_JRECORD);
+                ASSERT(ctype == EMPTY_JRECORD);
                 break;
             }
         }
diff --git a/block/fvd-journal.c b/block/fvd-journal.c
index 11796b0..c01af05 100644
--- a/block/fvd-journal.c
+++ b/block/fvd-journal.c
@@ -19,6 +19,7 @@ 
  * bitmap_update and table_update.
  *   Format of a bitmap_update record:
  *         BITMAP_JRECORD (uint32_t)
+ *         journal_epoch (uint64_t)
  *         num_dirty_sectors (uint32_t)
  *         dirty_sector_begin (int64_t)
  *   Format of a table_update record:
@@ -48,8 +49,8 @@ 
 #define BITMAP_JRECORD               ((uint32_t)0xEF2AB8ED)
 #define TABLE_JRECORD                ((uint32_t)0xB4E6F7AC)
 #define EMPTY_JRECORD                ((uint32_t)0xA5A5A5A5)
-#define BITMAP_JRECORD_SIZE          (2 * sizeof(uint32_t) + sizeof(int64_t))
-#define TABLE_JRECORD_HDR_SIZE       (3 * sizeof(uint32_t) + sizeof(uint64_t))
+#define BITMAP_JRECORD_SIZE          (2*sizeof(uint32_t) + 2*sizeof(int64_t))
+#define TABLE_JRECORD_HDR_SIZE       (3*sizeof(uint32_t) + sizeof(uint64_t))
 #define TABLE_JRECORDS_PER_SECTOR \
                 ((512 - TABLE_JRECORD_HDR_SIZE)/sizeof(uint32_t))
 
@@ -120,7 +121,7 @@  static int init_journal(int read_only, BlockDriverState * bs,
         return 0;
     }
 
-    if (!read_only && !IN_QEMU_TOOL && s->fvd_metadata->enable_write_cache
+    if (!read_only && !IN_QEMU_TOOL && bs->file->enable_write_cache
         && header->journal_buf_size > 0) {
         s->use_bjnl = true;
         QTAILQ_INIT(&s->bjnl.queued_bufs);
@@ -142,9 +143,9 @@  static int init_journal(int read_only, BlockDriverState * bs,
     QDEBUG("Recover from the journal as the VM was not shut down gracefully "
            "last time.\n");
 
-    uint8_t *journal = my_qemu_blockalign(s->fvd_metadata,
+    uint8_t *journal = my_qemu_blockalign(bs->file,
                                           s->journal_size * 512);
-    int ret = bdrv_read(s->fvd_metadata, s->journal_offset,
+    int ret = bdrv_read(bs->file, s->journal_offset,
                         journal, s->journal_size);
     if (ret < 0) {
         my_qemu_vfree(journal);
@@ -168,27 +169,32 @@  static int init_journal(int read_only, BlockDriverState * bs,
     while (sector < journal_end) {
         uint32_t *type = (uint32_t *) sector;   /* Journal record type. */
         while ((uint8_t *) type < (sector + 512)) {
-            if (le32_to_cpu(*type) == BITMAP_JRECORD) {
-                uint32_t *nb_sectors = type + 1;
-                int64_t *sector_num = (int64_t *) (type + 2);
-                if (s->stale_bitmap) {
-                    update_both_bitmaps(s, le64_to_cpu(*sector_num),
-                                        le32_to_cpu(*nb_sectors));
+            const uint32_t ctype = le32_to_cpup(type);
+            if (ctype == BITMAP_JRECORD) {
+                uint64_t *epoch = (uint64_t*)(type + 1);
+                uint32_t *nb_sectors = (uint32_t*)(epoch + 1);
+                int64_t *sector_num = (int64_t *) (nb_sectors + 1);
+                const uint64_t epo = le64_to_cpupu(epoch);
+
+                if (s->stale_bitmap && epo > header->stable_journal_epoch) {
+                    update_both_bitmaps(s, le64_to_cpupu((uint64_t*)sector_num),
+                                        le32_to_cpup(nb_sectors));
                     QDEBUG("JOURNAL: recover BITMAP_JRECORD sector_num=%"
                            PRId64 " nb_sectors=%u\n",
-                           le64_to_cpu(*sector_num), le32_to_cpu(*nb_sectors));
+                           le64_to_cpupu((uint64_t*)sector_num),
+                           le32_to_cpup(nb_sectors));
                 }
 
                 /* First field of the next journal record. */
-                type = (uint32_t *) (sector_num + 1);
-            } else if (le32_to_cpu(*type) == TABLE_JRECORD) {
+                type = (uint32_t *)(((uint8_t*)type) + BITMAP_JRECORD_SIZE);
+            } else if (ctype == TABLE_JRECORD) {
                 uint64_t *epoch = (uint64_t *) (type + 1);
                 uint32_t *count = (uint32_t *) (epoch + 1);
                 uint32_t *offset = count + 1;
                 uint32_t *content = offset + 1;
-                const uint32_t chunk = le32_to_cpu(*offset);
-                const uint64_t epo = le64_to_cpu(*epoch);
-                const uint32_t n = le32_to_cpu(*count);
+                const uint32_t chunk = le32_to_cpup(offset);
+                const uint64_t epo = le64_to_cpupu(epoch);
+                const uint32_t n = le32_to_cpup(count);
                 uint32_t i;
                 QDEBUG("JOURNAL: recover TABLE_JRECORD epoch=%" PRIu64
                        " chunk_start=%u " "nb_chunks=%u\n", epo, chunk, n);
@@ -202,9 +208,9 @@  static int init_journal(int read_only, BlockDriverState * bs,
                         chunk_epoch[chunk + i] = epo;
                         s->table[chunk + i] = content[i];
 
-                        /* The dirty bit was not cleaned when the table entry
-                         * was saved in the journal. */
-                        CLEAN_DIRTY2(s->table[chunk + i]);
+                        /* The dirty bit and the shared bit were not cleaned
+                         * when the table entry was saved in the journal. */
+                        CLEAN_DIRTY_AND_SHARED(s->table[chunk + i]);
                         QDEBUG("\tAccept mapping chunk %u to %u\n",
                                chunk + i, READ_TABLE(content[i]));
                     } else {
@@ -218,7 +224,7 @@  static int init_journal(int read_only, BlockDriverState * bs,
                 }
             } else {
                 /* End of valid records in this journal sector. */
-                ASSERT(le32_to_cpu(*type) == EMPTY_JRECORD);
+                ASSERT(ctype == EMPTY_JRECORD);
                 break;
             }
         }
@@ -298,7 +304,7 @@  static void write_metadata_to_journal_cb(void *opaque, int ret)
             const uint32_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
                 / s->chunk_size;
             for (i = first_chunk; i <= last_chunk; i++) {
-                CLEAN_DIRTY2(s->table[i]);
+                CLEAN_DIRTY(s->table[i]); /* Not dirty anymore. */
             }
         }
 
@@ -352,7 +358,7 @@  static inline uint8_t * alloc_journal_records(FvdAIOCB *acb,
         return NULL;
     }
 
-    uint8_t *buf = my_qemu_blockalign(s->fvd_metadata, 512 * nb_sectors);
+    uint8_t *buf = my_qemu_blockalign(bs->file, 512 * nb_sectors);
     if (buf_size % 512 != 0) {
         *((uint32_t*)(buf + buf_size)) = EMPTY_JRECORD; /* Mark buffer end. */
     }
@@ -379,14 +385,17 @@  static uint8_t * create_journal_records(FvdAIOCB * acb,
         }
 
         uint32_t *type = (uint32_t *)buf; /*BITMAP_JRECORD*/
-        uint32_t *nb_sectors = type + 1;
-        int64_t *sector_num = (int64_t *) (type + 2);
-        *type = cpu_to_le32(BITMAP_JRECORD);
-        *nb_sectors = cpu_to_le32((uint32_t) acb->nb_sectors);
-        *sector_num = cpu_to_le64(acb->sector_num);
+        uint64_t *epoch = (uint64_t*)(type + 1);
+        uint32_t *nb_sectors = (uint32_t*)(epoch + 1);
+        int64_t *sector_num = (int64_t *) (nb_sectors + 1);
+
+        cpu_to_le32w(type, BITMAP_JRECORD);
+        cpu_to_le32w(nb_sectors, acb->nb_sectors);
+        cpu_to_le64wu((uint64_t*)sector_num, acb->sector_num);
+        cpu_to_le64wu(epoch, s->journal_epoch);
+
         QDEBUG("JOURNAL: record BITMAP_JRECORD sector_num=%" PRId64
                " nb_sectors=%u\n", acb->sector_num, acb->nb_sectors);
-
     } else if (!update_bitmap) {
         /* Only update the table. */
 
@@ -419,20 +428,19 @@  static uint8_t * create_journal_records(FvdAIOCB * acb,
             uint32_t *offset = count + 1;
             uint32_t *content = offset + 1;
 
-            *type = cpu_to_le32(TABLE_JRECORD);
-            *offset = cpu_to_le32(chunk);
-            *epoch = cpu_to_le64(s->journal_epoch);
-            s->journal_epoch++;
+            cpu_to_le32w(type, TABLE_JRECORD);
+            cpu_to_le32w(offset, chunk);
+            cpu_to_le64wu(epoch, s->journal_epoch);
             if (num_chunks <= TABLE_JRECORDS_PER_SECTOR) {
                 /* This is the last journal sector. */
-                *count = cpu_to_le32(num_chunks);
+                cpu_to_le32w(count, num_chunks);
                 memcpy(content, &s->table[chunk],
                        sizeof(uint32_t) * num_chunks);
                 PRINT_TABLE_JRECORD(type);
                 break;
             }
 
-            *count = cpu_to_le32(TABLE_JRECORDS_PER_SECTOR);
+            cpu_to_le32w(count, TABLE_JRECORDS_PER_SECTOR);
             memcpy(content, &s->table[chunk],
                    sizeof(uint32_t) * TABLE_JRECORDS_PER_SECTOR);
             chunk += TABLE_JRECORDS_PER_SECTOR;
@@ -490,14 +498,13 @@  static uint8_t * create_journal_records(FvdAIOCB * acb,
             uint32_t *offset = count + 1;
             uint32_t *content = offset + 1;
 
-            *type = cpu_to_le32(TABLE_JRECORD);
-            *offset = cpu_to_le32(chunk);
-            *epoch = cpu_to_le64(s->journal_epoch);
-            s->journal_epoch++;
+            cpu_to_le32w(type, TABLE_JRECORD);
+            cpu_to_le32w(offset, chunk);
+            cpu_to_le64wu(epoch, s->journal_epoch);
 
             if (num_chunks <= MIXED_JRECORDS_PER_SECTOR) {
                 /* This is the last journal sector. */
-                *count = cpu_to_le32(num_chunks);
+                cpu_to_le32w(count, num_chunks);
                 memcpy(content, &s->table[chunk],
                        sizeof(uint32_t) * num_chunks);
                 PRINT_TABLE_JRECORD(type);
@@ -505,17 +512,21 @@  static uint8_t * create_journal_records(FvdAIOCB * acb,
                 /* A BITMAP_JRECORD follows a TABLE_JRECORD so that they are
                  * updated in one atomic operatoin. */
                 type = content + num_chunks;    /* BITMAP_JRECORD. */
-                uint32_t *p_nb_sectors = type + 1;
-                int64_t *p_sector_num = (int64_t *) (type + 2);
-                *type = cpu_to_le32(BITMAP_JRECORD);
-                *p_nb_sectors = cpu_to_le32(nb_sectors);
-                *p_sector_num = cpu_to_le64(sector_num);
+                uint64_t *p_epoch = (uint64_t*) (type + 1);
+                uint32_t *p_nb_sectors = (uint32_t*) (p_epoch + 1);
+                int64_t *p_sector_num = (int64_t *) (p_nb_sectors + 1);
+
+                cpu_to_le32w(type, BITMAP_JRECORD);
+                cpu_to_le64wu(p_epoch, s->journal_epoch);
+                cpu_to_le32w(p_nb_sectors, nb_sectors);
+                cpu_to_le64wu((uint64_t*)p_sector_num, sector_num);
+
                 QDEBUG("JOURNAL: record BITMAP_JRECORD sector_num=%" PRId64
                        " nb_sectors=%u\n", sector_num, nb_sectors);
                 break;
             }
 
-            *count = cpu_to_le32(MIXED_JRECORDS_PER_SECTOR);
+            cpu_to_le32w(count, MIXED_JRECORDS_PER_SECTOR);
             memcpy(content, &s->table[chunk],
                    sizeof(uint32_t) * MIXED_JRECORDS_PER_SECTOR);
             PRINT_TABLE_JRECORD(type);
@@ -523,16 +534,20 @@  static uint8_t * create_journal_records(FvdAIOCB * acb,
             /* A BITMAP_JRECORD follows a TABLE_JRECORD so that they are
              * updated in one atomic operatoin. */
             type = content + MIXED_JRECORDS_PER_SECTOR; /* BITMAP_JRECORD */
-            uint32_t *p_nb_sectors = type + 1;
-            int64_t *p_sector_num = (int64_t *) (type + 2);
-            *type = cpu_to_le32(BITMAP_JRECORD);
-            *p_nb_sectors = cpu_to_le32(nb_sectors);
-            *p_sector_num = cpu_to_le64(sector_num);
+            uint64_t *p_epoch = (uint64_t*) (type + 1);
+            uint32_t *p_nb_sectors = (uint32_t*) (p_epoch + 1);
+            int64_t *p_sector_num = (int64_t *) (p_nb_sectors + 1);
+
+            cpu_to_le32w(type, BITMAP_JRECORD);
+            cpu_to_le64wu(p_epoch, s->journal_epoch);
+            cpu_to_le32w(p_nb_sectors, nb_sectors);
+            cpu_to_le64wu((uint64_t*)p_sector_num, sector_num);
+
             QDEBUG("JOURNAL: record BITMAP_JRECORD sector_num=%" PRId64
                    " nb_sectors=%u\n", sector_num, nb_sectors);
 
             /* Prepare for the next journal sector. */
-            type = (uint32_t *) (p_sector_num + 1);
+            type = (uint32_t *) (((uint8_t*)type) + BITMAP_JRECORD_SIZE);
             chunk += MIXED_JRECORDS_PER_SECTOR;
             sector_num = chunk * s->chunk_size;
             num_chunks -= MIXED_JRECORDS_PER_SECTOR;
@@ -546,6 +561,8 @@  static uint8_t * create_journal_records(FvdAIOCB * acb,
         }
     }
 
+    s->journal_epoch++;
+
     if (p_journal_sec) {
         *p_journal_sec = journal_sec;
     }
@@ -611,9 +628,8 @@  static int flush_metadata_to_disk(BlockDriverState * bs,
     /* Clean DIRTY_TABLE bit and write the table to disk. */
     if (s->table) {
         int i;
-        int table_entries = ROUND_UP(s->virtual_disk_size,
-                                     s->chunk_size * 512) / (s->chunk_size *
-                                                             512);
+        int table_entries = DIV_ROUND_UP(s->virtual_disk_size,
+                                         s->chunk_size * 512);
         for (i = 0; i < table_entries; i++) {
             CLEAN_DIRTY(s->table[i]);
         }
@@ -621,7 +637,7 @@  static int flush_metadata_to_disk(BlockDriverState * bs,
         int nb = (int)(s->table_size / 512);
         QDEBUG("JOURNAL: flush table (%d sectors) to disk\n", nb);
 
-        if (bdrv_write(s->fvd_metadata, s->table_offset, (uint8_t *) s->table,
+        if (bdrv_write(bs->file, s->table_offset, (uint8_t *) s->table,
                        nb) < 0) {
             goto fail;
         }
@@ -629,22 +645,15 @@  static int flush_metadata_to_disk(BlockDriverState * bs,
 
     /* Write fresh_bitmap to disk. */
     if (s->fresh_bitmap) {
-        /* Ensure copy-on-read and prefetching data are stable. */
-        if (bdrv_flush(s->fvd_data)) {
+        /* Ensure data are stable before updating bitmap. */
+        if (bdrv_flush(bs->file)) {
             goto fail;
         }
 
-        if (s->fvd_data != s->fvd_metadata && s->table) {
-            /* Ensure table is stable before updating bitmap. */
-            if (bdrv_flush(s->fvd_metadata)) {
-                goto fail;
-            }
-        }
-
         int nb = (int)(s->bitmap_size / 512);
         QDEBUG("JOURNAL: flush bitmap (%d sectors) to disk\n", nb);
 
-        if (bdrv_write(s->fvd_metadata, s->bitmap_offset,
+        if (bdrv_write(bs->file, s->bitmap_offset,
                        s->fresh_bitmap, nb) < 0) {
             goto fail;
         }
@@ -656,7 +665,7 @@  static int flush_metadata_to_disk(BlockDriverState * bs,
     if (update_journal_epoch || update_base_img_prefetched) {
         /* Update the header. */
         FvdHeader header;
-        if (read_fvd_header(s, &header)) {
+        if (read_fvd_header(bs, &header)) {
             goto fail;
         }
         if (update_base_img_prefetched) {
@@ -665,13 +674,13 @@  static int flush_metadata_to_disk(BlockDriverState * bs,
         if (update_journal_epoch) {
             header.stable_journal_epoch = s->journal_epoch++;
         }
-        if (update_fvd_header(s, &header)) {
+        if (update_fvd_header(bs, &header)) {
             goto fail;
         }
     }
 
     /* Perform a final flush to ensure all metadata are stable. */
-    if (!bdrv_flush(s->fvd_metadata)) {
+    if (!bdrv_flush(bs->file)) {
         return 0;
     }
 
@@ -690,7 +699,7 @@  static void flush_metadata_to_disk_on_exit(BlockDriverState * bs)
 {
     BDRVFvdState *s = bs->opaque;
 
-    if (bs->read_only || !s->fvd_metadata) {
+    if (bs->read_only || !bs->file) {
         return;
     }
 
@@ -698,7 +707,7 @@  static void flush_metadata_to_disk_on_exit(BlockDriverState * bs)
      * so that it has to rely on journal for recovery. */
     if (s->journal_size <= 0 || !emulate_host_crash) {
         if (!flush_metadata_to_disk(bs, true, false) && !s->dirty_image) {
-            update_clean_shutdown_flag(s, true);
+            update_clean_shutdown_flag(bs, true);
         }
     }
 }
@@ -717,9 +726,9 @@  static void print_table_jrecord(uint32_t * type)
     uint32_t *p_offset = p_count + 1;
     uint32_t *content = p_offset + 1;
 
-    uint64_t epoch = le64_to_cpu(*p_epoch);
-    uint32_t count = le32_to_cpu(*p_count);
-    uint32_t offset = le32_to_cpu(*p_offset);
+    uint64_t epoch = le64_to_cpupu(p_epoch);
+    uint32_t count = le32_to_cpup(p_count);
+    uint32_t offset = le32_to_cpup(p_offset);
 
     QDEBUG("JOURNAL: record TABLE_JRECORD epoch=%" PRIu64
            " chunk_start=%u " "nb_chunks=%u\n", epoch, offset, count);
@@ -746,7 +755,7 @@  static void ujnl_write_metadata_to_journal_now(FvdAIOCB * acb,
     acb->jcb.iov.iov_base = buf;
     acb->jcb.iov.iov_len = 512 * nb_journal_sectors;
     qemu_iovec_init_external(&acb->jcb.qiov, &acb->jcb.iov, 1);
-    acb->jcb.hd_acb = bdrv_aio_writev(s->fvd_metadata,
+    acb->jcb.hd_acb = bdrv_aio_writev(bs->file,
                                       s->journal_offset + journal_sec,
                                       &acb->jcb.qiov, nb_journal_sectors,
                                       write_metadata_to_journal_cb, acb);
diff --git a/block/fvd-load.c b/block/fvd-load.c
index 9789cc5..011a990 100644
--- a/block/fvd-load.c
+++ b/block/fvd-load.c
@@ -20,7 +20,7 @@  static inline FvdAIOCB *init_load_acb(FvdAIOCB * parent_acb,
                     BlockDriverState * bs, int64_t sector_num,
                     QEMUIOVector * orig_qiov, int nb_sectors,
                     BlockDriverCompletionFunc * cb, void *opaque);
-static int load_create_child_requests(bool count_only, BDRVFvdState *s,
+static int load_create_child_requests(bool count_only, BlockDriverState *bs,
                     QEMUIOVector * orig_qiov, int64_t sector_num,
                     int nb_sectors, int *p_nziov, int *p_niov, int *p_nqiov,
                     FvdAIOCB *acb,  QEMUIOVector *q, struct iovec *v);
@@ -34,7 +34,7 @@  static inline BlockDriverAIOCB *load_data(FvdAIOCB * parent_acb,
 
     if (!s->table) {
         /* Load directly since it is not a compact image. */
-        return bdrv_aio_readv(s->fvd_data, s->data_offset + sector_num,
+        return bdrv_aio_readv(bs->file, s->data_offset + sector_num,
                               orig_qiov, nb_sectors, cb, opaque);
     } else {
         return load_data_from_compact_image(parent_acb, bs, sector_num,
@@ -57,7 +57,7 @@  static BlockDriverAIOCB *load_data_from_compact_image(FvdAIOCB * parent_acb,
 
     /* Count the number of qiov and iov needed to cover the continuous regions
      * of the compact image. */
-    load_create_child_requests(true/*count_only*/, s, orig_qiov, sector_num,
+    load_create_child_requests(true/*count_only*/, bs, orig_qiov, sector_num,
                           nb_sectors, &nziov, &niov, &nqiov, NULL, NULL, NULL);
 
     if (nqiov + nziov == 1) {
@@ -91,8 +91,8 @@  static BlockDriverAIOCB *load_data_from_compact_image(FvdAIOCB * parent_acb,
                        "load_directly_as_one_continuous_region\n",
                        parent_acb->uuid, parent_acb);
             }
-            return bdrv_aio_readv(s->fvd_data, s->data_offset + start_sec,
-                                  orig_qiov, nb_sectors, cb, opaque);
+            return bdrv_aio_readv(bs->file, start_sec, orig_qiov,
+                                  nb_sectors, cb, opaque);
         }
     }
 
@@ -110,7 +110,7 @@  static BlockDriverAIOCB *load_data_from_compact_image(FvdAIOCB * parent_acb,
     QEMUIOVector *q = (QEMUIOVector *) (acb->load.children + nqiov);
     struct iovec *v = (struct iovec *)(q + nqiov);
 
-    if (!load_create_child_requests(false/*count_only*/, s, orig_qiov,
+    if (!load_create_child_requests(false/*count_only*/, bs, orig_qiov,
                                     sector_num, nb_sectors, NULL, NULL,
                                     &nqiov, acb, q, v)) {
         return &acb->common;
@@ -215,8 +215,9 @@  static void fvd_aio_cancel_load_compact(FvdAIOCB * acb)
 static inline int load_create_one_child(bool count_only, bool empty,
                     QEMUIOVector * orig_qiov, int *iov_index, size_t *iov_left,
                     uint8_t **iov_buf, int64_t start_sec, int sectors_in_region,
-                    int *p_niov, int *p_nziov, int *p_nqiov, BDRVFvdState *s,
-                    FvdAIOCB *acb, QEMUIOVector **q, struct iovec **v)
+                    int *p_niov, int *p_nziov, int *p_nqiov,
+                    BlockDriverState *bs, FvdAIOCB *acb,
+                    QEMUIOVector **q, struct iovec **v)
 {
     int niov;
 
@@ -250,8 +251,8 @@  static inline int load_create_one_child(bool count_only, bool empty,
            " nb_sectors=%d niov=%d\n", acb->uuid, acb, *p_nqiov,
            start_sec, sectors_in_region, niov);
     acb->load.children[*p_nqiov].hd_acb =
-        bdrv_aio_readv(s->fvd_data, s->data_offset + start_sec, *q,
-                       sectors_in_region, load_data_from_compact_image_cb,
+        bdrv_aio_readv(bs->file, start_sec, *q, sectors_in_region,
+                       load_data_from_compact_image_cb,
                        &acb->load.children[*p_nqiov]);
     if (!acb->load.children[*p_nqiov].hd_acb) {
         return -1;
@@ -264,10 +265,11 @@  static inline int load_create_one_child(bool count_only, bool empty,
     return 0;
 }
 
-static int load_create_child_requests(bool count_only, BDRVFvdState *s,
+static int load_create_child_requests(bool count_only, BlockDriverState *bs,
     QEMUIOVector * orig_qiov, int64_t sector_num, int nb_sectors, int *p_nziov,
     int *p_niov, int *p_nqiov, FvdAIOCB *acb,  QEMUIOVector *q, struct iovec *v)
 {
+    BDRVFvdState *s = bs->opaque;
     const uint32_t first_chunk = sector_num / s->chunk_size;
     const uint32_t last_chunk = (sector_num + nb_sectors - 1) / s->chunk_size;
     int iov_index = 0;
@@ -276,7 +278,7 @@  static int load_create_child_requests(bool count_only, BDRVFvdState *s,
     int nziov = 0;      /* Number of empty regions. */
     int nqiov = 0;
     int niov = 0;
-    int64_t prev = READ_TABLE2(s->table[first_chunk]);
+    int64_t prev = READ_TABLE(s->table[first_chunk]);
     int64_t start_sec = -1;
     int sectors_in_region;
     int32_t chunk;
@@ -335,7 +337,7 @@  static int load_create_child_requests(bool count_only, BDRVFvdState *s,
                 if (load_create_one_child(count_only, false/*!empty*/,
                                     orig_qiov, &iov_index, &iov_left,
                                     &iov_buf, start_sec, sectors_in_region,
-                                    &niov, &nziov, &nqiov, s,
+                                    &niov, &nziov, &nqiov, bs,
                                     acb, &q, &v)) {
                     goto fail;
                 }
@@ -353,7 +355,7 @@  static int load_create_child_requests(bool count_only, BDRVFvdState *s,
     }
 
     for (chunk = first_chunk + 1; chunk <= last_chunk; chunk++) {
-        uint32_t current = READ_TABLE2(s->table[chunk]);
+        uint32_t current = READ_TABLE(s->table[chunk]);
         int64_t data_size;
 
         /* Check if the chunk spans over s->avail_storage. */
@@ -377,7 +379,7 @@  static int load_create_child_requests(bool count_only, BDRVFvdState *s,
                                             orig_qiov, &iov_index, &iov_left,
                                             &iov_buf, start_sec,
                                             sectors_in_region, &niov, &nziov,
-                                            &nqiov, s, acb, &q, &v);
+                                            &nqiov, bs, acb, &q, &v);
 
                         /* Start the first region of this split chunk. */
                         start_sec = current * s->chunk_size;
@@ -394,7 +396,7 @@  static int load_create_child_requests(bool count_only, BDRVFvdState *s,
                                     false/*!empty*/, orig_qiov, &iov_index,
                                     &iov_left, &iov_buf, start_sec,
                                     sectors_in_region, &niov, &nziov, &nqiov,
-                                    s, acb, &q, &v)) {
+                                    bs, acb, &q, &v)) {
                                 goto fail;
                             }
 
@@ -409,7 +411,7 @@  static int load_create_child_requests(bool count_only, BDRVFvdState *s,
                     if (load_create_one_child(count_only, false/*!empty*/,
                             orig_qiov, &iov_index, &iov_left, &iov_buf,
                             start_sec, sectors_in_region, &niov, &nziov,
-                            &nqiov, s, acb, &q, &v)) {
+                            &nqiov, bs, acb, &q, &v)) {
                         goto fail;
                     }
 
@@ -436,7 +438,8 @@  static int load_create_child_requests(bool count_only, BDRVFvdState *s,
             /* Terminate the previous region. */
             if (load_create_one_child(count_only, IS_EMPTY(prev), orig_qiov,
                     &iov_index, &iov_left, &iov_buf, start_sec,
-                    sectors_in_region, &niov, &nziov, &nqiov, s, acb, &q, &v)) {
+                    sectors_in_region, &niov, &nziov, &nqiov,
+                    bs, acb, &q, &v)) {
                 goto fail;
             }
 
@@ -475,8 +478,8 @@  static int load_create_child_requests(bool count_only, BDRVFvdState *s,
                " nb_sectors=%d niov=%d\n", acb->uuid, acb, nqiov, start_sec,
                sectors_in_region, niov);
         acb->load.children[nqiov].hd_acb =
-            bdrv_aio_readv(s->fvd_data, s->data_offset + start_sec, q,
-                           sectors_in_region, load_data_from_compact_image_cb,
+            bdrv_aio_readv(bs->file, start_sec, q, sectors_in_region,
+                           load_data_from_compact_image_cb,
                            &acb->load.children[nqiov]);
         if (!acb->load.children[nqiov].hd_acb) {
             goto fail;
diff --git a/block/fvd-misc.c b/block/fvd-misc.c
index 6315218..1505434 100644
--- a/block/fvd-misc.c
+++ b/block/fvd-misc.c
@@ -11,7 +11,7 @@ 
  *
  */
 
-static int read_fvd_header(BDRVFvdState * s, FvdHeader * header);
+static int read_fvd_header(BlockDriverState * bs, FvdHeader * header);
 static void fvd_aio_cancel_bjnl_buf_write(FvdAIOCB * acb);
 static void fvd_aio_cancel_bjnl_flush(FvdAIOCB * acb);
 static void fvd_aio_cancel_read(FvdAIOCB * acb);
@@ -21,17 +21,11 @@  static void fvd_aio_cancel_load_compact(FvdAIOCB * acb);
 static void fvd_aio_cancel_store_compact(FvdAIOCB * acb);
 static void fvd_aio_cancel_wrapper(FvdAIOCB * acb);
 static void flush_metadata_to_disk_on_exit (BlockDriverState *bs);
-
-static void fvd_aio_cancel_flush(FvdAIOCB * acb)
-{
-    if (acb->flush.data_acb) {
-        bdrv_aio_cancel(acb->flush.data_acb);
-    }
-    if (acb->flush.metadata_acb) {
-        bdrv_aio_cancel(acb->flush.metadata_acb);
-    }
-    my_qemu_aio_release(acb);
-}
+static int bjnl_sync_flush(BlockDriverState * bs);
+static int fvd_check_refcount(BlockDriverState *bs, BdrvCheckResult *res);
+static bool bjnl_clean_buf_on_aio_flush(BlockDriverState *bs,
+                              BlockDriverCompletionFunc * cb,
+                              void *opaque, BlockDriverAIOCB **p_acb);
 
 static void fvd_aio_cancel(BlockDriverAIOCB * blockacb)
 {
@@ -65,10 +59,6 @@  static void fvd_aio_cancel(BlockDriverAIOCB * blockacb)
         fvd_aio_cancel_wrapper(acb);
         break;
 
-    case OP_FLUSH:
-        fvd_aio_cancel_flush(acb);
-        break;
-
     case OP_BJNL_BUF_WRITE:
         fvd_aio_cancel_bjnl_buf_write(acb);
         break;
@@ -136,26 +126,15 @@  static void fvd_close(BlockDriverState * bs)
         s->table = NULL;
     }
 
-    if (s->fvd_metadata) {
-        if (s->fvd_metadata != s->fvd_data) {
-            bdrv_delete(s->fvd_metadata);
-        }
-        s->fvd_metadata = NULL;
-    }
-    if (s->fvd_data) {
-        bdrv_delete(s->fvd_data);
-        s->fvd_data = NULL;
+    if (s->free_chunks) {
+        my_qemu_free(s->free_chunks);
+        s->free_chunks = NULL;
     }
 
     if (s->add_storage_cmd) {
         my_qemu_free(s->add_storage_cmd);
         s->add_storage_cmd = NULL;
     }
-
-    if (s->leaked_chunks) {
-        my_qemu_free(s->leaked_chunks);
-        s->leaked_chunks = NULL;
-    }
 #ifdef FVD_DEBUG
     dump_resource_summary(s);
 #endif
@@ -188,7 +167,8 @@  static int fvd_is_allocated(BlockDriverState * bs, int64_t sector_num,
          * This also covers the case of no base image. */
 
         if (!s->table) {
-            return bdrv_is_allocated(s->fvd_data, s->data_offset + sector_num,
+            return bdrv_is_allocated(bs->file,
+                                     s->data_offset + sector_num,
                                      nb_sectors, pnum);
         }
 
@@ -248,41 +228,48 @@  static int fvd_is_allocated(BlockDriverState * bs, int64_t sector_num,
 
 static int fvd_get_info(BlockDriverState * bs, BlockDriverInfo * bdi)
 {
-    BDRVFvdState *s = bs->opaque;
     FvdHeader header;
 
-    if (read_fvd_header(s, &header) < 0) {
+    if (read_fvd_header(bs, &header) < 0) {
         return -1;
     }
 
     printf("========= Begin of FVD specific information ==================\n");
     printf("magic\t\t\t\t\t\t%0X\n", header.magic);
     printf("header_size\t\t\t\t\t%d\n", header.header_size);
+    printf("header_padding_size\t\t\t\t%d\n", header.header_padding_size);
     printf("create_version\t\t\t\t\t%d\n", header.create_version);
     printf("last_open_version\t\t\t\t%d\n", header.last_open_version);
-    printf("virtual_disk_size (bytes)\t\t\t%" PRId64 "\n",
+    printf("virtual_disk_size (bytes)\t\t\t%"PRId64"\n",
            header.virtual_disk_size);
-    printf("disk_metadata_size (bytes)\t\t\t%" PRId64 "\n", header.data_offset);
-    if (header.data_file[0]) {
-        printf("data_file\t\t\t\t\t%s\n", header.data_file);
-    }
-    if (header.data_file_fmt[0]) {
-        printf("data_file_fmt\t\t\t\t\t%s\n", header.data_file_fmt);
-    }
 
-    if (header.table_offset > 0) {
-        printf("table_size (bytes)\t\t\t\t%" PRId64 "\n", header.table_size);
-        printf("avail_storage (bytes)\t\t\t\t%" PRId64 "\n",
-               s->avail_storage * 512);
-        printf("chunk_size (bytes)\t\t\t\t%" PRId64 "\n", header.chunk_size);
-        printf("used_chunks (bytes)\t\t\t\t%" PRId64 "\n",
-               s->used_storage * 512);
-        printf("storage_grow_unit (bytes)\t\t\t%" PRId64 "\n",
+    if (header.table_offset == 0) {
+        /* Not a compact image. */
+        uint64_t metadata_size = header.header_size + header.header_padding_size
+            + header.bitmap_size + header.journal_size;
+        printf("disk_metadata_size (bytes)\t\t\t%"PRId64"\n", metadata_size);
+    } else {
+        printf("chunk_size (bytes)\t\t\t\t%"PRId64"\n", header.chunk_size);
+        printf("storage_grow_unit (bytes)\t\t\t%"PRId64"\n",
                header.storage_grow_unit);
-        printf("table_offset (bytes)\t\t\t\t%" PRId64 "\n",
+        printf("table_offset (bytes)\t\t\t\t%"PRId64"\n",
                header.table_offset);
-        printf("table_size (bytes)\t\t\t\t%" PRId64 "\n", s->table_size);
-        printf("chunks_relocated\t\t\t\t%s\n", BOOL(s->chunks_relocated));
+        printf("table_size (bytes)\t\t\t\t%"PRId64"\n", header.table_size);
+
+        printf("num_snapshots\t\t\t\t\t%u\n", header.num_snapshots);
+        if (header.refcount_offset > 0) {
+            printf("refcount_offset (bytes)\t\t\t\t%"PRId64"\n",
+                   header.refcount_offset);
+            printf("refcount_size (bytes)\t\t\t\t%"PRId64"\n",
+                   header.refcount_size);
+        }
+
+        if (header.vm_state_offset > 0) {
+            printf("vm_state_offset (bytes)\t\t\t\t%"PRId64"\n",
+                   header.vm_state_offset);
+            printf("vm_state_size (bytes)\t\t\t\t%"PRId64"\n",
+                   header.vm_state_size);
+        }
 
         if (header.add_storage_cmd[0] != 0) {
             printf("add_storage_cmd\t\t\t\t\t%s\n", header.add_storage_cmd);
@@ -291,13 +278,13 @@  static int fvd_get_info(BlockDriverState * bs, BlockDriverInfo * bdi)
 
     printf("clean_shutdown\t\t\t\t\t%s\n", BOOL(header.clean_shutdown));
     if (header.journal_size > 0) {
-        printf("journal_offset\t\t\t\t\t%" PRId64 "\n", header.journal_offset);
-        printf("journal_size\t\t\t\t\t%" PRId64 "\n", header.journal_size);
-        printf("stable_journal_epoch\t\t\t\t%" PRId64 "\n",
+        printf("journal_offset\t\t\t\t\t%"PRId64"\n", header.journal_offset);
+        printf("journal_size\t\t\t\t\t%"PRId64"\n", header.journal_size);
+        printf("stable_journal_epoch\t\t\t\t%"PRId64"\n",
                header.stable_journal_epoch);
-        printf("journal_buf_size (bytes)\t\t\t%" PRId64 "\n",
+        printf("journal_buf_size (bytes)\t\t\t%"PRId64"\n",
                header.journal_buf_size);
-        printf("journal_clean_buf_period (ms)\t\t\t%" PRId64 "\n",
+        printf("journal_clean_buf_period (ms)\t\t\t%"PRId64"\n",
                header.journal_clean_buf_period);
     }
 
@@ -308,17 +295,17 @@  static int fvd_get_info(BlockDriverState * bs, BlockDriverInfo * bdi)
         if (header.base_img_fmt[0]) {
             printf("base_img_fmt\t\t\t\t\t%s\n", header.base_img_fmt);
         }
-        printf("base_img_size (bytes)\t\t\t\t%" PRId64 "\n",
+        printf("base_img_size (bytes)\t\t\t\t%"PRId64"\n",
                header.base_img_size);
-        printf("bitmap_offset (bytes)\t\t\t\t%" PRId64 "\n",
+        printf("bitmap_offset (bytes)\t\t\t\t%"PRId64"\n",
                header.bitmap_offset);
-        printf("bitmap_size (bytes)\t\t\t\t%" PRId64 "\n", header.bitmap_size);
-        printf("block_size\t\t\t\t\t%" PRId64 "\n", header.block_size);
+        printf("bitmap_size (bytes)\t\t\t\t%"PRId64"\n", header.bitmap_size);
+        printf("block_size\t\t\t\t\t%"PRId64"\n", header.block_size);
         printf("copy_on_read\t\t\t\t\t%s\n", BOOL(header.copy_on_read));
-        printf("max_outstanding_copy_on_read_data (bytes)\t%" PRId64 "\n",
+        printf("max_outstanding_copy_on_read_data (bytes)\t%"PRId64"\n",
                header.max_outstanding_copy_on_read_data);
         printf("need_zero_init\t\t\t\t\t%s\n", BOOL(header.need_zero_init));
-        printf("prefetch_start_delay (sec)\t\t\t%" PRId64 "\n",
+        printf("prefetch_start_delay (sec)\t\t\t%"PRId64"\n",
                header.prefetch_start_delay);
         printf("num_prefetch_slots\t\t\t\t%d\n", header.num_prefetch_slots);
         printf("bytes_per_prefetch\t\t\t\t%" PRIu64 "\n",
@@ -350,10 +337,87 @@  static int fvd_has_zero_init(BlockDriverState * bs)
 {
     BDRVFvdState *s = bs->opaque;
 
-    /* For a non-compact image, chunks_relocated is always false. For a
-     * compact image with chunks_relocated=true, it can no longer guarantee
-     * zero init even if the file system does that. This is because a partialy
-     * written chunk X may be relocated to a location previously used by
-     * another chunk Y and some garbage data are left there by Y. */
-    return s->chunks_relocated ? 0 : bdrv_has_zero_init(s->fvd_data);
+    if (!s->table) { /* Not a compact image. */
+        return bdrv_has_zero_init(bs->file);
+    }
+
+    if (!s->compact_image_guarantee_zero_init) {
+        /* Some chunks might have been reused for different virtual block
+         * addresses. When a chunk is reused, the garbage data left in
+         * previous use cannot guarantee zero_init even if the underlying file
+         * system provides zero_init. */
+        return 0;
+    }
+
+    return bdrv_has_zero_init(bs->file);
+}
+
+static int fvd_flush(BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    QDEBUG("fvd_flush() invoked\n");
+
+    if (s->metadata_err_prohibit_write) {
+        return -EIO;
+    }
+
+    if (!bs->file->enable_write_cache) {
+        /* No need to flush since it uses O_DSYNC. */
+        return 0;
+    }
+
+    if (s->use_bjnl) {
+        /* Need to flush buffered journal update. */
+        return bjnl_sync_flush(bs);
+    } else {
+        /* Simple case: unbuffered journal update. */
+        return bdrv_flush(bs->file);
+    }
+}
+
+static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs,
+                                       BlockDriverCompletionFunc * cb,
+                                       void *opaque)
+{
+    BDRVFvdState *s = bs->opaque;
+    BlockDriverAIOCB * pacb;
+    FvdAIOCB  *acb;
+
+    QDEBUG("fvd_aio_flush() invoked\n");
+
+    if (s->metadata_err_prohibit_write) {
+        return NULL;
+    }
+
+    if (!bs->file->enable_write_cache) {
+        /* Need to flush since it uses O_DSYNC. Use a QEMUBH to invoke the
+         * callback. */
+
+        if (!(acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque))) {
+            return NULL;
+        }
+
+        acb->type = OP_WRAPPER;
+        acb->cancel_in_progress = false;
+        acb->wrapper.bh = qemu_bh_new(aio_wrapper_bh, acb);
+        qemu_bh_schedule(acb->wrapper.bh);
+        return &acb->common;
+    }
+
+    if (s->use_bjnl) {
+        if (bjnl_clean_buf_on_aio_flush(bs, cb, opaque, &pacb)) {
+            /* Return now while waiting for the journal buffer to be cleaned. */
+            return pacb;
+        }
+    }
+
+    /* No buffered journal data. */
+    QDEBUG("FLUSH: start now as no buffered journal data");
+    return bdrv_aio_flush(bs->file, cb, opaque);
+}
+
+static int fvd_check(BlockDriverState *bs, BdrvCheckResult *result)
+{
+    return fvd_check_refcount(bs, result);
 }
diff --git a/block/fvd-open.c b/block/fvd-open.c
index 8caf8d3..1ea750f 100644
--- a/block/fvd-open.c
+++ b/block/fvd-open.c
@@ -12,15 +12,15 @@ 
  */
 
 static void init_prefetch_timer(BlockDriverState * bs, BDRVFvdState * s);
-static int init_data_file(BDRVFvdState * s, FvdHeader * header, int flags);
+static int init_data(BlockDriverState * bs, FvdHeader * header, int flags);
 static int init_bitmap(BlockDriverState * bs, BDRVFvdState * s,
                        FvdHeader * header, const char *const filename);
-static int load_table(BDRVFvdState * s, FvdHeader * header,
+static int load_table(BlockDriverState * bs, FvdHeader * header,
                       const char *const filename);
 static int init_journal(int read_only, BlockDriverState * bs,
                         FvdHeader * header);
-static int init_compact_image(BDRVFvdState * s, FvdHeader * header,
-                              const char *const filename);
+static int init_compact_image(bool read_only, BlockDriverState * bs,
+                              FvdHeader * header, const char *const filename);
 
 static int fvd_open(BlockDriverState * bs, const char *filename, int flags)
 {
@@ -30,6 +30,9 @@  static int fvd_open(BlockDriverState * bs, const char *filename, int flags)
     BlockDriver *drv;
     int i;
 
+    /* Zero memory as fvd_open() may be invoked multiple times by FVD itself. */
+    memset(s, 0, sizeof(BDRVFvdState));
+
     const char *protocol = strchr(filename, ':');
     if (protocol) {
         drv = bdrv_find_protocol(filename);
@@ -43,8 +46,8 @@  static int fvd_open(BlockDriverState * bs, const char *filename, int flags)
         }
     }
 
-    s->fvd_metadata = bdrv_new("");
-    ret = bdrv_open(s->fvd_metadata, filename, flags, drv);
+    bs->file = bdrv_new("");
+    ret = bdrv_open(bs->file, filename, flags, drv);
     if (ret < 0) {
         fprintf(stderr, "Failed to open %s\n", filename);
         return ret;
@@ -63,7 +66,7 @@  static int fvd_open(BlockDriverState * bs, const char *filename, int flags)
     s->total_copy_on_read_data = s->total_prefetch_data = 0;
 #endif
 
-    if (bdrv_pread(s->fvd_metadata, 0, &header, sizeof(header)) !=
+    if (bdrv_pread(bs->file, 0, &header, sizeof(header)) !=
         sizeof(header)) {
         fprintf(stderr, "Failed to read the header of %s\n", filename);
         ret = -EIO;
@@ -97,11 +100,10 @@  static int fvd_open(BlockDriverState * bs, const char *filename, int flags)
     }
 
     /* Initialize the fields of BDRVFvdState. */
-    s->chunks_relocated = header.chunks_relocated;
     s->dirty_image = false;
     s->metadata_err_prohibit_write = false;
     s->block_size = header.block_size / 512;
-    s->bitmap_size = header.bitmap_size;
+
     s->prefetch_timer = NULL;
     s->sectors_per_prefetch = (header.bytes_per_prefetch + 511) / 512;
     s->prefetch_throttle_time = header.prefetch_throttle_time;
@@ -148,7 +150,7 @@  static int fvd_open(BlockDriverState * bs, const char *filename, int flags)
     s->base_img_sectors = header.base_img_size / 512;
     bs->total_sectors = s->virtual_disk_size / 512;
 
-    if ((ret = init_data_file(s, &header, flags))) {
+    if ((ret = init_data(bs, &header, flags))) {
         goto fail;
     }
 
@@ -156,7 +158,7 @@  static int fvd_open(BlockDriverState * bs, const char *filename, int flags)
         goto fail;
     }
 
-    if ((ret = load_table(s, &header, filename))) {
+    if ((ret = load_table(bs, &header, filename))) {
         goto fail;
     }
 
@@ -166,13 +168,13 @@  static int fvd_open(BlockDriverState * bs, const char *filename, int flags)
 
     /* This must be done after init_journal() because it may use metadata
      * recovered from the journal. */
-    if ((ret = init_compact_image(s, &header, filename))) {
+    if ((ret = init_compact_image(read_only, bs, &header, filename))) {
         goto fail;
     }
 
     if (!read_only) {
         /* This flag will be cleaned when the image is shut down gracefully. */
-        update_clean_shutdown_flag(s, false);
+        update_clean_shutdown_flag(bs, false);
         init_prefetch_timer(bs, s);
     }
 
@@ -201,20 +203,74 @@  fail:
     return ret;
 }
 
-static int load_table(BDRVFvdState * s, FvdHeader * header,
+static void check_free_chunks_size(uint32_t chunk,
+                                    unsigned long **free_chunks,
+                                    uint32_t *max_chunk)
+{
+    ASSERT(*max_chunk % BITS_PER_LONG == 0);
+    if (chunk > *max_chunk) {
+        /* Increase free_chunks and initialize the tail to empty.
+         * Add 8192 extra to avoid frequent realloc. */
+        uint32_t new_max = ROUND_UP(chunk + 8192, BITS_PER_LONG);
+
+        *free_chunks = my_qemu_realloc(*free_chunks, *max_chunk / BITS_PER_BYTE,
+                                       new_max / BITS_PER_BYTE);
+
+        uint8_t *tail = ((uint8_t*)*free_chunks) + *max_chunk / BITS_PER_BYTE;
+        memset(tail, 0, (new_max - *max_chunk) / BITS_PER_BYTE);
+        *max_chunk = new_max;
+    }
+}
+
+static inline void mark_used_chunks(uint64_t offset, uint64_t size,
+                                    uint64_t chunk_size,
+                                    unsigned long **free_chunks,
+                                    uint32_t *max_chunk)
+{
+    uint32_t chunk, begin = offset / chunk_size;
+    uint32_t end = begin + size / chunk_size;
+
+    ASSERT(offset % chunk_size == 0 && size % chunk_size == 0 &&
+           *max_chunk % BITS_PER_LONG == 0);
+
+    check_free_chunks_size(end, free_chunks, max_chunk);
+
+    for (chunk = begin; chunk < end; chunk++) {
+        ASSERT(!test_bit(chunk, *free_chunks));
+        set_bit(chunk, *free_chunks);
+    }
+}
+
+static inline void mark_used_chunk(uint32_t chunk,
+                                   uint64_t chunk_size,
+                                   unsigned long **free_chunks,
+                                   uint32_t *max_chunk,
+                                   bool allow_duplicate)
+{
+    check_free_chunks_size(chunk + 1, free_chunks, max_chunk);
+    ASSERT(allow_duplicate || !test_bit(chunk, *free_chunks));
+    set_bit(chunk, *free_chunks);
+}
+
+static int load_table(BlockDriverState * bs, FvdHeader * header,
                       const char *const filename)
 {
+    BDRVFvdState *s = bs->opaque;
+
     if (header->table_offset <= 0) {
         return 0;       /* Not a compact image and no table. */
     }
 
     /* Initialize the table. */
     s->table_offset = header->table_offset / 512;
-    s->table_size = header->table_size;
+    s->table_size = calc_table_size(header->virtual_disk_size,
+                                    header->chunk_size);
+    s->table_size = ROUND_UP(s->table_size, 512);
+    ASSERT(s->table_size <= header->table_size);
     s->chunk_size = header->chunk_size / 512;
-    s->table = my_qemu_blockalign(s->fvd_metadata, s->table_size);
+    s->table = my_qemu_blockalign(bs->file, s->table_size);
 
-    if (bdrv_pread(s->fvd_metadata, header->table_offset, s->table,
+    if (bdrv_pread(bs->file, header->table_offset, s->table,
                    (int)s->table_size) != (int)s->table_size) {
         fprintf(stderr, "Failed to read the table of %s\n", filename);
         return -EIO;
@@ -223,12 +279,14 @@  static int load_table(BDRVFvdState * s, FvdHeader * header,
     return 0;
 }
 
-static int init_compact_image(BDRVFvdState * s, FvdHeader * header,
-                              const char *const filename)
+static int init_compact_image(bool read_only, BlockDriverState * bs,
+                              FvdHeader * header, const char *const filename)
 {
-    s->leaked_chunks = NULL;
-    s->num_leaked_chunks = 0;
-    s->next_avail_leaked_chunk = 0;
+    BDRVFvdState *s = bs->opaque;
+    int ret = 0;
+    uint32_t i;
+    uint16_t *refcount = NULL;
+    unsigned long *free_chunks = NULL;
 
     if (header->table_offset <= 0) {
         /* Not a compact image. */
@@ -236,77 +294,16 @@  static int init_compact_image(BDRVFvdState * s, FvdHeader * header,
         return 0;
     }
 
-    /* Scan the table to find the max used chunk and leaked chunks. */
-    uint32_t i;
-    uint32_t max_chunk = 0;
-    uint32_t table_entries = ROUND_UP(header->virtual_disk_size,
-                                      header->chunk_size) / header->chunk_size;
-    uint8_t *used_chunks = my_qemu_mallocz(table_entries);
-    for (i = 0; i < table_entries; i++) {
-        if (!IS_EMPTY(s->table[i])) {
-            uint32_t id = READ_TABLE(s->table[i]);
-            if (id >= max_chunk) {
-                max_chunk = id + 1;
-            }
-            if (used_chunks[id]) {
-                fprintf(stderr, "ERROR: corrupted image with multiple "
-                        "virtual chunks mapped to physical chunk %u\n", id);
-                my_qemu_free(used_chunks);
-                return -EIO;
-            }
-            used_chunks[id] = true;
-        }
-    }
-
-    /* Count the number of leaked chunks. */
-    uint32_t num_leaked_chunks = 0;
-    for (i = 0; i < max_chunk; i++) {
-        if (!used_chunks[i]) {
-            num_leaked_chunks++;
-        }
-    }
-    QDEBUG("leaked_chunks=%u max_chunk=%u\n", num_leaked_chunks, max_chunk);
-
-    /* Record leaked chunks, which will be used later. */
-    if (num_leaked_chunks > 0) {
-        s->num_leaked_chunks = num_leaked_chunks;
-        s->leaked_chunks = my_qemu_malloc(sizeof(uint32_t) * num_leaked_chunks);
-        num_leaked_chunks = 0;
-        for (i = 0; i < max_chunk; i++) {
-            if (!used_chunks[i]) {
-                s->leaked_chunks[num_leaked_chunks++] = i;
-                QDEBUG("Recover leaked physical chunk %u\n", i);
-            }
-        }
-    }
-    s->used_storage = max_chunk * s->chunk_size;
-    s->storage_grow_unit = header->storage_grow_unit / 512;
-    my_qemu_free(used_chunks);
-
     /* Check if the image is directly stored on a raw device, including
      * logical volume. If so, figure out the size of the device. */
     struct stat stat_buf;
-    if (stat(filename, &stat_buf) != 0) {
+    if ((ret = stat(filename, &stat_buf)) < 0) {
         fprintf(stderr, "Failed to stat() %s\n", filename);
-        return -EIO;
-    }
-
-    /* Check how much storage space is already allocated. */
-    int64_t size = bdrv_getlength(s->fvd_data);
-    if (size < 0) {
-        fprintf(stderr, "Failed in bdrv_getlength(%s)\n", filename);
-        return -EIO;
+        goto done;
     }
 
+    /* Initialize the command to grow storage space. */
     if (S_ISBLK(stat_buf.st_mode) || S_ISCHR(stat_buf.st_mode)) {
-        const int64_t min_size = (s->data_offset + s->used_storage) * 512;
-        if (size < min_size) {
-            fprintf(stderr, "The size of device %s is not even big enough to "
-                    "store already allocated data.\n", filename);
-            return -EIO;
-        }
-
-        /* Initialize the command to grow storage space. */
         char cmd[2048];
         if (header->add_storage_cmd[0] == 0) {
             s->add_storage_cmd = NULL;
@@ -326,15 +323,15 @@  static int init_compact_image(BDRVFvdState * s, FvdHeader * header,
                  *     3. On Ubuntu, /bin/sh is linked to /bin/dash, which
                  *     does not support ">&" for stdout and stderr
                  *     redirection. */
-                snprintf(cmd, sizeof(cmd) - 1, "/sbin/lvextend -L+%" PRIu64
-                         "B %s >/dev/null 2>/dev/null",
-                         header->storage_grow_unit,
-                         header->data_file[0] ? header->data_file : filename);
+                snprintf(cmd, sizeof(cmd) - 1, "/sbin/lvextend -L+%lluB %s "
+                         ">/dev/null 2>/dev/null",
+                         (unsigned long long)header->storage_grow_unit,
+                         filename);
             } else {
-                snprintf(cmd, sizeof(cmd) - 1, "%s %" PRIu64
-                         " %s >/dev/null 2>/dev/null",
-                         header->add_storage_cmd, header->storage_grow_unit,
-                         header->data_file[0] ? header->data_file : filename);
+                snprintf(cmd, sizeof(cmd) - 1, "%s %llu %s "
+                         ">/dev/null 2>/dev/null", header->add_storage_cmd,
+                         (unsigned long long)header->storage_grow_unit,
+                         filename);
             }
             int len = strlen(cmd);
             s->add_storage_cmd = my_qemu_malloc(len + 1);
@@ -342,48 +339,169 @@  static int init_compact_image(BDRVFvdState * s, FvdHeader * header,
         }
     }
 
-    s->avail_storage = size / 512 - s->data_offset;
-    s->fvd_data->growable = true;
+    /* Check how much storage space is already allocated. */
+    int64_t size = bdrv_getlength(bs->file);
+    if (size < 0) {
+        fprintf(stderr, "Failed in bdrv_getlength(%s)\n", filename);
+        ret = size;
+        goto done;
+    }
+
+    s->avail_storage = size / 512;
+    bs->file->growable = true;
     s->data_region_prepared = true;
+    s->storage_grow_unit = header->storage_grow_unit / 512;
 
-    return 0;
-}
+    const uint64_t min_metadata_size = header->header_size +
+        header->header_padding_size + header->bitmap_size +
+        header->table_size + header->journal_size;
+    if (size == min_metadata_size) {
+        s->compact_image_guarantee_zero_init = true; /* See fvd_has_zero_init.*/
+    }
 
-static int init_data_file(BDRVFvdState * s, FvdHeader * header, int flags)
-{
-    int ret;
+    /* Identify free chunks that can used to store new data.
+     * First estimate the maximum number of chunks already allocated.
+     * The estimation need not be accurate. */
+    uint32_t max_chunk = DIV_ROUND_UP(size, header->chunk_size);
+    max_chunk = ROUND_UP(max_chunk, BITS_PER_LONG);
 
-    if (header->data_file[0]) {
-        /* Open a separate data file. */
-        s->data_offset = 0;
-        s->fvd_data = bdrv_new("");
-        if (!s->fvd_data) {
-            fprintf(stderr, "Failed to create a new block device driver.\n");
-            return -EIO;
+    /* Create a bitmap, where 1 means a chunk is in use and 0 means free. */
+    free_chunks = my_qemu_mallocz(max_chunk / BITS_PER_BYTE);
+
+    /* Mark chunks occupied by header. */
+    mark_used_chunks(0, header->header_size + header->header_padding_size,
+                     header->chunk_size, &free_chunks, &max_chunk);
+
+    /* Mark chunks occupied by bitmap. */
+    if (header->bitmap_offset > 0) {
+        mark_used_chunks(header->bitmap_offset, header->bitmap_size,
+                         header->chunk_size, &free_chunks, &max_chunk);
+    }
+
+    /* Mark chunks occupied by table. */
+    mark_used_chunks(header->table_offset, header->table_size,
+                     header->chunk_size, &free_chunks, &max_chunk);
+
+    /* Mark chunks occupied by refcount. */
+    if (header->refcount_offset > 0) {
+        mark_used_chunks(header->refcount_offset, header->refcount_size,
+                         header->chunk_size, &free_chunks, &max_chunk);
+    }
+
+    /* Mark chunks occupied by the snapshot list. */
+    if (header->snapshot_offset > 0) {
+        mark_used_chunks(header->snapshot_offset, header->snapshot_size,
+                         header->chunk_size, &free_chunks, &max_chunk);
+    }
+
+    /* Mark chunks occupied by journal. */
+    if (header->journal_offset > 0) {
+        mark_used_chunks(header->journal_offset, header->journal_size,
+                         header->chunk_size, &free_chunks, &max_chunk);
+    }
+
+    /* Mark chunks used by data in snapshots, as shown in refcount. */
+    if (header->refcount_offset > 0) {
+        ASSERT(header->block_size == header->chunk_size &&
+               header->refcount_size % header->chunk_size == 0);
+        refcount = my_qemu_blockalign(bs->file, header->refcount_size);
+        ret = bdrv_read(bs->file, header->refcount_offset / 512,
+                            (uint8_t*)refcount, header->refcount_size / 512);
+        if (ret < 0) {
+            goto done;
         }
 
-        if (header->data_file_fmt[0] == 0) {
-            ret = bdrv_open(s->fvd_data, header->data_file, flags, NULL);
-        } else {
-            BlockDriver *data_drv = bdrv_find_format(header->data_file_fmt);
-            if (!data_drv) {
-                fprintf(stderr, "Failed to find driver for image format "
-                        "'%s' of data file %s\n",
-                        header->data_file_fmt, header->data_file);
-                return -EINVAL;
+        for (i = 0; i < header->refcount_size / sizeof(uint16_t); i++) {
+            cpu_to_le16s(refcount + i);
+            if (refcount[i] != 0) {
+                ASSERT(!s->fresh_bitmap ||
+                    !fresh_bitmap_show_sector_in_base_img(i *
+                                    header->block_size / 512, s));
+                mark_used_chunk(i, header->chunk_size, &free_chunks,
+                                &max_chunk, false);
             }
-            ret = bdrv_open(s->fvd_data, header->data_file, flags, data_drv);
         }
-        if (ret != 0) {
-            fprintf(stderr, "Failed to open data file %s\n", header->data_file);
-            return -EIO;
+    }
+
+    /* Go through every chunk in the table. */
+    uint32_t refcount_chunks = header->refcount_size / sizeof(uint16_t);
+    const int n = DIV_ROUND_UP(header->virtual_disk_size, header->chunk_size);
+    for (i = 0; i < n; i++) {
+        if (!IS_EMPTY(s->table[i])) {
+            uint32_t chunk = READ_TABLE(s->table[i]);
+
+            /* Mark chunk used for image data. */
+            mark_used_chunk(chunk, header->chunk_size, &free_chunks,
+                            &max_chunk, true);
+
+            /* Check if the chunk is shared with any snapshot. Writing to a
+             * shared chunk needs to perform copy-on-write. */
+            if (chunk < refcount_chunks && refcount[chunk] > 0) {
+                chunk |= SHARED_TABLE; /* Mark the entry as shared. */
+                s->share_chunk_with_snapshot = true;
+            }
+
+            WRITE_TABLE(s->table[i], chunk);
         }
+    }
+
+    /* Identify chunks shared with snapshots. */
+    if (header->refcount_offset > 0) {
     } else {
-        s->data_offset = header->data_offset / 512;     /* In sectors. */
-        s->fvd_data = s->fvd_metadata;
+        for (i = 0; i < n; i++) {
+        }
+    }
+
+    uint32_t first_free = find_first_zero_bit(free_chunks, max_chunk);
+    if (first_free >= max_chunk) {
+        /* All chunks are used. */
+        s->used_storage = max_chunk * header->chunk_size / 512; /* in sectors */
+        goto done;
+    }
+
+    uint32_t last_used = find_last_bit(free_chunks, max_chunk);
+    ASSERT(last_used < max_chunk);
+    if (last_used < first_free) {
+        /* All chunks before first_free are used and no chunks after
+         * first_free are used. */
+        ASSERT(last_used + 1 == first_free);
+        s->used_storage = first_free * header->chunk_size / 512;
+        goto done;
+    }
+
+    /* Use free_chunks to track chunks in [0..last_used]. Use s->used_storage
+     * to track the frontier of unused chunks. */
+    last_used = ROUND_UP(last_used + 1, BITS_PER_LONG);
+    s->used_storage = last_used * header->chunk_size / 512;
+    s->free_chunks = my_qemu_realloc(free_chunks, max_chunk / BITS_PER_BYTE,
+                                     last_used / BITS_PER_BYTE);
+    s->free_chunks_size = last_used;
+    s->next_free_chunk = first_free;
+
+done:
+    if (!s->free_chunks && free_chunks) {
+        my_qemu_free(free_chunks);
+    }
+    if (refcount) {
+        my_qemu_vfree(refcount);
+    }
+
+    return ret;
+}
+
+static int init_data(BlockDriverState * bs, FvdHeader * header, int flags)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (header->table_offset == 0) {
+        /* Where data start in a non-compact image. */
+        uint64_t metadata_size = header->header_size +
+            header->header_padding_size + header->bitmap_size +
+            header->table_size + header->journal_size;
+        s->data_offset = metadata_size / 512;     /* In sectors. */
     }
 
-    if (header->need_zero_init && !bdrv_has_zero_init(s->fvd_data)) {
+    if (header->need_zero_init && !bdrv_has_zero_init(bs->file)) {
         if (IN_QEMU_TOOL) {
             /* Only give a warning to allow 'qemu-img update' to modify
              * need_zero_init if the user manually zero-init the device. */
@@ -414,14 +532,21 @@  static int init_bitmap(BlockDriverState * bs, BDRVFvdState * s,
             bs->backing_file[0] = 0;
         }
     } else {
+        /* The actual bitmap size can be smaller than the space reserved by
+         * header.bitmap_size. */
+        int bitmap_size = calc_bitmap_size(header->base_img_size,
+                                           header->block_size);
+        s->bitmap_size = ROUND_UP(bitmap_size, 512);
+        ASSERT(s->bitmap_size <= header->bitmap_size);
+
         ASSERT(header->base_img[0] != 0);
         pstrcpy(bs->backing_file, 1024, header->base_img);
         pstrcpy(bs->backing_format, 16, header->base_img_fmt);
 
         /* This will be enabled in init_prefetch() after a timer expires. */
         s->prefetch_state = PREFETCH_STATE_DISABLED;
-        s->stale_bitmap = my_qemu_blockalign(s->fvd_metadata, s->bitmap_size);
-        if (bdrv_pread(s->fvd_metadata, header->bitmap_offset,
+        s->stale_bitmap = my_qemu_blockalign(bs->file, s->bitmap_size);
+        if (bdrv_pread(bs->file, header->bitmap_offset,
                        s->stale_bitmap, s->bitmap_size) != s->bitmap_size) {
             fprintf(stderr, "Failed to read the bitmap of %s.\n", filename);
             return -EIO;
@@ -431,7 +556,7 @@  static int init_bitmap(BlockDriverState * bs, BDRVFvdState * s,
                                 s->prefetch_start_delay > 0)) {
             /* Use two bitmaps only if copy_on_read or prefetching is enabled.
              * See Section 3.3.4 of the FVD-cow paper. */
-            s->fresh_bitmap = my_qemu_blockalign(s->fvd_metadata,
+            s->fresh_bitmap = my_qemu_blockalign(bs->file,
                                                  s->bitmap_size);
             memcpy(s->fresh_bitmap, s->stale_bitmap, s->bitmap_size);
         } else {
@@ -445,7 +570,7 @@  static int init_bitmap(BlockDriverState * bs, BDRVFvdState * s,
 static void init_prefetch_timer(BlockDriverState * bs, BDRVFvdState * s)
 {
     if (IN_QEMU_TOOL) {
-        return;
+        return; /* No prefetching when running in a qemu tool. */
     }
 
     if (s->prefetch_state == PREFETCH_STATE_FINISHED ||
diff --git a/block/fvd-prefetch.c b/block/fvd-prefetch.c
index b8be98c..7e83d39 100644
--- a/block/fvd-prefetch.c
+++ b/block/fvd-prefetch.c
@@ -25,7 +25,7 @@  void fvd_init_prefetch(void *opaque)
     QDEBUG("Start prefetching\n");
 
     if (!s->data_region_prepared) {
-        init_data_region(s);
+        init_data_region(bs);
     }
 
     s->prefetch_acb = my_qemu_malloc(sizeof(FvdAIOCB *)*s->num_prefetch_slots);
diff --git a/block/fvd-read.c b/block/fvd-read.c
index b18fdf2..5f58927 100644
--- a/block/fvd-read.c
+++ b/block/fvd-read.c
@@ -19,10 +19,6 @@  static inline void calc_read_region(BDRVFvdState * s, int64_t sector_num,
                     int64_t * p_last_sec_in_fvd,
                     int64_t * p_first_sec_in_backing,
                     int64_t * p_last_sec_in_backing);
-static inline BlockDriverAIOCB *load_data(FvdAIOCB * parent_acb,
-                    BlockDriverState * bs, int64_t sector_num,
-                    QEMUIOVector * orig_qiov, int nb_sectors,
-                    BlockDriverCompletionFunc * cb, void *opaque);
 
 static BlockDriverAIOCB *fvd_aio_readv(BlockDriverState * bs,
                                        int64_t sector_num, QEMUIOVector * qiov,
@@ -34,7 +30,7 @@  static BlockDriverAIOCB *fvd_aio_readv(BlockDriverState * bs,
     TRACE_REQUEST(false, sector_num, nb_sectors);
 
     if (!s->data_region_prepared) {
-        init_data_region(s);
+        init_data_region(bs);
     }
 
     if (s->prefetch_state == PREFETCH_STATE_FINISHED
@@ -156,7 +152,7 @@  static BlockDriverAIOCB *fvd_aio_readv(BlockDriverState * bs,
         int buf_size = acb->read.read_fvd.iov.iov_len +
             ROUND_UP(bitmap_bytes, 512);
         acb->read.read_fvd.iov.iov_base =
-            my_qemu_blockalign(s->fvd_data, buf_size);
+            my_qemu_blockalign(bs->file, buf_size);
         uint8_t *saved_bitmap = ((uint8_t *) acb->read.read_fvd.iov.iov_base) +
             acb->read.read_fvd.iov.iov_len;
         memcpy(saved_bitmap, s->fresh_bitmap + b, bitmap_bytes);
diff --git a/block/fvd-snapshot.c b/block/fvd-snapshot.c
new file mode 100644
index 0000000..123b9ca
--- /dev/null
+++ b/block/fvd-snapshot.c
@@ -0,0 +1,878 @@ 
+/*
+ * QEMU Fast Virtual Disk Format Snapshot
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static inline void update_refcount_region(uint16_t *refcount,
+                            uint64_t refcount_size, uint64_t chunk_size,
+                            uint64_t offset, uint64_t size, int change)
+{
+    uint64_t begin = offset / chunk_size;
+    uint64_t end = begin + size / chunk_size;
+    uint64_t chunk;
+
+    ASSERT(offset % chunk_size == 0 && size % chunk_size == 0 &&
+           refcount_size % chunk_size == 0 && chunk_size % 512 == 0 &&
+           end * sizeof(uint16_t) <= refcount_size);
+
+    for (chunk = begin; chunk < end; chunk++) {
+        refcount[chunk] += change;
+        ASSERT(refcount[chunk] != (uint16_t)0xFFFF);
+    }
+}
+
+static inline void update_refcount_table(uint16_t *refcount,
+                            uint64_t refcount_size, uint32_t *table,
+                            uint32_t n, int change)
+{
+    uint32_t i;
+
+    for (i = 0; i < n; i++) {
+        uint32_t chunk = READ_TABLE(table[i]);
+        ASSERT(chunk < refcount_size / sizeof(uint16_t));
+        if (!IS_EMPTY(chunk)) {
+            refcount[chunk] += change;
+            ASSERT(refcount[chunk] != (uint16_t)0xFFFF);
+        }
+    }
+}
+
+static inline void PRINT_REFCOUNT(uint16_t *refcount, uint64_t refcount_size)
+{
+#ifdef ENABLE_QDEBUG
+    int i, n = refcount_size / sizeof(uint16_t);
+
+    QDEBUG("Print refcount:\n");
+    for (i = 0; i < n; i++) {
+        if (refcount[i] != 0) {
+            QDEBUG("\trefcount[%d]=%u\n", i, refcount[i]);
+        }
+    }
+#endif
+}
+
+static inline int find_snapshot_by_id(FvdSnapshot *sns, int n, const char *id)
+{
+    int i;
+
+    for(i = 0; i < n; i++) {
+        if (!strcmp(sns[i].id_str, id))
+            return i;
+    }
+    return -ENOENT;
+}
+
+static int find_snapshot_by_id_or_name(FvdSnapshot *sns, int n, const char *id)
+{
+    int i, ret;
+
+    if ((ret = find_snapshot_by_id(sns, n, id)) >= 0) {
+        return ret;
+    }
+
+    for(i = 0; i < n; i++) {
+        if (!strcmp(sns[i].name, id)) {
+            return i;
+        }
+    }
+
+    return -ENOENT;
+}
+
+static inline void snapshot_le_to_cpu(FvdSnapshot *sn)
+{
+    le32_to_cpus(&sn->date_sec);
+    le32_to_cpus(&sn->date_nsec);
+    le64_to_cpus(&sn->vm_state_actual_size);
+    le64_to_cpus(&sn->vm_clock_nsec);
+    le64_to_cpus(&sn->bitmap_offset);
+    le64_to_cpus(&sn->bitmap_size);
+    le64_to_cpus(&sn->table_offset);
+    le64_to_cpus(&sn->table_size);
+    le64_to_cpus(&sn->vm_state_offset);
+    le64_to_cpus(&sn->vm_state_space_size);
+}
+
+static inline void snapshot_cpu_to_le(FvdSnapshot *sn)
+{
+    cpu_to_le32s(&sn->date_sec);
+    cpu_to_le32s(&sn->date_nsec);
+    cpu_to_le64s(&sn->vm_state_actual_size);
+    cpu_to_le64s(&sn->vm_clock_nsec);
+    cpu_to_le64s(&sn->bitmap_offset);
+    cpu_to_le64s(&sn->bitmap_size);
+    cpu_to_le64s(&sn->table_offset);
+    cpu_to_le64s(&sn->table_size);
+    cpu_to_le64s(&sn->vm_state_offset);
+    cpu_to_le64s(&sn->vm_state_space_size);
+}
+
+static int find_snapshot(BlockDriverState *bs, FvdHeader *h,
+                         FvdSnapshot **sns, const char *snapshot_id)
+{
+    int ret, size;
+    FvdSnapshot *p;
+
+    if ((ret = read_fvd_header(bs, h)) < 0) {
+        return ret;
+    }
+
+    if (h->num_snapshots <= 0) {
+        return -ENOENT;
+    }
+
+    /* Read snapshots from disk. */
+    size = h->num_snapshots * sizeof(FvdSnapshot);
+    p = my_qemu_malloc(size);
+    ret = bdrv_pread(bs->file, h->snapshot_offset, (uint8_t*)p, size);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = find_snapshot_by_id_or_name(p, h->num_snapshots, snapshot_id);
+    if (ret >= 0) {
+        *sns = p;
+        return ret;
+    }
+
+fail:
+    my_qemu_free(sns);
+    return ret;
+}
+
+static int read_refcount(BlockDriverState *bs, FvdHeader *h,
+                         uint16_t *refcount)
+{
+    int ret, k;
+
+    ASSERT(h->refcount_offset > 0 && h->refcount_size > 0);
+    ret = bdrv_read(bs->file, h->refcount_offset / 512,
+                    (uint8_t*)refcount, h->refcount_size / 512);
+    if (ret < 0) {
+        my_qemu_vfree(refcount);
+        return ret;
+    }
+
+    for (k = 0; k < h->refcount_size / sizeof(uint16_t); k++) {
+        le16_to_cpus(refcount + k);
+    }
+
+    PRINT_REFCOUNT(refcount, h->refcount_size);
+    return 0;
+}
+
+static int fvd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *si)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdSnapshot *sns, *sn;
+    FvdHeader h;
+    int ret;
+    uint64_t size;
+    uint64_t new_refcount_offset, new_refcount_size;
+    uint32_t chunk, k, n_chunks;
+    uint32_t max_chunk = 0; /* The max chunk referred in the refcount table. */
+    uint16_t *refcount = NULL;
+
+    /* Convert the image if needed, since snapshot only works on a compact
+     * image with block_size=chunk_size. Otherwise, if block_size<chunk_size,
+     * the contents of blocks in one chunk C of a snapshot S may scatter in
+     * chunks of multiple previous snapshots, with each snapshot contributing
+     * only the content of one block of chunk C to snapshot S. This
+     * essentially forces snapshot S to track chunk content at the block
+     * level, i.e., artificially implementing block_size=chunk_size
+     * internally. This is not a general limitation of FVD.
+     * block_size=chunk_size is needed only after snapshots are taken. A
+     * high-performance FVD image (like those in a Cloud) may never take any
+     * snapshots. */
+    if (!s->table) {
+        /* Not a compact image. Do conversion. */
+        if ((ret = convert_to_compact_image(bs)) < 0) {
+            return ret;
+        }
+    } else if (s->block_size != s->chunk_size) {
+        /* Change chunk_size. */
+        if ((ret = reduce_chunk_size(bs, s->block_size * 512)) < 0) {
+            return ret;
+        }
+    } else if ((ret = flush_metadata_to_disk(bs, true, false)) < 0) {
+        return ret;
+    }
+    ASSERT(s->table && s->block_size == s->chunk_size);
+
+    if ((ret = read_fvd_header(bs, &h)) < 0) {
+        return ret;
+    }
+
+    /* Sanity check. */
+    if (h.num_snapshots > 65535) {
+        fprintf(stderr, "Cannot take more than 65535 snapshots.\n");
+        return -ENOENT;
+    }
+    if (h.num_snapshots > 0) {
+        if (h.snapshot_offset == 0) {
+            return -ENOENT;
+        }
+        size = h.num_snapshots * sizeof(FvdSnapshot);
+        size = ROUND_UP(size, 512);
+        if (size > h.snapshot_size) {
+            return -ENOENT;
+        }
+    }
+    if (h.snapshot_offset > 0 && h.num_snapshots == 0) {
+        return -ENOENT;
+    }
+
+    /* Allocate memory for the new snapshot list. */
+    size = (1 + h.num_snapshots) * sizeof(FvdSnapshot);
+    size = ROUND_UP(size, 512);
+    sns = my_qemu_blockalign(bs->file, size);
+    if (h.snapshot_offset > 0) {
+        /* Read in the old snapshot list. */
+        size = ROUND_UP(h.num_snapshots * sizeof(FvdSnapshot), 512);
+        ret = bdrv_read(bs->file, h.snapshot_offset / 512,
+                        (uint8_t*)sns, size / 512);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    if (si->id_str[0] == 0) {
+        /* Generate an ID for the new snapshot, larger than the IDs of
+         * previous snapshots. */
+        uint64_t i, max_id = 0;
+        for (i = 0; i < h.num_snapshots; i++) {
+            uint64_t id = strtoull(sns[i].id_str, NULL, 10);
+            if (id > max_id) {
+                max_id = id;
+            }
+        }
+        snprintf(si->id_str, sizeof(si->id_str), "%llu",
+                 (unsigned long long)(max_id + 1));
+    }
+
+    /* Initialize the new snapshot. */
+    sn = &sns[h.num_snapshots];
+    memset(sn, 0, sizeof(sn));
+    pstrcpy(sn->id_str, sizeof(si->id_str), si->id_str);
+    pstrcpy(sn->name, sizeof(si->name), si->name);
+    sn->vm_state_actual_size = cpu_to_le64((uint64_t)si->vm_state_size);
+    sn->date_sec = cpu_to_le32(si->date_sec);
+    sn->date_nsec = cpu_to_le32(si->date_nsec);
+    sn->vm_clock_nsec = cpu_to_le64(si->vm_clock_nsec);
+    sn->vm_state_offset = cpu_to_le64(0); // XXX
+    sn->vm_state_space_size = cpu_to_le64(0); // XXX
+
+    /* Allocate disk space for the new snapshot's bitmap. */
+    if (s->fresh_bitmap) {
+        int sz = calc_bitmap_size(h.base_img_size, h.block_size);
+        n_chunks = DIV_ROUND_UP(sz, h.chunk_size);
+        if (IS_EMPTY(chunk = alloc_chunks(bs, n_chunks))) {
+            goto fail;
+        }
+        sn->bitmap_offset = chunk * h.chunk_size;
+        sn->bitmap_size = n_chunks * h.chunk_size;
+        if (chunk + n_chunks > max_chunk) {
+            max_chunk = chunk + n_chunks;
+        }
+
+        /* Make a copy of the bitmap on disk. */
+        sz = ROUND_UP(sz, 512);
+        uint8_t *bitmap = my_qemu_blockalign(bs->file, sz);
+        memcpy(bitmap, s->fresh_bitmap, sz);
+        ret = bdrv_write(bs->file, sn->bitmap_offset / 512,
+                         bitmap, sz / 512);
+        my_qemu_vfree(bitmap);
+        if (ret < 0) {
+            goto fail;
+        }
+    } else { /* No bitmap. */
+        sn->bitmap_offset = 0;
+        sn->bitmap_size = 0;
+    }
+
+    /* Allocate disk space for the new snapshot's chunk mapping table. */
+    size = calc_table_size(h.virtual_disk_size, h.chunk_size);
+    n_chunks = DIV_ROUND_UP(size, h.chunk_size);
+    if (IS_EMPTY(chunk = alloc_chunks(bs, n_chunks))) {
+        goto fail;
+    }
+    sn->table_offset = chunk * h.chunk_size;
+    sn->table_size = n_chunks * h.chunk_size;
+    if (chunk + n_chunks > max_chunk) {
+        max_chunk = chunk + n_chunks;
+    }
+
+    /* Make a copy of the table on disk. */
+    n_chunks = size / sizeof(uint32_t);
+    size = ROUND_UP(size, 512);
+    uint8_t *table = my_qemu_blockalign(bs->file, size);
+    memcpy(table, s->table, n_chunks * sizeof(uint32_t));
+    ret = bdrv_write(bs->file, sn->table_offset / 512, table, size / 512);
+    my_qemu_vfree(table);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* If the old snapshot list does not have enough disk space to accomdate
+     * the new snapshot, re-allocate disk space for the snapshot list. */
+    size = (1 + h.num_snapshots) * sizeof(FvdSnapshot);
+    if (h.snapshot_size < size) {
+        n_chunks = DIV_ROUND_UP(size, h.chunk_size);
+        if (IS_EMPTY(chunk = alloc_chunks(bs, n_chunks))) {
+            goto fail;
+        }
+
+        h.snapshot_offset = chunk * h.chunk_size;
+        h.snapshot_size = n_chunks * h.chunk_size;
+
+        if (chunk + n_chunks > max_chunk) {
+            max_chunk = chunk + n_chunks;
+        }
+    }
+
+    /* Write out the new snapshot list. */
+    ret = bdrv_write(bs->file, h.snapshot_offset / 512, (uint8_t*)sns,
+                     DIV_ROUND_UP(size, 512));
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Scan the chunk mapping table and update max_chunk. */
+    n_chunks = DIV_ROUND_UP(h.virtual_disk_size, h.chunk_size);
+    for (k = 0; k < n_chunks; k++) {
+        chunk = READ_TABLE(s->table[k]);
+        if (chunk + 1 > max_chunk) {
+            max_chunk = chunk + 1;
+        }
+    }
+
+    /* Allocate memory for the refcount table. */
+    new_refcount_size = ROUND_UP(max_chunk * sizeof(uint16_t),
+                                 h.chunk_size);
+    new_refcount_size = MAX(h.refcount_size, new_refcount_size);
+    refcount = my_qemu_blockalign(bs->file, new_refcount_size);
+
+    /* Read in the old refcount table. */
+    if (h.refcount_offset > 0 && (ret = read_refcount(bs, &h, refcount)) < 0) {
+        goto fail;
+    }
+
+    /* Zero the new tail section of refcount. */
+    if (new_refcount_size > h.refcount_size) {
+        memset(((uint8_t*)refcount) + h.refcount_size, 0,
+               new_refcount_size - h.refcount_size);
+    }
+
+    /* Build the content of the new refcount table. It counts spaces
+     * consumed by snapshots, but counts neither the space for the refcount
+     * table itself nor the sapce used by the currently running image. */
+    if (sn->bitmap_offset > 0) {
+        /* Add disk space used by the new snapshot's bitmap. */
+        update_refcount_region(refcount, new_refcount_size, h.chunk_size,
+                               sn->bitmap_offset, sn->bitmap_size, 1);
+    }
+
+    /* Add disk space used by the new snapshot's chunk allocation table. */
+    update_refcount_region(refcount, new_refcount_size, h.chunk_size,
+                           sn->table_offset, sn->table_size, 1);
+
+    /* Add disk space used by the new snapshot's data. */
+    n_chunks = DIV_ROUND_UP(h.virtual_disk_size, h.chunk_size);
+    update_refcount_table(refcount, new_refcount_size, s->table, n_chunks, 1);
+
+    /* Allocate disk space for the new refcount table. */
+    n_chunks = new_refcount_size / h.chunk_size;
+    if (IS_EMPTY(chunk = alloc_chunks(bs, n_chunks))) {
+        goto fail;
+    }
+    new_refcount_offset = chunk * h.chunk_size;
+
+    /* Write out the new refcount table. It does not modify the on-disk old
+     * refcount table in place in order to gaurantee image integrity in the
+     * event of host crash during refcount update. */
+    PRINT_REFCOUNT(refcount, new_refcount_size);
+    for (k = 0; k < new_refcount_size / sizeof(uint16_t); k++) {
+        cpu_to_le16s(refcount + k);
+    }
+    ret = bdrv_write(bs->file, new_refcount_offset / 512,
+                    (uint8_t*)refcount, new_refcount_size / 512);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Now update the image's header fields and commit the change. Because sns
+     * modified header fields are in the first 512-byte of FvdHeader, the
+     * changes are made atomically. If the host crashes while changing
+     * FvdHeader, after it reboots, it either gets the image before taking the
+     * snapshot, or gets an image with the snapshot already taken
+     * successfully. The image's state is never corrupted with partial
+     * updates. */
+    h.refcount_offset = new_refcount_offset;
+    h.refcount_size = new_refcount_size;
+    h.num_snapshots++;
+
+    my_qemu_vfree(refcount);
+    my_qemu_vfree(sns);
+
+    return update_header_and_reopen_img(bs, &h);
+
+fail:
+    if (!refcount) {
+        my_qemu_vfree(refcount);
+    }
+    if (!sns) {
+        my_qemu_vfree(sns);
+    }
+    return -EIO;
+}
+
+static int fvd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
+{
+    FvdSnapshot *sns = NULL, *sn;
+    int size, ret = 0;
+    FvdHeader h;
+    uint32_t chunk, n_chunks;
+
+    if ((ret = find_snapshot(bs, &h, &sns, snapshot_id)) < 0) {
+        return ret;
+    }
+
+    sn = &sns[ret];
+    snapshot_le_to_cpu(sn);
+
+    /* Restore the bitmap. */
+    if (sn->bitmap_offset > 0) {
+        /* Allocate disk space for the bitmap. */
+        size = calc_bitmap_size(h.base_img_size, h.block_size);
+        n_chunks = DIV_ROUND_UP(size, h.chunk_size);
+        if (IS_EMPTY(chunk = alloc_chunks(bs, n_chunks))) {
+            ret = -EIO;
+            goto done;
+        }
+        h.bitmap_offset = chunk * h.chunk_size;
+        h.bitmap_size = n_chunks * h.chunk_size;
+
+        /* Load the bitmap. */
+        size = ROUND_UP(size, 512);
+        ASSERT(sn->bitmap_size >= size);
+        uint8_t *bitmap = my_qemu_blockalign(bs, size);
+        ret = bdrv_read(bs->file, sn->bitmap_offset / 512, bitmap, size / 512);
+        if (ret < 0) {
+            goto done;
+        }
+
+        /* Copy the bitmap. */
+        ret = bdrv_write(bs->file, h.bitmap_offset / 512, bitmap, size / 512);
+        if (ret < 0) {
+            goto done;
+        }
+        my_qemu_vfree(bitmap);
+    }
+
+    /* Restore the table. First allocate disk space for the table. */
+    size = calc_table_size(h.virtual_disk_size, h.chunk_size);
+    n_chunks = DIV_ROUND_UP(size, h.chunk_size);
+    if (IS_EMPTY(chunk = alloc_chunks(bs, n_chunks))) {
+        ret = -EIO;
+        goto done;
+    }
+    h.table_offset = chunk * h.chunk_size;
+    h.table_size = n_chunks * h.chunk_size;
+
+    /* Load the table. */
+    size = ROUND_UP(size, 512);
+    ASSERT(sn->table_size >= size);
+    uint8_t *table = my_qemu_blockalign(bs, size);
+    ret = bdrv_read(bs->file, sn->table_offset / 512, table, size / 512);
+    if (ret < 0) {
+        goto done;
+    }
+
+    /* Copy the table. */
+    ret = bdrv_write(bs->file, h.table_offset / 512, table, size / 512);
+    if (ret < 0) {
+        goto done;
+    }
+
+    my_qemu_vfree(table);
+    my_qemu_free(sns);
+
+    h.vm_state_offset = sn->vm_state_offset;
+    h.vm_state_size = sn->vm_state_space_size;
+
+    return update_header_and_reopen_img(bs, &h);
+
+done:
+    if (sns) {
+        my_qemu_free(sns);
+    }
+    return ret;
+}
+
+static int fvd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+{
+    FvdSnapshot *sns = NULL, *sn;
+    int sidx, size, ret = 0;
+    FvdHeader h;
+    uint32_t k, chunk, n_chunks, *table = NULL;
+    uint16_t *refcount = NULL;
+
+    if ((sidx = find_snapshot(bs, &h, &sns, snapshot_id)) < 0) {
+        return sidx;
+    }
+
+    h.num_snapshots--;
+    ASSERT(h.num_snapshots != (uint16_t)0xFFFF);
+    if (h.num_snapshots == 0) {
+        my_qemu_free(sns);
+
+        /* Once the following fields are reset, sns related spaces will be
+         * freed automatically when the image is re-opened, include spaces for
+         * snapshot list, refcount, vm_state, data chunks in the snapshot. */
+        h.snapshot_offset = h.snapshot_size = 0;
+        h.refcount_offset = h.refcount_size = 0;
+        h.vm_state_offset = h.vm_state_size = 0;
+
+        /* Re-open the image so that the freed space can be reused. */
+        return update_header_and_reopen_img(bs, &h);
+    }
+
+    sn = &sns[sidx];
+    snapshot_le_to_cpu(sn);
+
+    /* Sanity check. */
+    if (h.refcount_offset == 0 || h.refcount_size == 0) {
+        ret = -ENOENT;
+        goto fail;
+    }
+
+    /* Allocate disk space for the new snapshot list. The old snapshot list on
+     * disk is not updated in place in order to gaurantee image integrity in
+     * the event of host crash while half way through the update. */
+    size = h.num_snapshots * sizeof(FvdSnapshot);
+    n_chunks = DIV_ROUND_UP(size, h.chunk_size);
+    if (IS_EMPTY(chunk = alloc_chunks(bs, n_chunks))) {
+        goto fail;
+    }
+    h.snapshot_offset = chunk * h.chunk_size;
+    h.snapshot_size = n_chunks * h.chunk_size;
+
+    /* Read in the old refcount table. */
+    refcount = my_qemu_blockalign(bs->file, h.refcount_size);
+    if ((ret = read_refcount(bs, &h, refcount)) < 0) {
+        goto fail;
+    }
+
+    /* Free disk space used by the snapshot's bitmap. */
+    if (sn->bitmap_offset > 0) {
+        update_refcount_region(refcount, h.refcount_size, h.chunk_size,
+                               sn->bitmap_offset, sn->bitmap_size, -1);
+    }
+
+    /* Free disk space used by the snapshot's chunk allocation table. */
+    update_refcount_region(refcount, h.refcount_size, h.chunk_size,
+                           sn->table_offset, sn->table_size, -1);
+
+    if (sn->vm_state_offset > 0) {
+        /* Free disk space used by the snapshot's vm_state. */
+        update_refcount_region(refcount, h.refcount_size, h.chunk_size,
+                               sn->vm_state_offset,
+                               sn->vm_state_space_size, -1);
+    }
+
+    /* Free disk space used by data chunks in the snapshot. First load
+     * the snapshot's table. */
+    size = calc_table_size(h.virtual_disk_size, h.chunk_size);
+    size = ROUND_UP(size, 512);
+    ASSERT(sn->table_size >= size);
+    table = my_qemu_blockalign(bs, size);
+    ret = bdrv_read(bs->file, sn->table_offset / 512,
+                    (uint8_t*)table, size / 512);
+    if (ret < 0) {
+        goto fail;
+    }
+    n_chunks = DIV_ROUND_UP(h.virtual_disk_size, h.chunk_size);
+    update_refcount_table(refcount, h.refcount_size, table, n_chunks, -1);
+
+    /* Update the snapshot list and write it to disk at the new location. */
+    if (sidx < h.num_snapshots) {
+        memmove(sn, sn + 1, (h.num_snapshots - sidx) * sizeof(FvdSnapshot));
+    }
+    ret = bdrv_pwrite(bs->file, h.snapshot_offset, (uint8_t*)sns,
+                      h.num_snapshots * sizeof(FvdSnapshot));
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Allocate disk space for the new refcount table. */
+    if (IS_EMPTY(chunk = alloc_chunks(bs, h.refcount_size / h.chunk_size))) {
+        goto fail;
+    }
+    h.refcount_offset = chunk * h.chunk_size;
+
+    /* Wrie the new refcount table at the new location. The disk space for the
+     * refcount table itself is not counted in refcount. */
+    PRINT_REFCOUNT(refcount, h.refcount_size);
+    for (k = 0; k < h.refcount_size / sizeof(uint16_t); k++) {
+        cpu_to_le16s(refcount + k);
+    }
+    ret = bdrv_write(bs->file, h.refcount_offset / 512,
+                    (uint8_t*)refcount, h.refcount_size / 512);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    my_qemu_free(sns);
+    my_qemu_vfree(table);
+    my_qemu_vfree(refcount);
+
+    /* Now write FvdHeader to commit the change. Because sns modified header
+     * fields are in the first 512-byte of FvdHeader, the changes are made
+     * atomically. If the host crashes while changing FvdHeader, after it
+     * reboots, it either gets the image before deleting the snapshot, or gets
+     * an image with the snapshot already deleted successfully. The image's
+     * state is never corrupted with partial updates. The image is re-openned
+     * so that the space freed can be used. */
+    return update_header_and_reopen_img(bs, &h);
+
+fail:
+    if (sns) {
+        my_qemu_free(sns);
+    }
+    if (table) {
+        my_qemu_vfree(table);
+    }
+    if (refcount) {
+        my_qemu_vfree(refcount);
+    }
+    return ret;
+}
+
+static int fvd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
+{
+    FvdSnapshot *sns;
+    QEMUSnapshotInfo *si;
+    int ret, i;
+    FvdHeader h;
+
+    if ((ret = read_fvd_header(bs, &h)) < 0) {
+        return ret;
+    }
+
+    if (h.num_snapshots <= 0) {
+        *psn_tab = NULL;
+        return 0;
+    }
+
+    /* Read snapshots from disk. */
+    i = h.num_snapshots * sizeof(FvdSnapshot);
+    sns = my_qemu_malloc(i);
+    ret = bdrv_pread(bs->file, h.snapshot_offset, (uint8_t*)sns, i);
+    if (ret < 0) {
+        my_qemu_free(sns);
+        *psn_tab = NULL;
+        return ret;
+    }
+
+    si = my_qemu_mallocz(h.num_snapshots * sizeof(QEMUSnapshotInfo));
+    for (i = 0; i < h.num_snapshots; i++) {
+        pstrcpy(si[i].id_str, sizeof(si[i].id_str), sns[i].id_str);
+        pstrcpy(si[i].name, sizeof(si[i].name), sns[i].name);
+        si[i].vm_state_size = le64_to_cpu(sns[i].vm_state_actual_size);
+        si[i].date_sec = le32_to_cpu(sns[i].date_sec);
+        si[i].date_nsec = le32_to_cpu(sns[i].date_nsec);
+        si[i].vm_clock_nsec = le64_to_cpu(sns[i].vm_clock_nsec);
+    }
+
+    my_qemu_free(sns);
+    *psn_tab = si;
+    return h.num_snapshots;
+}
+
+static int fvd_snapshot_load_tmp(BlockDriverState *bs,
+                                 const char *snapshot_name)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdSnapshot *sns = NULL, *sn;
+    int idx, size, ret = 0;
+    FvdHeader h;
+
+    if ((ret = read_fvd_header(bs, &h)) < 0) {
+        return ret;
+    }
+
+    if (h.num_snapshots <= 0) {
+        return -ENOENT;
+    }
+
+    /* Read snapshots from disk. */
+    size = h.num_snapshots * sizeof(FvdSnapshot);
+    sns = my_qemu_malloc(size);
+    ret = bdrv_pread(bs->file, h.snapshot_offset, (uint8_t*)sns, size);
+    if (ret < 0) {
+        goto done;
+    }
+
+    idx = find_snapshot_by_id_or_name(sns, h.num_snapshots, snapshot_name);
+    if (idx < 0) {
+        ret = -ENOENT;
+        goto done;
+    }
+
+    sn = &sns[idx];
+    snapshot_le_to_cpu(sn);
+
+    /* Load the bitmap. */
+    if (s->fresh_bitmap) {
+        size = calc_bitmap_size(h.base_img_size, h.block_size);
+        size = DIV_ROUND_UP(size, 512);
+        ret = bdrv_read(bs->file, sn->bitmap_offset / 512,
+                        s->fresh_bitmap, size);
+        if (ret < 0) {
+            goto done;
+        }
+    }
+
+    /* Load the table. */
+    if (s->table) {
+        size = calc_table_size(h.virtual_disk_size, h.chunk_size);
+        size = DIV_ROUND_UP(size, 512);
+        ret = bdrv_read(bs->file, sn->table_offset / 512,
+                        (uint8_t*)s->table, size);
+        if (ret < 0) {
+            goto done;
+        }
+    }
+
+done:
+    my_qemu_free(sns);
+    return ret;
+}
+
+static int fvd_check_refcount(BlockDriverState *bs, BdrvCheckResult *res)
+{
+    int i, ret, size;
+    FvdSnapshot *sns = NULL, *sn;
+    FvdHeader h;
+    uint16_t *refcount = NULL;
+    uint32_t *table = NULL;
+    uint32_t n_chunks;
+
+    if ((ret = read_fvd_header(bs, &h)) < 0) {
+        goto fail;
+    }
+
+    if (h.num_snapshots == 0) {
+        if (h.snapshot_offset != 0) {
+            res->corruptions++;
+        }
+        if (h.snapshot_size != 0) {
+            res->corruptions++;
+        }
+        if (h.refcount_offset != 0) {
+            res->corruptions++;
+        }
+        if (h.refcount_size != 0) {
+            res->corruptions++;
+        }
+        if (h.refcount_size != 0) {
+            res->corruptions++;
+        }
+
+        return 0;
+    }
+
+    /* Read snapshots from disk. */
+    size = h.num_snapshots * sizeof(FvdSnapshot);
+    if (h.snapshot_size < size) {
+        res->corruptions++;
+        goto done;
+    }
+
+    sns = my_qemu_malloc(size);
+    ret = bdrv_pread(bs->file, h.snapshot_offset, (uint8_t*)sns, size);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Read refcount. */
+    refcount = my_qemu_blockalign(bs->file, h.refcount_size);
+    if ((ret = read_refcount(bs, &h, refcount)) < 0) {
+        goto fail;
+    }
+
+    /* Go through all snapshots. */
+    size = calc_table_size(h.virtual_disk_size, h.chunk_size);
+    size = ROUND_UP(size, 512);
+    n_chunks = DIV_ROUND_UP(h.virtual_disk_size, h.chunk_size);
+    table = my_qemu_blockalign(bs, size);
+
+    for (i = 0; i < h.num_snapshots; i++) {
+        sn = sns + i;
+        snapshot_le_to_cpu(sn);
+
+        if (sn->table_offset == 0 || sn->table_size < size) {
+            res->corruptions++;
+            goto done;
+        }
+
+        /* Count disk space used by the snapshot's bitmap. */
+        if (sn->bitmap_offset > 0) {
+            update_refcount_region(refcount, h.refcount_size, h.chunk_size,
+                                   sn->bitmap_offset, sn->bitmap_size, -1);
+        }
+
+        /* Count disk space used by the snapshot's chunk allocation table. */
+        update_refcount_region(refcount, h.refcount_size, h.chunk_size,
+                               sn->table_offset, sn->table_size, -1);
+
+        if (sn->vm_state_offset > 0) {
+            /* Count disk space used by the snapshot's vm_state. */
+            update_refcount_region(refcount, h.refcount_size, h.chunk_size,
+                                   sn->vm_state_offset,
+                                   sn->vm_state_space_size, -1);
+        }
+
+        /* Count disk space used by data chunks in the snapshot. First load
+         * the snapshot's table. */
+        ret = bdrv_read(bs->file, sn->table_offset / 512,
+                        (uint8_t*)table, size / 512);
+        if (ret < 0) {
+            goto fail;
+        }
+        update_refcount_table(refcount, h.refcount_size, table, n_chunks, -1);
+    }
+
+    for (i = 0; i < h.refcount_size / sizeof(uint16_t); i++) {
+        if (refcount[i] > 0) {
+            res->corruptions++;
+        }
+    }
+
+done:
+    my_qemu_free(sns);
+    my_qemu_vfree(table);
+    my_qemu_vfree(refcount);
+    return 0;
+
+fail:
+    if (sns) {
+        my_qemu_free(sns);
+    }
+    if (table) {
+        my_qemu_vfree(table);
+    }
+    if (refcount) {
+        my_qemu_vfree(refcount);
+    }
+    res->check_errors++;
+    return ret;
+}
diff --git a/block/fvd-store.c b/block/fvd-store.c
index ec23fd7..8649a94 100644
--- a/block/fvd-store.c
+++ b/block/fvd-store.c
@@ -11,18 +11,19 @@ 
  *
  */
 
-static uint32_t allocate_chunk(BlockDriverState * bs);
-static inline FvdAIOCB *init_store_acb(int soft_write,
-                QEMUIOVector * orig_qiov, BlockDriverState * bs,
-                int64_t sector_num, int nb_sectors, FvdAIOCB * parent_acb,
-                BlockDriverCompletionFunc * cb, void *opaque);
-static BlockDriverAIOCB *store_data_in_compact_image(int soft_write,
+static uint32_t alloc_chunk(BlockDriverState * bs);
+static inline FvdAIOCB *init_store_acb(int copy_on_read,
+                uint32_t *shared_table_copy, QEMUIOVector * orig_qiov,
+                BlockDriverState * bs, int64_t sector_num, int nb_sectors,
+                FvdAIOCB * parent_acb, BlockDriverCompletionFunc * cb,
+                void *opaque);
+static BlockDriverAIOCB *store_data_in_compact_image(int copy_on_read,
             struct FvdAIOCB *parent_acb, BlockDriverState * bs,
             int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
             BlockDriverCompletionFunc * cb, void *opaque);
 static void store_data_in_compact_image_cb(void *opaque, int ret);
 
-static inline BlockDriverAIOCB *store_data(int soft_write,
+static inline BlockDriverAIOCB *store_data(int copy_on_read,
                 FvdAIOCB * parent_acb, BlockDriverState * bs,
                 int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors,
                 BlockDriverCompletionFunc * cb, void *opaque)
@@ -33,19 +34,19 @@  static inline BlockDriverAIOCB *store_data(int soft_write,
 
     if (!s->table) {
         /* Write directly since it is not a compact image. */
-        return bdrv_aio_writev(s->fvd_data, s->data_offset + sector_num,
+        return bdrv_aio_writev(bs->file, s->data_offset + sector_num,
                                orig_qiov, nb_sectors, cb, opaque);
     } else {
-        return store_data_in_compact_image(soft_write, parent_acb, bs,
+        return store_data_in_compact_image(copy_on_read, parent_acb, bs,
                                            sector_num, orig_qiov, nb_sectors,
                                            cb, opaque);
     }
 }
 
-/* Store data in the compact image. The argument 'soft_write' means
+/* Store data in the compact image. The argument 'copy_on_read' means
  * the store was caused by copy-on-read or prefetching, which need not
  * update metadata immediately. */
-static BlockDriverAIOCB *store_data_in_compact_image(int soft_write,
+static BlockDriverAIOCB *store_data_in_compact_image(int copy_on_read,
                                                      FvdAIOCB * parent_acb,
                                                      BlockDriverState * bs,
                                                      int64_t sector_num,
@@ -56,16 +57,22 @@  static BlockDriverAIOCB *store_data_in_compact_image(int soft_write,
 {
     BDRVFvdState *s = bs->opaque;
     FvdAIOCB *acb;
+    int64_t start_sec;
+    uint32_t prev, chunk, *stab;
+    size_t iov_left;
+    uint8_t *iov_buf;
+    int i, nb, iov_index, nqiov, niov;
     const uint32_t first_chunk = sector_num / s->chunk_size;
     const uint32_t last_chunk = (sector_num + nb_sectors - 1) / s->chunk_size;
-    int table_dirty = false;
-    uint32_t chunk;
-    int64_t start_sec;
+    bool table_dirty = false, table_shared = false;
 
     /* Check if storag space is allocated. */
     for (chunk = first_chunk; chunk <= last_chunk; chunk++) {
+        /* It cannot be dirty and shared with snapshot at the same time. */
+        ASSERT(!IS_DIRTY(s->table[chunk]) || !IS_SHARED(s->table[chunk]));
+
         if (IS_EMPTY(s->table[chunk])) {
-            uint32_t id = allocate_chunk(bs);
+            uint32_t id = alloc_chunk(bs);
             if (IS_EMPTY(id)) {
                 return NULL;
             }
@@ -95,20 +102,53 @@  static BlockDriverAIOCB *store_data_in_compact_image(int soft_write,
              * unnecessary locking due to ``false sharing'' of a chunk by two
              * writes. */
             table_dirty = true;
+        } else if (IS_SHARED(s->table[chunk])) {
+            table_shared = true;
+            /* Should never do copy-on-read on a chunk shared with snapshot.*/
+            ASSERT(!copy_on_read);
         }
     }
 
-    if (!(acb = init_store_acb(soft_write, orig_qiov, bs, sector_num,
+    /* Initialize 'stab', which is fragment of s->table that we care about. */
+    if (!table_shared) {
+        stab = &s->table[first_chunk];
+    } else {
+        /* Cannot do in-place updates for table entries of shared chunks
+         * before the new data are written disk. Otherwise, it would return
+         * incorrect data if those new table entries are used by read
+         * operations. Make a copy of the table fragment into 'stab' and
+         * update 'stab' for now. 'stab' will be merged back to the master
+         * copy after the data are writen to disk. */
+        size_t size = (last_chunk - first_chunk + 1) * sizeof(uint32_t);
+        stab = my_qemu_malloc(size);
+        memcpy(stab, &s->table[first_chunk], size);
+
+        /* Allocate new disk spaces for chunks shared with snapshot. Data
+         * will be written to the new spaces rather than the shared chunks. */
+        for (chunk = first_chunk; chunk <= last_chunk; chunk++) {
+            if (IS_SHARED(s->table[chunk])) {
+                uint32_t id = alloc_chunk(bs);
+                if (IS_EMPTY(id)) {
+                    my_qemu_free(stab);
+                    return NULL;
+                }
+
+                QDEBUG ("STORE: map chunk %u to %u\n", chunk, id);
+                id |= (DIRTY_TABLE | SHARED_TABLE);
+                WRITE_TABLE(stab[chunk - first_chunk], id);
+            }
+        }
+    }
+
+    if (!(acb = init_store_acb(copy_on_read, table_shared ? stab : NULL,
+                               orig_qiov, bs, sector_num,
                                nb_sectors, parent_acb, cb, opaque))) {
+        if (table_shared) {
+            my_qemu_free(stab);
+        }
         return NULL;
     }
 
-    const bool update_table = (!soft_write && table_dirty);
-    size_t iov_left;
-    uint8_t *iov_buf;
-    int nb, iov_index, nqiov, niov;
-    uint32_t prev;
-
     if (first_chunk == last_chunk) {
         goto handle_one_continuous_region;
     }
@@ -120,13 +160,13 @@  static BlockDriverAIOCB *store_data_in_compact_image(int soft_write,
     iov_index = 0;
     nqiov = 0;
     niov = 0;
-    prev = READ_TABLE(s->table[first_chunk]);
+    prev = READ_TABLE(stab[0]);
 
     /* Data in the first chunk. */
     nb = s->chunk_size - (sector_num % s->chunk_size);
 
     for (chunk = first_chunk + 1; chunk <= last_chunk; chunk++) {
-        uint32_t current = READ_TABLE(s->table[chunk]);
+        uint32_t current = READ_TABLE(stab[chunk - first_chunk]);
         int64_t data_size;
         if (chunk < last_chunk) {
             data_size = s->chunk_size;
@@ -153,20 +193,22 @@  static BlockDriverAIOCB *store_data_in_compact_image(int soft_write,
 handle_one_continuous_region:
         /* A simple case. All data can be written out in one qiov and no new
          * chunks are allocated. */
-        start_sec = READ_TABLE(s->table[first_chunk]) * s->chunk_size +
+        start_sec = READ_TABLE(stab[0]) * s->chunk_size +
             (sector_num % s->chunk_size);
 
-        acb->store.update_table = update_table;
         acb->store.num_children = 1;
         acb->store.one_child.hd_acb =
-            bdrv_aio_writev(s->fvd_data, s->data_offset + start_sec, orig_qiov,
-                            nb_sectors, store_data_in_compact_image_cb,
+            bdrv_aio_writev(bs->file, start_sec, orig_qiov, nb_sectors,
+                            store_data_in_compact_image_cb,
                             &acb->store.one_child);
         if (acb->store.one_child.hd_acb) {
             acb->store.one_child.acb = acb;
             return &acb->common;
         } else {
             my_qemu_aio_release(acb);
+            if (table_shared) {
+                my_qemu_free(stab);
+            }
             return NULL;
         }
     }
@@ -178,7 +220,6 @@  handle_one_continuous_region:
     ASSERT(iov_index == orig_qiov->niov - 1 && iov_left == 0);
 
     /* Need to submit multiple requests to the lower layer. */
-    acb->store.update_table = update_table;
     acb->store.num_children = nqiov;
 
     if (!parent_acb) {
@@ -195,13 +236,12 @@  handle_one_continuous_region:
     QEMUIOVector *q = (QEMUIOVector *) (acb->store.children + nqiov);
     struct iovec *v = (struct iovec *)(q + nqiov);
 
-    start_sec = READ_TABLE(s->table[first_chunk]) * s->chunk_size +
-        (sector_num % s->chunk_size);
+    prev = READ_TABLE(stab[0]);
+    start_sec = prev * s->chunk_size + (sector_num % s->chunk_size);
     nqiov = 0;
     iov_index = 0;
     iov_left = orig_qiov->iov[0].iov_len;
     iov_buf = orig_qiov->iov[0].iov_base;
-    prev = READ_TABLE(s->table[first_chunk]);
 
     /* Data in the first chunk. */
     if (first_chunk == last_chunk) {
@@ -211,7 +251,7 @@  handle_one_continuous_region:
     }
 
     for (chunk = first_chunk + 1; chunk <= last_chunk; chunk++) {
-        uint32_t current = READ_TABLE(s->table[chunk]);
+        uint32_t current = READ_TABLE(stab[chunk - first_chunk]);
         int64_t data_size;
         if (chunk < last_chunk) {
             data_size = s->chunk_size;
@@ -233,8 +273,8 @@  handle_one_continuous_region:
                    " nb_sectors=%zu niov=%d\n", acb->uuid, acb, nqiov,
                    start_sec, q->size / 512, q->niov);
             acb->store.children[nqiov].hd_acb =
-                bdrv_aio_writev(s->fvd_data, s->data_offset + start_sec, q,
-                                q->size / 512, store_data_in_compact_image_cb,
+                bdrv_aio_writev(bs->file, start_sec, q, q->size / 512,
+                                store_data_in_compact_image_cb,
                                 &acb->store.children[nqiov]);
             if (!acb->store.children[nqiov].hd_acb) {
                 goto fail;
@@ -259,17 +299,20 @@  handle_one_continuous_region:
            " nb_sectors=%zu niov=%d\n", acb->uuid, acb, nqiov, start_sec,
            q->size / 512, q->niov);
     acb->store.children[nqiov].hd_acb =
-        bdrv_aio_writev(s->fvd_data, s->data_offset + start_sec, q,
-                        q->size / 512, store_data_in_compact_image_cb,
+        bdrv_aio_writev(bs->file, start_sec, q, q->size / 512,
+                        store_data_in_compact_image_cb,
                         &acb->store.children[nqiov]);
     if (acb->store.children[nqiov].hd_acb) {
         acb->store.children[nqiov].acb = acb;
         return &acb->common;
     }
 
-    int i;
 fail:
     QDEBUG("STORE: acb%llu-%p  failed\n", acb->uuid, acb);
+
+    if (table_shared) {
+        my_qemu_free(stab);
+    }
     for (i = 0; i < nqiov; i++) {
         bdrv_aio_cancel(acb->store.children[i].hd_acb);
     }
@@ -278,63 +321,127 @@  fail:
     return NULL;
 }
 
-static uint32_t allocate_chunk(BlockDriverState * bs)
+static inline bool has_storage_space(BlockDriverState *bs, int num_chunks)
+{
+    BDRVFvdState *s = bs->opaque;
+    uint64_t new_alloc = num_chunks * s->chunk_size;
+
+    if (!s->add_storage_cmd) {
+        return true; /* The underlying file system will grow as needed. */
+    }
+
+    if (s->used_storage + new_alloc <= s->avail_storage) {
+        return true;
+    }
+
+    /* Try to add more storage. */
+    if (system(s->add_storage_cmd)) {
+        fprintf(stderr, "Error in executing %s\n", s->add_storage_cmd);
+    }
+
+    /* Check how much storage is available now. */
+    int64_t size = bdrv_getlength(bs->file);
+    if (size < 0) {
+        fprintf(stderr, "Error in bdrv_getlength(%s)\n", bs->filename);
+        return false;
+    }
+
+    s->avail_storage = size / 512;
+    if (s->used_storage + new_alloc <= s->avail_storage) {
+        QDEBUG("Increased storage to %" PRId64 " bytes.\n", size);
+        return true;
+    } else {
+        fprintf(stderr, "Could not allocate more storage space.\n");
+        return false;
+    }
+}
+
+static uint32_t alloc_chunk(BlockDriverState * bs)
 {
     BDRVFvdState *s = bs->opaque;
     uint32_t physical_chunk;
 
-    /* Reuse a previously leaked chunk if possible. */
-    if (s->next_avail_leaked_chunk < s->num_leaked_chunks) {
-        physical_chunk = s->leaked_chunks[s->next_avail_leaked_chunk++];
-        QDEBUG("Reuse leaked physical chunk %u\n", physical_chunk);
-        if (s->next_avail_leaked_chunk == s->num_leaked_chunks) {
-            /* All leaked chunks have been used. */
-            my_qemu_free(s->leaked_chunks);
-            s->leaked_chunks = NULL;
-            s->num_leaked_chunks = s->next_avail_leaked_chunk = 0;
-            QDEBUG("All leaked physical chunks reused\n");
-        }
-        if (!s->chunks_relocated) {
-            s->chunks_relocated = true;
-            /* Update the header. */
-            FvdHeader header;
-            if (read_fvd_header(s, &header)) {
-                s->metadata_err_prohibit_write = true;
-            } else {
-                header.chunks_relocated = true;
-                if (update_fvd_header(s, &header)
-                    || bdrv_flush(s->fvd_metadata)) {
-                    s->metadata_err_prohibit_write = true;
-                }
-            }
+    /* Check s->free_chunks first. */
+    if (s->free_chunks) {
+        physical_chunk = find_next_zero_bit(s->free_chunks,
+                                            s->free_chunks_size,
+                                            s->next_free_chunk);
+
+        if (physical_chunk < s->free_chunks_size) {
+            /* Found one free chunk. */
+            QDEBUG("Reuse leaked physical chunk %u\n", physical_chunk);
+            set_bit(physical_chunk, s->free_chunks);
+            s->next_free_chunk = physical_chunk + 1;
+            return physical_chunk;
         }
-        return physical_chunk;
+
+        /* All existing free chunks have been used up. */
+        my_qemu_free(s->free_chunks);
+        s->free_chunks = NULL;
+        QDEBUG("All leaked physical chunks reused\n");
     }
 
-    /* Grow storage space if needed. */
-    if (s->add_storage_cmd &&
-        s->used_storage + s->chunk_size > s->avail_storage) {
-        if (system(s->add_storage_cmd)) {
-            fprintf(stderr, "Error in executing %s\n", s->add_storage_cmd);
-        }
+    if (!has_storage_space(bs, 1)) {
+        return EMPTY_TABLE;
+    }
 
-        /* Check how much storage is available now. */
-        int64_t size = bdrv_getlength(s->fvd_data);
-        if (size < 0) {
-            fprintf(stderr, "Error in bdrv_getlength(%s)\n", bs->filename);
-            return EMPTY_TABLE;
+    physical_chunk = s->used_storage / s->chunk_size;
+    s->used_storage += s->chunk_size;
+    return physical_chunk;
+}
+
+static uint32_t alloc_chunks(BlockDriverState * bs, int num_chunks)
+{
+    BDRVFvdState *s = bs->opaque;
+    uint32_t physical_chunk;
+
+    if (num_chunks == 1) {
+        return alloc_chunk(bs);
+    }
+
+    /* Check s->free_chunks first to find num_chunks continuus space. */
+    if (s->free_chunks) {
+        int i;
+        uint32_t next, current = s->next_free_chunk;
+
+retry:
+        current = find_next_zero_bit(s->free_chunks,
+                                     s->free_chunks_size,
+                                     current);
+        for (i = 1; i < num_chunks && current < s->free_chunks_size; i++) {
+            next = find_next_zero_bit(s->free_chunks,
+                                      s->free_chunks_size,
+                                      current + 1);
+            if (next != current + 1) {
+                current = next;
+                goto retry;
+            }
+            current = next;
         }
-        s->avail_storage = size / 512 - s->data_offset;
-        if (s->used_storage + s->chunk_size > s->avail_storage) {
-            fprintf(stderr, "Could not allocate more storage space.\n");
-            return EMPTY_TABLE;
+        if (current < s->free_chunks_size) {
+            /* Found continuous space in free_chunks. */
+            for (i = 0; i < num_chunks; i++) {
+                set_bit(current - i, s->free_chunks);
+            }
+
+            /* First of the continus chunks. */
+            physical_chunk = current + 1 - num_chunks;
+            QDEBUG("Reuse %d leaked physical chunk starting at %u\n",
+                   num_chunks, physical_chunk);
+            return physical_chunk;
         }
+    }
 
-        QDEBUG("Increased storage to %" PRId64 " bytes.\n", size);
+    if (!has_storage_space(bs, num_chunks)) {
+        return EMPTY_TABLE;
     }
 
     physical_chunk = s->used_storage / s->chunk_size;
-    s->used_storage += s->chunk_size;
+    s->used_storage += s->chunk_size * num_chunks;
+
+    QDEBUG("Allocate %d new physical chunks starting at %u\n",
+           num_chunks, physical_chunk);
+
     return physical_chunk;
 }
 
@@ -375,11 +482,37 @@  static void store_data_in_compact_image_cb(void *opaque, int ret)
     if (acb->store.ret) {       /* error */
         QDEBUG("STORE: acb%llu-%p  store_last_child_finished_with_error "
                "ret=%d\n", acb->uuid, acb, acb->store.ret);
+
+        if (acb->store.shared_table_copy) {
+            my_qemu_free(acb->store.shared_table_copy);
+        }
         acb->common.cb(acb->common.opaque, acb->store.ret);
         my_qemu_aio_release(acb);
         return;
     }
 
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    uint32_t chunk;
+    const uint32_t first_chunk = acb->sector_num / s->chunk_size;
+    const uint32_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
+                                / s->chunk_size;
+
+    /* For copy-on-write on data shared with snapshot, merge shared_table_copy
+     * back to s->table after the data are written. */
+    if (acb->store.shared_table_copy) {
+        for (chunk = first_chunk; chunk <= last_chunk; chunk ++) {
+            uint32_t entry = acb->store.shared_table_copy[chunk - first_chunk];
+            if (IS_SHARED(entry)) {
+                ASSERT(IS_SHARED(s->table[chunk]) && IS_DIRTY(entry));
+                s->table[chunk] = entry;
+                CLEAN_SHARED(s->table[chunk]); /* But keep the DIRTY flag. */
+            }
+        }
+        my_qemu_free(acb->store.shared_table_copy);
+        acb->store.shared_table_copy = NULL;
+    }
+
     /* Update the frontier of sectors already written (i.e.,avail_storage).
      * This affects load_data_from_compact_image(). A load from unwritten
      * sectors in allocated chunks should return an array of zeros.  Also
@@ -388,13 +521,7 @@  static void store_data_in_compact_image_cb(void *opaque, int ret)
      * table entries to the journal. If those table entries are no longer
      * dirty, depending on the behavior of parent_acb, it might be able to
      * skip a journal update. */
-    BlockDriverState *bs = acb->common.bs;
-    BDRVFvdState *s = bs->opaque;
-    const uint32_t first_chunk = acb->sector_num / s->chunk_size;
-    const uint32_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
-                                / s->chunk_size;
     bool update_table = false;
-    uint32_t chunk;
     for (chunk = first_chunk; chunk <= last_chunk; chunk ++) {
         int64_t end;
         if (chunk == last_chunk) {
@@ -415,7 +542,8 @@  static void store_data_in_compact_image_cb(void *opaque, int ret)
         }
     }
 
-    if (!acb->store.update_table) {
+    if (acb->store.copy_on_read || !update_table) {
+        /* No need to update the table on disk. Invoke callback. */
         QDEBUG("STORE: acb%llu-%p  "
                "store_last_child_finished_without_table_update\n",
                acb->uuid, acb);
@@ -433,21 +561,16 @@  static void store_data_in_compact_image_cb(void *opaque, int ret)
         acb->store.parent_acb->write.update_table = update_table;
         acb->common.cb(acb->common.opaque, acb->store.ret);
         my_qemu_aio_release(acb);
-    } else if (update_table) {
+    } else {
         QDEBUG("STORE: acb%llu-%p  "
                "store_last_child_finished_and_start_table_update\n",
                acb->uuid, acb);
         write_metadata_to_journal(acb, false);
-    } else {
-        QDEBUG("STORE: acb%llu-%p  "
-               "store_last_child_finished_without_table_update\n",
-               acb->uuid, acb);
-        acb->common.cb(acb->common.opaque, acb->store.ret);
-        my_qemu_aio_release(acb);
     }
 }
 
-static inline FvdAIOCB *init_store_acb(int soft_write,
+static inline FvdAIOCB *init_store_acb(int copy_on_read,
+                                       uint32_t *shared_table_copy,
                                        QEMUIOVector * orig_qiov,
                                        BlockDriverState * bs,
                                        int64_t sector_num, int nb_sectors,
@@ -463,7 +586,8 @@  static inline FvdAIOCB *init_store_acb(int soft_write,
     acb->cancel_in_progress = false;
     acb->sector_num = sector_num;
     acb->nb_sectors = nb_sectors;
-    acb->store.soft_write = soft_write;
+    acb->store.copy_on_read = copy_on_read;
+    acb->store.shared_table_copy = shared_table_copy;
     acb->store.orig_qiov = orig_qiov;
     acb->store.parent_acb = parent_acb;
     acb->store.finished_children = 0;
@@ -480,6 +604,9 @@  static inline FvdAIOCB *init_store_acb(int soft_write,
 
 static void fvd_aio_cancel_store_compact(FvdAIOCB * acb)
 {
+    if (acb->store.shared_table_copy) {
+        my_qemu_free(acb->store.shared_table_copy);
+    }
     if (acb->store.children) {
         int i;
         for (i = 0; i < acb->store.num_children; i++) {
diff --git a/block/fvd-update.c b/block/fvd-update.c
index 4ef4969..e7fc5ad 100644
--- a/block/fvd-update.c
+++ b/block/fvd-update.c
@@ -11,13 +11,19 @@ 
  *
  */
 
+static int convert_to_compact_image(BlockDriverState *bs);
+static int reduce_chunk_size(BlockDriverState *bs, uint64_t new_chunk_size);
+
 static int fvd_update(BlockDriverState * bs, QEMUOptionParameter * options)
 {
     BDRVFvdState *s = bs->opaque;
     FvdHeader header;
+    bool update_header = true;
     int ret;
 
-    read_fvd_header(s, &header);
+    if ((ret = read_fvd_header(bs, &header) < 0)) {
+        return ret;
+    }
 
     while (options && options->name) {
         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
@@ -34,7 +40,7 @@  static int fvd_update(BlockDriverState * bs, QEMUOptionParameter * options)
             header.virtual_disk_size = options->value.n;
             printf("Image resized to %" PRId64 " bytes.\n", options->value.n);
         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
-            if (strlen(options->value.s) > 1023) {
+            if (strlen(options->value.s) >= 1024) {
                 fprintf(stderr, "Error: the new base image name is longer "
                         "than 1023, which is not allowed.\n");
                 return -EINVAL;
@@ -42,16 +48,39 @@  static int fvd_update(BlockDriverState * bs, QEMUOptionParameter * options)
             memset(header.base_img, 0, 1024);
             pstrcpy(header.base_img, 1024, options->value.s);
             printf("Backing file updated to '%s'.\n", options->value.s);
-        } else if (!strcmp(options->name, "data_file")) {
-            if (strlen(options->value.s) > 1023) {
-                fprintf(stderr, "Error: the new data file name is longer "
-                        "than 1023, which is not allowed.\n");
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
+            if (strlen(options->value.s) >= 16) {
+                fprintf(stderr, "Error: the new base image format is longer "
+                        "than 15, which is not allowed.\n");
                 return -EINVAL;
             }
-
-            memset(header.data_file, 0, 1024);
-            pstrcpy(header.data_file, 1024, options->value.s);
-            printf("Data file updated to '%s'.\n", options->value.s);
+            memset(header.base_img_fmt, 0, 16);
+            pstrcpy(header.base_img_fmt, 16, options->value.s);
+            printf("Backing file format updated to '%s'.\n", options->value.s);
+        } else if (!strcmp(options->name, "raw_layout")) {
+            if (options->value.n) {
+                if (header.table_offset > 0) {
+                    fprintf(stderr, "Cannot change a compact image to a non-"
+                            "compact image with raw layout. "
+                            "Use qemu-img convert instead.\n");
+                    ret = -EINVAL;
+                } else {
+                    fprintf(stderr, "It is already a non-compact image.\n");
+                    ret = -EINVAL;
+                }
+            } else if (header.table_offset > 0) {
+                fprintf(stderr, "It is already a compact image.\n");
+                ret = -EINVAL;
+            } else if ((ret = convert_to_compact_image(bs) < 0)) {
+                fprintf(stderr, "Conveersion to compact image failed\n");
+            }
+            update_header = false;
+        } else if (!strcmp(options->name, "chunk_size")) {
+            ret = reduce_chunk_size(bs, options->value.n);
+            if (ret == 0) {
+                printf("Chunk size changed to %"PRId64"\n", options->value.n);
+            }
+            update_header = false;
         } else if (!strcmp(options->name, "need_zero_init")) {
             header.need_zero_init = options->value.n;
             if (header.need_zero_init) {
@@ -93,7 +122,7 @@  static int fvd_update(BlockDriverState * bs, QEMUOptionParameter * options)
                    ".\n", header.max_outstanding_copy_on_read_data);
         } else if (!strcmp(options->name, "init_data_region")) {
             if (options->value.n && !s->data_region_prepared) {
-                init_data_region(s);
+                init_data_region(bs);
             }
         } else if (!strcmp(options->name, "prefetch_start_delay")) {
             if (options->value.n <= 0) {
@@ -182,13 +211,194 @@  static int fvd_update(BlockDriverState * bs, QEMUOptionParameter * options)
         options++;
     }
 
-    if ((ret = update_fvd_header(s, &header))) {
-        return ret;
+    if (update_header) {
+        if ((ret = update_fvd_header(bs, &header))) {
+            return ret;
+        }
+        ret = bdrv_flush(bs->file);
     }
-    ret = bdrv_flush(s->fvd_metadata);
+
     return ret;
 }
 
+static int update_header_and_reopen_img(BlockDriverState *bs,
+                                        FvdHeader *new_header)
+{
+    BDRVFvdState *s = bs->opaque;
+    int ret;
+
+    /* Flush metadata. */
+    if ((ret = flush_metadata_to_disk(bs, true, false)) < 0) {
+        return ret;
+    }
+
+    /* Close the image but bs->file is kept open. */
+    fvd_close(bs);
+
+    /* Update stable_journal_epoch in the new header to ensure that the
+     * journal content will be ignored after the image is re-openned later. */
+    new_header->stable_journal_epoch = s->journal_epoch++;
+
+    /* Write the new header. */
+    if (update_fvd_header(bs, new_header) < 0 || bdrv_flush(bs->file) < 0) {
+        /* No way to recover since fvd_close() is already invoked. */
+        abort();
+    }
+
+    /* Close bs->file in preparation for re-openning the image. */
+    bdrv_close(bs->file);
+
+    if (fvd_open(bs, bs->filename, bs->open_flags) < 0) {
+        abort(); /* No way to recover since fvd_close() is already invoked. */
+    }
+
+    return 0;
+}
+
+static int convert_to_compact_image(BlockDriverState *bs)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdHeader h;
+    int ret, i;
+
+    if ((ret = read_fvd_header(bs, &h) < 0)) {
+        return ret;
+    }
+
+    const uint64_t metadata_size = h.header_size + h.header_padding_size +
+        h.bitmap_size + h.journal_size;
+
+    /* h.chunk_size and h.block_size are already set by fvd_create(). */
+    if(h.table_offset > 0 || h.chunk_size <= 0 ||
+       h.chunk_size != h.block_size || metadata_size % h.chunk_size != 0) {
+        return -EINVAL;
+    }
+
+    /* Initialize the table. */
+    h.table_size = calc_table_size(h.virtual_disk_size, h.chunk_size);
+    h.table_size = ROUND_UP(h.table_size, h.chunk_size);
+    h.table_offset = metadata_size + ROUND_UP(h.virtual_disk_size,h.chunk_size);
+    uint32_t *table = my_qemu_blockalign(bs->file,  h.table_size);
+    memset(table, 0, h.table_size);
+    uint32_t metadata_chunks = metadata_size / h.chunk_size;
+
+    if (s->fresh_bitmap) {
+        /* Only dirty blocks are are allocated. Note block_size==chunk_size. */
+        int blocks = DIV_ROUND_UP(h.base_img_size, h.block_size);
+        for (i = 0; i < blocks; i++) {
+            if (test_bit(i, (unsigned long*)s->fresh_bitmap)) {
+                WRITE_TABLE(table[i], metadata_chunks + i);
+            }
+        }
+    } else {
+        i = 0;
+    }
+
+    /* Assume all chunks not covered by bitmap are allocated. A slower but
+     * more accurate alternative is to check chunk content and only assume
+     * non-zero chunks are allocated. */
+    int chunks = DIV_ROUND_UP(h.virtual_disk_size, h.chunk_size);
+    for (; i < chunks; i++) {
+        WRITE_TABLE(table[i], metadata_chunks + i);
+    }
+
+    /* Write the table. */
+    int growable = bs->file->growable;
+    bs->file->growable = 1;
+    ret = bdrv_write(bs->file, h.table_offset / 512, (uint8_t*)table,
+                     h.table_size / 512);
+    bs->file->growable = growable;
+    my_qemu_vfree(table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return update_header_and_reopen_img(bs, &h);
+}
+
+static int reduce_chunk_size(BlockDriverState *bs, uint64_t new_chunk_size)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdHeader h;
+    int ret, i;
+    uint64_t new_table_size, new_table_offset;
+    uint32_t chunk;
+
+    if ((ret = read_fvd_header(bs, &h) < 0)) {
+        return ret;
+    }
+
+    if (h.num_snapshots > 0) {
+        fprintf(stderr, "Cannot change chunk size of image with snapshot.\n");
+        return -EINVAL;
+    }
+
+    /* Is new_chunk_size valid w.r.t the old chunk_size? */
+    if (h.table_offset <= 0) {
+        fprintf(stderr, "Cannot change chunk size of a non-compact image.\n");
+        return -EINVAL;
+    }
+    if (new_chunk_size >= h.chunk_size) {
+        fprintf(stderr, "Old and new chunk sizes are the same.\n");
+        return -EINVAL;
+    }
+    if (h.chunk_size % new_chunk_size != 0) {
+        fprintf(stderr, "Old chunk size is not a multiple of new chunk size\n");
+        return -EINVAL;
+    }
+
+    /* Is new_chunk_size valid w.r.t the block_size? */
+    if (h.bitmap_offset > 0 && (new_chunk_size < h.block_size ||
+                                new_chunk_size % h.block_size != 0)) {
+        fprintf(stderr, "New chunk size is not a multiple of block size\n");
+        return -EINVAL;
+    }
+
+    /* Allocate space for the new table. */
+    new_table_size = calc_table_size(h.virtual_disk_size, new_chunk_size);
+    new_table_size = ROUND_UP(new_table_size, new_chunk_size);
+    chunk = alloc_chunks(bs, DIV_ROUND_UP(new_table_size, h.chunk_size));
+    if (IS_EMPTY(chunk)) {
+        return -EIO;
+    }
+    new_table_offset = chunk * h.chunk_size;
+
+    /* Convert old table to new table. */
+    uint32_t *new_table = my_qemu_blockalign(bs->file,  new_table_size);
+    memset(new_table, 0, new_table_size);
+    const uint32_t m = h.chunk_size / new_chunk_size;
+    uint32_t chunks = DIV_ROUND_UP(h.virtual_disk_size, h.chunk_size);
+    uint32_t n_entries = DIV_ROUND_UP(h.virtual_disk_size, new_chunk_size);
+    for (i = 0; i < chunks; i++) {
+        if (!IS_EMPTY(s->table[i])) {
+            uint32_t j;
+            uint32_t old_entry = READ_TABLE(s->table[i]) * m;
+            for (j = 0; j < m; j++) {
+                uint32_t s = m * i + j;
+                if (s >= n_entries) {
+                    goto done;
+                }
+                WRITE_TABLE(new_table[s],  old_entry + j);
+            }
+        }
+    }
+
+done:
+    /* Write the new table. */
+    ret = bdrv_write(bs->file, new_table_offset / 512, (uint8_t*)new_table,
+                     new_table_size / 512);
+    my_qemu_vfree(new_table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Update header. */
+    h.table_offset = new_table_offset;
+    h.table_size = new_table_size;
+    h.chunk_size = new_chunk_size;
+    return update_header_and_reopen_img(bs, &h);
+}
+
 static QEMUOptionParameter fvd_update_options[] = {
     {
      .name = BLOCK_OPT_SIZE,
@@ -205,19 +415,15 @@  static QEMUOptionParameter fvd_update_options[] = {
     {
      .name = BLOCK_OPT_BACKING_FILE,
      .type = OPT_STRING,
-     .help = "File name of a backing image"},
+     .help = "File name of the backing image"},
     {
      .name = BLOCK_OPT_BACKING_FMT,
      .type = OPT_STRING,
-     .help = "Image format of the backing image"},
+     .help = "Format of the backing image"},
     {
-     .name = "data_file",
-     .type = OPT_STRING,
-     .help = "File name of a data file"},
-    {
-     .name = "data_file_fmt",
+     .name = BLOCK_OPT_BACKING_FMT,
      .type = OPT_STRING,
-     .help = "Image format of the data file"},
+     .help = "Image format of the backing image"},
     {
      .name = "copy_on_read",
      .type = OPT_FLAG,
@@ -231,9 +437,17 @@  static QEMUOptionParameter fvd_update_options[] = {
      .type = OPT_SIZE,
      .help = "Journal size"},
     {
+     .name = "raw_layout",
+     .type = OPT_FLAG,
+     .help = "raw_layout=on|off"},
+    {
+     .name = "chunk_size",
+     .type = OPT_SIZE,
+     .help = "New chunk size"},
+    {
      .name = "need_zero_init",
      .type = OPT_FLAG,
-     .help = "compact_image=on|off"},
+     .help = "need_zero_init=on|off"},
     {
      .name = "max_outstanding_copy_on_read_data",
      .type = OPT_SIZE,
diff --git a/block/fvd-write.c b/block/fvd-write.c
index a74dc5d..f36bd61 100644
--- a/block/fvd-write.c
+++ b/block/fvd-write.c
@@ -20,10 +20,15 @@  static inline BlockDriverAIOCB *store_data(int soft_write,
                 FvdAIOCB * parent_acb, BlockDriverState * bs,
                 int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors,
                 BlockDriverCompletionFunc * cb, void *opaque);
+static inline BlockDriverAIOCB *load_data(FvdAIOCB * parent_acb,
+                    BlockDriverState * bs, int64_t sector_num,
+                    QEMUIOVector * orig_qiov, int nb_sectors,
+                    BlockDriverCompletionFunc * cb, void *opaque);
 
-static inline void init_data_region(BDRVFvdState * s)
+static inline void init_data_region(BlockDriverState * bs)
 {
-    bdrv_truncate(s->fvd_data, s->data_offset * 512 + s->virtual_disk_size);
+    BDRVFvdState *s = bs->opaque;
+    bdrv_truncate(bs->file, s->data_offset * 512 + s->virtual_disk_size);
     s->data_region_prepared = true;
 }
 
@@ -43,33 +48,40 @@  static BlockDriverAIOCB *fvd_aio_writev(BlockDriverState * bs,
     }
 
     if (!s->data_region_prepared) {
-        init_data_region(s);
+        init_data_region(bs);
     }
 
-    if (s->prefetch_state == PREFETCH_STATE_FINISHED
-        || sector_num >= s->base_img_sectors) {
-        /* This is an  efficient case. See Section 3.3.5 of the FVD-cow paper.
-         * This also covers the case of no base image. */
+    /* If the image's current state depends on a snapshot, it must takes the
+     * slow path to check if it needs to do copy-on-write for chunks shared
+     * with the snapshot. */
+    if (!s->share_chunk_with_snapshot) {
+        if (s->prefetch_state == PREFETCH_STATE_FINISHED ||
+            sector_num >= s->base_img_sectors) {
+            /* This is an  efficient case, no need to perform copy-on-write.
+             * This also covers the case of no base image.
+             * See Section 3.3.5 of the FVD-cow paper. */
+            return store_data(false, NULL, bs, sector_num, qiov,
+                              nb_sectors, cb, opaque);
+        }
+
+        /* Check if all requested sectors are in the FVD data file. */
+        int64_t sec = ROUND_DOWN(sector_num, s->block_size);
+        int64_t sec_in_last_block = ROUND_DOWN(sector_num + nb_sectors - 1,
+                                               s->block_size);
+        do {
+            if (stale_bitmap_show_sector_in_base_img(sec, s)) {
+                goto slow_path;
+            }
+            sec += s->block_size;
+        } while (sec <= sec_in_last_block);
+
+        /* This is the fast path, since 1) all requested data are in the FVD
+         * data file, 2) no need to update the bitmap, and 3) no chunk sharing
+         * with snapshot and hence no need for copy-on-write. */
         return store_data(false, NULL, bs, sector_num, qiov,
                           nb_sectors, cb, opaque);
     }
 
-    /* Check if all requested sectors are in the FVD data file. */
-    int64_t sec = ROUND_DOWN(sector_num, s->block_size);
-    int64_t sec_in_last_block = ROUND_DOWN(sector_num + nb_sectors - 1,
-                                           s->block_size);
-    do {
-        if (stale_bitmap_show_sector_in_base_img(sec, s)) {
-            goto slow_path;
-        }
-        sec += s->block_size;
-    } while (sec <= sec_in_last_block);
-
-    /* This is the fast path, as all requested data are in the FVD data file
-     * and no need to update the bitmap. */
-    return store_data(false, NULL, bs, sector_num, qiov,
-                      nb_sectors, cb, opaque);
-
 slow_path:
     acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
     if (!acb) {
@@ -84,6 +96,7 @@  slow_path:
     acb->write.update_table = false;
     acb->write.qiov = qiov;
     acb->write.hd_acb = NULL;
+    acb->write.hd_acb2 = NULL;
     acb->write.cow_buf = NULL;
     acb->copy_lock.next.le_prev = NULL;
     acb->write.next_write_lock.le_prev = NULL;
@@ -111,6 +124,9 @@  static void fvd_aio_cancel_write(FvdAIOCB * acb)
     if (acb->write.hd_acb) {
         bdrv_aio_cancel(acb->write.hd_acb);
     }
+    if (acb->write.hd_acb2) {
+        bdrv_aio_cancel(acb->write.hd_acb2);
+    }
     if (acb->jcb.hd_acb) {
         bdrv_aio_cancel(acb->jcb.hd_acb);
         BDRVFvdState *s = acb->common.bs->opaque;
@@ -180,6 +196,16 @@  static void write_data_cb(void *opaque, int ret)
 
     QDEBUG("WRITE: acb%llu-%p  write_data_cb\n", acb->uuid, acb);
 
+    /* The case of no base image or prefetching finished. */
+    if (!s->fresh_bitmap) {
+        if (acb->write.update_table) {
+            write_metadata_to_journal(acb, false);
+        } else {
+            finish_write(acb, ret);
+        }
+        return;
+    }
+
     /* Figure out whether to update metadata or not. */
     if (s->fresh_bitmap == s->stale_bitmap) {
         /* Neither copy_on_read nor prefetching is enabled. Cannot update
@@ -191,7 +217,6 @@  static void write_data_cb(void *opaque, int ret)
         } else {
             finish_write(acb, ret);     /* No need to update metadata. */
         }
-
         return;
     }
 
@@ -216,7 +241,7 @@  static void write_data_cb(void *opaque, int ret)
     }
 }
 
-static void read_backing_for_copy_on_write_cb(void *opaque, int ret)
+static void cow_read_cb(void *opaque, int ret, int id)
 {
     FvdAIOCB *acb = (FvdAIOCB *) opaque;
     BlockDriverState *bs = acb->common.bs;
@@ -226,12 +251,28 @@  static void read_backing_for_copy_on_write_cb(void *opaque, int ret)
     }
 
     if (ret != 0) {
-        QDEBUG("WRITE: acb%llu-%p  read_backing with error "
-               "ret=%d\n", acb->uuid, acb, ret);
-        finish_write(acb, ret);
+        QDEBUG("WRITE: acb%llu-%p  cow_read with error ret=%d\n",
+               acb->uuid, acb, ret);
+        acb->write.ret = ret;
+    }
+
+    if (id == 1) {
+        acb->write.hd_acb = NULL;
+        if (acb->write.hd_acb2) {
+            return;     /* The other read has not finished yet. */
+        }
     } else {
-        QDEBUG("WRITE: acb%llu-%p  "
-               "finish_read_from_backing_and_start_write_data\n",
+        acb->write.hd_acb2 = NULL;
+        if (acb->write.hd_acb) {
+            return;     /* The other read has not finished yet. */
+        }
+    }
+
+    /* All reads finished. Do write now. */
+    if (acb->write.ret != 0) {
+        finish_write(acb, acb->write.ret); /* Error. */
+    } else {
+        QDEBUG("WRITE: acb%llu-%p  cow_finish_read_and_start_write_data\n",
                acb->uuid, acb);
         acb->write.hd_acb = store_data(false, acb, bs,
                                        acb->write.cow_start_sector,
@@ -244,6 +285,29 @@  static void read_backing_for_copy_on_write_cb(void *opaque, int ret)
     }
 }
 
+static void cow_read_cb1(void *opaque, int ret)
+{
+    cow_read_cb(opaque, ret, 1);
+}
+
+static void cow_read_cb2(void *opaque, int ret)
+{
+    cow_read_cb(opaque, ret, 2);
+}
+
+static inline BlockDriverAIOCB * read_cow_data(bool read_from_snapshot,
+FvdAIOCB * acb, BlockDriverState * bs, int64_t sector_num,
+                    QEMUIOVector * qiov, int nb_sectors,
+                    BlockDriverCompletionFunc * cb, void *opaque)
+{
+    if (read_from_snapshot) {
+        return load_data(acb, bs, sector_num, qiov, nb_sectors, cb, opaque);
+    } else {
+        return bdrv_aio_readv(bs->backing_hd, sector_num, qiov, nb_sectors,
+                              cb, opaque);      /* Read from base image. */
+    }
+}
+
 static int do_aio_write(FvdAIOCB * acb)
 {
     BlockDriverState *bs = acb->common.bs;
@@ -253,8 +317,11 @@  static int do_aio_write(FvdAIOCB * acb)
     const int64_t sector_end = acb->sector_num + acb->nb_sectors;
     const int64_t block_begin = ROUND_DOWN(acb->sector_num, s->block_size);
     int64_t block_end = ROUND_UP(sector_end, s->block_size);
+    bool cow_first_block = false, cow_last_block = false;
+    bool cow_first_block_from_snapshot = false;
+    bool cow_last_block_from_snapshot = false;
 
-    /* Check for conflicting copy-on-reads. */
+    /* Check for conflicting copy-on-read or copy-on-write. */
     FvdAIOCB *old;
     QLIST_FOREACH(old, &s->copy_locks, copy_lock.next) {
         if (old->copy_lock.end > acb->sector_num &&
@@ -269,122 +336,199 @@  static int do_aio_write(FvdAIOCB * acb)
         }
     }
 
-    /* No conflict. check if this write updates partial blocks and need to
-     * read those blocks from the base image and merge with this write. */
-    int read_first_block, read_last_block;
+    /* No conflict. Check if this write updates partial blocks and needs to
+     * perform copy-on-write.  Note if (share_chunk_with_snapshot==true), it
+     * must hold that block_size=chunk_size, and hence block_size also means
+     * chunk_size, block_begin also means chunk_begin and blok_end also means
+     * chunk_end. */
+    ASSERT (!s->share_chunk_with_snapshot || s->block_size == s->chunk_size);
+
+    /* No need to do copy-on-write for first block if the request is aligned. */
     if (acb->sector_num % s->block_size == 0) {
-        read_first_block = false;
-    } else if (fresh_bitmap_show_sector_in_base_img(acb->sector_num, s)) {
-        read_first_block = true;
-    } else {
-        read_first_block = false;
+        goto check_last_block;
+    }
+
+    /* Perform copy-on-write if the first block is in base image. */
+    if (s->fresh_bitmap &&
+        fresh_bitmap_show_sector_in_base_img(acb->sector_num, s)) {
+        cow_first_block = true;
+        goto check_last_block;
     }
 
+    /* Perform copy-on-write if the first block is shared with snapshot. */
+    if (s->share_chunk_with_snapshot) {
+        uint32_t first_chunk  = acb->sector_num / s->chunk_size;
+        if (IS_SHARED(s->table[first_chunk])) {
+            cow_first_block = true;
+            cow_first_block_from_snapshot = true;
+        }
+    }
+
+check_last_block:
+    /* No need to do copy-on-write for last block if the request is aligned. */
     if (sector_end % s->block_size == 0) {
-        read_last_block = false;
-    } else if (fresh_bitmap_show_sector_in_base_img(sector_end, s)) {
-        read_last_block = true;
-    } else {
-        read_last_block = false;
+        goto do_cow;
     }
 
-    if (read_first_block) {
-        if (read_last_block) {
-            /* Case 1: Read all the blocks involved from the base image. */
+    /* Perform copy-on-write if last block is in base image. */
+    if (s->fresh_bitmap &&
+        fresh_bitmap_show_sector_in_base_img(sector_end, s)) {
+        cow_last_block = true;
+        goto do_cow;
+    }
+
+    /* Perform copy-on-write if last block is shared with snapshot. */
+    if (s->share_chunk_with_snapshot) {
+        uint32_t last_chunk  = sector_end / s->chunk_size;
+        if (IS_SHARED(s->table[last_chunk])) {
+            cow_last_block = true;
+            cow_last_block_from_snapshot = true;
+        }
+    }
+
+do_cow:
+    if (cow_first_block) {
+        if (cow_last_block) {
+            /* Case 1: Read both the first partial block and the last partial
+             * block for copy-on-write. */
             const QEMUIOVector *old_qiov = acb->write.qiov;
-            if (block_end > s->base_img_sectors) {
+            if (!cow_last_block_from_snapshot &&
+                block_end > s->base_img_sectors) {
                 block_end = s->base_img_sectors;
             }
-
-            int buf_size = (block_end - block_begin) * 512
-                + 2 * sizeof(QEMUIOVector)
-                + sizeof(struct iovec) * (old_qiov->niov + 3);
+            ASSERT(sector_end < block_end);
+
+            /* Allocate memory with the following layout:
+             * - data buffer for the first partial block.
+             * - data buffer for the last partial block.
+             * - qiov for writing the combined data.
+             * - iov for writing data read from first partial block.
+             * - iov for old_qiov.
+             * - iov for writing data read from last partial block.
+             * - qiov for reading first partial block.
+             * - qiov for reading last partial block. */
+            int data_size = 512 * ((acb->sector_num - block_begin) +
+                                   (block_end - sector_end));
+            int buf_size = data_size + 3 * sizeof(QEMUIOVector) +
+                sizeof(struct iovec) * (old_qiov->niov + 2);
             buf_size = ROUND_UP(buf_size, 512);
             acb->write.cow_buf = my_qemu_blockalign(bs->backing_hd, buf_size);
 
-            /* For reading from the base image. */
-            QEMUIOVector *read_qiov = (QEMUIOVector *) (acb->write.cow_buf +
-                                            (block_end - block_begin) * 512);
-            read_qiov->iov = (struct iovec *)(read_qiov + 1);
-            read_qiov->nalloc = -1;
-            read_qiov->niov = 1;
-            read_qiov->iov[0].iov_base = acb->write.cow_buf;
-            read_qiov->iov[0].iov_len = read_qiov->size =
-                (block_end - block_begin) * 512;
-
             /* For writing to the FVD data file. */
-            QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
+            QEMUIOVector *write_qiov =
+                (QEMUIOVector *) (acb->write.cow_buf + data_size);
             write_qiov->iov = (struct iovec *)(write_qiov + 1);
             write_qiov->nalloc = -1;
             write_qiov->niov = old_qiov->niov + 2;
-            write_qiov->size = read_qiov->size;
+            write_qiov->size = 512 * (block_end - block_begin);
 
-            /* The first entry is for data read from the base image. */
+            /* First iov entry is for data read from the first partial block. */
             write_qiov->iov[0].iov_base = acb->write.cow_buf;
-            write_qiov->iov[0].iov_len = (acb->sector_num - block_begin) * 512;
+            write_qiov->iov[0].iov_len = 512 * (acb->sector_num - block_begin);
+
+            /* iov entries in the middle are from the original old_qiov. */
             memcpy(&write_qiov->iov[1], old_qiov->iov,
                    sizeof(struct iovec) * old_qiov->niov);
 
-            /* The last entry is for data read from the base image. */
+            /* Last iov entry is for data read from the last partial block. */
             const int last = old_qiov->niov + 1;
-            write_qiov->iov[last].iov_base = acb->write.cow_buf
-                                        + (sector_end - block_begin) * 512;
-            write_qiov->iov[last].iov_len = (block_end - sector_end) * 512;
+            write_qiov->iov[last].iov_base = acb->write.cow_buf +
+                write_qiov->iov[0].iov_len;
+            write_qiov->iov[last].iov_len = 512 * (block_end - sector_end);
             acb->write.cow_qiov = write_qiov;
             acb->write.cow_start_sector = block_begin;
 
-            acb->write.hd_acb = bdrv_aio_readv(bs->backing_hd, block_begin,
-                                    read_qiov, block_end - block_begin,
-                                    read_backing_for_copy_on_write_cb, acb);
+            /* qiov for reading first partial block. */
+            QEMUIOVector *read_qiov =
+                (QEMUIOVector *) (write_qiov->iov + write_qiov->niov);
+            read_qiov->iov = &write_qiov->iov[0];
+            read_qiov->nalloc = -1;
+            read_qiov->niov = 1;
+            read_qiov->size = read_qiov->iov[0].iov_len;
+
+            /* qiov for reading last partial block. */
+            QEMUIOVector *read_qiov2 = read_qiov + 1;
+            read_qiov2->iov = &write_qiov->iov[write_qiov->niov - 1];
+            read_qiov2->nalloc = -1;
+            read_qiov2->niov = 1;
+            read_qiov2->size = read_qiov2->iov[0].iov_len;
+
+            /* Initialize acb->write.hd_acb2 to an invalid pointer in case
+             * that in qemu tool the callback below is invoked before the
+             * following read returns. This happens in qemu tool if the read
+             * operation need not read from disk and immediately uses a QEMUBH
+             * to complete the read. */
+            acb->write.hd_acb2 = INVALID_POINTER;
+
+            acb->write.hd_acb = read_cow_data(cow_first_block_from_snapshot,
+                                              acb, bs, block_begin, read_qiov,
+                                              acb->sector_num - block_begin,
+                                              cow_read_cb1, acb);
+
             if (!acb->write.hd_acb) {
                 goto fail;
             }
 
+            acb->write.hd_acb2 = read_cow_data(cow_last_block_from_snapshot,
+                                               acb, bs, sector_end, read_qiov2,
+                                               block_end - sector_end,
+                                               cow_read_cb2, acb);
+            if (!acb->write.hd_acb2) {
+                bdrv_aio_cancel(acb->write.hd_acb);
+                goto fail;
+            }
+
             acb->copy_lock.begin = block_begin;
             acb->copy_lock.end = block_end;
             QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
             QDEBUG("WRITE: acb%llu-%p  "
-                   "read_first_last_partial_blocks_from_backing  sector_num=%"
+                   "cow_read_first_last_partial_blocks  sector_num=%"
                    PRId64 " nb_sectors=%d\n", acb->uuid, acb, block_begin,
                    (int)(block_end - block_begin));
         } else {
-            /* Case 2: Read the first block from the base image. */
+            /* Case 2: Read the first partial block for copy-on-write. */
             int nb = acb->sector_num - block_begin;
             const QEMUIOVector *old_qiov = acb->write.qiov;
 
-            /* Space for data and metadata. */
+            /* Allocate memory with the following layout:
+             * - data buffer for the first partial block.
+             * - qiov for writing the combined data.
+             * - iov for data read from first partial block.
+             * - iov for old_qiov.
+             * - qiov for reading first partial block. */
+
             int buf_size = nb * 512 + 2 * sizeof(QEMUIOVector)
-                                + sizeof(struct iovec) * (old_qiov->niov + 2);
+                                + sizeof(struct iovec) * (old_qiov->niov + 1);
             buf_size = ROUND_UP(buf_size, 512);
             acb->write.cow_buf = my_qemu_blockalign(bs->backing_hd, buf_size);
 
-            /* For reading from the base image. */
-            QEMUIOVector *read_qiov =
-                (QEMUIOVector *) (acb->write.cow_buf + nb * 512);
-            read_qiov->iov = (struct iovec *)(read_qiov + 1);
-            read_qiov->nalloc = -1;
-            read_qiov->niov = 1;
-            read_qiov->iov[0].iov_base = acb->write.cow_buf;
-            read_qiov->iov[0].iov_len = read_qiov->size = nb * 512;
-
             /* For writing to the FVD data file. */
-            QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
+            QEMUIOVector *write_qiov =
+                (QEMUIOVector *) (acb->write.cow_buf + nb * 512);
             write_qiov->iov = (struct iovec *)(write_qiov + 1);
             write_qiov->nalloc = -1;
             write_qiov->niov = old_qiov->niov + 1;
-            write_qiov->size = old_qiov->size + read_qiov->size;
+            write_qiov->size = old_qiov->size + nb * 512;
 
-            /* The first entry is added for data read from the base image. */
+            /* First entry is added for data read from first partial block. */
             write_qiov->iov[0].iov_base = acb->write.cow_buf;
-            write_qiov->iov[0].iov_len = read_qiov->size;
+            write_qiov->iov[0].iov_len = nb * 512;
             memcpy(&write_qiov->iov[1], old_qiov->iov,
                    sizeof(struct iovec) * old_qiov->niov);
             acb->write.cow_qiov = write_qiov;
             acb->write.cow_start_sector = block_begin;
 
-            acb->write.hd_acb = bdrv_aio_readv(bs->backing_hd,
-                                    block_begin, read_qiov, nb,
-                                    read_backing_for_copy_on_write_cb, acb);
+            /* qiov for reading the first partial block. */
+            QEMUIOVector *read_qiov =
+                (QEMUIOVector *) (write_qiov->iov + write_qiov->niov);
+            read_qiov->iov = &write_qiov->iov[0];
+            read_qiov->size = read_qiov->iov[0].iov_len;
+            read_qiov->niov = 1;
+            read_qiov->nalloc = -1;
+
+            acb->write.hd_acb = read_cow_data(cow_first_block_from_snapshot,
+                                              acb, bs, block_begin, read_qiov,
+                                              nb, cow_read_cb1, acb);
             if (!acb->write.hd_acb) {
                 goto fail;
             }
@@ -392,54 +536,62 @@  static int do_aio_write(FvdAIOCB * acb)
             acb->copy_lock.begin = block_begin;
             acb->copy_lock.end = block_begin + s->block_size;
             QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
-            QDEBUG("WRITE: acb%llu-%p  read_first_partial_block_from_backing  "
+            QDEBUG("WRITE: acb%llu-%p  cow_read_first_partial_block  "
                    "sector_num=%" PRId64 " nb_sectors=%d\n",
                    acb->uuid, acb, block_begin, nb);
         }
     } else {
-        if (read_last_block) {
-            /* Case 3: Read the last block from the base image. */
+        if (cow_last_block) {
+            /* Case 3: Read the last partial block for copy-on-write. */
             int nb;
-            if (block_end < s->base_img_sectors) {
-                nb = block_end - sector_end;
-            } else {
+            if (!cow_last_block_from_snapshot &&
+                block_end > s->base_img_sectors) {
                 nb = s->base_img_sectors - sector_end;
+            } else {
+                nb = block_end - sector_end;
             }
+            ASSERT(nb > 0);
             const QEMUIOVector *old_qiov = acb->write.qiov;
 
-            /* Space for data and metadata. */
+            /* Allocate memory with the following layout:
+             * - data buffer for the last partial block.
+             * - qiov for writing the combined data.
+             * - iov for old_qiov.
+             * - iov for data read from the last partial block.
+             * - qiov for reading last partial block. */
+
             int buf_size = nb * 512 + 2 * sizeof(QEMUIOVector)
-                                + sizeof(struct iovec) * (old_qiov->niov + 2);
+                                + sizeof(struct iovec) * (old_qiov->niov + 1);
             buf_size = ROUND_UP(buf_size, 512);
             acb->write.cow_buf = my_qemu_blockalign(bs->backing_hd, buf_size);
 
-            /* For reading from the base image. */
-            QEMUIOVector *read_qiov = (QEMUIOVector *) (acb->write.cow_buf
-                                                        + nb * 512);
-            read_qiov->iov = (struct iovec *)(read_qiov + 1);
-            read_qiov->nalloc = -1;
-            read_qiov->niov = 1;
-            read_qiov->iov[0].iov_base = acb->write.cow_buf;
-            read_qiov->iov[0].iov_len = read_qiov->size = nb * 512;
-
             /* For writing to the FVD data file. */
-            QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
+            QEMUIOVector *write_qiov =
+                (QEMUIOVector *) (acb->write.cow_buf + nb * 512);
             write_qiov->iov = (struct iovec *)(write_qiov + 1);
-            write_qiov->nalloc = -1;
             write_qiov->niov = old_qiov->niov + 1;
-            write_qiov->size = old_qiov->size + read_qiov->size;
+            write_qiov->size = old_qiov->size + nb * 512;
+            write_qiov->nalloc = -1;
             memcpy(write_qiov->iov, old_qiov->iov,
                    sizeof(struct iovec) * old_qiov->niov);
 
-            /* The last appended entry is for data read from the base image. */
+            /* Last added entry is for data read from last partial block. */
             write_qiov->iov[old_qiov->niov].iov_base = acb->write.cow_buf;
-            write_qiov->iov[old_qiov->niov].iov_len = read_qiov->size;
+            write_qiov->iov[old_qiov->niov].iov_len = nb * 512;
             acb->write.cow_qiov = write_qiov;
             acb->write.cow_start_sector = acb->sector_num;
 
-            acb->write.hd_acb = bdrv_aio_readv(bs->backing_hd,
-                                    sector_end, read_qiov, nb,
-                                    read_backing_for_copy_on_write_cb, acb);
+            /* qiov for reading from the last partial block. */
+            QEMUIOVector *read_qiov =
+                (QEMUIOVector *) (write_qiov->iov + write_qiov->niov);
+            read_qiov->iov = &write_qiov->iov[write_qiov->niov - 1];
+            read_qiov->size = read_qiov->iov[0].iov_len;
+            read_qiov->niov = 1;
+            read_qiov->nalloc = -1;
+
+            acb->write.hd_acb = read_cow_data(cow_last_block_from_snapshot, acb,
+                                              bs, sector_end, read_qiov, nb,
+                                              cow_read_cb1, acb);
             if (!acb->write.hd_acb) {
                 goto fail;
             }
@@ -447,14 +599,13 @@  static int do_aio_write(FvdAIOCB * acb)
             acb->copy_lock.end = block_end;
             acb->copy_lock.begin = block_end - s->block_size;
             QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
-            QDEBUG("WRITE: acb%llu-%p  read_last_partial_block_from_backing  "
+            QDEBUG("WRITE: acb%llu-%p  cow_read_last_partial_block  "
                    "sector_num=%" PRId64 " nb_sectors=%d\n",
                    acb->uuid, acb, sector_end, nb);
         } else {
             /* Case 4: Can write directly and no need to merge with data from
              * the base image. */
-            QDEBUG("WRITE: acb%llu-%p  "
-                   "write_fvd_without_read_partial_block_from_backing\n",
+            QDEBUG("WRITE: acb%llu-%p  write_without_read_partial_block\n",
                    acb->uuid, acb);
             acb->write.hd_acb = store_data(false, acb, bs, acb->sector_num,
                                            acb->write.qiov, acb->nb_sectors,
@@ -465,7 +616,10 @@  static int do_aio_write(FvdAIOCB * acb)
         }
     }
 
-    QLIST_INSERT_HEAD(&s->write_locks, acb, write.next_write_lock);
+    /* Need to lock out copy_on_read only if prefetching does not finish. */
+    if (s->prefetch_state != PREFETCH_STATE_FINISHED) {
+        QLIST_INSERT_HEAD(&s->write_locks, acb, write.next_write_lock);
+    }
     return 0;
 
 fail:
diff --git a/block/fvd.c b/block/fvd.c
index c779d65..2ee15d4 100644
--- a/block/fvd.c
+++ b/block/fvd.c
@@ -22,6 +22,7 @@ 
  *============================================================================*/
 
 #include "block/fvd.h"
+#include "bswap.h"
 
 #define ENABLE_TRACE_IO
 //#define DEBUG_MEMORY_LEAK
@@ -36,7 +37,6 @@ 
 /* Use include to avoid exposing too many FVD symbols, and to allow inline
  * function optimization. */
 #include "block/fvd-debug.c"
-#include "block/fvd-flush.c"
 #include "block/fvd-utils.c"
 #include "block/fvd-bitmap.c"
 #include "block/fvd-misc.c"
@@ -50,6 +50,7 @@ 
 #include "block/fvd-journal-buf.c"
 #include "block/fvd-prefetch.c"
 #include "block/fvd-update.c"
+#include "block/fvd-snapshot.c"
 
 static AIOPool fvd_aio_pool = {
     .aiocb_size = sizeof(FvdAIOCB),
@@ -72,7 +73,13 @@  static BlockDriver bdrv_fvd = {
     .update_options = fvd_update_options,
     .bdrv_get_info = fvd_get_info,
     .bdrv_update = fvd_update,
-    .bdrv_has_zero_init = fvd_has_zero_init
+    .bdrv_has_zero_init = fvd_has_zero_init,
+    .bdrv_check = fvd_check,
+    .bdrv_snapshot_create = fvd_snapshot_create,
+    .bdrv_snapshot_goto = fvd_snapshot_goto,
+    .bdrv_snapshot_delete = fvd_snapshot_delete,
+    .bdrv_snapshot_list = fvd_snapshot_list,
+    .bdrv_snapshot_load_tmp = fvd_snapshot_load_tmp
 };
 
 static void bdrv_fvd_init(void)
@@ -91,8 +98,8 @@  block_init(bdrv_fvd_init);
 extern QTAILQ_HEAD(, BlockDriverState) bdrv_states;
   static void __attribute__ ((destructor)) flush_fvd_bitmap_to_disk(void)
 {
-    BlockDriverState *bs;
-    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+    BlockDriverState *bs = NULL;
+    while ((bs = bdrv_next(bs))) {
         if (bs->drv == &bdrv_fvd) {
             flush_metadata_to_disk_on_exit(bs);
 #ifdef FVD_DEBUG
diff --git a/block/fvd.h b/block/fvd.h
index 34ea2b4..b5cbb59 100644
--- a/block/fvd.h
+++ b/block/fvd.h
@@ -18,6 +18,7 @@ 
 #include "block.h"
 #include "qemu-queue.h"
 #include "qemu-common.h"
+#include "bitops.h"
 #include "block/fvd-ext.h"
 
 enum {
@@ -28,31 +29,27 @@  enum {
 };
 
 /*
- * The FVD format consists of the following fields in little endian:
+ * The FVD format consists of the following fields in little endian, with all
+ * uint64_t data fields properly aligned on sizeof(uint64_t) boundary.
  *   + Header fields of FvdHeader.
- *   + Bitmap, starting on a 4KB page boundary at a location specified by
- *     FvdHeader.bitmap_offset.
- *   + Journal, starting on a 4KB page boundary at a location specified by
- *     FvdHeader.journal_offset.
- *   + Table, starting on a 4KB page boundary at a location specified by
- *     FvdHeader.table_offset. When expanding the size of an existing FVD
- *     image, the table can be expanded to borrow space from the next,
- *     "virtual disk data" section, by relocating some data chunks.
- *   + Virtual disk data,  starting on a 4KB page boundary. Optionally, disk
- *     data can be stored in a separate data file specified by
- *     FvdHeader.data_file.
+ *   + Bitmap for implementing copy-on-write.
+ *   + Table for implementing compact image.
+ *   + fct (free_chunk_tracer) for tracking available physical chunks.
+ *   + Journal.
+ *   + Virtual disk data.
  */
 typedef struct __attribute__ ((__packed__)) FvdHeader {
     uint32_t magic;             /* FVD_MAGIC */
 
-    /* Size of FvdHeader in bytes, rounded up to DEF_PAGE_SIZE. A new FVD
-     * version may add fields to FvdHeader and hence need to increase
-     * header_size. When an old FVD version reads an image created by a new
-     * FVD version, the old version only reads the beginning part of FvdHeader
-     * that it can understand and ignroes the new fields at the end of
-     * FvdHeader. */
+    /* Size of FvdHeader in bytes. A new FVD version may add fields to
+     * FvdHeader and hence increases header_size. When an old FVD version
+     * reads an image created by a new FVD version, the old version only reads
+     * the beginning part of FvdHeader that it can understand and ignroes the
+     * new fields at the end of FvdHeader. */
     uint32_t header_size;
 
+    uint32_t header_padding_size;  /* In bytes. Zeros after FvdHeader. */
+
     /* Version of the FVD software that created the image. */
     uint32_t create_version;
 
@@ -70,38 +67,37 @@  typedef struct __attribute__ ((__packed__)) FvdHeader {
      * 'last_modified' to avoid making the optimization counter effective. */
     uint32_t last_open_version;
 
-    uint64_t virtual_disk_size;  /* in bytes. Disk size perceived by the VM. */
-    uint64_t data_offset;        /* in bytes. Aligned on DEF_PAGE_SIZE. */
-
-    /* Data can be optionally stored in a different file. */
-    char data_file[1024];
-    char data_file_fmt[16];
+    uint32_t clean_shutdown;      /* true if VM's last shutdown was graceful. */
 
-    /* Base image. */
-    char base_img[1024];
-    char base_img_fmt[16];
+    uint64_t virtual_disk_size;  /* in bytes. Disk size perceived by the VM. */
     uint64_t base_img_size;      /* in bytes. */
 
-    /* Bitmap for base image. */
-    uint64_t bitmap_offset;      /* in bytes. Aligned on DEF_PAGE_SIZE. */
-    uint64_t bitmap_size;        /* in bytes. Rounded up to DEF_PAGE_SIZE. */
+    /* Copy-on-write bitmap for base image. */
+    uint64_t bitmap_offset;      /* in bytes. */
+    uint64_t bitmap_size;        /* in bytes. */
     uint64_t block_size;         /* in bytes. */
 
+    /* Table for compact image. */
+    uint64_t table_offset;        /* in bytes. */
+    uint64_t table_size;          /* in bytes. */
+    uint64_t chunk_size;          /* in bytes. */
+    uint64_t storage_grow_unit;   /* in bytes. */
+
     /* Journal */
     uint64_t journal_offset;      /* in bytes. */
     uint64_t journal_size;        /* in bytes. On-disk journal size. */
-    uint32_t clean_shutdown;      /* true if VM's last shutdown was graceful. */
     uint64_t stable_journal_epoch; /* Needed only if a chunk can be relocated.*/
     uint64_t journal_buf_size;     /* in bytes. In-memory buffer size. */
     uint64_t journal_clean_buf_period; /* in milliseconds. */
 
-    /* Table for compact image. */
-    uint64_t table_offset;        /* in bytes. Aligned on DEF_PAGE_SIZE. */
-    uint64_t table_size;          /* in bytes. Rounded up to DEF_PAGE_SIZE. */
-    uint64_t chunk_size;          /* in bytes. */
-    uint64_t storage_grow_unit;   /* in bytes. */
-    char add_storage_cmd[1024];
-    uint32_t chunks_relocated;    /* Affect bdrv_has_zero_init(). */
+    /* Snapshots */
+    uint64_t snapshot_offset;     /* in bytes. Point to an FvdSnapshot list. */
+    uint64_t snapshot_size;       /* in bytes. */
+    uint64_t refcount_offset;     /* in bytes. */
+    uint64_t refcount_size;       /* in bytes. */
+    uint64_t vm_state_offset;     /* in bytes. */
+    uint64_t vm_state_size;       /* in bytes. */
+    uint32_t num_snapshots;
 
     /* Copy-on-read */
     uint32_t copy_on_read;       /* true or false */
@@ -121,7 +117,7 @@  typedef struct __attribute__ ((__packed__)) FvdHeader {
     uint64_t prefetch_throttle_time;  /* in milliseconds. */
 
     /* need_zero_init is true if the image mandates that the storage layer
-     * (BDRVFvdState.fvd_data) must return true for bdrv_has_zero_init().
+     * must return true for bdrv_has_zero_init().
      * This is the case if the optimization described in Section 3.3.3 of the
      * FVD-cow paper is enabled (see function search_holes()). If 'qemu-img
      * create' sets need_zero_init to true, 'qemu-img update' can be used to
@@ -130,11 +126,23 @@  typedef struct __attribute__ ((__packed__)) FvdHeader {
      * file system, it already supports zero_init, and hence there is no need
      * to manually manipulate this field. */
     uint32_t need_zero_init;
+    uint32_t ignore_me_padding_1;
+
+    /* Put all char arrays at the end of FvdHeader so that all important
+     * fields of FvdHeader fit the first 512 bytes (i.e., in one sector).
+     * When multiple fields of FvdHeader need be updated (e.g., when creating
+     * a snapshot), the disk update operation is atomic. Regardless of when
+     * the host crashes, it either gets the old image before the update or
+     * gets a consistent, new image with all fields properly updated. There is
+     * not change of getting a partially updated, corrupted image. */
+    char base_img[1024];
+    char base_img_fmt[16];
+    char add_storage_cmd[1024];  /* For compact image. */
 
-    /* This field enables adding incompatible features. For example, Suppose
-     * FVD version N+1 adds image compression. A compressed image cannot be
-     * openned by FVD version N. Suppose in FVD version N, the value of
-     * INCOMPATIBLE_FEATURES_SPACE is 4096. Introducing image compression
+    /* The field below enables adding incompatible features. For example,
+     * Suppose FVD version N+1 adds image compression. A compressed image
+     * cannot be openned by FVD version N. Suppose in FVD version N, the value
+     * of INCOMPATIBLE_FEATURES_SPACE is 4096. Introducing image compression
      * in FVD version N+1 causes the following changes to the header.
      *   In FVD version N:
      *          uint8_t incompatible_features[4096];
@@ -169,12 +177,10 @@  typedef struct __attribute__ ((__packed__)) FvdHeader {
 } FvdHeader;
 
 typedef struct BDRVFvdState {
-    BlockDriverState *fvd_metadata;
-    BlockDriverState *fvd_data;
     uint64_t virtual_disk_size;  /*in bytes. */
-    uint64_t bitmap_offset;      /* in sectors */
-    uint64_t bitmap_size;        /* in bytes. */
-    uint64_t data_offset;        /* in sectors. Begin of real data. */
+    uint64_t bitmap_offset;      /* in sectors. copy-on-write bitmap. */
+    uint64_t bitmap_size;        /* in bytes. copy-on-write bitmap. */
+    uint64_t data_offset;        /* in sectors. Real data in non-compact img. */
     uint64_t base_img_sectors;
     uint64_t block_size;         /* in sectors. */
     bool copy_on_read;
@@ -192,19 +198,23 @@  typedef struct BDRVFvdState {
 
     /******** Begin: for compact image. *************************************/
     uint32_t *table;    /* Mapping table stored in memory in little endian. */
-    uint64_t table_size;        /* in bytes. */
-    uint64_t used_storage;        /* in sectors. */
-    uint64_t avail_storage;        /* in sectors. */
+    uint64_t table_size;          /* in bytes. */
+    uint64_t table_offset;        /* in sectors. On-disk offset. */
     uint64_t chunk_size;          /* in sectors. */
+    uint64_t avail_storage;       /* in sectors. */
+    uint64_t used_storage;        /* in sectors. */
+    unsigned long *free_chunks;  /* A bitmap with bit 0 for a free chunk. */
+    unsigned long free_chunks_size;
+    unsigned long next_free_chunk;
     uint64_t storage_grow_unit;   /* in sectors. */
-    uint64_t table_offset;        /* in sectors. */
+    bool compact_image_guarantee_zero_init;  /* Affect bdrv_has_zero_init(). */
+    bool share_chunk_with_snapshot;
     char *add_storage_cmd;
-    uint32_t *leaked_chunks;
-    uint32_t num_leaked_chunks;
-    uint32_t next_avail_leaked_chunk;
-    uint32_t chunks_relocated;    /* Affect bdrv_has_zero_init(). */
     /******** Begin: for compact image. *************************************/
 
+    /******** Begin: for snapshots. *****************************************/
+    /******** End: for snapshots. *******************************************/
+
     /******** Begin: for journal. *******************************************/
     uint64_t journal_offset;       /* in sectors. */
     uint64_t journal_size;         /* in sectors. */
@@ -327,6 +337,7 @@  typedef struct AIOCopyCB {
 
 typedef struct AIOWriteCB {
     BlockDriverAIOCB *hd_acb;
+    BlockDriverAIOCB *hd_acb2;
     QEMUIOVector *qiov;
     uint8_t *cow_buf;
     QEMUIOVector *cow_qiov;
@@ -354,13 +365,16 @@  typedef struct CompactChildCB {
 typedef struct AIOStoreCompactCB {
     CompactChildCB one_child;
     CompactChildCB *children;
-    int update_table;
     int num_children;
     int finished_children;
     struct FvdAIOCB *parent_acb;
     int ret;
-    int soft_write; /*true if the store is caused by copy-on-read or prefetch.*/
+    bool copy_on_read; /* true if store is caused by copy-on-read or prefetch.*/
     QEMUIOVector *orig_qiov;
+
+    /* A copy of the related fragment of s->table. Used for copy-on-write on
+     * chunks shared with snapshot. */
+    uint32_t *shared_table_copy;
 } AIOStoreCompactCB;
 
 /* For loading data from a compact image. */
@@ -374,13 +388,6 @@  typedef struct AIOLoadCompactCB {
     QEMUIOVector *orig_qiov;
 } AIOLoadCompactCB;
 
-typedef struct AIOFlushCB {
-    BlockDriverAIOCB *data_acb;
-    BlockDriverAIOCB *metadata_acb;
-    int num_finished;
-    int ret;
-} AIOFlushCB;
-
 typedef struct AIOCleanJournalBufCB {
     uint8_t *buf;
 } AIOCleanJournalBufCB;
@@ -390,7 +397,7 @@  typedef struct AIOWrapperCB {
 } AIOWrapperCB;
 
 typedef enum { OP_READ = 1, OP_WRITE, OP_COPY, OP_STORE_COMPACT,
-    OP_LOAD_COMPACT, OP_WRAPPER, OP_FLUSH, OP_BJNL_BUF_WRITE, OP_BJNL_FLUSH
+    OP_LOAD_COMPACT, OP_WRAPPER, OP_BJNL_BUF_WRITE, OP_BJNL_FLUSH
 } op_type;
 
 #ifdef FVD_DEBUG
@@ -421,7 +428,6 @@  typedef struct FvdAIOCB {
         AIOCopyCB copy;
         AIOLoadCompactCB load;
         AIOStoreCompactCB store;
-        AIOFlushCB flush;
     };
 
 #ifdef FVD_DEBUG
@@ -437,6 +443,24 @@  static BlockDriver bdrv_fvd;
 static QEMUOptionParameter fvd_create_options[];
 static QEMUOptionParameter fvd_update_options[];
 
+/* See QEMUSnapshotInfo. */
+typedef struct __attribute__ ((__packed__)) FvdSnapshot {
+    char id_str[128];
+    char name[256];
+    uint32_t date_sec;             /* UTC date of the snapshot */
+    uint32_t date_nsec;
+
+    /* The following fields are aligned on uint64_t boundary. */
+    uint64_t vm_state_actual_size; /* in bytes. Actual size of VM state. */
+    uint64_t vm_clock_nsec;        /* VM clock relative to boot */
+    uint64_t bitmap_offset;
+    uint64_t bitmap_size;
+    uint64_t table_offset;
+    uint64_t table_size;
+    uint64_t vm_state_offset;
+    uint64_t vm_state_space_size;  /* in bytes. Storage space for VM state. */
+} FvdSnapshot;
+
 /* Function prototypes. */
 static int fvd_create(const char *filename, QEMUOptionParameter * options);
 static int fvd_probe(const uint8_t * buf, int buf_size, const char *filename);
@@ -469,8 +493,8 @@  static int fvd_has_zero_init(BlockDriverState * bs);
 #define PREFETCH_MAX_WRITE_THROUGHPUT           1000000000L /* KB/s */
 #define PREFETCH_PERF_CALC_ALPHA                0.8
 #define MAX_OUTSTANDING_COPY_ON_READ_DATA       2000000     /* bytes */
-#define MODERATE_BITMAP_SIZE                    4194304L    /* bytes */
 #define CHUNK_SIZE                              1048576LL   /* bytes */
+#define FCB_SIZE                                1048576LL   /* bytes */
 #define JOURNAL_SIZE                            16777216LL  /* bytes */
 #define STORAGE_GROW_UNIT                       104857600LL /* bytes */
 #define JOURNAL_BUF_SIZE                        (64*1024)  /* bytes */
@@ -486,25 +510,29 @@  static int fvd_has_zero_init(BlockDriverState * bs);
 #define ROUND_UP(x, base)       ((((x)+(base)-1) / (base)) * (base))
 #define ROUND_DOWN(x, base)     ((((x) / (base)) * (base)))
 #define BOOL(x)                 ((x) ? "true" : "false")
-#define EMPTY_TABLE             ((uint32_t)0xFFFFFFFF)
 #define DIRTY_TABLE             ((uint32_t)0x80000000)
-#define READ_TABLE(entry)       (le32_to_cpu(entry) & ~DIRTY_TABLE)
-# define FVDAIOCB_MAGIC         ((uint64_t)0x3A8FCE89325B976DULL)
-# define FVD_ALLOC_MAGIC        ((uint64_t)0x4A7dCEF9925B976DULL)
+#define SHARED_TABLE            ((uint32_t)0x40000000)
+#define READ_TABLE(entry)       (le32_to_cpu(entry)&~DIRTY_TABLE&~SHARED_TABLE)
+#define EMPTY_TABLE             0
 #define IS_EMPTY(entry)         ((entry) == EMPTY_TABLE)
 #define IS_DIRTY(entry)         (le32_to_cpu(entry) & DIRTY_TABLE)
+#define IS_SHARED(entry)        (le32_to_cpu(entry) & SHARED_TABLE)
 #define WRITE_TABLE(entry,id)   ((entry) = cpu_to_le32(id))
-#define READ_TABLE2(entry) \
-    ((entry)==EMPTY_TABLE ? EMPTY_TABLE : (le32_to_cpu(entry) & ~DIRTY_TABLE))
+#define FVDAIOCB_MAGIC          ((uint64_t)0x3A8FCE89325B976DULL)
+#define FVD_ALLOC_MAGIC         ((uint64_t)0x4A7dCEF9925B976DULL)
+#define INVALID_POINTER         ((void*)(-1L))
 
 #define CLEAN_DIRTY(entry) \
-    do {  \
-        if (!IS_EMPTY(entry))  \
-            entry = cpu_to_le32(le32_to_cpu(entry) & ~DIRTY_TABLE);  \
+    do { \
+        entry = cpu_to_le32(le32_to_cpu(entry) & ~DIRTY_TABLE);\
+    } while (0)
+
+#define CLEAN_SHARED(entry) \
+    do { \
+        entry = cpu_to_le32(le32_to_cpu(entry) & ~SHARED_TABLE);\
     } while (0)
 
-#define CLEAN_DIRTY2(entry) \
+#define CLEAN_DIRTY_AND_SHARED(entry) \
     do { \
-        ASSERT(!IS_EMPTY(entry)); \
-        entry = cpu_to_le32(le32_to_cpu(entry) & ~DIRTY_TABLE);  \
+        entry = cpu_to_le32(le32_to_cpu(entry) & ~DIRTY_TABLE & ~SHARED_TABLE);\
     } while (0)
diff --git a/bswap.h b/bswap.h
index 82a7951..c896b13 100644
--- a/bswap.h
+++ b/bswap.h
@@ -138,8 +138,10 @@  CPU_CONVERT(le, 64, uint64_t)
 
 #define cpu_to_le16wu(p, v) cpu_to_le16w(p, v)
 #define cpu_to_le32wu(p, v) cpu_to_le32w(p, v)
+#define cpu_to_le64wu(p, v) cpu_to_le64w(p, v)
 #define le16_to_cpupu(p) le16_to_cpup(p)
 #define le32_to_cpupu(p) le32_to_cpup(p)
+#define le64_to_cpupu(p) le64_to_cpup(p)
 #define be32_to_cpupu(p) be32_to_cpup(p)
 
 #define cpu_to_be16wu(p, v) cpu_to_be16w(p, v)
@@ -166,6 +168,20 @@  static inline void cpu_to_le32wu(uint32_t *p, uint32_t v)
     p1[3] = v >> 24;
 }
 
+static inline void cpu_to_le64wu(uint64_t *p, uint64_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v & 0xff;
+    p1[1] = v >> 8;
+    p1[2] = v >> 16;
+    p1[3] = v >> 24;
+    p1[4] = v >> 32;
+    p1[5] = v >> 40;
+    p1[6] = v >> 48;
+    p1[7] = v >> 56;
+}
+
 static inline uint16_t le16_to_cpupu(const uint16_t *p)
 {
     const uint8_t *p1 = (const uint8_t *)p;
@@ -178,6 +194,14 @@  static inline uint32_t le32_to_cpupu(const uint32_t *p)
     return p1[0] | (p1[1] << 8) | (p1[2] << 16) | (p1[3] << 24);
 }
 
+static inline uint64_t le64_to_cpupu(const uint64_t *p)
+{
+    const uint8_t *p1 = (const uint8_t *)p;
+    return p1[0] | (p1[1] << 8) | (p1[2] << 16) | (p1[3] << 24) |
+           ((uint64_t)p1[4] << 32) | ((uint64_t)p1[5] << 40) |
+	   ((uint64_t)p1[6] << 48) | ((uint64_t)p1[7] << 56);
+}
+
 static inline uint32_t be32_to_cpupu(const uint32_t *p)
 {
     const uint8_t *p1 = (const uint8_t *)p;
diff --git a/qemu-img.c b/qemu-img.c
index 215e7b9..f381013 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1052,11 +1052,11 @@  static int img_info(int argc, char **argv)
                backing_filename,
                backing_filename2);
     }
+    dump_snapshots(bs);
     if (bdrv_get_info(bs, &bdi) >= 0) {
         if (bdi.cluster_size != 0)
             printf("cluster_size: %d\n", bdi.cluster_size);
     }
-    dump_snapshots(bs);
     bdrv_delete(bs);
     return 0;
 }
diff --git a/qemu-io-auto.c b/qemu-io-auto.c
index 67c84f8..25b74c3 100644
--- a/qemu-io-auto.c
+++ b/qemu-io-auto.c
@@ -122,10 +122,10 @@  static void auto_test_usage(void)
 
 static int truth_io(void *buf, int64_t sector_num, int nb_sectors, int do_read)
 {
-    off_t offset = sector_num * 512;
+    off64_t offset = sector_num * 512;
     size_t size = nb_sectors * 512;
 
-    if (lseek(fd, offset, SEEK_SET) < 0) {
+    if (lseek64(fd, offset, SEEK_SET) < 0) {
         die("lseek\n");
     }
 
@@ -597,12 +597,13 @@  static void perform_test(const char *truth_file, const char *test_file,
         open_test_file(format, test_file, BDRV_O_RDWR | cache_flag);
     }
 
-    fd = open(truth_file, O_RDWR | O_LARGEFILE, 0);
+    int flag = round > 0 ? O_RDWR : O_RDONLY;
+    fd = open(truth_file, flag | O_LARGEFILE, 0);
     if (fd < 0) {
         die("Failed to open '%s'\n", truth_file);
     }
 
-    int64_t l0 = lseek(fd, 0, SEEK_END);
+    int64_t l0 = lseek64(fd, (off64_t)0, SEEK_END);
     int64_t l1 = bdrv_getlength(bs);
     if (l0 < 0 || l1 < 0 || l0 < l1) {
         die("Mismatch: truth image %s length %"PRId64", test image %s "
diff --git a/qemu-option.c b/qemu-option.c
index 28b19b5..c5275c5 100644
--- a/qemu-option.c
+++ b/qemu-option.c
@@ -399,15 +399,19 @@  QEMUOptionParameter *append_one_option_parameter(QEMUOptionParameter *dest,
     QEMUOptionParameter *param)
 {
     QEMUOptionParameter *target;
-    if ((target = get_option_parameter(dest, param->name))) {
-        *target = *param;
-    } else {
+
+    if ((target = get_option_parameter(dest, param->name)) == NULL) {
         size_t n = count_option_parameters(dest);
         dest = qemu_realloc(dest, (n + 2) * sizeof(QEMUOptionParameter));
-        dest[n] = *param;
+        target = dest + n;
         dest[n + 1].name = NULL;
     }
 
+    *target = *param;
+    if (param->type == OPT_STRING) {
+        target->value.s = qemu_strdup(param->value.s);
+    }
+
     return dest;
 }
 
diff --git a/test-fvd.sh b/test-fvd.sh
index 3d67c3f..4fa1e97 100755
--- a/test-fvd.sh
+++ b/test-fvd.sh
@@ -31,25 +31,33 @@  fi
 
 DATA_DIR=/var/ramdisk
 TRUTH_IMG=$DATA_DIR/truth.raw
+TRUTH_IMG_SNAPSHOT=$DATA_DIR/truth.raw-snapshot
 TEST_IMG=$DATA_DIR/test.fvd
 TEST_BASE=$DATA_DIR/zero-500M.raw
-TEST_IMG_DATA=$DATA_DIR/test.dat
 CMD_LOG=./test-fvd.log
 
-G1=1073741824
+DEF_IMG_SIZE=104857600
 MAX_MEM=536870912
-MAX_ROUND=1000000
+MAX_ROUND=100000
 MAX_IO_SIZE=100000000
 fail_prob=0.1
 cancel_prob=0.1
 flush_prob_base=0.05
 aio_flush_prob_base=0.1
+snapshot_create_prob=0.5
+snapshot_goto_prob=0.5
+snapshot_record_prob=0.5
+snapshot_del_prob=0.1
 seed=$RANDOM$RANDOM
 count=0
 
+DEF_IMG_SIZE_HALF=$[$DEF_IMG_SIZE / 2]
+DEF_IMG_SIZE_TENTH=$[$DEF_IMG_SIZE / 10]
+
 function invoke() {
     echo "$*" >> $CMD_LOG
     sync
+    echo $*
     $*
     ret=$?
     if [ $ret -ne 0 ]; then
@@ -58,31 +66,41 @@  function invoke() {
     fi
 }
 
+# Make a random decision
+function rand_toss_coin()
+{
+    prob=$1
+    if [ $prob == "0" ]; then return 0 ; fi
+    N=10000
+    r=$[1$RANDOM$RANDOM % $N]
+    c=`echo "scale=10; ($r / $N) < $prob" | bc`
+    return $c
+}
+
 mount | grep $DATA_DIR > /dev/null
 if [ $? -ne 0 ]; then
     echo "Create tmpfs at $DATA_DIR to store testing images."
     if [ ! -e $DATA_DIR ]; then mkdir -p $DATA_DIR ; fi
-    invoke "mount -t tmpfs none $DATA_DIR -o size=4G"
+    invoke "mount -t tmpfs none $DATA_DIR -o size=16G"
     if [ $? -ne 0 ]; then exit 1; fi
 fi
 
-/bin/rm -f $CMD_LOG $DATA_DIR/*
+/bin/mv -f $CMD_LOG $CMD_LOG.old
+/bin/rm -f $DATA_DIR/*
 touch $CMD_LOG
 
 while [ -t ]; do
     for block_size in 7680 512 1024 15872 65536 65024 1048576 1048064; do
     for chunk_mult in 5 1 2 3 7 9 12 16 33 99 ; do
     for cache in writeback writethrough ; do
-    #for compact_image in on off ; do
-    for compact_image in on ; do
-    for prefetch_delay in 1 0; do
-    for copy_on_read in on off; do
+    for raw_layout in off on; do
+    for prefetch_delay in 0 1 ; do
+    for copy_on_read in off on ; do
     for base_img in "-b $TEST_BASE" "" ; do
         chunk_size=$[$block_size * $chunk_mult]
         large_io_size=$[$chunk_size * 5]
         if [ $large_io_size -gt $MAX_IO_SIZE ]; then large_io_size=$MAX_IO_SIZE; fi
     for io_size in $large_io_size 1048576 ; do
-    for use_data_file in "" "data_file=$TEST_IMG_DATA," ; do
 
     if [ cache == "writethrough" ]; then
         JOURNAL_BUF_SIZE=0
@@ -96,17 +114,15 @@  while [ -t ]; do
     for journal_clean_buf_period in $JOURNAL_CLEAN_BUF_PERIOD ; do
         /bin/rm -rf /tmp/fvd.log*
 
-        # FVD image is about 1G
-        img_size=$[(1073741824 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+        # FVD image size
+        img_size=$[($DEF_IMG_SIZE + (1$RANDOM$RANDOM$RANDOM % $DEF_IMG_SIZE_TENTH)) / 512 * 512]
 
-        # base image is about 500MB
-        base_size=$[(536870912 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+        # base image size
+        base_size=$[($DEF_IMG_SIZE_HALF + (1$RANDOM$RANDOM$RANDOM % $DEF_IMG_SIZE_TENTH)) / 512 * 512]
 
         count=$[$count + 1]
         echo "Round $count" >> $CMD_LOG
-
-        invoke "/bin/rm -rf $TRUTH_IMG $TEST_IMG $TEST_BASE $TEST_IMG_DATA"
-
+        invoke "/bin/rm -rf $TRUTH_IMG $TEST_IMG $TEST_BASE $TRUTH_IMG_SNAPSHOT"
         if [ -z "$base_img" ]; then
             # Use zero-filled empty images.
             invoke "dd if=/dev/zero of=$TRUTH_IMG count=0 bs=1 seek=$img_size"
@@ -117,8 +133,6 @@  while [ -t ]; do
             invoke "dd if=/dev/zero of=$TRUTH_IMG count=0 bs=1 seek=$img_size"
         fi
 
-        if [ ! -z $use_data_file ]; then invoke "touch $TEST_IMG_DATA"; fi
-
         # Ensure the journal is large enough to hold at least one write.
         mixed_records_per_journal_sector=119
         if [ cache == "writethrough" ]; then
@@ -126,36 +140,77 @@  while [ -t ]; do
         else
             journal_size_factor=100
         fi
-        journal_size=$[(((($io_size / $chunk_size ) + 1 ) / $mixed_records_per_journal_sector ) + 1) * 512 * (1 + $RANDOM$RANDOM % $journal_size_factor) ]
+        journal_size=$[(((($io_size / $chunk_size ) + 1 ) / $mixed_records_per_journal_sector ) + 1) * 512 * (1 + 1$RANDOM$RANDOM % $journal_size_factor) ]
 
-        invoke "$QEMU_IMG create -f fvd $base_img -ojournal_buf_size=$journal_buf_size,journal_clean_buf_period=$journal_clean_buf_period,${use_data_file}data_file_fmt=blksim,backing_fmt=blksim,compact_image=$compact_image,copy_on_read=$copy_on_read,block_size=$block_size,chunk_size=$chunk_size,journal_size=$journal_size,prefetch_start_delay=$prefetch_delay $TEST_IMG $img_size"
+        # Create the image.
+        invoke "$QEMU_IMG create -f fvd $base_img -ojournal_buf_size=$journal_buf_size,journal_clean_buf_period=$journal_clean_buf_period,backing_fmt=blksim,raw_layout=$raw_layout,copy_on_read=$copy_on_read,block_size=$block_size,chunk_size=$chunk_size,journal_size=$journal_size,prefetch_start_delay=$prefetch_delay $TEST_IMG $img_size"
         invoke "$QEMU_IMG update -oinit_data_region=on $TEST_IMG"
         if [ $prefetch_delay -eq 1 ]; then invoke "$QEMU_IMG update -f fvd -oprefetch_over_threshold_throttle_time=0 $TEST_IMG" ; fi
 
-        # Use no more 1GB memory.
-        mem=$[$io_size * 1000]
+        parallel=100
+        mem=$[$io_size * $parallel]
         if [ $mem -gt $MAX_MEM ]; then
             parallel=$[$MAX_MEM / $io_size]
-        else
-            parallel=200
         fi
-        parallel=$[${RANDOM}${RANDOM} % $parallel]
+        parallel=$[1${RANDOM}${RANDOM} % $parallel + 1]
 
         flush_prob=`echo $flush_prob_base / $parallel | bc -l`
         aio_flush_prob=`echo $aio_flush_prob_base / $parallel | bc -l`
 
-        round=$[$G1 * 10 / $io_size]
+        round=$[$DEF_IMG_SIZE / $io_size]
         if [ $round -gt $MAX_ROUND ]; then round=$MAX_ROUND; fi
 
         b3=$[$round * 2 / 3]
         [ $b3 -eq 0 ] && b3=1
-        for rep in 0 1 2 3 4 5 6 7 8 ; do
+        snapshot=""
+
+        for rep in 0 1 2 3 4 5 6 7 8 9 10; do
             if [ $rep -eq 0 ]; then
                 compare_before=false
             else
                 compare_before=true
             fi
-            r=$[${RANDOM}${RANDOM} % $b3]
+
+            # Delete a random snapshot?
+            rand_toss_coin $snapshot_del_prob
+            if [ $? -eq 1 ]; then
+                $QEMU_IMG info $TEST_IMG | grep 00:00:00.000 > snapshots.txt
+		num_snapshots=`wc -l snapshots.txt | awk '{print $1}'`
+                # Choose a random snapshot to delete.
+                if [ $num_snapshots -gt 0 ]; then
+                    sel=$[$num_snapshots - 1$RANDOM$RANDOM % $num_snapshots]
+                    id=`tail -$sel snapshots.txt| head -1 | awk '{print $2}'`
+                    invoke "$QEMU_IMG snapshot -d $id $TEST_IMG"
+                    invoke "$QEMU_IMG check $TEST_IMG"
+                    if [ $id == "$snapshot" ]; then
+                        snapshot=""
+	    fi ; fi ; fi
+
+            # Take a snapshot?
+            rand_toss_coin $snapshot_create_prob
+            if [ $? -eq 1 ]; then
+                name="test-$count-$seed"
+                invoke "$QEMU_IMG snapshot -c $name $TEST_IMG"
+                invoke "$QEMU_IMG check $TEST_IMG"
+
+                # Will go back to this snapshot later on?
+                rand_toss_coin $snapshot_record_prob
+                if [ $? -eq 1 ]; then
+                    snapshot=$name
+                    invoke "cp $TRUTH_IMG $TRUTH_IMG_SNAPSHOT"
+            fi ; fi
+
+            # Randomize the number of rounds to run.
+            r=$[1${RANDOM}${RANDOM} % $b3]
             seed=$[$seed + 1]
+
             invoke "$QEMU_IO --auto --cache=$cache --truth=$TRUTH_IMG --format=fvd --test="blksim:$TEST_IMG" --verify_write=true --parallel=$parallel --io_size=$io_size --fail_prob=$fail_prob --cancel_prob=$cancel_prob --aio_flush_prob=$aio_flush_prob --flush_prob=$flush_prob --compare_after=true --round=$r --compare_before=$compare_before --instant_qemubh=false --seed=$seed"
-done; done; done; done; done; done; done; done; done; done; done; done; done
+
+            # Go back to a previous snapshot?
+            if [ ! -z $snapshot ]; then
+                rand_toss_coin $snapshot_goto_prob
+                if [ $? -eq 1 ]; then
+                    invoke "cp $TRUTH_IMG_SNAPSHOT $TRUTH_IMG"
+                    invoke "$QEMU_IMG snapshot -a $snapshot $TEST_IMG"
+            fi ; fi
+done; done; done; done; done; done; done; done; done; done; done; done
diff --git a/test-qcow2.sh b/test-qcow2.sh
index d1e4dc0..6d1df3b 100755
--- a/test-qcow2.sh
+++ b/test-qcow2.sh
@@ -72,10 +72,10 @@  for io_size in 1048576 ; do
     echo "Round $count" >> $CMD_LOG
 
     # QCOW2 image is about 1G
-    img_size=$[(1073741824 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+    img_size=$[(1073741824 + (1$RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
 
     # base image is about 500MB
-    base_size=$[(536870912 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+    base_size=$[(536870912 + (1$RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
 
     invoke "/bin/rm -rf $TRUTH_IMG $TEST_IMG $TEST_BASE"
     invoke "$QEMU_IO --auto --create=$TEST_BASE --seed=$seed --block_size=1048576 --empty_block_prob=0 --empty_block_chain=1 --file_size=$base_size"
diff --git a/test-vdi.sh b/test-vdi.sh
index b0bfe65..116b72a 100755
--- a/test-vdi.sh
+++ b/test-vdi.sh
@@ -71,7 +71,7 @@  for io_size in 3145728; do
     echo "Round $count" >> $CMD_LOG
 
     # VDI image is about 100M
-    img_size=$[(104857600 + ($RANDOM$RANDOM$RANDOM % 10485760)) / 512 * 512]
+    img_size=$[(104857600 + (1$RANDOM$RANDOM$RANDOM % 10485760)) / 512 * 512]
 
     invoke "/bin/rm -rf $TRUTH_IMG $TEST_IMG"
     invoke "dd if=/dev/zero of=$TRUTH_IMG count=0 bs=1 seek=$img_size"