diff mbox

[15/26] FVD: add basic journal functionality

Message ID 1298673486-3573-15-git-send-email-ctang@us.ibm.com
State New
Headers show

Commit Message

Chunqiang Tang Feb. 25, 2011, 10:37 p.m. UTC
This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds the basic journal functionality to FVD. The journal provides
several benefits. First, updating both the bitmap and the lookup table
requires only a single write to journal. Second, K concurrent updates to any
potions of the bitmap or the lookup table are converted to K sequential writes
in the journal, which can be merged into a single write by the host Linux
kernel. Third, it increases concurrency by avoiding locking the bitmap or the
lookup table. For example, updating one bit in the bitmap requires writing a
512-byte sector to the on-disk bitmap. This bitmap sector covers a total of
512*8*64K=256MB data, and any two writes to that same bitmap sector cannot
proceed concurrently. The journal solves this problem and eliminates
unnecessary locking.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block.c                 |    2 +-
 block/fvd-bitmap.c      |   57 ++++
 block/fvd-journal-buf.c |   34 ++
 block/fvd-journal.c     |  814 ++++++++++++++++++++++++++++++++++++++++++++++-
 block/fvd-write.c       |    1 +
 block/fvd.c             |   19 ++
 6 files changed, 920 insertions(+), 7 deletions(-)
 create mode 100644 block/fvd-journal-buf.c
diff mbox

Patch

diff --git a/block.c b/block.c
index f7d91a2..8b3083d 100644
--- a/block.c
+++ b/block.c
@@ -58,7 +58,7 @@  static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
                          const uint8_t *buf, int nb_sectors);
 
-static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
+QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
diff --git a/block/fvd-bitmap.c b/block/fvd-bitmap.c
index 30e4a4b..06d7912 100644
--- a/block/fvd-bitmap.c
+++ b/block/fvd-bitmap.c
@@ -66,6 +66,63 @@  static inline void update_fresh_bitmap(int64_t sector_num, int nb_sectors,
     }
 }
 
+static void update_stale_bitmap(BDRVFvdState * s, int64_t sector_num,
+                                int nb_sectors)
+{
+    if (sector_num >= s->base_img_sectors) {
+        return;
+    }
+
+    int64_t end = sector_num + nb_sectors;
+    if (end > s->base_img_sectors) {
+        end = s->base_img_sectors;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    const int64_t block_end = (end - 1) / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->stale_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            ASSERT(s->stale_bitmap == s->fresh_bitmap ||
+                   (s->fresh_bitmap[bitmap_byte_offset] & mask));
+            b |= mask;
+            s->stale_bitmap[bitmap_byte_offset] = b;
+        }
+    }
+}
+
+static void update_both_bitmaps(BDRVFvdState * s, int64_t sector_num,
+                                int nb_sectors)
+{
+    if (sector_num >= s->base_img_sectors) {
+        return;
+    }
+
+    int64_t end = sector_num + nb_sectors;
+    if (end > s->base_img_sectors) {
+        end = s->base_img_sectors;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    const int64_t block_end = (end - 1) / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->fresh_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            b |= mask;
+            s->fresh_bitmap[bitmap_byte_offset] =
+                s->stale_bitmap[bitmap_byte_offset] = b;
+        }
+    }
+}
+
 static inline bool bitmap_show_sector_in_base_img(int64_t sector_num,
                                                   const BDRVFvdState * s,
                                                   int bitmap_offset,
diff --git a/block/fvd-journal-buf.c b/block/fvd-journal-buf.c
new file mode 100644
index 0000000..3efdd47
--- /dev/null
+++ b/block/fvd-journal-buf.c
@@ -0,0 +1,34 @@ 
+/*
+ * QEMU Fast Virtual Disk Format Metadata Journal
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+/*=============================================================================
+ * There are two different ways of writing metadata changes to the journal:
+ * immediate write or buffered write. If cache=writethrough, metadata changes
+ * are written to the journal immediately. If cache!=writethrough, metadata
+ * changes are buffered in memory and later written to the journal either
+ * triggered by bdrv_aio_flush() or by a timeout. This module implements the
+ * case for cache!=writethrough.
+ *============================================================================*/
+
+static uint8_t * bjnl_alloc_journal_records_from_buf(BlockDriverState *bs,
+                                                bool update_bitmap,
+                                                size_t record_size)
+{
+    return NULL;
+}
+
+
+static void bjnl_clean_buf_timer_cb(BlockDriverState * bs)
+{
+    /* To be implemented. */
+}
diff --git a/block/fvd-journal.c b/block/fvd-journal.c
index 2edfc70..11796b0 100644
--- a/block/fvd-journal.c
+++ b/block/fvd-journal.c
@@ -11,28 +11,830 @@ 
  *
  */
 
-#ifdef FVD_DEBUG
-static bool emulate_host_crash = true;
+/*=============================================================================
+ *  A short description: this FVD module implements a journal for committing
+ *  metadata changes. Each sector in the journal is self-contained so that
+ *  updates are atomic. A sector may contain one or multiple journal records.
+ *  There are two types of journal records:
+ * bitmap_update and table_update.
+ *   Format of a bitmap_update record:
+ *         BITMAP_JRECORD (uint32_t)
+ *         num_dirty_sectors (uint32_t)
+ *         dirty_sector_begin (int64_t)
+ *   Format of a table_update record:
+ *         TABLE_JRECORD (uint32_t)
+ *         journal_epoch (uint64_t)
+ *         num_dirty_table_entries (uint32_t)
+ *         dirty_table_begin (uint32_t)
+ *         table_entry_1 (uint32_t)
+ *         table_entry_2 (uint32_t)
+ *         ...
+ * If both the bitmap and the table need update, one sector contains a
+ * TABLE_JRECORD and a BITMAP_JRECORD, and these two records cover the same
+ * range of virtual disk data so that the corresponding parts of the bitmap
+ * and the table are always updated in one atomic operation.
+ *
+ * There are two different ways of writing metadata changes to the journal:
+ * immediate write or buffered write. If cache=writethrough, metadata changes
+ * are written to the journal immediately. If cache!=writethrough, metadata
+ * changes are buffered in memory and later written to the journal either
+ * triggered by bdrv_aio_flush() or by a timeout.
+ *
+ * Immediate journal write is implemented in this file, see
+ * ujnl_write_metadata_to_journal_now(). Buffered journal write is implemented
+ * in fvd-journal-buf.c, see bjnl_store_metadata_change_in_buffer() and
+ *============================================================================*/
+
+#define BITMAP_JRECORD               ((uint32_t)0xEF2AB8ED)
+#define TABLE_JRECORD                ((uint32_t)0xB4E6F7AC)
+#define EMPTY_JRECORD                ((uint32_t)0xA5A5A5A5)
+#define BITMAP_JRECORD_SIZE          (2 * sizeof(uint32_t) + sizeof(int64_t))
+#define TABLE_JRECORD_HDR_SIZE       (3 * sizeof(uint32_t) + sizeof(uint64_t))
+#define TABLE_JRECORDS_PER_SECTOR \
+                ((512 - TABLE_JRECORD_HDR_SIZE)/sizeof(uint32_t))
+
+/* One BITMAP_JRECORD and this number of BITMAP_JRECORDs can fit
+ * in one journal sector. */
+#define MIXED_JRECORDS_PER_SECTOR ((512 - TABLE_JRECORD_HDR_SIZE - \
+                                BITMAP_JRECORD_SIZE) / sizeof(uint32_t))
+
+#ifndef ENABLE_QDEBUG
+#  define PRINT_TABLE_JRECORD(type) do{}while(0)
 #else
-static bool emulate_host_crash = false;
+static void print_table_jrecord(uint32_t * type);
+#  define PRINT_TABLE_JRECORD print_table_jrecord
 #endif
 
+static int flush_metadata_to_disk (BlockDriverState * bs,
+                                   bool update_journal_epoch,
+                                   bool update_base_img_prefetched);
+static int64_t ujnl_allocate_journal_sectors(BlockDriverState * bs,
+                                             FvdAIOCB * acb, int nb_sectors);
+static void ujnl_write_metadata_to_journal_now(FvdAIOCB * acb,
+                                bool update_bitmap, uint8_t *buf,
+                                int64_t journal_sec, int nb_journal_sectors);
+static void bjnl_clean_buf_timer_cb(BlockDriverState * bs);
+static uint8_t * bjnl_alloc_journal_records_from_buf(BlockDriverState *bs,
+                                    bool update_bitmap, size_t record_size);
+
 static inline int64_t calc_min_journal_size(int64_t table_entries)
 {
-    return 512;
+    return (table_entries + MIXED_JRECORDS_PER_SECTOR - 1)
+        / MIXED_JRECORDS_PER_SECTOR * 512;
 }
 
 static int init_journal(int read_only, BlockDriverState * bs,
                         FvdHeader * header)
 {
-    return -ENOTSUP;
+    BDRVFvdState *s = bs->opaque;
+    s->journal_size = header->journal_size / 512;
+    s->journal_offset = header->journal_offset / 512;
+    s->journal_epoch = header->stable_journal_epoch + 1;
+    s->next_journal_sector = 0;
+    s->use_bjnl = false;
+    QLIST_INIT(&s->ujnl.wait4_recycle);
+    s->ujnl.active_writes = 0;
+
+    if (s->journal_size <= 0) {
+        if (!s->table && !s->fresh_bitmap) {
+            return 0;   /* No need to use the journal. */
+        }
+
+        if (!header->clean_shutdown) {
+            fprintf(stderr, "ERROR: the image may be corrupted because it was "
+                    "not shut down gracefully last\ntime and it does not use "
+                    "a journal. You may continue to use the image at your\n"
+                    "own risk by manually resetting the clean_shutdown flag "
+                    "in the image.\n\n");
+            s->dirty_image = true;
+            if (IN_QEMU_TOOL) {
+                return 0;       /* Allow qemu tools to use the image. */
+            } else {
+                /* Do not allow boot the VM until the clean_shutdown flag is
+                 * manually cleaned. */
+                return -EINVAL;
+            }
+        }
+
+        QDEBUG("Journal is disabled\n");
+        return 0;
+    }
+
+    if (!read_only && !IN_QEMU_TOOL && s->fvd_metadata->enable_write_cache
+        && header->journal_buf_size > 0) {
+        s->use_bjnl = true;
+        QTAILQ_INIT(&s->bjnl.queued_bufs);
+        s->bjnl.buf = NULL;
+        s->bjnl.def_buf_size = header->journal_buf_size;
+        s->bjnl.clean_buf_period = header->journal_clean_buf_period;
+        s->bjnl.buf_contains_bitmap_update = false;
+        s->bjnl.clean_buf_timer = qemu_new_timer(rt_clock,
+                                    (QEMUTimerCB *)bjnl_clean_buf_timer_cb, bs);
+        s->bjnl.timer_scheduled = false;
+    }
+
+    if (header->clean_shutdown) {
+        QDEBUG("Journal is skipped as the VM was shut down gracefully "
+               "last time.\n");
+        return 0;
+    }
+
+    QDEBUG("Recover from the journal as the VM was not shut down gracefully "
+           "last time.\n");
+
+    uint8_t *journal = my_qemu_blockalign(s->fvd_metadata,
+                                          s->journal_size * 512);
+    int ret = bdrv_read(s->fvd_metadata, s->journal_offset,
+                        journal, s->journal_size);
+    if (ret < 0) {
+        my_qemu_vfree(journal);
+        fprintf(stderr, "Failed to read the journal (%" PRId64 ") bytes\n",
+                s->journal_size * 512);
+        return -EIO;
+    }
+
+    /* Go through every journal sector. */
+    uint64_t max_epoch = 0;
+    uint8_t *sector = journal;
+    uint8_t *journal_end = journal + s->journal_size * 512;
+    uint64_t *chunk_epoch = NULL;
+
+    if (header->table_offset > 0) {
+        int table_entries = ROUND_UP(header->virtual_disk_size,
+                                     header->chunk_size) / header->chunk_size;
+        chunk_epoch = my_qemu_mallocz(sizeof(uint64_t) * table_entries);
+    }
+
+    while (sector < journal_end) {
+        uint32_t *type = (uint32_t *) sector;   /* Journal record type. */
+        while ((uint8_t *) type < (sector + 512)) {
+            if (le32_to_cpu(*type) == BITMAP_JRECORD) {
+                uint32_t *nb_sectors = type + 1;
+                int64_t *sector_num = (int64_t *) (type + 2);
+                if (s->stale_bitmap) {
+                    update_both_bitmaps(s, le64_to_cpu(*sector_num),
+                                        le32_to_cpu(*nb_sectors));
+                    QDEBUG("JOURNAL: recover BITMAP_JRECORD sector_num=%"
+                           PRId64 " nb_sectors=%u\n",
+                           le64_to_cpu(*sector_num), le32_to_cpu(*nb_sectors));
+                }
+
+                /* First field of the next journal record. */
+                type = (uint32_t *) (sector_num + 1);
+            } else if (le32_to_cpu(*type) == TABLE_JRECORD) {
+                uint64_t *epoch = (uint64_t *) (type + 1);
+                uint32_t *count = (uint32_t *) (epoch + 1);
+                uint32_t *offset = count + 1;
+                uint32_t *content = offset + 1;
+                const uint32_t chunk = le32_to_cpu(*offset);
+                const uint64_t epo = le64_to_cpu(*epoch);
+                const uint32_t n = le32_to_cpu(*count);
+                uint32_t i;
+                QDEBUG("JOURNAL: recover TABLE_JRECORD epoch=%" PRIu64
+                       " chunk_start=%u " "nb_chunks=%u\n", epo, chunk, n);
+                for (i = 0; i < n; i++) {
+                    /* If a chunk can be mapped to different locations at
+                     * different times, e.g., due to defragmentation
+                     * activities that move chunks, the epoch number is used
+                     * to identify the last effective mapping of a chunk. */
+                    if (epo > header->stable_journal_epoch &&
+                        epo > chunk_epoch[chunk + i]) {
+                        chunk_epoch[chunk + i] = epo;
+                        s->table[chunk + i] = content[i];
+
+                        /* The dirty bit was not cleaned when the table entry
+                         * was saved in the journal. */
+                        CLEAN_DIRTY2(s->table[chunk + i]);
+                        QDEBUG("\tAccept mapping chunk %u to %u\n",
+                               chunk + i, READ_TABLE(content[i]));
+                    } else {
+                        QDEBUG("\tReject mapping chunk %u to %u\n",
+                               chunk + i, READ_TABLE(content[i]));
+                    }
+                }
+                type = content + n;     /* First field of the next record. */
+                if (epo > max_epoch) {
+                    max_epoch = epo;
+                }
+            } else {
+                /* End of valid records in this journal sector. */
+                ASSERT(le32_to_cpu(*type) == EMPTY_JRECORD);
+                break;
+            }
+        }
+
+        sector += 512;
+    }
+    my_qemu_vfree(journal);
+    if (chunk_epoch) {
+        my_qemu_free(chunk_epoch);
+    }
+
+    if (++max_epoch > s->journal_epoch) {
+        s->journal_epoch = max_epoch;
+    }
+    QDEBUG("JOURNAL: journal_epoch=%" PRIu64 "\n", s->journal_epoch);
+
+    if (!read_only) {
+        /* Write the recovered metadata. */
+        flush_metadata_to_disk(bs, true /*journal */ , false /*prefetch */ );
+    }
+
+    return 0;
 }
 
-static void write_metadata_to_journal(struct FvdAIOCB *acb, bool update_bitmap)
+/*
+ * This function first flushes in-memory metadata to disk and then recycle the
+ * used journal sectors. It is possible to make this operation asynchronous so
+ * that the performance is better.  However, the overall performance
+ * improvement may be limited since recycling the journal happens very
+ * infrequently and updating on-disk metadata finishes quickly because of the
+ * small size of the metadata.
+ */
+static int recycle_journal(BlockDriverState * bs)
 {
+    BDRVFvdState *s = bs->opaque;
+    int ret;
+
+#ifdef ENABLE_QDEBUG
+    static int64_t recycle_count = 0;
+    QDEBUG("JOURNAL: start journal recycle %" PRId64 ".\n", recycle_count);
+    recycle_count++;
+    int64_t begin_time = qemu_get_clock(rt_clock);
+#endif
+
+    ret = flush_metadata_to_disk(bs, true /*journal */ , false /*prefetch */);
+    if (ret == 0) {
+        s->next_journal_sector = 0;
+    }
+
+#ifdef ENABLE_QDEBUG
+    int64_t end_time = qemu_get_clock(rt_clock);
+    QDEBUG("JOURNAL: journal recycle took %" PRId64 " ms.\n",
+           (end_time - begin_time));
+#endif
+
+    return ret;
+}
+
+static void write_metadata_to_journal_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = (FvdAIOCB *) opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    if (ret == 0) {
+        QDEBUG("JOURNAL: acb%llu-%p  write_metadata_to_journal_cb\n",
+               acb->uuid, acb);
+
+        if (s->table) {
+            /* Update the table. */
+            int i;
+            const uint32_t first_chunk = acb->sector_num / s->chunk_size;
+            const uint32_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
+                / s->chunk_size;
+            for (i = first_chunk; i <= last_chunk; i++) {
+                CLEAN_DIRTY2(s->table[i]);
+            }
+        }
+
+        if (s->stale_bitmap) {
+            /* If fresh_bitmap differs from stale_bitmap, fresh_bitmap has
+             * already been updated in write_data_cb() when invoking
+             * update_fresh_bitmap_and_check_stale_bitmap(). */
+            update_stale_bitmap(s, acb->sector_num, acb->nb_sectors);
+        }
+    } else {
+        QDEBUG("JOURNAL: acb%llu-%p  write_metadata_to_journal_cb err ret=%d\n",
+               acb->uuid, acb, ret);
+    }
+
+    /* Clean up. */
+    if (acb->type == OP_STORE_COMPACT) {
+        acb->common.cb(acb->common.opaque, ret);
+        if (acb->jcb.iov.iov_base != NULL) {
+            my_qemu_vfree(acb->jcb.iov.iov_base);
+        }
+        my_qemu_aio_release(acb);
+    } else {
+        ASSERT(acb->type == OP_WRITE);
+        finish_write(acb, ret);
+    }
+
+    if (!s->use_bjnl) {
+        ujnl_free_journal_sectors(bs);
+    }
+}
+
+static inline uint8_t * alloc_journal_records(FvdAIOCB *acb,
+                                              bool update_bitmap,
+                                              size_t buf_size,
+                                              int64_t *journal_sec)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (s->use_bjnl) {
+        /* Use buffered journal update. */
+        return bjnl_alloc_journal_records_from_buf(bs, update_bitmap, buf_size);
+    }
+
+    /* Allocate journal sectors for unbuffered journal update. */
+    size_t nb_sectors = (buf_size + 511) / 512;
+    *journal_sec = ujnl_allocate_journal_sectors(bs, acb, nb_sectors);
+    if (*journal_sec < 0) {
+        /* No journal sector is available now. It will be waken up later
+         * in ujnl_free_journal_sectors(). */
+        return NULL;
+    }
+
+    uint8_t *buf = my_qemu_blockalign(s->fvd_metadata, 512 * nb_sectors);
+    if (buf_size % 512 != 0) {
+        *((uint32_t*)(buf + buf_size)) = EMPTY_JRECORD; /* Mark buffer end. */
+    }
+    return buf;
+}
+
+static uint8_t * create_journal_records(FvdAIOCB * acb,
+                                        bool update_bitmap,
+                                        int64_t *p_journal_sec,
+                                        size_t *p_buf_size)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    uint8_t *buf;
+    int64_t journal_sec = -1;
+    size_t buf_size;
+
+    if (update_bitmap && !s->table) {
+        /* Only update the bitmap. */
+        buf_size = BITMAP_JRECORD_SIZE;
+        buf = alloc_journal_records(acb, update_bitmap, buf_size, &journal_sec);
+        if (!buf) {
+            return NULL; /* Wake up later in ujnl_free_journal_sectors(). */
+        }
+
+        uint32_t *type = (uint32_t *)buf; /*BITMAP_JRECORD*/
+        uint32_t *nb_sectors = type + 1;
+        int64_t *sector_num = (int64_t *) (type + 2);
+        *type = cpu_to_le32(BITMAP_JRECORD);
+        *nb_sectors = cpu_to_le32((uint32_t) acb->nb_sectors);
+        *sector_num = cpu_to_le64(acb->sector_num);
+        QDEBUG("JOURNAL: record BITMAP_JRECORD sector_num=%" PRId64
+               " nb_sectors=%u\n", acb->sector_num, acb->nb_sectors);
+
+    } else if (!update_bitmap) {
+        /* Only update the table. */
+
+        const int64_t first_chunk = acb->sector_num / s->chunk_size;
+        const int64_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
+            / s->chunk_size;
+        int num_chunks = last_chunk - first_chunk + 1;
+
+        /* Buf space for complete journal sectors. */
+        buf_size = (num_chunks / TABLE_JRECORDS_PER_SECTOR) * 512;
+
+        /* Buf space for the last partial journal sectors. */
+        int rc = num_chunks % TABLE_JRECORDS_PER_SECTOR;
+        if (rc > 0) {
+            buf_size += TABLE_JRECORD_HDR_SIZE + sizeof(uint32_t) * rc;
+        }
+
+        buf = alloc_journal_records(acb, update_bitmap, buf_size, &journal_sec);
+        if (!buf) {
+            return NULL; /* Wake up later in ujnl_free_journal_sectors(). */
+        }
+
+        uint32_t *type = (uint32_t *)buf; /* TABLE_JRECORD */
+        int64_t chunk = first_chunk;
+
+        while (1) {
+            /* Start a new journal sector. */
+            uint64_t *epoch = (uint64_t *) (type + 1);
+            uint32_t *count = (uint32_t *) (epoch + 1);
+            uint32_t *offset = count + 1;
+            uint32_t *content = offset + 1;
+
+            *type = cpu_to_le32(TABLE_JRECORD);
+            *offset = cpu_to_le32(chunk);
+            *epoch = cpu_to_le64(s->journal_epoch);
+            s->journal_epoch++;
+            if (num_chunks <= TABLE_JRECORDS_PER_SECTOR) {
+                /* This is the last journal sector. */
+                *count = cpu_to_le32(num_chunks);
+                memcpy(content, &s->table[chunk],
+                       sizeof(uint32_t) * num_chunks);
+                PRINT_TABLE_JRECORD(type);
+                break;
+            }
+
+            *count = cpu_to_le32(TABLE_JRECORDS_PER_SECTOR);
+            memcpy(content, &s->table[chunk],
+                   sizeof(uint32_t) * TABLE_JRECORDS_PER_SECTOR);
+            chunk += TABLE_JRECORDS_PER_SECTOR;
+            num_chunks -= TABLE_JRECORDS_PER_SECTOR;
+            PRINT_TABLE_JRECORD(type);
+
+            /* Next TABLE_JRECORD field 1. */
+            type = content + TABLE_JRECORDS_PER_SECTOR;
+        }
+    } else {
+        /* Update both the table and the bitmap. It may use multiple journal
+         * sectors. Each sector is self-contained, including a TABLE_JRECORD
+         * and a BITMAP_JRECORD. The two records one the same sector cover the
+         * same range of virtual disk data.  The purpose is to update the
+         * corresponding parts of the bitmap and the table in one atomic
+         * operation. */
+        const int64_t first_chunk = acb->sector_num / s->chunk_size;
+        const int64_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
+            / s->chunk_size;
+        int num_chunks = last_chunk - first_chunk + 1;
+
+        /* Buf space for complete journal sectors. */
+        buf_size = (num_chunks / MIXED_JRECORDS_PER_SECTOR) * 512;
+
+        /* Buf space for the last partial journal sectors. */
+        int rc = num_chunks % MIXED_JRECORDS_PER_SECTOR;
+        if (rc > 0) {
+            buf_size += BITMAP_JRECORD_SIZE + TABLE_JRECORD_HDR_SIZE
+                        + sizeof(uint32_t) * rc;
+        }
+
+        buf = alloc_journal_records(acb, update_bitmap, buf_size, &journal_sec);
+        if (!buf) {
+            return NULL; /* Wake up later in ujnl_free_journal_sectors(). */
+        }
+
+        uint32_t *type = (uint32_t *)buf; /*TABLE_JRECORD*/
+        int64_t chunk = first_chunk;
+        int64_t sector_num = acb->sector_num;
+        uint32_t nb_sectors;
+
+        /* Determine the number of data sectors whose bitmap change fits in
+         * the first journal sector. */
+        if (buf_size <= 512) {
+            nb_sectors = acb->nb_sectors; /* All fit in one journal sector. */
+        } else {
+            nb_sectors = (first_chunk + MIXED_JRECORDS_PER_SECTOR)
+                * s->chunk_size - acb->sector_num;
+        }
+
+        while (1) {
+            /* Start a new journal sector. */
+            uint64_t *epoch = (uint64_t *) (type + 1);
+            uint32_t *count = (uint32_t *) (epoch + 1);
+            uint32_t *offset = count + 1;
+            uint32_t *content = offset + 1;
+
+            *type = cpu_to_le32(TABLE_JRECORD);
+            *offset = cpu_to_le32(chunk);
+            *epoch = cpu_to_le64(s->journal_epoch);
+            s->journal_epoch++;
+
+            if (num_chunks <= MIXED_JRECORDS_PER_SECTOR) {
+                /* This is the last journal sector. */
+                *count = cpu_to_le32(num_chunks);
+                memcpy(content, &s->table[chunk],
+                       sizeof(uint32_t) * num_chunks);
+                PRINT_TABLE_JRECORD(type);
+
+                /* A BITMAP_JRECORD follows a TABLE_JRECORD so that they are
+                 * updated in one atomic operatoin. */
+                type = content + num_chunks;    /* BITMAP_JRECORD. */
+                uint32_t *p_nb_sectors = type + 1;
+                int64_t *p_sector_num = (int64_t *) (type + 2);
+                *type = cpu_to_le32(BITMAP_JRECORD);
+                *p_nb_sectors = cpu_to_le32(nb_sectors);
+                *p_sector_num = cpu_to_le64(sector_num);
+                QDEBUG("JOURNAL: record BITMAP_JRECORD sector_num=%" PRId64
+                       " nb_sectors=%u\n", sector_num, nb_sectors);
+                break;
+            }
+
+            *count = cpu_to_le32(MIXED_JRECORDS_PER_SECTOR);
+            memcpy(content, &s->table[chunk],
+                   sizeof(uint32_t) * MIXED_JRECORDS_PER_SECTOR);
+            PRINT_TABLE_JRECORD(type);
+
+            /* A BITMAP_JRECORD follows a TABLE_JRECORD so that they are
+             * updated in one atomic operatoin. */
+            type = content + MIXED_JRECORDS_PER_SECTOR; /* BITMAP_JRECORD */
+            uint32_t *p_nb_sectors = type + 1;
+            int64_t *p_sector_num = (int64_t *) (type + 2);
+            *type = cpu_to_le32(BITMAP_JRECORD);
+            *p_nb_sectors = cpu_to_le32(nb_sectors);
+            *p_sector_num = cpu_to_le64(sector_num);
+            QDEBUG("JOURNAL: record BITMAP_JRECORD sector_num=%" PRId64
+                   " nb_sectors=%u\n", sector_num, nb_sectors);
+
+            /* Prepare for the next journal sector. */
+            type = (uint32_t *) (p_sector_num + 1);
+            chunk += MIXED_JRECORDS_PER_SECTOR;
+            sector_num = chunk * s->chunk_size;
+            num_chunks -= MIXED_JRECORDS_PER_SECTOR;
+            if (num_chunks <= MIXED_JRECORDS_PER_SECTOR) {
+                /* Data sectors covered by the last journal sector. */
+                nb_sectors = (acb->sector_num + acb->nb_sectors)
+                    - chunk * s->chunk_size;
+            } else {
+                nb_sectors = s->chunk_size * MIXED_JRECORDS_PER_SECTOR;
+            }
+        }
+    }
+
+    if (p_journal_sec) {
+        *p_journal_sec = journal_sec;
+    }
+    if (p_buf_size) {
+        *p_buf_size = buf_size;
+    }
+    return buf;
+}
+
+static void write_metadata_to_journal(FvdAIOCB * acb, bool update_bitmap)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    ASSERT((s->table || s->fresh_bitmap)
+           && (!update_bitmap || (s->fresh_bitmap && acb->type == OP_WRITE))
+           && (update_bitmap || s->table)
+           && (acb->type == OP_WRITE || acb->type == OP_STORE_COMPACT));
+
+    if (acb->type == OP_WRITE) {
+        /* Save update_bitmap as it may be needed later if this request is
+         * queued.*/
+        acb->write.update_bitmap = update_bitmap;
+    }
+
+    if (s->metadata_err_prohibit_write) {
+        write_metadata_to_journal_cb(acb, -EIO);  /* Fail the request now. */
+        return;
+    }
+
+    if (s->journal_size <= 0) {
+        write_metadata_to_journal_cb(acb, 0);     /* Journal is disabled. */
+        return;
+    }
+
+    int64_t journal_sec = -1;
+    size_t buf_size = -1;
+    uint8_t *buf = create_journal_records(acb, update_bitmap,
+                                          &journal_sec, &buf_size);
+
+    /* Depending on the cache mode, either write metadata changes to journal
+     * immediately, or put it in buffer first. */
+    if (s->use_bjnl) {
+        /* Done for now. The buffer will be written to the journal later. */
+        write_metadata_to_journal_cb(acb, 0);
+    } else if (buf) {
+        int nb_sectors = (buf_size + 511) / 512;
+        ujnl_write_metadata_to_journal_now(acb, update_bitmap, buf,
+                                           journal_sec, nb_sectors);
+    }
+}
+
+static int flush_metadata_to_disk(BlockDriverState * bs,
+                                  bool update_journal_epoch,
+                                  bool update_base_img_prefetched)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (bs->read_only) {
+        return 0;
+    }
+
+    /* Clean DIRTY_TABLE bit and write the table to disk. */
+    if (s->table) {
+        int i;
+        int table_entries = ROUND_UP(s->virtual_disk_size,
+                                     s->chunk_size * 512) / (s->chunk_size *
+                                                             512);
+        for (i = 0; i < table_entries; i++) {
+            CLEAN_DIRTY(s->table[i]);
+        }
+
+        int nb = (int)(s->table_size / 512);
+        QDEBUG("JOURNAL: flush table (%d sectors) to disk\n", nb);
+
+        if (bdrv_write(s->fvd_metadata, s->table_offset, (uint8_t *) s->table,
+                       nb) < 0) {
+            goto fail;
+        }
+    }
+
+    /* Write fresh_bitmap to disk. */
+    if (s->fresh_bitmap) {
+        /* Ensure copy-on-read and prefetching data are stable. */
+        if (bdrv_flush(s->fvd_data)) {
+            goto fail;
+        }
+
+        if (s->fvd_data != s->fvd_metadata && s->table) {
+            /* Ensure table is stable before updating bitmap. */
+            if (bdrv_flush(s->fvd_metadata)) {
+                goto fail;
+            }
+        }
+
+        int nb = (int)(s->bitmap_size / 512);
+        QDEBUG("JOURNAL: flush bitmap (%d sectors) to disk\n", nb);
+
+        if (bdrv_write(s->fvd_metadata, s->bitmap_offset,
+                       s->fresh_bitmap, nb) < 0) {
+            goto fail;
+        }
+        if (s->fresh_bitmap != s->stale_bitmap) {
+            memcpy(s->stale_bitmap, s->fresh_bitmap, s->bitmap_size);
+        }
+    }
+
+    if (update_journal_epoch || update_base_img_prefetched) {
+        /* Update the header. */
+        FvdHeader header;
+        if (read_fvd_header(s, &header)) {
+            goto fail;
+        }
+        if (update_base_img_prefetched) {
+            header.base_img_fully_prefetched = true;
+        }
+        if (update_journal_epoch) {
+            header.stable_journal_epoch = s->journal_epoch++;
+        }
+        if (update_fvd_header(s, &header)) {
+            goto fail;
+        }
+    }
+
+    /* Perform a final flush to ensure all metadata are stable. */
+    if (!bdrv_flush(s->fvd_metadata)) {
+        return 0;
+    }
+
+fail:
+    s->metadata_err_prohibit_write = true;
+    return -EIO;
+}
+
+#ifdef FVD_DEBUG
+static bool emulate_host_crash = true;
+#else
+static bool emulate_host_crash = false;
+#endif
+
+static void flush_metadata_to_disk_on_exit(BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (bs->read_only || !s->fvd_metadata) {
+        return;
+    }
+
+    /* If (emulate_host_crash==true), do not flush metadata to disk
+     * so that it has to rely on journal for recovery. */
+    if (s->journal_size <= 0 || !emulate_host_crash) {
+        if (!flush_metadata_to_disk(bs, true, false) && !s->dirty_image) {
+            update_clean_shutdown_flag(s, true);
+        }
+    }
 }
 
 void fvd_emulate_host_crash(bool cond)
 {
     emulate_host_crash = cond;
 }
+
+#ifdef ENABLE_QDEBUG
+static void print_table_jrecord(uint32_t * type)
+{
+    int32_t i;
+    uint64_t *p_epoch = (uint64_t *) (type + 1);
+    uint32_t *p_count = (uint32_t *) (p_epoch + 1);
+    uint32_t *p_offset = p_count + 1;
+    uint32_t *content = p_offset + 1;
+
+    uint64_t epoch = le64_to_cpu(*p_epoch);
+    uint32_t count = le32_to_cpu(*p_count);
+    uint32_t offset = le32_to_cpu(*p_offset);
+
+    QDEBUG("JOURNAL: record TABLE_JRECORD epoch=%" PRIu64
+           " chunk_start=%u " "nb_chunks=%u\n", epoch, offset, count);
+    for (i = 0; i < count; i++) {
+        QDEBUG("\tMap chunk %u to %u\n", offset + i, READ_TABLE(content[i]));
+    }
+}
+#endif
+
+/* Only used for unbuffered journal update. */
+static void ujnl_write_metadata_to_journal_now(FvdAIOCB * acb,
+                                               bool update_bitmap,
+                                               uint8_t *buf,
+                                               int64_t journal_sec,
+                                               int nb_journal_sectors)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    QDEBUG("JOURNAL: acb%llu-%p  write_metadata_to_journal journal_sec=%"
+           PRId64 " nb_journal_sectors=%d\n", acb->uuid, acb, journal_sec,
+           nb_journal_sectors);
+
+    acb->jcb.iov.iov_base = buf;
+    acb->jcb.iov.iov_len = 512 * nb_journal_sectors;
+    qemu_iovec_init_external(&acb->jcb.qiov, &acb->jcb.iov, 1);
+    acb->jcb.hd_acb = bdrv_aio_writev(s->fvd_metadata,
+                                      s->journal_offset + journal_sec,
+                                      &acb->jcb.qiov, nb_journal_sectors,
+                                      write_metadata_to_journal_cb, acb);
+    if (!acb->jcb.hd_acb) {
+        write_metadata_to_journal_cb(acb, -1);
+    }
+}
+
+/* Only used for unbuffered journal update. */
+static void ujnl_free_journal_sectors(BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (s->journal_size <= 0) {
+        return;
+    }
+
+    s->ujnl.active_writes--;
+    ASSERT(s->ujnl.active_writes >= 0);
+
+    if (s->ujnl.active_writes > 0 || QLIST_EMPTY(&s->ujnl.wait4_recycle)) {
+        return;
+    }
+
+    /* Some requests are waiting for the journal to be recycled in order to
+     * get free journal sectors. */
+    recycle_journal(bs);
+
+    /* Restart requests in the ujnl.wait4_recycle list.  First make a copy of
+     * the head and then empty the head. */
+    FvdAIOCB *acb = QLIST_FIRST(&s->ujnl.wait4_recycle);
+    QLIST_INIT(&s->ujnl.wait4_recycle);
+    FvdAIOCB *next;
+
+    /* Restart all dependent requests. Cannot use QLIST_FOREACH here, because
+     * the next link might not be the same any more after the callback. */
+    while (acb) {
+        next = acb->jcb.ujnl_next_wait4_recycle.le_next;
+        acb->jcb.ujnl_next_wait4_recycle.le_prev = NULL;
+        QDEBUG("WRITE: acb%llu-%p  restart_write_metadata_to_journal "
+               "after recycle_journal\n", acb->uuid, acb);
+        if (acb->type == OP_WRITE) {
+            write_metadata_to_journal(acb, acb->write.update_bitmap);
+        } else {
+            write_metadata_to_journal(acb, false);
+        }
+        acb = next;
+    }
+}
+
+/* Only used for unbuffered journal update. */
+static int64_t ujnl_allocate_journal_sectors(BlockDriverState * bs,
+                                             FvdAIOCB * acb, int nb_sectors)
+{
+    BDRVFvdState *s = bs->opaque;
+    ASSERT(nb_sectors <= s->journal_size);
+
+    if (!QLIST_EMPTY(&s->ujnl.wait4_recycle)) {
+        /* Waiting for journal recycle to finish. */
+        ASSERT(s->ujnl.active_writes > 0);
+        QDEBUG("WRITE: acb%llu-%p  wait4_journal_recycle active_writes=%d\n",
+               acb->uuid, acb, s->ujnl.active_writes);
+        QLIST_INSERT_HEAD(&s->ujnl.wait4_recycle, acb,
+                          jcb.ujnl_next_wait4_recycle);
+        return -1;
+    }
+
+    int64_t journal_sec;
+    if (s->next_journal_sector + nb_sectors <= s->journal_size) {
+        journal_sec = s->next_journal_sector;
+        s->next_journal_sector += nb_sectors;
+        s->ujnl.active_writes++;
+        return journal_sec;
+    }
+
+    /* No free journal sector is available. Check if the journal can be
+     * recycled now. */
+    if (s->ujnl.active_writes == 0) {
+        recycle_journal(bs);
+        s->next_journal_sector = nb_sectors;
+        s->ujnl.active_writes = 1;
+        return 0; /* Use the first sector. */
+    }
+
+    /* Waiting for journal recycle to finish. It will be waken up later in
+     * ujnl_free_journal_sectors(). */
+    QLIST_INSERT_HEAD(&s->ujnl.wait4_recycle, acb, jcb.ujnl_next_wait4_recycle);
+    QDEBUG("WRITE: acb%llu-%p  wait4_journal_recycle active_writes=%d\n",
+           acb->uuid, acb, s->ujnl.active_writes);
+    return -1;
+}
diff --git a/block/fvd-write.c b/block/fvd-write.c
index f0580d4..623ec83 100644
--- a/block/fvd-write.c
+++ b/block/fvd-write.c
@@ -15,6 +15,7 @@  static void write_metadata_to_journal(struct FvdAIOCB *acb, bool update_bitmap);
 static int do_aio_write(struct FvdAIOCB *acb);
 static void restart_dependent_writes(struct FvdAIOCB *acb);
 static void free_write_resource(struct FvdAIOCB *acb);
+static void ujnl_free_journal_sectors(BlockDriverState * bs);
 static inline BlockDriverAIOCB *store_data(int soft_write,
                 FvdAIOCB * parent_acb, BlockDriverState * bs,
                 int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors,
diff --git a/block/fvd.c b/block/fvd.c
index 74845e7..2402a94 100644
--- a/block/fvd.c
+++ b/block/fvd.c
@@ -37,6 +37,7 @@ 
 #include "block/fvd-store.c"
 #include "block/fvd-load.c"
 #include "block/fvd-journal.c"
+#include "block/fvd-journal-buf.c"
 #include "block/fvd-prefetch.c"
 #include "block/fvd-update.c"
 
@@ -65,3 +66,21 @@  static void bdrv_fvd_init(void)
 }
 
 block_init(bdrv_fvd_init);
+
+/* Since bdrv_close may not be properly invoked on a VM shutdown, we use a
+ * destructor to flush metadata to disk. This only affects performance and
+ * does not affect correctness.  See Section 3.3.4 of the FVD-cow paper for
+ * the rationale. */
+extern QTAILQ_HEAD(, BlockDriverState) bdrv_states;
+  static void __attribute__ ((destructor)) flush_fvd_bitmap_to_disk(void)
+{
+    BlockDriverState *bs;
+    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        if (bs->drv == &bdrv_fvd) {
+            flush_metadata_to_disk_on_exit(bs);
+#ifdef FVD_DEBUG
+            dump_resource_summary(bs->opaque);
+#endif
+        }
+    }
+}