Patchwork [RFC,V8,03/24] qcow2: Add journal.

login
register
mail settings
Submitter Benoît Canet
Date June 20, 2013, 2:26 p.m.
Message ID <1371738392-9594-4-git-send-email-benoit@irqsave.net>
Download mbox | patch
Permalink /patch/252955/
State New
Headers show

Comments

Benoît Canet - June 20, 2013, 2:26 p.m.
This commit add the code required to manage one or more journals in a qcow2 file.
The primary user of this journal will be the qcow2-log-store.c.
The journal is asynchronous and will require it's users to issue flushs in order
to make sure that entries had reached stable storage.

Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
 block/Makefile.objs   |    1 +
 block/qcow2-journal.c |  587 +++++++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2.h         |   29 +++
 3 files changed, 617 insertions(+)
 create mode 100644 block/qcow2-journal.c

Patch

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 5f0358a..ee894b5 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,5 +1,6 @@ 
 block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
+block-obj-y += qcow2-journal.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-y += vhdx.o
diff --git a/block/qcow2-journal.c b/block/qcow2-journal.c
new file mode 100644
index 0000000..693de37
--- /dev/null
+++ b/block/qcow2-journal.c
@@ -0,0 +1,587 @@ 
+/*
+ * QCOW2 journal
+ *
+ * Copyright (C) Nodalink, SARL. 2013
+ *
+ * Author:
+ *   Benoît Canet <benoit.canet@irqsave.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "block/qcow2.h"
+
+/* This function reset the state of a journal
+ *
+ * @journal: the journal to initialize
+ * @sector:  the on disk sector where the journal start
+ * @size:    the size of the journal
+ */
+static void qcow2_journal_reset(QCowJournal *journal,
+                                uint64_t sector,
+                                uint64_t size)
+{
+    journal->sector = sector;
+    journal->size = size;
+    journal->index = 0;
+    /* clear write buffer */
+    memset(journal->write_buf, QCOW_LOG_NONE, JOURNAL_CLUSTER_SIZE * 2);
+    journal->offset_in_buf = 0;
+    journal->flushed = true;
+    /* clear read cache */
+    memset(journal->read_cache, QCOW_LOG_NONE, JOURNAL_CLUSTER_SIZE * 2);
+    /* so we will load the cache at first hit */
+    journal->read_index = -1;
+    /* journal need to be resumed */
+    journal->started = false;
+}
+
+/* This function set the journal to the correct state after having replaying it
+ *
+ * @journal: the journal to resume
+ * @offset:  the offset inside the journal
+ * @ret:     0 on success, -errno on error
+ */
+int qcow2_journal_resume(BlockDriverState *bs,
+                         QCowJournal *journal,
+                         uint64_t offset)
+{
+    uint64_t disk_offset;
+
+    /* flush read cache */
+    journal->read_index = -1;
+
+    /* mark the journal as resumed */
+    journal->started = true;
+
+    /* reset state */
+    journal->index = offset / JOURNAL_CLUSTER_SIZE;
+    journal->offset_in_buf = offset % JOURNAL_CLUSTER_SIZE;
+
+    /* compute on disk offset of the current cluster */
+    disk_offset = journal->sector * BDRV_SECTOR_SIZE +
+                  journal->index * JOURNAL_CLUSTER_SIZE;
+
+    /* read the current cluster in buffer */
+    return bdrv_pread(bs->file,
+                      disk_offset,
+                      journal->write_buf,
+                      JOURNAL_CLUSTER_SIZE);
+
+}
+
+uint64_t qcow2_journal_round_to_erase_blocks(uint64_t size)
+{
+    return ((size / SSD_ERASE_BLOCK_SIZE) + 1) * SSD_ERASE_BLOCK_SIZE;
+}
+
+/* This function reset a journal content to none
+ *
+ * To be called at the end of the incarnate/freeze coroutine 
+ *
+ * @journal: the QCowJournal to reset
+ * @in_coroutine: true if called from a coroutine
+ * @ret:     0 on success, -errno on error
+ */
+static int qcow2_journal_set_to_none(BlockDriverState *bs,
+                                     QCowJournal *journal,
+                                     bool in_coroutine)
+{
+    int ret = 0;
+    uint8_t *buf;
+    uint64_t i, offset;
+
+    /* prepare buffer to erase journal */
+    buf = qemu_blockalign(bs, SSD_ERASE_BLOCK_SIZE);
+    memset(buf, QCOW_LOG_NONE, SSD_ERASE_BLOCK_SIZE);
+
+    /* do the erasing */
+    for (i = 0; i < (journal->size / SSD_ERASE_BLOCK_SIZE); i++) {
+         offset = journal->sector * BDRV_SECTOR_SIZE +
+                  i * SSD_ERASE_BLOCK_SIZE;
+         /* function will be called from the incarnate coroutine ->
+          * don't write on disk when vm is paused
+          */
+         if (in_coroutine) {
+             co_sleep_ns(vm_clock, 1);
+         }
+         ret = bdrv_pwrite(bs->file,
+                           offset,
+                           buf,
+                           SSD_ERASE_BLOCK_SIZE);
+
+         if (ret < 0) {
+             qemu_vfree(buf);
+             return ret;
+         }
+    }
+
+    qemu_vfree(buf);
+    return 0;
+}
+
+/* This function is used to recycle a QCowJournal
+ *
+ * It reset it's state and buffer while keeping back the allocated on disk
+ * space.
+ * Then it clear the two first clusters.
+ *
+ * @journal: the journal to recycle
+ * @ret:     0 on success, -errno on error
+ */
+int qcow2_journal_recycle(BlockDriverState *bs, QCowJournal *journal)
+{
+    qcow2_journal_reset(journal,
+                        journal->sector,
+                        journal->size);
+    /* mark the journal as started */
+    journal->started = true;
+    /* writes 0xFF so QCOW_LOG_NONE terminate the journal */
+    return qcow2_journal_set_to_none(bs, journal, true);
+}
+
+/* This function is used to allocate a journal's buffers
+ *
+ * Set sector and size to zero so the journal will be treated as unallocated
+ *
+ * @journal: the journal to initialize
+ */
+void qcow2_journal_init(BlockDriverState *bs,
+                        QCowJournal *journal)
+{
+    journal->sector = 0;
+    journal->size = 0;
+    /* allocate extra cluster because sometime we will read an end entry
+     * which spill on the second cluster
+     */
+    journal->write_buf = qemu_blockalign(bs, JOURNAL_CLUSTER_SIZE * 2);
+    /* allocate extra read_cache cluster to avoid read casting overflow */
+    journal->read_cache = qemu_blockalign(bs, JOURNAL_CLUSTER_SIZE * 2);
+}
+
+/* This function cleanup a journal
+ *
+ * @journal: the journal to cleanup
+ */
+void qcow2_journal_cleanup(QCowJournal *journal)
+{
+    qemu_vfree(journal->write_buf);
+    qemu_vfree(journal->read_cache);
+}
+
+/* This function tell if a journal on disk space has been allocated
+ *
+ * @journal: the journal to test
+ * @ret:     true if allocated else false
+ */
+bool qcow2_journal_is_allocated(QCowJournal *journal)
+{
+    return journal->sector;
+}
+
+/* This function is used to allocate the on disk space of a journal
+ *
+ * @size: the on disk size of the journal
+ * @ret:  0 on success, -errno on error
+ */
+int qcow2_journal_disk_allocate(BlockDriverState *bs,
+                                QCowJournal *journal,
+                                uint64_t size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t offset;
+    int ret = 0;
+
+    /* allocate journal disk space */
+    offset = qcow2_alloc_clusters(bs, size);
+
+    if (offset < 0) {
+        return offset;
+    }
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+
+    if (ret < 0) {
+        goto deallocate_exit;
+    }
+
+    qcow2_journal_reset(journal, offset / BDRV_SECTOR_SIZE, size);
+
+    ret = qcow2_journal_set_to_none(bs, journal, false);
+
+    if (ret < 0) {
+        goto deallocate_exit;
+    }
+
+    /* success */
+    return 0;
+
+deallocate_exit:
+    qcow2_free_clusters(bs, offset, size);
+    return ret;
+}
+
+
+/* This function deallocate a journal disk space
+ *
+ * @journal: the journal to deallocate
+ */
+void qcow2_journal_disk_deallocate(BlockDriverState *bs,
+                                   QCowJournal *journal)
+{
+    qcow2_free_clusters(bs,
+                        journal->sector * BDRV_SECTOR_SIZE,
+                        journal->size);
+}
+
+/* Add a QCOW_LOG_END entry pointing to the next cluster of the journal
+ *
+ * Note: the caller code must be sure the end entry will be written in the last
+ *       256 bytes of the current cluster
+ *
+ * @journal: the journal to operate on
+ */
+static void qcow2_journal_add_end_entry(QCowJournal *journal)
+{
+    QCowJournalEntry entry;
+
+    memset(&entry, QCOW_LOG_NONE, sizeof(entry));
+    entry.size = JOURNAL_CLUSTER_SIZE - journal->offset_in_buf;
+    entry.type = QCOW_LOG_END;
+    memcpy(journal->write_buf + journal->offset_in_buf, &entry, entry.size);
+}
+
+/* Reset the journal write buffer increments counters
+ *
+ * @journal: the journal to operate on
+ */
+static void qcow2_journal_reset_buffer_and_inc(QCowJournal *journal)
+{
+    memset(journal->write_buf, QCOW_LOG_NONE, JOURNAL_CLUSTER_SIZE);
+    journal->offset_in_buf = 0;
+    journal->index++;
+}
+
+/* This function write a journal buffer to disk
+ *
+ * note: caller code must be sure that there is at max only 256 bytes of free
+ *       space in the buffer when deciding to end writting in this cluster
+ * note: this function will never return -1 (full) when end_cluster == false
+ *       This is used to flush the current cluster.
+ *
+ * @journal:     the journal to flush the buffer to disk
+ * @end_cluster:   true if we must prepare to write in a new cluster
+ * @ret:         0 on success, -errno on error, -1 if journal is full
+ */
+static int qcow2_journal_write_buffer(BlockDriverState *bs,
+                                      QCowJournal *journal,
+                                      bool end_cluster)
+{
+    int ret = 0;
+    uint64_t offset, total_bytes;
+
+    /* We will add a QCOW_LOG_END entry at the end of the buffer
+     * before writing it to disk.
+     * This entry will have the size required for a walk in the journal to jump
+     * to the next cluster.
+     */
+    if (end_cluster) {
+        qcow2_journal_add_end_entry(journal);
+    }
+
+    /* check if the journal is full and we are creating a new cluster */
+    total_bytes = (journal->index + 1) * JOURNAL_CLUSTER_SIZE;
+    if (end_cluster && total_bytes >= journal->size) {
+        return -1;
+    }
+
+    /* We write the journal buffer to disk */
+    offset = journal->sector * BDRV_SECTOR_SIZE +
+             journal->index * JOURNAL_CLUSTER_SIZE;
+    ret = bdrv_pwrite(bs->file,
+                      offset,
+                      journal->write_buf,
+                      JOURNAL_CLUSTER_SIZE);
+
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* prepare the journal state to buffer a new serie of entry */
+    if (end_cluster) {
+        qcow2_journal_reset_buffer_and_inc(journal);
+    }
+
+    journal->flushed = true;
+
+    return 0;
+}
+
+/* This function is used to append an entry to a journal
+ *
+ * Note: Appending an entry to the journal is an asynchronous process.
+ *       The journal must be flushed at some point to ensure that data have
+ *       reached stable storage.
+ *
+ * @journal:    the journal to append the entry to
+ * @entry:      the journal entry to write
+ * @ret:        offset of the entry in the journal, -errno on error, -1 if full
+ */
+int64_t qcow2_journal_append(BlockDriverState *bs, QCowJournal *journal,
+                             QCowJournalEntry *entry)
+{
+    int ret = 0;
+    uint64_t entry_offset;
+
+    assert(entry->size <= 254);
+
+    /* check if there is room left for this entry and end entry
+     * regular entry size are at max 254 bytes
+     * QCOW_LOG_END entry takes 2 bytes and can skip 256 bytes
+     */
+    if ((journal->offset_in_buf + entry->size + QCOW_LOG_END_SIZE) >=
+        JOURNAL_CLUSTER_SIZE) {
+        /* if not we must flush the buffer and create a new one */
+        ret = qcow2_journal_write_buffer(bs, journal, true);
+    }
+
+    /* error or journal full */
+    if (ret < 0) {
+        return ret;
+    }
+
+    entry_offset = journal->index * JOURNAL_CLUSTER_SIZE +
+                   journal->offset_in_buf;
+
+    /* write entry to the journal buffer */
+    memcpy(journal->write_buf + journal->offset_in_buf, entry, entry->size);
+    journal->offset_in_buf += entry->size;
+    journal->flushed = false;
+
+    return entry_offset;
+}
+
+/* This function is used to read a journal entry from disk given it's offset
+ *
+ * @journal: the journal to read the entry from
+ * @offset:  the offset of the entry in the journal
+ * @entry:   the journal entry to read the data into
+ * @ret:     size of the entry on success, -errno on error
+ */
+static int qcow2_journal_read_from_disk(BlockDriverState *bs,
+                                        QCowJournal *journal,
+                                        uint64_t offset,
+                                        QCowJournalEntry *entry)
+{
+
+    int ret = 0;
+    uint64_t index;
+
+
+    index = offset / JOURNAL_CLUSTER_SIZE;
+
+    /* If this cluster is not cached read it.
+     * Cache will help if we iterate, otherwise for dedup access it will be
+     * totally random.
+     */
+    if (index != journal->read_index) {
+        uint64_t read_offset = journal->sector * BDRV_SECTOR_SIZE +
+                               index * JOURNAL_CLUSTER_SIZE;
+        ret = bdrv_pread(bs->file,
+                         read_offset,
+                         journal->read_cache,
+                         JOURNAL_CLUSTER_SIZE);
+
+        if (ret < 0) {
+            return ret;
+        }
+
+        journal->read_index = index;
+    }
+
+    /* The maximum size of an entry is 254 bytes and the journal
+     * read_cache has an extra cluster to avoid read overflow
+     */
+    memcpy(entry,
+           journal->read_cache + offset % JOURNAL_CLUSTER_SIZE,
+           sizeof(QCowJournalEntry));
+
+    return entry->size;
+}
+
+/* This function tell if a journal entry is in the write buffer */
+static bool qcow2_journal_is_offset_in_buf(QCowJournal *journal,
+                                           uint64_t offset)
+{
+    return journal->index == offset / JOURNAL_CLUSTER_SIZE;
+}
+
+/* This function read a journal entry from the write buffer */
+static int qcow2_journal_read_from_buf(QCowJournal *journal,
+                                       uint64_t offset,
+                                       QCowJournalEntry *entry)
+{
+    memcpy(entry,
+           journal->write_buf + offset % JOURNAL_CLUSTER_SIZE,
+           sizeof(QCowJournalEntry));
+    return entry->size;
+}
+
+/* This function is used to read a journal entry given it's offset
+ *
+ * As this function return the size of the entry just read it can be used as
+ * an iterator to walk in the journal.
+ *
+ * @journal: the journal to read the entry from
+ * @offset:  the offset of the entry in the journal
+ * @entry:   the journal entry to read the data into
+ * @ret:     size of the entry on success, -errno on error
+ */
+int qcow2_journal_read(BlockDriverState *bs, QCowJournal *journal,
+                       uint64_t offset, QCowJournalEntry *entry)
+{
+    if (offset >= journal->size) {
+        return -EFBIG;
+    }
+
+    if (journal->started &&
+        qcow2_journal_is_offset_in_buf(journal, offset)) {
+        return qcow2_journal_read_from_buf(journal, offset, entry);
+    }
+
+    return qcow2_journal_read_from_disk(bs, journal, offset, entry);
+}
+
+/* This function is used to flush the current journal page to disk
+ *
+ * Note: it does not issue a bdrv_flush it's up to the caller to do so.
+ * Note: qcow2_journal_write_buffer will never return -1 (full) when it receive
+ *       end_cluster == false) 
+ *
+ * @journal: the journal to flush
+ * @ret:     0 on success, -errno on error
+ */
+int qcow2_journal_flush(BlockDriverState *bs, QCowJournal *journal)
+{
+    if (journal->flushed) {
+        return 0;
+    }
+
+    /* write the current cluster while not ending it */
+    return qcow2_journal_write_buffer(bs, journal, false);
+}
+
+/* This function flush and stop a journal
+ *
+ * @journal: the journal to stop
+ * @ret:     0 on success, -errno on error
+ */
+int qcow2_journal_stop(BlockDriverState *bs, QCowJournal *journal)
+{
+    int ret = 0;
+
+    ret = qcow2_journal_flush(bs, journal);
+
+    if (ret < 0) {
+        return ret;
+    }
+
+    journal->started = false;
+
+    return 0;
+}
+
+/* This function convert a hash info to to a journal entry
+ *
+ * @entry:     the already allocated destination journal entry
+ * @hash_info: the QCowHashInfo to take data from
+ */
+void qcow2_journal_entry_from_hash_info(QCowJournalEntry *entry,
+                                        QCowHashInfo *hash_info)
+{
+    entry->size = 2 + sizeof(QCowHashInfo);
+    entry->type = QCOW_LOG_HASH;
+    memcpy(&entry->u.hash_info, hash_info, sizeof(QCowHashInfo));
+    entry->u.hash_info.physical_sect = cpu_to_be64(hash_info->physical_sect);
+    entry->u.hash_info.first_logical_sect =
+        cpu_to_be64(hash_info->first_logical_sect);
+}
+
+/* This function convert a journal entry to a hash info
+ *
+ * @entry:     the already allocated QCowHashInfo
+ * @hash_info: the QCowJournalEntry to take data from
+ * @ret:       O on success, -1 on invalid entry
+ */
+int qcow2_hash_info_from_journal_entry(QCowHashInfo *hash_info,
+                                       QCowJournalEntry *entry)
+{
+    if (entry->type != QCOW_LOG_HASH) {
+        return -EINVAL;
+    }
+
+    if (entry->size != (2 + sizeof(QCowHashInfo))) {
+        return -EINVAL;
+    }
+
+    memcpy(hash_info, &entry->u.hash_info, sizeof(QCowHashInfo));
+    hash_info->physical_sect = be64_to_cpu(hash_info->physical_sect);
+    hash_info->first_logical_sect = be64_to_cpu(hash_info->first_logical_sect);
+
+    return 0;
+}
+
+/* This function compute the size of a journal when dumped */
+size_t qcow2_journal_dump_size(void)
+{
+    return sizeof(uint64_t) * 2;
+}
+
+/* This function dump the required journal information in a buffer
+ *
+ * @buf:     the buffer to do the dump into
+ * @journal: the journal to dump
+ * @ret:     the size of the dump
+ */
+size_t qcow2_journal_dump(uint8_t *buf, QCowJournal *journal)
+{
+    uint64_t *buf64 = (uint64_t *) buf;
+
+    buf64[0] = cpu_to_be64(journal->sector);
+    buf64[1] = cpu_to_be64(journal->size);
+
+    return qcow2_journal_dump_size();
+}
+
+/* This function parse a journal dump
+ *
+ *
+ * @journal: the journal to parse the dump into
+ * @buf:     the buffer to read the journal info from
+ */
+void qcow2_journal_parse(QCowJournal *journal, uint8_t *buf)
+{
+    uint64_t *buf64 = (uint64_t *) buf;
+
+    return qcow2_journal_reset(journal,
+                               be64_to_cpu(buf64[0]),  /* sector */
+                               be64_to_cpu(buf64[1])); /* size */
+}
diff --git a/block/qcow2.h b/block/qcow2.h
index 953edfe..adde631 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -606,4 +606,33 @@  int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
     void **table);
 int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
 
+/* qcow2-journal.c functions */
+int qcow2_journal_resume(BlockDriverState *bs,
+                         QCowJournal *journal,
+                         uint64_t offset);
+uint64_t qcow2_journal_round_to_erase_blocks(uint64_t size);
+int qcow2_journal_recycle(BlockDriverState *bs, QCowJournal *journal);
+void qcow2_journal_init(BlockDriverState *bs,
+                        QCowJournal *journal);
+void qcow2_journal_cleanup(QCowJournal *journal);
+bool qcow2_journal_is_allocated(QCowJournal *journal);
+int qcow2_journal_disk_allocate(BlockDriverState *bs,
+                                QCowJournal *journal,
+                                uint64_t size);
+void qcow2_journal_disk_deallocate(BlockDriverState *bs,
+                                   QCowJournal *journal);
+int64_t qcow2_journal_append(BlockDriverState *bs, QCowJournal *journal,
+                             QCowJournalEntry *entry);
+int qcow2_journal_read(BlockDriverState *bs, QCowJournal *journal,
+                       uint64_t offset, QCowJournalEntry *entry);
+int qcow2_journal_flush(BlockDriverState *bs, QCowJournal *journal);
+int qcow2_journal_stop(BlockDriverState *bs, QCowJournal *journal);
+void qcow2_journal_entry_from_hash_info(QCowJournalEntry *entry,
+                                        QCowHashInfo *hash_info);
+int qcow2_hash_info_from_journal_entry(QCowHashInfo *hash_info,
+                                       QCowJournalEntry *entry);
+size_t qcow2_journal_dump_size(void);
+size_t qcow2_journal_dump(uint8_t *buf, QCowJournal *journal);
+void qcow2_journal_parse(QCowJournal *journal, uint8_t *buf);
+
 #endif