diff mbox

[07/26] FVD: extend FVD header fvd.h to be more complete

Message ID 1298673486-3573-7-git-send-email-ctang@us.ibm.com
State New
Headers show

Commit Message

Chunqiang Tang Feb. 25, 2011, 10:37 p.m. UTC
This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch makes FVD's header file fvd.h more complete, by adding type
definition for BDRVFvdState, FvdAIOCB, etc.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd.h |  337 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 337 insertions(+), 0 deletions(-)
diff mbox

Patch

diff --git a/block/fvd.h b/block/fvd.h
index f2da330..b83b7aa 100644
--- a/block/fvd.h
+++ b/block/fvd.h
@@ -168,4 +168,341 @@  typedef struct __attribute__ ((__packed__)) FvdHeader {
 } FvdHeader;
 
 typedef struct BDRVFvdState {
+    BlockDriverState *fvd_metadata;
+    BlockDriverState *fvd_data;
+    uint64_t virtual_disk_size;  /*in bytes. */
+    uint64_t bitmap_offset;      /* in sectors */
+    uint64_t bitmap_size;        /* in bytes. */
+    uint64_t data_offset;        /* in sectors. Begin of real data. */
+    uint64_t base_img_sectors;
+    uint64_t block_size;         /* in sectors. */
+    bool copy_on_read;
+    uint64_t max_outstanding_copy_on_read_data;    /* in bytes. */
+    uint64_t outstanding_copy_on_read_data;        /* in bytes. */
+    bool data_region_prepared;
+    QLIST_HEAD(WriteLocks, FvdAIOCB) write_locks; /* All writes. */
+    QLIST_HEAD(CopyLocks, FvdAIOCB) copy_locks; /* copy-on-read and CoW. */
+
+    /* Keep two copies of bitmap to reduce the overhead of updating the
+     * on-disk bitmap, i.e., copy-on-read and prefetching do not update the
+     * on-disk bitmap. See Section 3.3.4 of the FVD-cow paper. */
+    uint8_t *fresh_bitmap;
+    uint8_t *stale_bitmap;
+
+    /******** Begin: for compact image. *************************************/
+    uint32_t *table;    /* Mapping table stored in memory in little endian. */
+    uint64_t table_size;        /* in bytes. */
+    uint64_t used_storage;        /* in sectors. */
+    uint64_t avail_storage;        /* in sectors. */
+    uint64_t chunk_size;          /* in sectors. */
+    uint64_t storage_grow_unit;   /* in sectors. */
+    uint64_t table_offset;        /* in sectors. */
+    char *add_storage_cmd;
+    uint32_t *leaked_chunks;
+    uint32_t num_leaked_chunks;
+    uint32_t next_avail_leaked_chunk;
+    uint32_t chunks_relocated;    /* Affect bdrv_has_zero_init(). */
+    /******** Begin: for compact image. *************************************/
+
+    /******** Begin: for journal. *******************************************/
+    uint64_t journal_offset;       /* in sectors. */
+    uint64_t journal_size;         /* in sectors. */
+    uint64_t journal_epoch;
+    uint64_t next_journal_sector;  /* in sector. */
+    bool dirty_image;
+    bool metadata_err_prohibit_write;
+
+    /* There are two different ways of writing metadata changes to the
+     * journal. If cache=writethrough, metadata changes are written to the
+     * journal immediately. If (cache!=writethrough||IN_QEMU_TOOL), metadata
+     * changes are buffered in memory (bjnl.journal_buf below), and later
+     * written to the journal either triggered by bdrv_aio_flush() or by a
+     * timeout (bjnl.clean_buf_timer below). */
+    bool use_bjnl;      /* 'bjnl' stands for buffered journal update. */
+    union {
+        /* 'ujnl' stands for unbuffered journal update. */
+        struct {
+            int active_writes;
+            /* Journal writes waiting for journal recycle to finish.
+             * See JournalCB.ujnl_next_wait4_recycle. */
+            QLIST_HEAD(JournalRecycle, FvdAIOCB) wait4_recycle;
+        } ujnl;
+
+        /* 'bjnl' stands for buffered journal update. */
+        struct {
+            uint8_t *buf;
+            size_t buf_size;
+            size_t def_buf_size;
+            size_t buf_used;
+            bool buf_contains_bitmap_update;
+            QEMUTimer *clean_buf_timer;
+            bool timer_scheduled;
+            uint64_t clean_buf_period;
+            /* See JournalCB.bjnl_next_queued_buf. */
+            QTAILQ_HEAD(CleanBuf, FvdAIOCB) queued_bufs;
+        } bjnl;
+    };
+    /******** End: for journal. ********************************************/
+
+    /******** Begin: for prefetching. ***********************************/
+    struct FvdAIOCB **prefetch_acb;
+    int prefetch_state;    /* PREFETCH_STATE_RUNNING, FINISHED, or DISABLED. */
+    int num_prefetch_slots;
+    int num_filled_prefetch_slots;
+    int next_prefetch_read_slot;
+    bool prefetch_read_active;
+    bool pause_prefetch_requested;
+    int64_t prefetch_start_delay;      /* in seconds  */
+    uint64_t unclaimed_prefetch_region_start;
+    uint64_t prefetch_read_time;                     /* in milliseconds. */
+    uint64_t prefetch_write_time;                    /* in milliseconds. */
+    uint64_t prefetch_data_read;                     /* in bytes. */
+    uint64_t prefetch_data_written;                  /* in bytes. */
+    double prefetch_read_throughput;                 /* in bytes/millisecond. */
+    double prefetch_write_throughput;                /* in bytes/millisecond. */
+    double prefetch_min_read_throughput;             /* in bytes/millisecond. */
+    double prefetch_min_write_throughput;            /* in bytes/millisecond. */
+    uint64_t prefetch_read_throughput_measure_time;  /* in millisecond. */
+    uint64_t prefetch_write_throughput_measure_time; /* in millisecond.*/
+    uint64_t prefetch_throttle_time;                 /* in millisecond. */
+    uint64_t sectors_per_prefetch;
+    QEMUTimer *prefetch_timer;
+    /******** End: for prefetching. ***********************************/
+
+#ifdef FVD_DEBUG
+    int64_t total_copy_on_read_data;  /* in bytes. */
+    int64_t total_prefetch_data;      /* in bytes. */
+#endif
 } BDRVFvdState;
+
+/* Begin of data type definitions. */
+struct FvdAIOCB;
+
+typedef struct JournalCB {
+    BlockDriverAIOCB *hd_acb;
+    QEMUIOVector qiov;
+    struct iovec iov;
+    bool bitmap_updated;
+    union {
+        QLIST_ENTRY(FvdAIOCB) ujnl_next_wait4_recycle;
+        QTAILQ_ENTRY(FvdAIOCB) bjnl_next_queued_buf;
+    };
+} JournalCB;
+
+/* CopyLock is used by AIOWriteCB and AIOCopyCB. */
+typedef struct CopyLock {
+    QLIST_ENTRY(FvdAIOCB) next;
+    int64_t begin;
+    int64_t end;
+     QLIST_HEAD(DependentWritesHead, FvdAIOCB) dependent_writes;
+} CopyLock;
+
+typedef struct ChildAIOReadCB {
+    BlockDriverAIOCB *hd_acb;
+    struct iovec iov;
+    QEMUIOVector qiov;
+    int64_t sector_num;
+    int nb_sectors;
+    int done;
+} ChildAIOReadCB;
+
+typedef struct AIOReadCB {
+    QEMUIOVector *qiov;
+    int ret;
+    ChildAIOReadCB read_backing;
+    ChildAIOReadCB read_fvd;
+} AIOReadCB;
+
+/* For copy-on-read and prefetching. */
+typedef struct AIOCopyCB {
+    BlockDriverAIOCB *hd_acb;
+    struct iovec iov;
+    QEMUIOVector qiov;
+    uint8_t *buf;
+    int64_t buffered_sector_begin;
+    int64_t buffered_sector_end;
+    int64_t last_prefetch_op_start_time; /* For prefetch only. */
+} AIOCopyCB;
+
+typedef struct AIOWriteCB {
+    BlockDriverAIOCB *hd_acb;
+    QEMUIOVector *qiov;
+    uint8_t *cow_buf;
+    QEMUIOVector *cow_qiov;
+    int64_t cow_start_sector;
+    int ret;
+    union {
+        bool update_table;
+        bool update_bitmap;
+    };
+
+    /* See BDRVFvdState.write_locks */
+    QLIST_ENTRY(FvdAIOCB) next_write_lock;
+
+    /* See FvdAIOCB.write.dependent_writes. */
+    QLIST_ENTRY(FvdAIOCB) next_dependent_write;
+} AIOWriteCB;
+
+/* For AIOStoreCompactCB and AIOLoadCompactCB. */
+typedef struct CompactChildCB {
+    struct FvdAIOCB *acb;
+    BlockDriverAIOCB *hd_acb;
+} CompactChildCB;
+
+/* For storing data to a compact image. */
+typedef struct AIOStoreCompactCB {
+    CompactChildCB one_child;
+    CompactChildCB *children;
+    int update_table;
+    int num_children;
+    int finished_children;
+    struct FvdAIOCB *parent_acb;
+    int ret;
+    int soft_write; /*true if the store is caused by copy-on-read or prefetch.*/
+    QEMUIOVector *orig_qiov;
+} AIOStoreCompactCB;
+
+/* For loading data from a compact image. */
+typedef struct AIOLoadCompactCB {
+    CompactChildCB *children;
+    CompactChildCB one_child;
+    int num_children;
+    int finished_children;
+    struct FvdAIOCB *parent_acb;
+    int ret;
+    QEMUIOVector *orig_qiov;
+} AIOLoadCompactCB;
+
+typedef struct AIOFlushCB {
+    BlockDriverAIOCB *data_acb;
+    BlockDriverAIOCB *metadata_acb;
+    int num_finished;
+    int ret;
+} AIOFlushCB;
+
+typedef struct AIOCleanJournalBufCB {
+    uint8_t *buf;
+} AIOCleanJournalBufCB;
+
+typedef struct AIOWrapperCB {
+    QEMUBH *bh;
+} AIOWrapperCB;
+
+typedef enum { OP_READ = 1, OP_WRITE, OP_COPY, OP_STORE_COMPACT,
+    OP_LOAD_COMPACT, OP_WRAPPER, OP_FLUSH, OP_BJNL_BUF_WRITE, OP_BJNL_FLUSH
+} op_type;
+
+#ifdef FVD_DEBUG
+/* For debugging memory leadk. */
+typedef struct alloc_tracer_t {
+    int64_t magic;
+    int alloc_tracer;
+    const char *alloc_file;
+    int alloc_line;
+    size_t size;
+} alloc_tracer_t;
+#endif
+
+typedef struct FvdAIOCB {
+    BlockDriverAIOCB common;
+    op_type type;
+    int64_t sector_num;
+    int nb_sectors;
+    JournalCB jcb;       /* For AIOWriteCB and AIOStoreCompactCB. */
+    CopyLock copy_lock;  /* For AIOWriteCB and AIOCopyCB. */
+    bool cancel_in_progress;
+
+    /* Use a union so that all requests can efficiently share one big AIOPool.*/
+    union {
+        AIOWrapperCB wrapper;
+        AIOReadCB read;
+        AIOWriteCB write;
+        AIOCopyCB copy;
+        AIOLoadCompactCB load;
+        AIOStoreCompactCB store;
+        AIOFlushCB flush;
+    };
+
+#ifdef FVD_DEBUG
+    int64_t magic;
+    alloc_tracer_t tracer; /* For debugging memory leak. */
+    /* Uniquely identifies a request across all processing activities. */
+    unsigned long long int uuid;
+#endif
+} FvdAIOCB;
+
+static BlockDriver bdrv_fvd;
+static QEMUOptionParameter fvd_create_options[];
+static QEMUOptionParameter fvd_update_options[];
+
+/* Function prototypes. */
+static int fvd_create(const char *filename, QEMUOptionParameter * options);
+static int fvd_probe(const uint8_t * buf, int buf_size, const char *filename);
+static int fvd_open(BlockDriverState * bs, const char *filename, int flags);
+static void fvd_close(BlockDriverState * bs);
+static int fvd_is_allocated(BlockDriverState * bs, int64_t sector_num,
+                            int nb_sectors, int *pnum);
+static int fvd_flush(BlockDriverState * bs);
+static BlockDriverAIOCB *fvd_aio_readv(BlockDriverState * bs,
+            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque);
+static BlockDriverAIOCB *fvd_aio_writev(BlockDriverState * bs,
+            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque);
+static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs,
+            BlockDriverCompletionFunc * cb, void *opaque);
+static int fvd_get_info(BlockDriverState * bs, BlockDriverInfo * bdi);
+static int fvd_update (BlockDriverState * bs, QEMUOptionParameter * options);
+static int fvd_has_zero_init(BlockDriverState * bs);
+
+/* Default configurations. */
+#define BYTES_PER_PREFETCH                      1048576     /* bytes */
+#define PREFETCH_THROTTLING_TIME                30000       /* milliseconds */
+#define NUM_PREFETCH_SLOTS                      2
+#define PREFETCH_MIN_MEASURE_READ_TIME          100         /* milliseconds */
+#define PREFETCH_MIN_MEASURE_WRITE_TIME         100         /* milliseconds */
+#define PREFETCH_MIN_READ_THROUGHPUT            5120        /* KB/s */
+#define PREFETCH_MIN_WRITE_THROUGHPUT           5120        /* KB/s */
+#define PREFETCH_MAX_READ_THROUGHPUT            1000000000L /* KB/s */
+#define PREFETCH_MAX_WRITE_THROUGHPUT           1000000000L /* KB/s */
+#define PREFETCH_PERF_CALC_ALPHA                0.8
+#define MAX_OUTSTANDING_COPY_ON_READ_DATA       2000000     /* bytes */
+#define MODERATE_BITMAP_SIZE                    4194304L    /* bytes */
+#define CHUNK_SIZE                              1048576LL   /* bytes */
+#define JOURNAL_SIZE                            16777216LL  /* bytes */
+#define STORAGE_GROW_UNIT                       104857600LL /* bytes */
+#define JOURNAL_BUF_SIZE                        (64*1024)  /* bytes */
+#define JOURNAL_CLEAN_BUF_PERIOD                5000        /* milliseconds */
+
+/* State of BDRVFvdState.prefetch_state. */
+#define PREFETCH_STATE_RUNNING  1
+#define PREFETCH_STATE_FINISHED 2
+#define PREFETCH_STATE_DISABLED 3
+
+/* For convience. */
+#define IN_QEMU_TOOL            (rt_clock == NULL) /* a trick */
+#define ROUND_UP(x, base)       ((((x)+(base)-1) / (base)) * (base))
+#define ROUND_DOWN(x, base)     ((((x) / (base)) * (base)))
+#define BOOL(x)                 ((x) ? "true" : "false")
+#define EMPTY_TABLE             ((uint32_t)0xFFFFFFFF)
+#define DIRTY_TABLE             ((uint32_t)0x80000000)
+#define READ_TABLE(entry)       (le32_to_cpu(entry) & ~DIRTY_TABLE)
+# define FVDAIOCB_MAGIC         ((uint64_t)0x3A8FCE89325B976DULL)
+# define FVD_ALLOC_MAGIC        ((uint64_t)0x4A7dCEF9925B976DULL)
+#define IS_EMPTY(entry)         ((entry) == EMPTY_TABLE)
+#define IS_DIRTY(entry)         (le32_to_cpu(entry) & DIRTY_TABLE)
+#define WRITE_TABLE(entry,id)   ((entry) = cpu_to_le32(id))
+#define READ_TABLE2(entry) \
+    ((entry)==EMPTY_TABLE ? EMPTY_TABLE : (le32_to_cpu(entry) & ~DIRTY_TABLE))
+
+#define CLEAN_DIRTY(entry) \
+    do {  \
+        if (!IS_EMPTY(entry))  \
+            entry = cpu_to_le32(le32_to_cpu(entry) & ~DIRTY_TABLE);  \
+    } while (0)
+
+#define CLEAN_DIRTY2(entry) \
+    do { \
+        ASSERT(!IS_EMPTY(entry)); \
+        entry = cpu_to_le32(le32_to_cpu(entry) & ~DIRTY_TABLE);  \
+    } while (0)