@@ -1,5 +1,6 @@
block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
+block-obj-y += irow.o
block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-obj-y += qed-check.o
block-obj-y += parallels.o blkdebug.o blkverify.o
new file mode 100644
@@ -0,0 +1,2257 @@
+/* IROW(Improved ROW)Disk Format
+ * */
+
+/*
+ * iRow (imporved Redirect-on-Write) is a disk format supporting high-efficiency VM disk snapshot.
+ * iROW uses bitmap to reduce the amount of metadata, so that both the VM disk snapshot key operations
+ * performance and the VM disk I/O performance would be enhanced at the same time.
+ *
+ *The iROW VM disk image consists of a meta file and several snapshots.
+ *
+ *A snapshot consists of 2 files: a bitmap file (btmp file) and a VM disk data file (irvd file).
+ *The current state of the iROW VM disk also occupies a snapshot.
+ *
+ *The meta file consists of the meta header and the snapshots information. The meta header is used to
+ *store basic information of VM disk image. The snapshots information sequentially stores every snapshot’s name,
+ *id and others related information.
+ *
+ *The btmp file consists of a bitmap and the VM state data. The bitmap is used to indicate whether the
+ *clusters exist in corresponding irvd file. Each cluster in the VM disk image is mapped to a bit in the bitmap.
+ *
+ *The irvd file is used to store the actual data of the VM disk image. The smallest unit of storage is cluster.
+ *iROW does not decide the address of the data clusters. It just writes the clusters to the same VM disk image
+ *addresses as the virtual addresses of the clusters. Because of host machine’s file system support sparse files,
+ *iROW also achieves the gradual growth of the VM disk image size with the actual disk usage.
+ *
+ */
+
+#include "qemu-common.h"
+#include "include/block/block_int.h"
+#include "include/qemu/module.h"
+#include "block/irow.h"
+
+#include <linux/falloc.h>
+
+BDRVIrowState **birows_cache = NULL;
+ClusterCache *cluster_cache = NULL;
+
+static int get_bits_from_size(size_t size)
+{
+ int ret = 0;
+ if (size == 0) {
+ return -1;
+ }
+ while (size != 1) {
+ if (size & 1) {
+ return -1;
+ }
+ size >>= 1;
+ ret++;
+ }
+ return ret;
+}
+
+static int irow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const IRowMeta *irow_meta = (const void *)buf;
+
+ if (buf_size >= sizeof(IRowMeta) &&
+ be32_to_cpu(irow_meta->magic) == IROW_MAGIC &&
+ be32_to_cpu(irow_meta->version) == IROW_VERSION){
+ return 100;
+ }
+ else {
+ return 0;
+ }
+}
+
+static void irow_close_btmp(BDRVIrowState *s) {
+ if(s->bitmap) {
+ g_free(s->bitmap);
+ s->bitmap = NULL;
+ }
+
+ if(s->irow_btmp) {
+ bdrv_delete(s->irow_btmp);
+ s->irow_btmp = NULL;
+ }
+}
+
+static void irow_close_irvd(BDRVIrowState *s) {
+ if(s->irow_irvd) {
+ bdrv_delete(s->irow_irvd);
+ s->irow_irvd = NULL;
+ }
+}
+
+static void irow_close_snapshots2(IRowSnapshot *snapshots, int nb_snapshots) {
+ int i;
+ IRowSnapshot *snap_ptr;
+
+ if(snapshots == NULL)
+ return;
+
+ for(i = 0; i < nb_snapshots; i++) {
+ snap_ptr = snapshots + i;
+ if(snap_ptr->btmp_file) {
+ g_free(snap_ptr->btmp_file);
+ snap_ptr->btmp_file = NULL;
+ }
+
+ if(snap_ptr->irvd_file) {
+ g_free(snap_ptr->irvd_file);
+ snap_ptr->irvd_file = NULL;
+ }
+
+ if(snap_ptr->father_btmp_file) {
+ g_free(snap_ptr->father_btmp_file);
+ snap_ptr->father_btmp_file = NULL;
+ }
+
+ if(snap_ptr->id_str) {
+ g_free(snap_ptr->id_str);
+ snap_ptr->id_str = NULL;
+ }
+
+ if(snap_ptr->name) {
+ g_free(snap_ptr->name);
+ snap_ptr->name = NULL;
+ }
+ }
+ g_free(snapshots);
+}
+
+static void irow_close_snapshots(BDRVIrowState *birows) {
+ irow_close_snapshots2(birows->snapshots, birows->nb_snapshots);
+ birows->snapshots = NULL;
+}
+
+static void irow_close_meta(BDRVIrowState *s) {
+ if(s->meta_file) {
+ g_free(s->meta_file);
+ s->meta_file = NULL;
+ }
+
+ if(s->current_btmp_file) {
+ g_free(s->current_btmp_file);
+ s->current_btmp_file = NULL;
+ }
+
+ if(s->father_btmp_file) {
+ g_free(s->father_btmp_file);
+ s->father_btmp_file = NULL;
+ }
+
+ if(s->irvd_file) {
+ g_free(s->irvd_file);
+ s->irvd_file = NULL;
+ }
+
+ if(s->opened_btmp_file) {
+ g_free(s->opened_btmp_file);
+ s->opened_btmp_file = NULL;
+ }
+
+ if(s->irow_meta) {
+ bdrv_delete(s->irow_meta);
+ s->irow_meta = NULL;
+ }
+ if(s->snapshots) {
+ irow_close_snapshots(s);
+ }
+}
+
+static void irow_close_state(BDRVIrowState *s) {
+
+ irow_close_meta(s);
+ irow_close_btmp(s);
+ irow_close_irvd(s);
+
+}
+
+static int irow_check_bitmap(BDRVIrowState *birows) {
+ uint64_t i;
+ for(i = 0; i < birows->bitmap_size; i++) {
+ if(birows->bitmap[i] != 0xff)
+ return 0;
+ }
+ return 1;
+}
+
+static int irow_update_btmp(BDRVIrowState *birows) {
+
+ int ret = 0;
+ if(birows->bitmap_is_dirty) {
+ if(bdrv_pwrite(birows->irow_btmp, 0, birows->bitmap, birows->bitmap_size) != birows->bitmap_size) {
+ fprintf(stderr, "Failed to write the IROW bitmap data to %s\n", birows->opened_btmp_file);
+ ret = -1;
+ goto end;
+ }
+ birows->bitmap_is_dirty = 0;
+ ret = bdrv_truncate(birows->irow_btmp, birows->bitmap_size + birows->vm_state_size);
+ if(irow_check_bitmap(birows)) {
+ birows->complete_image = 1;
+ }
+ }
+ if(birows->vmstate_is_saved) {
+ birows->vmstate_is_saved = 0;
+ ret = bdrv_truncate(birows->irow_btmp, birows->bitmap_size + birows->vm_state_size);
+ }
+
+end:
+ return ret;
+}
+
+static int irow_update_meta(BDRVIrowState *birows, const char *current_btmp, int change_copy_on_demand_state) {
+ int i, ret = 0;
+ uint32_t copy_on_demand;
+ IRowMeta meta;
+ IRowSnapshotHeader snap_header;
+ IRowSnapshot *snap_ptr;
+
+ if(change_copy_on_demand_state == 0 && birows->snapshots_is_dirty == 0 && current_btmp == NULL)
+ goto end;
+
+ if(bdrv_pread (birows->irow_meta, 0, &meta, sizeof(meta)) != sizeof(meta)) {
+ fprintf (stderr, "Failed to read the meta data from %s\n", birows->meta_file);
+ ret = -1;
+ goto end;
+ }
+ if(change_copy_on_demand_state) {
+ copy_on_demand = meta.copy_on_demand;
+ be32_to_cpus(©_on_demand);
+ copy_on_demand = copy_on_demand ? 0 : 1;
+ meta.copy_on_demand = cpu_to_be32(copy_on_demand);
+ }
+ if(current_btmp != NULL) {
+ memset(meta.current_btmp, 0, MAX_FILE_NAME_LENGTH);
+ strncpy(meta.current_btmp, current_btmp, MAX_FILE_NAME_LENGTH);
+ }
+
+ if(birows->snapshots_is_dirty) {
+ meta.nb_snapshots = cpu_to_be32(birows->nb_snapshots);
+ for(i = 0; i < birows->nb_snapshots; i++) {
+ memset(&snap_header, 0, sizeof(snap_header));
+ snap_ptr = birows->snapshots + i;
+ snap_header.snap_magic = cpu_to_be32(IROW_SNAPHEADER_MAGIC);
+ snap_header.date_sec = snap_ptr->date_sec;
+ snap_header.date_nsec = snap_ptr->date_nsec;
+ snap_header.vm_clock_nsec = snap_ptr->vm_clock_nsec;
+ snap_header.vm_state_size = snap_ptr->vm_state_size;
+ snap_header.nb_children = snap_ptr->nb_children;
+ snap_header.is_deleted = snap_ptr->is_deleted;
+ if(snap_ptr->id_str != NULL)
+ strncpy(snap_header.id_str, snap_ptr->id_str, 128);
+ if(snap_ptr->name != NULL)
+ strncpy(snap_header.name, snap_ptr->name, 256);
+ if(snap_ptr->btmp_file == NULL) {
+ fprintf(stderr, "Void btmp filename\n");
+ ret = -1;
+ goto end;
+ }
+ strncpy(snap_header.btmp_file, snap_ptr->btmp_file, MAX_FILE_NAME_LENGTH);
+ if(snap_ptr->irvd_file == NULL) {
+ fprintf(stderr, "Void irvd filename\n");
+ ret = -1;
+ goto end;
+ }
+ strncpy(snap_header.irvd_file, snap_ptr->irvd_file, MAX_FILE_NAME_LENGTH);
+ if(snap_ptr->father_btmp_file != NULL)
+ strncpy(snap_header.father_btmp_file, snap_ptr->father_btmp_file, MAX_FILE_NAME_LENGTH);
+
+ if(bdrv_pwrite(birows->irow_meta, sizeof(meta) + i * sizeof(IRowSnapshotHeader), &snap_header, sizeof(snap_header)) != sizeof(snap_header)) {
+ fprintf (stderr, "Failed to write the snapshot #%d info to %s\n", i, birows->meta_file);
+ ret = -1;
+ goto end;
+ }
+ }
+ birows->snapshots_is_dirty = 0;
+ }
+
+ if(bdrv_pwrite(birows->irow_meta, 0, &meta, sizeof(meta)) != sizeof(meta)) {
+ fprintf (stderr, "Failed to write the meta data to %s\n", birows->meta_file);
+ ret = -1;
+ goto end;
+ }
+
+ ret = bdrv_truncate(birows->irow_meta, sizeof(meta) + (birows->nb_snapshots) * sizeof(IRowSnapshotHeader));
+
+end:
+ return ret;
+}
+
+static void irow_close_previous_state(BDRVIrowState *birows) {
+ birows->irow_meta = NULL;
+ irow_close_state(birows);
+ g_free(birows);
+}
+
+static void irow_free_birows_cache(BDRVIrowState *birows) {
+ int i;
+ if(birows_cache != NULL) {
+ for(i = 0; i < birows->nb_snapshots; i++) {
+ if(birows_cache[i] != NULL) {
+ irow_close_previous_state(birows_cache[i]);
+ }
+ }
+ g_free(birows_cache);
+ birows_cache = NULL;
+ }
+}
+
+static void irow_close(BlockDriverState *bs) {
+
+ BDRVIrowState *s = bs->opaque;
+
+ irow_free_birows_cache(s);
+ irow_close_state(s);
+
+}
+
+static int irow_open_snapshots(BDRVIrowState *birows) {
+ int i, ret = 0;
+ IRowSnapshotHeader snap_header;
+ IRowSnapshot *snap_ptr;
+ int64_t offset;
+
+ birows->snapshots = g_malloc0(sizeof(IRowSnapshot) * birows->nb_snapshots);
+ offset = IROW_SNAPSHOT_OFFSET;
+ for(i = 0; i < birows->nb_snapshots; i++) {
+ if(bdrv_pread(birows->irow_meta, offset, &snap_header, sizeof(snap_header)) != sizeof(snap_header)) {
+ fprintf(stderr, "Failed to read snapshot #%d info from %s\n", i, birows->meta_file);
+ ret = -1;
+ goto fail;
+ }
+ snap_ptr = birows->snapshots + i;
+ snap_ptr->date_sec = snap_header.date_sec;
+ snap_ptr->date_nsec = snap_header.date_nsec;
+ snap_ptr->vm_clock_nsec = snap_header.vm_clock_nsec;
+ snap_ptr->vm_state_size = snap_header.vm_state_size;
+ snap_ptr->nb_children = snap_header.nb_children;
+ snap_ptr->is_deleted = snap_header.is_deleted;
+
+ if(snap_header.id_str[0] != '\0') {
+ snap_ptr->id_str = g_malloc0(128);
+ strncpy(snap_ptr->id_str, snap_header.id_str, 128);
+ }
+ if(snap_header.name[0] != '\0') {
+ snap_ptr->name = g_malloc0(256);
+ strncpy(snap_ptr->name, snap_header.name, 256);
+ }
+ if(snap_header.btmp_file == '\0') {
+ fprintf(stderr, "Invalid btmp file name. (snapshot #%d)\n", i);
+ ret = -1;
+ goto fail;
+ }
+ snap_ptr->btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ strncpy(snap_ptr->btmp_file, snap_header.btmp_file, MAX_FILE_NAME_LENGTH);
+ if(snap_header.irvd_file == '\0') {
+ fprintf(stderr, "Invalid irvd file name. (snapshot #%d)\n", i);
+ ret = -1;
+ goto fail;
+ }
+ snap_ptr->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ strncpy(snap_ptr->irvd_file, snap_header.irvd_file, MAX_FILE_NAME_LENGTH);
+ if(snap_header.father_btmp_file[0] != '\0') {
+ snap_ptr->father_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ strncpy(snap_ptr->father_btmp_file, snap_header.father_btmp_file, MAX_FILE_NAME_LENGTH);
+ }
+ offset += sizeof(snap_header);
+ }
+ birows->snapshots_is_dirty = 0;
+
+ return ret;
+fail:
+ irow_close_snapshots(birows);
+ return ret;
+
+}
+
+static int irow_open_meta(BlockDriverState *bs, BDRVIrowState *birows, const char *filename, int flags) {
+ int ret = 0;
+ IRowMeta meta;
+
+ birows->irow_meta = bdrv_new ("");
+ ret = bdrv_file_open(&birows->irow_meta, filename, flags);
+ if (ret < 0) {
+ fprintf (stderr, "Failed to open %s\n", filename);
+ goto end;
+ }
+ if (bdrv_pread (birows->irow_meta, 0, &meta, sizeof(meta)) != sizeof(meta)) {
+ fprintf (stderr, "Failed to read the IROW meta data from %s\n", filename);
+ ret = -1;
+ goto end;
+ }
+ be32_to_cpus(&meta.magic);
+ be32_to_cpus(&meta.version);
+ be32_to_cpus(&meta.copy_on_demand);
+ be32_to_cpus(&meta.cluster_size);
+ be32_to_cpus(&meta.cluster_bits);
+ be64_to_cpus(&meta.total_clusters);
+ be32_to_cpus(&meta.sectors_per_cluster);
+ be64_to_cpus(&meta.disk_size);
+ be32_to_cpus(&meta.nb_snapshots);
+
+ if(meta.magic != IROW_MAGIC || meta.version != IROW_VERSION) {
+ fprintf (stderr, "Invalid magic number or version number!\n");
+ ret = -1;
+ goto end;
+ }
+ if((meta.cluster_bits < MIN_CLUSTER_BITS) || (meta.cluster_bits > MAX_CLUSTER_BITS)) {
+ fprintf (stderr, "Invalid cluster_bits!\n");
+ ret = -1;
+ goto end;
+ }
+ if(meta.cluster_bits != get_bits_from_size(meta.cluster_size)) {
+ fprintf (stderr, "cluster_size and cluster_bits do not match!\n");
+ ret = -1;
+ goto end;
+ }
+ if(meta.total_clusters != ((meta.disk_size + meta.cluster_size - 1) >> meta.cluster_bits)) {
+ fprintf (stderr, "total_clusters and disk_size do not match!\n");
+ ret = -1;
+ goto end;
+ }
+ if(meta.sectors_per_cluster != (meta.cluster_size >> BDRV_SECTOR_BITS)) {
+ fprintf (stderr, "Invalid sectors_per_cluster!\n");
+ ret = -1;
+ goto end;
+ }
+ birows->copy_on_demand = meta.copy_on_demand;
+ birows->cluster_size = meta.cluster_size;
+ birows->cluster_bits = meta.cluster_bits;
+ birows->total_clusters = meta.total_clusters;
+ birows->sectors_per_cluster = meta.sectors_per_cluster;
+ birows->disk_size = meta.disk_size;
+ bs->total_sectors = meta.disk_size / BDRV_SECTOR_SIZE;
+ birows->bitmap_size = (birows->total_clusters + 7) >> 3;
+ birows->nb_snapshots = meta.nb_snapshots;
+ birows->meta_file = g_malloc(MAX_FILE_NAME_LENGTH);
+ strncpy(birows->meta_file, filename, MAX_FILE_NAME_LENGTH);
+ birows->current_btmp_file = g_malloc(MAX_FILE_NAME_LENGTH);
+ strncpy(birows->current_btmp_file, meta.current_btmp, MAX_FILE_NAME_LENGTH);
+ strncpy(bs->backing_file, meta.backing_file, sizeof(bs->backing_file));
+
+ if(cluster_cache == NULL) {
+ cluster_cache = g_malloc0(sizeof(ClusterCache));
+ if(cluster_cache != NULL) {
+ cluster_cache->cache = qemu_memalign(512, birows->cluster_size);
+ if(cluster_cache->cache != NULL)
+ memset(cluster_cache->cache, 0, birows->cluster_size);
+ else {
+ fprintf(stderr, "Failed to create father cache\n");
+ ret = -1;
+ goto end;
+ }
+ cluster_cache->cluster_num = -1;
+ } else {
+ fprintf(stderr, "Failed to create father cache\n");
+ ret = -1;
+ goto end;
+ }
+ }
+
+ if(irow_open_snapshots(birows) < 0) {
+ fprintf(stderr, "Failed to read snapshots info from %s\n", birows->meta_file);
+ ret = -1;
+ goto end;
+ }
+
+end:
+ return ret;
+}
+
+static int irow_open_btmp(BDRVIrowState *birows, const char *filename, int flags) {
+ int ret;
+
+ birows->irow_btmp = bdrv_new ("");
+ ret = bdrv_file_open(&birows->irow_btmp, filename, flags);
+ if (ret < 0) {
+ return ret;
+ }
+ birows->bitmap = qemu_memalign(512, birows->bitmap_size);
+ if(bdrv_pread(birows->irow_btmp, 0, birows->bitmap, birows->bitmap_size) != birows->bitmap_size) {
+ fprintf(stderr, "Failed to read bitmap from %s\n", filename);
+ return -1;
+ }
+ birows->bitmap_is_dirty = 0;
+ birows->vmstate_is_saved = 0;
+ if(irow_check_bitmap(birows)) {
+ birows->complete_image = 1;
+ } else {
+ birows->complete_image = 0;
+ }
+ return ret;
+}
+
+static int irow_open_vd(BDRVIrowState *birows, const char *filename, int flags) {
+ int ret;
+ birows->irow_irvd = bdrv_new ("");
+ ret = bdrv_file_open(&birows->irow_irvd, filename, flags);
+ return ret;
+}
+
+static int irow_open_data(BDRVIrowState *birows, int flags) {
+
+ int ret = 0;
+
+ if(birows->opened_btmp_file == NULL || birows->opened_btmp_file[0] == '\0') {
+ fprintf (stderr, "Void btmp file name\n");
+ ret = -1;
+ goto end;
+ }
+ if(irow_open_btmp(birows, birows->opened_btmp_file, flags) < 0) {
+ fprintf (stderr, "Failed to open %s\n", birows->opened_btmp_file);
+ ret = -1;
+ goto end;
+ }
+
+ if(birows->irvd_file == NULL || birows->irvd_file[0] == '\0') {
+ fprintf (stderr, "Void irvd file name\n");
+ ret = -1;
+ goto end;
+ }
+ if(irow_open_vd(birows, birows->irvd_file, flags) < 0) {
+ fprintf (stderr, "Failed to open %s\n", birows->irvd_file);
+ ret = -1;
+ goto end;
+ }
+
+end:
+ return ret;
+}
+
+static int irow_find_snapshot_by_btmp(BDRVIrowState *birows, const char *btmp) {
+ int i;
+
+ for(i = 0; i < birows->nb_snapshots; i++) {
+ if(birows->snapshots[i].btmp_file != NULL) {
+ if(strcmp(birows->snapshots[i].btmp_file, btmp) == 0) {
+ return i;
+ }
+ }
+ }
+ return -1;
+}
+
+static int irow_load_info_from_snapshot(BDRVIrowState *birows, int snapshot_index) {
+ IRowSnapshot *snap;
+ int ret = 0;
+
+ if(snapshot_index < 0) {
+ fprintf (stderr, "Invalid snapshot index.\n");
+ ret = -1;
+ goto end;
+ }
+ snap = birows->snapshots + snapshot_index;
+ if(snap->btmp_file == NULL) {
+ fprintf (stderr, "Void btmp file name in snap info\n");
+ ret = -1;
+ goto end;
+ }
+ if(snap->irvd_file == NULL) {
+ fprintf (stderr, "Void irvd file name in snap info\n");
+ ret = -1;
+ goto end;
+ }
+ birows->opened_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ birows->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ strncpy(birows->opened_btmp_file, snap->btmp_file, MAX_FILE_NAME_LENGTH);
+ strncpy(birows->irvd_file, snap->irvd_file, MAX_FILE_NAME_LENGTH);
+ if(snap->father_btmp_file) {
+ birows->father_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ strncpy(birows->father_btmp_file, snap->father_btmp_file, MAX_FILE_NAME_LENGTH);
+ }
+ birows->vm_state_size = snap->vm_state_size;
+end:
+ return ret;
+}
+
+static BDRVIrowState *irow_open_previous_state(BDRVIrowState *birows, int snap_index) {
+ BDRVIrowState *new_birows = g_malloc0(sizeof(BDRVIrowState));
+
+ new_birows->cluster_size = birows->cluster_size;
+ new_birows->cluster_bits = birows->cluster_bits;
+ new_birows->total_clusters = birows->total_clusters;
+ new_birows->sectors_per_cluster = birows->sectors_per_cluster;
+ new_birows->disk_size = birows->disk_size;
+ new_birows->bitmap_size = birows->bitmap_size;
+ new_birows->current_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ strcpy(new_birows->current_btmp_file, birows->current_btmp_file);
+
+ new_birows->nb_snapshots = birows->nb_snapshots;
+ new_birows->irow_meta = birows->irow_meta;
+ irow_open_snapshots(new_birows);
+
+ if(irow_load_info_from_snapshot(new_birows, snap_index) < 0) {
+ goto fail;
+ }
+ new_birows->open_flags = birows->open_flags;
+ if(irow_open_data(new_birows, new_birows->open_flags) < 0) {
+ goto fail;
+ }
+
+ return new_birows;
+
+fail:
+ if(new_birows != NULL) {
+ irow_close_previous_state(new_birows);
+ new_birows = NULL;
+ }
+
+ return NULL;
+}
+
+static int irow_init_birows_cache(BDRVIrowState *birows) {
+ int ret = 0;
+ birows_cache = g_malloc0(sizeof(BDRVIrowState *) * birows->nb_snapshots);
+ if(birows_cache == NULL) {
+ ret = -1;
+ goto end;
+ }
+end:
+ return ret;
+}
+
+static int irow_open(BlockDriverState *bs, int flags) {
+ BDRVIrowState *s = bs->opaque;
+
+ int snap_index;
+
+ s->open_flags = flags;
+ if(irow_open_meta(bs, s, bs->filename, flags) < 0) {
+ fprintf (stderr, "Failed to open %s\n", bs->filename);
+ goto fail;
+ }
+
+ snap_index = irow_find_snapshot_by_btmp(s, s->current_btmp_file);
+ if(irow_load_info_from_snapshot(s, snap_index) < 0) {
+ fprintf (stderr, "Failed to load filename from snapshot\n");
+ goto fail;
+ }
+
+ if(irow_open_data(s, flags) < 0) {
+ goto fail;
+ }
+
+ if(irow_init_birows_cache(s) < 0) {
+ fprintf (stderr, "Failed to create birows_cache\n");
+ goto fail;
+ }
+ return 0;
+
+fail:
+ irow_close (bs);
+ return -1;
+}
+
+static int irow_get_bit(BDRVIrowState *birows, int64_t cluster_index) {
+ int64_t byte_index, bit_index;
+
+ byte_index = cluster_index >> 3;
+ bit_index = cluster_index & 0x7;
+ return (birows->bitmap[byte_index] >> bit_index) & 1;
+}
+
+static void irow_set_bit(BDRVIrowState *birows, int64_t cluster_index) {
+ int64_t byte_index, bit_index;
+ int old_bit;
+
+ if(cluster_cache != NULL) {
+ if(cluster_index == cluster_cache->cluster_num)
+ cluster_cache->cluster_num = -1;
+ }
+
+ byte_index = cluster_index >> 3;
+ bit_index = cluster_index & 0x7;
+ old_bit = (birows->bitmap[byte_index] >> bit_index) & 1;
+ if(old_bit == 0) {
+ birows->bitmap[byte_index] |= (1 << bit_index);
+ birows->bitmap_is_dirty = 1;
+ }
+}
+
+static int irow_read_missing_clusters2(BlockDriverState *bs, BDRVIrowState *birows, int64_t start_cluster, int64_t nb_clusters, uint8_t *buf, uint8_t *buf_bitmap, uint64_t buf_start) {
+ int64_t continuous_missing_clusters, continuous_appearing_clusters, i, cluster_index, buf_index;
+ int64_t backing_len, backing_sector_num, backing_nb_sectors;
+ uint8_t *backing_buf;
+ int snap_index, ret = 0;
+ BlockDriver *drv;
+
+ continuous_missing_clusters = 0;
+ continuous_appearing_clusters = 0;
+ for(i = 0; i < nb_clusters; i++) {
+ if(irow_get_bit(birows, start_cluster + i) == 0) {
+ buf_bitmap[buf_start + i] = 1;
+ continuous_missing_clusters += 1;
+ if(continuous_appearing_clusters != 0) {
+ if(strcmp(birows->current_btmp_file, birows->opened_btmp_file) != 0) {
+ cluster_index = start_cluster + i - continuous_appearing_clusters;
+ buf_index = buf_start + i - continuous_appearing_clusters;
+ if(cluster_cache != NULL) {
+ if(cluster_cache->cache != NULL) {
+ if(cluster_index == cluster_cache->cluster_num) {
+ memcpy(buf + buf_index * birows->cluster_size, cluster_cache->cache, birows->cluster_size);
+ cluster_index += 1;
+ buf_index += 1;
+ continuous_appearing_clusters -= 1;
+ if(continuous_appearing_clusters == 0) {
+ continue;
+ }
+ }
+ }
+ }
+ drv = birows->irow_irvd->drv;
+ if(bdrv_read(birows->irow_irvd,
+ cluster_index * birows->sectors_per_cluster,
+ buf + buf_index * birows->cluster_size,
+ continuous_appearing_clusters * birows->sectors_per_cluster) < 0) {
+ fprintf(stderr, "Failed to read clusters from %s\n", birows->irvd_file);
+ ret = -1;
+ goto end;
+ }
+ if(cluster_cache != NULL) {
+ if(cluster_cache->cache != NULL) {
+ memcpy(cluster_cache->cache, buf + (buf_start + i - 1) * birows->cluster_size, birows->cluster_size);
+ cluster_cache->cluster_num = start_cluster + i - 1;
+ }
+ }
+ }
+ continuous_appearing_clusters = 0;
+ }
+ } else {
+ continuous_appearing_clusters += 1;
+ if(continuous_missing_clusters != 0) {
+ if(birows->father_btmp_file != NULL) {
+ snap_index = irow_find_snapshot_by_btmp(birows, birows->father_btmp_file);
+ if(birows_cache[snap_index] == NULL) {
+ birows_cache[snap_index] = irow_open_previous_state(birows, snap_index);
+ if(birows_cache[snap_index] == NULL) {
+ ret = -1;
+ goto end;
+ }
+ }
+ ret = irow_read_missing_clusters2(bs,
+ birows_cache[snap_index],
+ start_cluster + i - continuous_missing_clusters,
+ continuous_missing_clusters,
+ buf,
+ buf_bitmap,
+ buf_start + i - continuous_missing_clusters);
+
+ } else {
+ if(bs->backing_hd) {
+ backing_len = bdrv_getlength(bs->backing_hd) / 512;
+ backing_sector_num = (start_cluster + i - continuous_missing_clusters) * birows->sectors_per_cluster;
+ backing_nb_sectors = continuous_missing_clusters * birows->sectors_per_cluster;
+ backing_buf = buf + (buf_start + i - continuous_missing_clusters) * birows->cluster_size;
+ if(backing_sector_num < backing_len) {
+ if(backing_nb_sectors > backing_len - backing_sector_num) {
+ backing_nb_sectors = backing_len - backing_sector_num;
+ }
+ if(bdrv_read(bs->backing_hd, backing_sector_num, backing_buf, backing_nb_sectors)<0) {
+ fprintf(stderr, "failed to read base image: %s\n", bs->backing_file);
+ ret = -1;
+ goto end;
+ }
+ }
+ }
+ }
+ continuous_missing_clusters = 0;
+ }
+ }
+ }
+ if(continuous_missing_clusters != 0) {
+ if(birows->father_btmp_file != NULL) {
+ snap_index = irow_find_snapshot_by_btmp(birows, birows->father_btmp_file);
+ if(birows_cache[snap_index] == NULL) {
+ birows_cache[snap_index] = irow_open_previous_state(birows, snap_index);
+ if(birows_cache[snap_index] == NULL) {
+ ret = -1;
+ goto end;
+ }
+ }
+ ret = irow_read_missing_clusters2(bs,
+ birows_cache[snap_index],
+ start_cluster + i - continuous_missing_clusters,
+ continuous_missing_clusters,
+ buf,
+ buf_bitmap,
+ buf_start + i - continuous_missing_clusters);
+
+ } else {
+ if(bs->backing_hd) {
+ backing_len = bdrv_getlength(bs->backing_hd) / 512;
+ backing_sector_num = (start_cluster + i - continuous_missing_clusters) * birows->sectors_per_cluster;
+ backing_nb_sectors = continuous_missing_clusters * birows->sectors_per_cluster;
+ backing_buf = buf + (buf_start + i - continuous_missing_clusters) * birows->cluster_size;
+ if(backing_sector_num < backing_len) {
+ if(backing_nb_sectors > backing_len - backing_sector_num) {
+ backing_nb_sectors = backing_len - backing_sector_num;
+ }
+ if(bdrv_read(bs->backing_hd, backing_sector_num, backing_buf, backing_nb_sectors)<0) {
+ fprintf(stderr, "failed to read base image: %s\n", bs->backing_file);
+ ret = -1;
+ goto end;
+ }
+ }
+ }
+ }
+ continuous_missing_clusters = 0;
+ }
+
+ if(continuous_appearing_clusters != 0) {
+ if(strcmp(birows->current_btmp_file, birows->opened_btmp_file) != 0) {
+ cluster_index = start_cluster + i - continuous_appearing_clusters;
+ buf_index = buf_start + i - continuous_appearing_clusters;
+ if(cluster_cache != NULL) {
+ if(cluster_cache->cache != NULL) {
+
+ if(cluster_index == cluster_cache->cluster_num) {
+ memcpy(buf + buf_index * birows->cluster_size, cluster_cache->cache, birows->cluster_size);
+ cluster_index += 1;
+ buf_index += 1;
+ continuous_appearing_clusters -= 1;
+ if(continuous_appearing_clusters == 0) {
+ goto end;
+ }
+ }
+ }
+ }
+ drv = birows->irow_irvd->drv;
+ if(bdrv_read(birows->irow_irvd,
+ cluster_index * birows->sectors_per_cluster,
+ buf + buf_index * birows->cluster_size,
+ continuous_appearing_clusters * birows->sectors_per_cluster) < 0) {
+ fprintf(stderr, "Failed to read clusters from %s\n", birows->irvd_file);
+ ret = -1;
+ }
+ if(cluster_cache != NULL) {
+ if(cluster_cache->cache != NULL) {
+ memcpy(cluster_cache->cache, buf + (buf_start + i - 1) * birows->cluster_size, birows->cluster_size);
+ cluster_cache->cluster_num = start_cluster + i - 1;
+ }
+ }
+ }
+ continuous_appearing_clusters = 0;
+ }
+
+end:
+ return ret;
+}
+
+static int irow_read_missing_clusters(BlockDriverState *bs, int64_t first_cluster, int64_t last_cluster, uint8_t *buf, uint8_t *buf_bitmap, int is_read) {
+ BDRVIrowState *birows = bs->opaque;
+ int64_t nb_clusters;
+ int ret = 0;
+
+ if(first_cluster >= birows->total_clusters) {
+ fprintf (stderr, "Invalid first_cluster!\n");
+ ret = -1;
+ goto end;
+ }
+ if(last_cluster >= birows->total_clusters) {
+ fprintf (stderr, "Invalid last_cluster!\n");
+ ret = -1;
+ goto end;
+ }
+
+ if(is_read) {
+ nb_clusters = last_cluster - first_cluster + 1;
+ ret = irow_read_missing_clusters2(bs, birows, first_cluster, nb_clusters, buf, buf_bitmap, 0);
+ if(ret < 0)
+ goto end;
+
+ } else {
+ ret = irow_read_missing_clusters2(bs, birows, first_cluster, 1, buf, buf_bitmap, 0);
+ if(ret < 0)
+ goto end;
+ if(first_cluster != last_cluster) {
+ ret = irow_read_missing_clusters2(bs, birows, last_cluster, 1, buf, buf_bitmap, 1);
+ }
+ }
+
+
+end:
+ return ret;
+}
+
+static int irow_write_clusters(BDRVIrowState *birows, int64_t cluster_index, const uint8_t *buf, int nb_clusters) {
+ int ret = 0;
+ BlockDriver *drv;
+
+ if(cluster_index >= birows->total_clusters) {
+ fprintf (stderr, "Invalid cluster_index!\n");
+ ret = -1;
+ goto end;
+ }
+ if((cluster_index + nb_clusters -1) >= birows->total_clusters) {
+ fprintf (stderr, "Invalid cluster_index or nb_clusters!\n");
+ ret = -1;
+ goto end;
+ }
+ drv = birows->irow_irvd->drv;
+ ret = bdrv_write(birows->irow_irvd, birows->sectors_per_cluster * cluster_index, buf, birows->sectors_per_cluster * nb_clusters);
+
+end:
+ return ret;
+}
+
+static int64_t first_sector_in_cluster(BDRVIrowState *birows, int64_t cluster_index) {
+ return cluster_index * birows->sectors_per_cluster;
+}
+
+static int64_t last_sector_in_cluster(BDRVIrowState *birows, int64_t cluster_index) {
+ return (cluster_index + 1) * birows->sectors_per_cluster - 1;
+}
+
+static int irow_assert_clusters(BlockDriverState *bs, ClusterBuffer *cbuf, int64_t sector_num, int nb_sectors, int op_type) {
+ BDRVIrowState *birows = bs->opaque;
+ int64_t nb_clusters, i, first_cluster, last_cluster, continuous_cluster, cluster_offset;
+ uint8_t *buffer_offset;// *zero_buf = NULL;
+ int ret = 0;
+
+ first_cluster = sector_num / birows->sectors_per_cluster;
+ last_cluster = (sector_num + nb_sectors - 1) / birows->sectors_per_cluster;
+ nb_clusters = last_cluster - first_cluster + 1;
+
+ switch(op_type) {
+ case IROW_READ:
+ case IROW_AIO_READ:
+ if(irow_read_missing_clusters(bs, first_cluster, last_cluster, cbuf->buf, cbuf->read_from_father, 1) < 0) {
+ ret = -1;
+ goto end;
+ }
+
+ if(birows->copy_on_demand) {
+ continuous_cluster = 0;
+ for(i = 0; i < nb_clusters + 1; i++) {
+ if(cbuf->read_from_father[i] == 0) {
+ if(continuous_cluster == 0)
+ continue;
+ cluster_offset = first_cluster + i - continuous_cluster;
+ buffer_offset = cbuf->buf + (i - continuous_cluster) * birows->cluster_size;
+ if(irow_write_clusters(birows, cluster_offset, buffer_offset, continuous_cluster) < 0) {
+ ret = -1;
+ goto end;
+ }
+ continuous_cluster = 0;
+ } else {
+ continuous_cluster += 1;
+ irow_set_bit(birows, first_cluster + i);
+ }
+ }
+ }
+ break;
+ case IROW_WRITE:
+ case IROW_AIO_WRITE:
+ if(sector_num == first_sector_in_cluster(birows, first_cluster)) {
+ if((sector_num + nb_sectors - 1) == last_sector_in_cluster(birows, last_cluster)) {
+ break;
+ } else {
+ if(irow_read_missing_clusters(bs, last_cluster, last_cluster, cbuf->buf, cbuf->read_from_father, 0) < 0) {
+ ret = -1;
+ goto end;
+ }
+ if(cbuf->read_from_father[0] == 1) {
+ if(irow_write_clusters(birows, last_cluster , cbuf->buf, 1) < 0) {
+ ret = -1;
+ goto end;
+ }
+ irow_set_bit(birows, last_cluster);
+ }
+ break;
+ }
+ } else {
+ if((sector_num + nb_sectors - 1) == last_sector_in_cluster(birows, last_cluster)) {
+ if(irow_read_missing_clusters(bs, first_cluster, first_cluster, cbuf->buf, cbuf->read_from_father, 0) < 0) {
+ ret = -1;
+ goto end;
+ }
+ if(cbuf->read_from_father[0] == 1) {
+ if(irow_write_clusters(birows, first_cluster , cbuf->buf, 1) < 0) {
+ ret = -1;
+ goto end;
+ }
+ irow_set_bit(birows, first_cluster);
+ }
+ break;
+ } else {
+ if(irow_read_missing_clusters(bs, first_cluster, last_cluster, cbuf->buf, cbuf->read_from_father, 0) < 0) {
+ ret = -1;
+ goto end;
+ }
+ if(cbuf->read_from_father[0] == 1) {
+ if(irow_write_clusters(birows, first_cluster, cbuf->buf, 1) < 0) {
+ ret = -1;
+ goto end;
+ }
+ irow_set_bit(birows, first_cluster);
+ }
+ if(cbuf->read_from_father[1] == 1) {
+ if(irow_write_clusters(birows, last_cluster, cbuf->buf + birows->cluster_size, 1) < 0) {
+ ret = -1;
+ goto end;
+ }
+ irow_set_bit(birows, last_cluster);
+ }
+ break;
+ }
+ }
+ }
+
+end:
+ return ret;
+}
+
+static int irow_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) {
+
+ BDRVIrowState *s = bs->opaque;
+ int64_t first_cluster, last_cluster, nb_clusters, sector_index, cluster_index, buf_offset, temp_buf_offset, temp_buf_index;
+ int first_cluster_copied = 0;
+ BlockDriver *drv;
+ ClusterBuffer cbuf;
+ int remain_sectors, cbuf_offset, len, ret = 0;
+ uint8_t *temp_buf = NULL;
+
+ first_cluster = sector_num / s->sectors_per_cluster;
+ last_cluster = (sector_num + nb_sectors - 1) / s->sectors_per_cluster;
+ nb_clusters = last_cluster - first_cluster + 1;
+ temp_buf_offset = (sector_num & (s->sectors_per_cluster - 1)) * BDRV_SECTOR_SIZE;
+ temp_buf_index = 0;
+ cbuf.buf = NULL;
+ cbuf.read_from_father = NULL;
+
+ if(first_cluster >= s->total_clusters) {
+ fprintf (stderr, "Invalid sector_num.\n");
+ ret = -1;
+ goto end;
+ }
+ if(last_cluster >= s->total_clusters) {
+ fprintf (stderr, "Invalid nb_sectors.\n");
+ ret = -1;
+ goto end;
+ }
+
+ temp_buf = qemu_memalign(512, nb_clusters * s->cluster_size);
+ memset(temp_buf, 0, nb_clusters * s->cluster_size);
+ if(temp_buf == NULL) {
+ fprintf (stderr, "Failed to create temp_buf.\n");
+ ret = -1;
+ goto end;
+ }
+ if(cluster_cache != NULL) {
+ if(cluster_cache->cache != NULL) {
+ if(first_cluster == cluster_cache->cluster_num) {
+ memcpy(temp_buf, cluster_cache->cache, s->cluster_size);
+ first_cluster_copied = 1;
+ first_cluster += 1;
+ nb_clusters -= 1;
+ temp_buf_index += 1;
+ }
+ }
+ }
+
+ if(nb_clusters != 0) {
+ drv = s->irow_irvd->drv;
+ ret = bdrv_read(s->irow_irvd, first_cluster * s->sectors_per_cluster, temp_buf + temp_buf_index * s->cluster_size, nb_clusters * s->sectors_per_cluster);
+ if(ret < 0) {
+ goto end;
+ }
+ }
+
+ memcpy(buf, temp_buf + temp_buf_offset, nb_sectors * BDRV_SECTOR_SIZE);
+
+ if(nb_clusters != 0) {
+ if(first_cluster_copied) {
+ first_cluster -= 1;
+ nb_clusters += 1;
+ }
+ if(cluster_cache != NULL) {
+ if(cluster_cache->cache != NULL) {
+ if(irow_get_bit(s, last_cluster)) {
+ memcpy(cluster_cache->cache, temp_buf + (nb_clusters - 1) * s->cluster_size, s->cluster_size);
+ cluster_cache->cluster_num = last_cluster;
+ }
+ }
+ }
+
+ if(s->complete_image != 1) {
+ cbuf.buf = qemu_memalign(512, nb_clusters * s->cluster_size);
+ memset(cbuf.buf, 0, nb_clusters * s->cluster_size);
+ cbuf.read_from_father = g_malloc0(nb_clusters + 1);
+
+ if(irow_assert_clusters(bs, &cbuf, first_sector_in_cluster(s, first_cluster), nb_clusters * s->sectors_per_cluster, IROW_READ) < 0) {
+ fprintf (stderr, "irow_assert_clusters() failed.\n");
+ ret = -1;
+ goto end;
+ }
+
+ irow_update_btmp(s);
+
+ sector_index = sector_num;
+ remain_sectors = nb_sectors;
+ buf_offset = 0;
+
+ while(remain_sectors > 0) {
+ cluster_index = sector_index / s->sectors_per_cluster;
+ len = last_sector_in_cluster(s, cluster_index) - sector_index + 1;
+ if(len > remain_sectors)
+ len = remain_sectors;
+
+ if(cbuf.read_from_father[cluster_index - first_cluster] == 1) {
+ cbuf_offset = (sector_index & (s->sectors_per_cluster - 1)) + (cluster_index - first_cluster) * s->sectors_per_cluster;
+ memcpy(buf + buf_offset, cbuf.buf + cbuf_offset * BDRV_SECTOR_SIZE, len * BDRV_SECTOR_SIZE);
+ }
+ sector_index = first_sector_in_cluster(s, cluster_index + 1);
+ remain_sectors -= len;
+ buf_offset += len * BDRV_SECTOR_SIZE;
+ }
+ }
+
+ }
+
+end:
+ if(cbuf.buf != NULL) {
+ g_free(cbuf.buf);
+ cbuf.buf = NULL;
+ }
+ if(cbuf.read_from_father != NULL) {
+ g_free(cbuf.read_from_father);
+ cbuf.read_from_father = NULL;
+ }
+ if(temp_buf != NULL) {
+ g_free(temp_buf);
+ temp_buf = NULL;
+ }
+ return ret;
+}
+
+static int irow_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors) {
+ BDRVIrowState *s = bs->opaque;
+ int64_t first_cluster, last_cluster, current_cluster;
+ ClusterBuffer cbuf;
+ BlockDriver *drv;
+ int ret = 0;
+
+ first_cluster = sector_num / s->sectors_per_cluster;
+ last_cluster = (sector_num + nb_sectors - 1) / s->sectors_per_cluster;
+
+
+ if(first_cluster >= s->total_clusters) {
+ fprintf (stderr, "Invalid sector_num!\n");
+ ret = -1;
+ goto end;
+ }
+ if(last_cluster >= s->total_clusters) {
+ fprintf (stderr, "Invalid nb_sectors!\n");
+ ret = -1;
+ goto end;
+ }
+
+ cbuf.buf = NULL;
+ cbuf.read_from_father = NULL;
+ if(s->complete_image != 1) {
+ cbuf.buf = qemu_memalign(512, 2 * s->cluster_size);
+ memset(cbuf.buf, 0, 2 * s->cluster_size);
+ cbuf.read_from_father = g_malloc0(2);
+ if(irow_assert_clusters(bs, &cbuf, sector_num, nb_sectors, IROW_WRITE) < 0) {
+ ret = -1;
+ goto end;
+ }
+ }
+
+ for(current_cluster = first_cluster; current_cluster <= last_cluster; current_cluster++) {
+ irow_set_bit(s, current_cluster);
+ }
+
+ drv = s->irow_irvd->drv;
+ ret = bdrv_write(s->irow_irvd, sector_num, buf, nb_sectors);
+ if(ret < 0) {
+ goto end;
+ }
+
+ if(irow_update_btmp(s) < 0) {
+ fprintf (stderr, "Failed to update btmp file. (%s)\n", s->opened_btmp_file);
+ ret = -1;
+ goto end;
+ }
+
+end:
+ if(cbuf.buf != NULL) {
+ g_free(cbuf.buf);
+ cbuf.buf = NULL;
+ }
+ if(cbuf.read_from_father != NULL) {
+ g_free(cbuf.read_from_father);
+ cbuf.read_from_father = NULL;
+ }
+
+ return ret;
+}
+
+static int irow_generate_filename(char *dest, const char *prefix, const char *body, const char *suffix) {
+ if(strlen(prefix) + strlen(body) + strlen(suffix) + 2 >= MAX_FILE_NAME_LENGTH) {
+ fprintf(stderr, "Invalid filename length, max is %d\n", MAX_FILE_NAME_LENGTH);
+ return -1;
+ }
+ strcpy(dest, prefix);
+ strcat(dest, "-");
+ strcat(dest, body);
+ strcat(dest, ".");
+ strcat(dest, suffix);
+ return 0;
+}
+
+static int irow_create_meta(IRowCreateState *cs) {
+ IRowMeta meta;
+ IRowSnapshotHeader snap_header;
+ uint32_t cluster_size, copy_on_demand;
+ uint64_t disk_size;
+ qemu_timeval tv;
+ int fd, cluster_bits, ret = 0;
+
+ if(cs->disk_size == 0) {
+ fprintf(stderr, "Invalid disk_size\n");
+ ret = -1;
+ goto end;
+ }
+ disk_size = cs->disk_size;
+
+ if(cs->cluster_size == 0) {
+ fprintf(stderr, "Invalid cluster_size\n");
+ ret = -1;
+ goto end;
+ }
+ cluster_size = cs->cluster_size;
+
+ cluster_bits = get_bits_from_size(cluster_size);
+ cs->cluster_bits = cluster_bits;
+ if ((cluster_bits < MIN_CLUSTER_BITS) || (cluster_bits > MAX_CLUSTER_BITS)) {
+ fprintf(stderr, "Cluster size must be a power of two between %d and %dk\n",
+ 1 << MIN_CLUSTER_BITS,
+ 1 << (MAX_CLUSTER_BITS - 10));
+ ret = -1;
+ goto end;
+
+ }
+ copy_on_demand = cs->copy_on_demand;
+ if(cs->meta_file[0] == '\0') {
+ fprintf(stderr, "Void meta file name\n");
+ ret = -1;
+ goto end;
+ }
+ fd = open(cs->meta_file, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+ if (fd < 0) {
+ fprintf(stderr, "Can not open %s\n", cs->meta_file);
+ ret = -1;
+ goto end;
+ }
+ memset(&meta, 0, sizeof(meta));
+ meta.magic = cpu_to_be32(IROW_MAGIC);
+ meta.version = cpu_to_be32(IROW_VERSION);
+ meta.copy_on_demand = cpu_to_be32(copy_on_demand);
+ meta.cluster_size = cpu_to_be32(cluster_size);
+ meta.cluster_bits = cpu_to_be32(cluster_bits);
+ meta.total_clusters = cpu_to_be64((disk_size + cluster_size -1) >> cluster_bits);
+ meta.sectors_per_cluster = cpu_to_be32(cluster_size >> BDRV_SECTOR_BITS);
+ meta.disk_size = cpu_to_be64(disk_size);
+ meta.nb_snapshots = cpu_to_be32(1);
+
+ if(irow_generate_filename(meta.current_btmp, cs->meta_file, cs->time_value, "btmp") < 0) {
+ ret = -1;
+ goto end;
+ }
+
+ if(irow_generate_filename(cs->irvd_file, cs->meta_file, cs->time_value, "irvd") < 0) {
+ ret = -1;
+ goto end;
+ }
+
+ if(cs->backing_file != NULL) {
+ strncpy(meta.backing_file, cs->backing_file, MAX_FILE_NAME_LENGTH);
+ }
+
+ strncpy(cs->btmp_file, meta.current_btmp, MAX_FILE_NAME_LENGTH);
+
+ memset(&snap_header, 0, sizeof(snap_header));
+
+ snap_header.snap_magic = cpu_to_be32(IROW_SNAPHEADER_MAGIC);
+ sprintf(snap_header.id_str, "0");
+ sprintf(snap_header.name, "current state");
+ strncpy(snap_header.btmp_file, cs->btmp_file, MAX_FILE_NAME_LENGTH);
+ strncpy(snap_header.irvd_file, cs->irvd_file, MAX_FILE_NAME_LENGTH);
+ qemu_gettimeofday(&tv);
+ snap_header.date_sec = tv.tv_sec;
+ snap_header.date_nsec = tv.tv_usec * 1000;
+ snap_header.nb_children = 0;
+ snap_header.is_deleted = 0;
+
+ if(write(fd, &meta, sizeof(meta))==-1){
+ ret = -1;
+ goto end;
+ }
+ if(write(fd, &snap_header, sizeof(snap_header))==-1){
+ ret = -1;
+ goto end;
+ }
+
+ if(close(fd) != 0) {
+ ret = -1;
+ }
+end:
+ return ret;
+}
+
+static int irow_create_btmp(IRowCreateState *cs) {
+
+ char *bitmap = NULL;
+ int fd, bitmap_size, ret = 0;
+
+ if(cs->btmp_file[0] == '\0') {
+ fprintf(stderr, "Void btmp file name\n");
+ ret = -1;
+ goto end;
+ }
+ fd = open(cs->btmp_file, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+ if(fd < 0) {
+ fprintf(stderr, "Can not open %s\n", cs->btmp_file);
+ ret = -1;
+ goto end;
+ }
+
+ bitmap_size = (((cs->disk_size + cs->cluster_size - 1) >> cs->cluster_bits) + 7) >> 3;
+ bitmap = g_malloc(bitmap_size);
+ memset(bitmap, 0, bitmap_size);
+
+ if(write(fd, bitmap, bitmap_size)==-1){
+ ret = -1;
+ goto end;
+ }
+
+ if(close(fd) != 0) {
+ ret = -1;
+ }
+
+end:
+ if(bitmap != NULL)
+ g_free(bitmap);
+ return ret;
+}
+
+static int irow_create_vd(IRowCreateState *cs) {
+ int fd, ret = 0;
+
+ if(cs->irvd_file[0] == '\0') {
+ fprintf(stderr, "Void irvd file name\n");
+ ret = -1;
+ goto end;
+ }
+
+ fd = open(cs->irvd_file, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+ if(fd < 0) {
+ fprintf(stderr, "Can not open %s\n", cs->irvd_file);
+ ret = -1;
+ goto end;
+ }
+ if(fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, cs->disk_size) < 0) {
+ ;
+ }
+ if (ftruncate(fd, cs->disk_size) != 0) {
+ fprintf(stderr, "Can not truncate %s to %" PRId64 " bytes\n", cs->irvd_file, cs->disk_size);
+ ret = -1;
+ }
+ if (close(fd) != 0) {
+ ret = -1;
+ }
+
+
+end:
+ return ret;
+}
+
+static IRowCreateState *irow_create_state_new(void) {
+ IRowCreateState *cs = g_malloc0(sizeof(IRowCreateState));
+ qemu_timeval tv;
+
+ cs->meta_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ cs->btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ cs->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ cs->time_value = g_malloc0(MAX_FILE_NAME_LENGTH);
+ cs->father_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+
+ qemu_gettimeofday(&tv);
+ sprintf(cs->time_value, "%lx%lx", tv.tv_sec, tv.tv_usec);
+ return cs;
+}
+
+static void irow_create_state_delete(IRowCreateState *cs) {
+ if(cs->meta_file != NULL)
+ g_free(cs->meta_file);
+ if(cs->btmp_file != NULL)
+ g_free(cs->btmp_file);
+ if(cs->irvd_file != NULL)
+ g_free(cs->irvd_file);
+ if(cs->time_value != NULL)
+ g_free(cs->time_value);
+ if(cs->father_btmp_file != NULL)
+ g_free(cs->father_btmp_file);
+ g_free(cs);
+}
+
+static int irow_create(const char *filename, QEMUOptionParameter *options) {
+ IRowCreateState *cs = irow_create_state_new();
+ int ret = 0;
+
+ if(cs == NULL) {
+ ret = -1;
+ goto end;
+ }
+ cs->cluster_size = 65536;
+ cs->copy_on_demand = 0;
+ cs->backing_file = NULL;
+ strncpy(cs->meta_file, filename, MAX_FILE_NAME_LENGTH);
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ cs->disk_size= options->value.n;
+ } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+ if (options->value.n) {
+ cs->cluster_size = options->value.n;
+ }
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+ cs->backing_file = options->value.s;
+ } else if(!strcmp(options->name, "copy_on_demand")) {
+ cs->copy_on_demand = options->value.n;
+ }
+ options++;
+ }
+
+ if(irow_create_meta(cs) < 0) {
+ fprintf(stderr, "Fail to create meta file of %s\n", filename);
+ ret = -1;
+ goto end;
+ }
+
+ if(irow_create_btmp(cs) < 0) {
+ fprintf(stderr, "Fail to create bitmap file of %s\n", filename);
+ ret = -1;
+ goto end;
+ }
+
+ if(irow_create_vd(cs) < 0) {
+ fprintf(stderr, "Fail to create virtual machine disk file of %s\n", filename);
+ ret = -1;
+ goto end;
+ }
+
+end:
+ if(cs != NULL) {
+ irow_create_state_delete(cs);
+ }
+ return ret;
+}
+
+static int coroutine_fn irow_flush(BlockDriverState *bs) {
+ BDRVIrowState *s = bs->opaque;
+
+ return bdrv_flush(s->irow_irvd);
+}
+
+typedef struct IRowAIOCB {
+ BlockDriverAIOCB common;
+ int64_t sector_num;
+ QEMUIOVector *qiov;
+ int nb_sectors;
+ BlockDriverAIOCB *irvd_aiocb;
+
+} IRowAIOCB;
+
+static void irow_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ IRowAIOCB *acb = (IRowAIOCB *)blockacb;
+ if (acb->irvd_aiocb)
+ bdrv_aio_cancel(acb->irvd_aiocb);
+ qemu_aio_release(acb);
+}
+
+static AIOCBInfo irow_aio_pool = {
+ .aiocb_size = sizeof(IRowAIOCB),
+ .cancel = irow_aio_cancel,
+};
+
+
+static IRowAIOCB *irow_aio_setup(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ IRowAIOCB *acb;
+
+ acb = qemu_aio_get(&irow_aio_pool, bs, cb, opaque);
+ if (!acb)
+ return NULL;
+ acb->irvd_aiocb = NULL;
+ acb->sector_num = sector_num;
+ acb->qiov = qiov;
+ acb->nb_sectors = nb_sectors;
+ return acb;
+}
+
+static void irow_aio_readv_cb(void *opaque, int ret) {
+ IRowAIOCB *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+ BDRVIrowState *birows = bs->opaque;
+ int64_t first_cluster, last_cluster, nb_clusters, sector_index, cluster_index, buf_offset;
+ ClusterBuffer cbuf;
+ void *buf = NULL;
+ int remain_sectors, cbuf_offset, len;
+
+ if(ret < 0) {
+ fprintf(stderr, "aio_readv failed\n");
+ goto end;
+ }
+ first_cluster = acb->sector_num / birows->sectors_per_cluster;
+ last_cluster = (acb->sector_num + acb->nb_sectors - 1) / birows->sectors_per_cluster;
+
+ if(first_cluster >= birows->total_clusters) {
+ fprintf (stderr, "Invalid sector_num.\n");
+ ret = -1;
+ goto end;
+ }
+ if(last_cluster >= birows->total_clusters) {
+ fprintf (stderr, "Invalid nb_sectors.\n");
+ ret = -1;
+ goto end;
+ }
+
+ cbuf.buf = NULL;
+ cbuf.read_from_father = NULL;
+ if(birows->complete_image != 1) {
+ nb_clusters = last_cluster - first_cluster + 1;
+ cbuf.buf = qemu_memalign(512, nb_clusters * birows->cluster_size);
+ memset(cbuf.buf, 0, nb_clusters * birows->cluster_size);
+ cbuf.read_from_father = g_malloc0(nb_clusters + 1);
+ if(irow_assert_clusters(bs, &cbuf, acb->sector_num, acb->nb_sectors, IROW_AIO_READ) < 0) {
+ fprintf (stderr, "irow_assert_clusters() failed.\n");
+ ret = -1;
+ goto end;
+ }
+ irow_update_btmp(birows);
+
+ buf = g_malloc(acb->qiov->size);
+ qemu_iovec_to_buf(acb->qiov, 0, buf, acb->qiov->size);
+
+ sector_index = acb->sector_num;
+ remain_sectors = acb->nb_sectors;
+ buf_offset = 0;
+ while(remain_sectors > 0) {
+ cluster_index = sector_index / birows->sectors_per_cluster;
+ len = last_sector_in_cluster(birows, cluster_index) - sector_index + 1;
+ if(len > remain_sectors)
+ len = remain_sectors;
+ if(cbuf.read_from_father[cluster_index - first_cluster] == 1) {
+ cbuf_offset = (sector_index & (birows->sectors_per_cluster - 1)) + (cluster_index - first_cluster) * birows->sectors_per_cluster;
+ memcpy(buf + buf_offset, cbuf.buf + cbuf_offset * BDRV_SECTOR_SIZE, len * BDRV_SECTOR_SIZE);
+ }
+ sector_index = first_sector_in_cluster(birows, cluster_index + 1);
+ remain_sectors -= len;
+ buf_offset += len * BDRV_SECTOR_SIZE;
+ }
+
+ qemu_iovec_from_buf(acb->qiov, 0, buf, acb->qiov->size);
+ }
+
+ end:
+ if(buf != NULL) {
+ g_free(buf);
+ buf = NULL;
+ }
+ if(cbuf.buf != NULL) {
+ g_free(cbuf.buf);
+ cbuf.buf = NULL;
+ }
+ if(cbuf.read_from_father != NULL) {
+ g_free(cbuf.read_from_father);
+ cbuf.read_from_father = NULL;
+ }
+ acb->common.cb(acb->common.opaque, ret);
+ qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *irow_aio_readv(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque) {
+ IRowAIOCB *acb;
+ BDRVIrowState *birows = bs->opaque;
+ BlockDriver *drv;
+
+ acb = irow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque);
+ if (!acb)
+ return NULL;
+ drv = birows->irow_irvd->drv;
+ acb->irvd_aiocb = drv->bdrv_aio_readv(birows->irow_irvd, sector_num, qiov, nb_sectors, irow_aio_readv_cb, acb);
+ if(acb->irvd_aiocb == NULL){
+ qemu_aio_release(acb);
+ return NULL;
+ }
+ return &acb->common;
+}
+
+static BlockDriverAIOCB *irow_aio_writev(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque) {
+ BDRVIrowState *s = bs->opaque;
+ int64_t first_cluster, last_cluster, current_cluster;
+ ClusterBuffer cbuf;
+ BlockDriver *drv;
+ BlockDriverAIOCB *ret = NULL;
+
+ first_cluster = sector_num / s->sectors_per_cluster;
+ last_cluster = (sector_num + nb_sectors - 1) / s->sectors_per_cluster;
+
+ if(first_cluster >= s->total_clusters) {
+ fprintf (stderr, "Invalid sector_num!\n");
+ goto end;
+ }
+ if(last_cluster >= s->total_clusters) {
+ fprintf (stderr, "Invalid nb_sectors!\n");
+ goto end;
+ }
+ cbuf.buf = NULL;
+ cbuf.read_from_father = NULL;
+ if(s->complete_image != 1) {
+ cbuf.buf = qemu_memalign(512, 2 * s->cluster_size);
+ cbuf.read_from_father = g_malloc0(2);
+ if(irow_assert_clusters(bs, &cbuf, sector_num, nb_sectors, IROW_AIO_WRITE) < 0) {
+ fprintf (stderr, "irow_assert_clusters() failed.\n");
+ goto end;
+ }
+ }
+
+ for(current_cluster = first_cluster; current_cluster <= last_cluster; current_cluster++) {
+ irow_set_bit(s, current_cluster);
+ }
+
+ drv = s->irow_irvd->drv;
+ ret = drv->bdrv_aio_writev(s->irow_irvd, sector_num, qiov, nb_sectors, cb, opaque );
+ if(ret == NULL) {
+ goto end;
+ }
+
+ if(irow_update_btmp(s) < 0) {
+ fprintf (stderr, "Failed to update btmp file. (%s)\n", s->opened_btmp_file);
+ ret = NULL;
+ goto end;
+ }
+
+end:
+ if(cbuf.buf != NULL) {
+ g_free(cbuf.buf);
+ cbuf.buf = NULL;
+ }
+ if(cbuf.read_from_father != NULL) {
+ g_free(cbuf.read_from_father);
+ cbuf.read_from_father = NULL;
+ }
+ return ret;
+}
+
+static BlockDriverAIOCB *irow_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque) {
+ BDRVIrowState *s = bs->opaque;
+ BlockDriverAIOCB *ret = NULL;
+
+ ret = bdrv_aio_flush(s->irow_irvd, cb, opaque);
+
+ return ret;
+}
+
+static void irow_new_snapshot_id(BDRVIrowState *birows, char *id_str, int id_str_size) {
+ IRowSnapshot *snap_ptr;
+ uint i, id, found;
+
+ for(id = 1; id < 0xffffffff; id++) {
+ found = 1;
+ for(i = 0; i < birows->nb_snapshots; i++) {
+ snap_ptr = birows->snapshots + i;
+ if(snap_ptr->id_str != NULL) {
+ if(id == strtoul(snap_ptr->id_str, NULL, 10)) {
+ found = 0;
+ break;
+ }
+ }
+ }
+ if(found)
+ break;
+ }
+ snprintf(id_str, id_str_size, "%d", id);
+}
+
+static int irow_find_snapshot_by_id(BDRVIrowState *birows, const char *id_str) {
+ int i;
+
+ for(i = 0; i < birows->nb_snapshots; i++) {
+ if(birows->snapshots[i].id_str != NULL) {
+ if(strcmp(birows->snapshots[i].id_str, id_str) == 0) {
+ return i;
+ }
+ }
+ }
+ return -1;
+}
+
+static int irow_find_snapshot_by_name(BDRVIrowState *birows, const char *name) {
+ int i;
+
+ for(i = 0; i < birows->nb_snapshots; i++) {
+ if(birows->snapshots[i].name != NULL) {
+ if(strcmp(birows->snapshots[i].name, name) == 0) {
+ return i;
+ }
+ }
+ }
+ return -1;
+}
+
+static int irow_find_free_snapshot(BDRVIrowState *birows) {
+ int i;
+
+ for(i = 0; i < birows->nb_snapshots; i++) {
+ if(birows->snapshots[i].nb_children == 0 && birows->snapshots[i].is_deleted == 1) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+static int irow_update_nb_children(BDRVIrowState *birows, IRowSnapshot *snap, int value) {
+ IRowSnapshot *father_snap;
+ int snap_index, ret = 0;
+ snap->nb_children += value;
+ if(snap->nb_children == 0 && snap->is_deleted == 1) {
+ if(snap->father_btmp_file) {
+ snap_index = irow_find_snapshot_by_btmp(birows, snap->father_btmp_file);
+ if(snap_index < 0) {
+ fprintf(stderr, "Failed to find father snapshot\n");
+ ret = -1;
+ goto end;
+ }
+ father_snap = birows->snapshots + snap_index;
+ irow_update_nb_children(birows, father_snap, value);
+ }
+ }
+
+end:
+ return ret;
+}
+
+static int irow_snapshot_add(BDRVIrowState *birows, IRowCreateState *cs, QEMUSnapshotInfo *sn_info) {
+ IRowSnapshot *new_snap, *snap;
+ qemu_timeval tv;
+ int snap_index;
+
+ birows->snapshots = g_realloc(birows->snapshots, (birows->nb_snapshots + 1) * sizeof(IRowSnapshot));
+
+ snap_index = irow_find_snapshot_by_btmp(birows, birows->current_btmp_file);
+ if(snap_index < 0) {
+ return -1;
+ }
+ snap = birows->snapshots + snap_index;
+
+ new_snap = birows->snapshots + birows->nb_snapshots;
+ memset(new_snap, 0, sizeof(IRowSnapshot));
+
+ snap->date_sec = sn_info->date_sec;
+ snap->date_nsec = sn_info->date_nsec;
+ snap->vm_clock_nsec = sn_info->vm_clock_nsec;
+ snap->vm_state_size = sn_info->vm_state_size;
+ irow_update_nb_children(birows, snap, 1);
+
+ if(snap->id_str == NULL) {
+ snap->id_str = g_malloc0(128);
+ } else {
+ memset(snap->id_str, 0, 128);
+ }
+ strncpy(snap->id_str, sn_info->id_str, 128);
+
+ if(snap->name == NULL) {
+ snap->name = g_malloc0(256);
+ } else {
+ memset(snap->name, 0, 256);
+ }
+ strncpy(snap->name, sn_info->name, 256);
+
+ new_snap->id_str = g_malloc0(128);
+ sprintf(new_snap->id_str, "0");
+ new_snap->name = g_malloc0(256);
+ sprintf(new_snap->name, "current state");
+ new_snap->btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ strncpy(new_snap->btmp_file, cs->btmp_file, MAX_FILE_NAME_LENGTH);
+ new_snap->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ strncpy(new_snap->irvd_file, cs->irvd_file, MAX_FILE_NAME_LENGTH);
+ if(cs->father_btmp_file != NULL) {
+ new_snap->father_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ strncpy(new_snap->father_btmp_file, cs->father_btmp_file, MAX_FILE_NAME_LENGTH);
+ }
+ qemu_gettimeofday(&tv);
+ new_snap->date_sec = tv.tv_sec;
+ new_snap->date_nsec = tv.tv_usec * 1000;
+
+ birows->nb_snapshots += 1;
+ birows_cache = g_realloc(birows_cache, sizeof(BDRVIrowState *) * birows->nb_snapshots);
+ memset(birows_cache, 0, sizeof(BDRVIrowState *) * birows->nb_snapshots);
+ birows->snapshots_is_dirty = 1;
+
+ return 0;
+}
+
+static void irow_snapshot_copy(IRowSnapshot *dst, IRowSnapshot *src) {
+
+ if(src->id_str) {
+ dst->id_str = g_malloc0(128);
+ strncpy(dst->id_str, src->id_str, 128);
+ }
+ if(src->name) {
+ dst->name = g_malloc0(256);
+ strncpy(dst->name, src->name, 256);
+ }
+ if(src->btmp_file) {
+ dst->btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ strncpy(dst->btmp_file, src->btmp_file, MAX_FILE_NAME_LENGTH);
+ }
+ if(src->irvd_file) {
+ dst->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ strncpy(dst->irvd_file, src->irvd_file, MAX_FILE_NAME_LENGTH);
+ }
+ if(src->father_btmp_file) {
+ dst->father_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+ strncpy(dst->father_btmp_file, src->father_btmp_file, MAX_FILE_NAME_LENGTH);
+ }
+ dst->date_sec = src->date_sec;
+ dst->date_nsec = src->date_nsec;
+ dst->vm_clock_nsec = src->vm_clock_nsec;
+ dst->vm_state_size = src->vm_state_size;
+ dst->nb_children = src->nb_children;
+ dst->is_deleted = src->is_deleted;
+}
+
+static int irow_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) {
+ BDRVIrowState *s = bs->opaque;
+ IRowCreateState *cs = NULL;
+ IRowSnapshot *free_snap, *old_snap, *snap;
+ int snap_index, offset, ret = 0;
+
+ if(sn_info->id_str[0] == '\0') {
+ irow_new_snapshot_id(s, sn_info->id_str, sizeof(sn_info->id_str));
+ }
+
+ if(irow_find_snapshot_by_id(s, sn_info->id_str) >= 0) {
+ fprintf(stderr, "Duplicated snapshot id\n");
+ ret = -1;
+ goto end;
+ }
+
+ if(irow_find_snapshot_by_name(s, sn_info->name) >= 0) {
+ fprintf(stderr, "Duplicated snapshot name\n");
+ ret = -1;
+ goto end;
+ }
+
+ cs = irow_create_state_new();
+ cs->cluster_bits = s->cluster_bits;
+ cs->cluster_size = s->cluster_size;
+ cs->disk_size = s->disk_size;
+ strncpy(cs->meta_file, s->meta_file, MAX_FILE_NAME_LENGTH);
+ strncpy(cs->father_btmp_file, s->current_btmp_file, MAX_FILE_NAME_LENGTH); // 其father文件为老的当前镜像
+
+ snap_index = irow_find_free_snapshot(s);
+ if(snap_index >= 0) {
+ free_snap = s->snapshots + snap_index;
+ strcpy(cs->btmp_file, free_snap->btmp_file);
+ strcpy(cs->irvd_file, free_snap->irvd_file);
+ old_snap = s->snapshots;
+ s->snapshots = g_malloc0((s->nb_snapshots - 1) * sizeof(IRowSnapshot));
+ offset = 0;
+ for(snap_index = 0; snap_index < s->nb_snapshots; snap_index++) {
+ snap = old_snap + snap_index;
+ if(snap != free_snap) {
+ irow_snapshot_copy(s->snapshots + offset, snap);
+ offset += 1;
+ }
+ }
+
+ irow_close_snapshots2(old_snap, s->nb_snapshots);
+ s->nb_snapshots -= 1;
+ } else {
+ irow_generate_filename(cs->btmp_file, cs->meta_file, cs->time_value, "btmp");
+ irow_generate_filename(cs->irvd_file, cs->meta_file, cs->time_value, "irvd");
+
+ if(irow_create_btmp(cs) < 0) {
+ fprintf(stderr, "Failed to create new btmp file (%s)\n", cs->btmp_file);
+ ret = -1;
+ goto end;
+ }
+
+ if(irow_create_vd(cs) < 0) {
+ fprintf(stderr, "Failed to create new irvd file (%s)\n", cs->irvd_file);
+ ret = -1;
+ goto end;
+ }
+ }
+
+ if(irow_snapshot_add(s, cs, sn_info) < 0) {
+ fprintf(stderr, "Failed to add new snapshot in mem\n");
+ ret = -1;
+ goto end;
+ }
+
+ if(irow_update_meta(s, cs->btmp_file, 0) < 0) {
+ fprintf(stderr, "Failed to update meta file (%s)\n", s->meta_file);
+ ret = -1;
+ goto end;
+ }
+
+ s->vm_state_size = sn_info->vm_state_size;
+ irow_update_btmp(s);
+
+ irow_close_btmp(s);
+ irow_close_irvd(s);
+
+ strncpy(s->current_btmp_file, cs->btmp_file, MAX_FILE_NAME_LENGTH);
+ snap_index = irow_find_snapshot_by_btmp(s, s->current_btmp_file);
+ if(irow_load_info_from_snapshot(s, snap_index) < 0) {
+ ret = -1;
+ goto end;
+ }
+ ret = irow_open_data(s, s->open_flags);
+ memset(s->bitmap, 0, s->bitmap_size);
+ s->bitmap_is_dirty = 1;
+ if(irow_update_btmp(s) < 0) {
+ fprintf(stderr, "Failed to update btmp file\n");
+ ret = -1;
+ goto end;
+ }
+
+end:
+ if(cs != NULL) {
+ irow_create_state_delete(cs);
+ cs = NULL;
+ }
+ return ret;
+}
+
+static int64_t irow_vm_state_offset(BDRVIrowState *birows) {
+ return birows->bitmap_size;
+}
+
+static int irow_load_vmstate2(BDRVIrowState *birows, uint8_t *buf, int64_t pos, int size) {
+
+ return bdrv_pread(birows->irow_btmp, irow_vm_state_offset(birows) + pos, buf, size);
+
+}
+
+static int irow_save_vmstate2(BDRVIrowState *birows, const uint8_t *buf, int64_t pos, int size) {
+ birows->vmstate_is_saved = 1;
+ return bdrv_pwrite(birows->irow_btmp, irow_vm_state_offset(birows) + pos, buf, size);
+
+}
+
+static int irow_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) {
+
+ BDRVIrowState *s = bs->opaque;
+ IRowSnapshot *target_snap, *current_snap, *father_snap;
+ int snap_index, ret = 0;
+
+ if(strcmp(snapshot_id, "0") == 0 || strcmp(snapshot_id, "current state") == 0) {
+ fprintf(stderr, "No need to goto current state.\n");
+ goto end;
+ }
+
+ snap_index = irow_find_snapshot_by_id(s, snapshot_id);
+ if(snap_index < 0) {
+ snap_index = irow_find_snapshot_by_name(s, snapshot_id);
+ if(snap_index < 0) {
+ fprintf(stderr, "Failed to find snapshot %s\n", snapshot_id);
+ ret = -1;
+ goto end;
+ }
+ }
+ target_snap = s->snapshots + snap_index;
+
+ if(target_snap->is_deleted) {
+ fprintf(stderr, "Can not go to deleted snapshot %s\n", snapshot_id);
+ ret = -1;
+ goto end;
+ }
+
+ snap_index = irow_find_snapshot_by_btmp(s, s->current_btmp_file);
+ if(snap_index < 0) {
+ fprintf(stderr, "Failed to find current state.\n");
+ ret = -1;
+ goto end;
+ }
+ current_snap = s->snapshots + snap_index;
+ snap_index = irow_find_snapshot_by_btmp(s, s->father_btmp_file);
+ if(snap_index < 0) {
+ fprintf(stderr, "Failed to find father snapshot.\n");
+ ret = -1;
+ goto end;
+ }
+ father_snap = s->snapshots + snap_index;
+ strncpy(s->father_btmp_file, target_snap->btmp_file, MAX_FILE_NAME_LENGTH);
+ strncpy(current_snap->father_btmp_file, target_snap->btmp_file, MAX_FILE_NAME_LENGTH);
+
+ irow_update_nb_children(s, father_snap, -1);
+ irow_update_nb_children(s, target_snap, 1);
+
+ current_snap->date_sec = target_snap->date_sec;
+ current_snap->date_nsec = target_snap->date_nsec;
+ current_snap->vm_clock_nsec = target_snap->vm_clock_nsec;
+ current_snap->vm_state_size = 0;
+
+ memset(s->bitmap, 0, s->bitmap_size);
+ s->bitmap_is_dirty = 1;
+ if(irow_update_btmp(s) < 0) {
+ fprintf(stderr, "Failed to update btmp file\n");
+ ret = -1;
+ goto end;
+ }
+
+ s->snapshots_is_dirty = 1;
+ if(irow_update_meta(s, NULL, 0) < 0) {
+ fprintf(stderr, "Failed to update meta file\n");
+ ret = -1;
+ }
+
+
+end:
+ return ret;
+}
+
+static int irow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) {
+
+ BDRVIrowState *s = bs->opaque;
+ IRowSnapshot *target_snap, *father_snap;
+ int snap_index, ret = 0;
+
+ if(strcmp(snapshot_id, "0") == 0 || strcmp(snapshot_id, "current state") == 0) {
+ fprintf(stderr, "Can not delete current state.\n");
+ goto end;
+ }
+
+ snap_index = irow_find_snapshot_by_id(s, snapshot_id);
+ if(snap_index < 0) {
+ snap_index = irow_find_snapshot_by_name(s, snapshot_id);
+ if(snap_index < 0) {
+ fprintf(stderr, "Failed to find snapshot %s\n", snapshot_id);
+ ret = -1;
+ goto end;
+ }
+ }
+ target_snap = s->snapshots + snap_index;
+
+ if(target_snap->is_deleted) {
+ fprintf(stderr, "Can not delete deleted snapshot %s\n", snapshot_id);
+ ret = -1;
+ goto end;
+ }
+
+ target_snap->is_deleted = 1;
+ strncat(target_snap->name, "_del", 255-strlen(target_snap->name));
+
+ if(target_snap->nb_children == 0) {
+ if(target_snap->father_btmp_file) {
+ snap_index = irow_find_snapshot_by_btmp(s, target_snap->father_btmp_file);
+ if(snap_index < 0) {
+ fprintf(stderr, "Failed to find father snapshot\n");
+ ret = -1;
+ goto end;
+ }
+ father_snap = s->snapshots + snap_index;
+ irow_update_nb_children(s, father_snap, -1);
+ }
+ }
+
+ s->snapshots_is_dirty = 1;
+ irow_update_meta(s, NULL, 0);
+end:
+ return ret;
+}
+
+static int irow_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) {
+
+ BDRVIrowState *s = bs->opaque;
+ QEMUSnapshotInfo *snap_tab, *snap_info;
+ IRowSnapshot *snap;
+ int i, offset, nb_del_snapshots = 0;
+
+ if (s->nb_snapshots == 0) {
+ *psn_tab = NULL;
+ return s->nb_snapshots;
+ }
+
+ for(i = 0; i < s->nb_snapshots; i++) {
+ snap = s->snapshots + i;
+ if(snap->is_deleted)
+ nb_del_snapshots += 1;
+ }
+ snap_tab = g_malloc0((s->nb_snapshots - nb_del_snapshots) * sizeof(QEMUSnapshotInfo));
+ offset = 0;
+ for(i = 0; i < s->nb_snapshots; i++) {
+ snap_info = snap_tab + offset;
+ snap = s->snapshots + i;
+ if(snap->is_deleted != 1) {
+ if(snap->id_str != NULL) {
+ pstrcpy(snap_info->id_str, sizeof(snap_info->id_str), snap->id_str);
+ }
+ if(snap->name != NULL) {
+ pstrcpy(snap_info->name, sizeof(snap_info->name), snap->name);
+ }
+ snap_info->vm_state_size = snap->vm_state_size;
+ snap_info->date_sec = snap->date_sec;
+ snap_info->date_nsec = snap->date_nsec;
+ snap_info->vm_clock_nsec = snap->vm_clock_nsec;
+
+ offset += 1;
+ }
+ }
+ *psn_tab = snap_tab;
+ return s->nb_snapshots - nb_del_snapshots;
+}
+
+static int irow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) {
+ BDRVIrowState *s = bs->opaque;
+ bdi->cluster_size = s->cluster_size;
+ bdi->vm_state_offset = irow_vm_state_offset(s);
+ return 0;
+}
+
+static int irow_save_vmstate(BlockDriverState *bs, const uint8_t *buf, int64_t pos, int size) {
+
+ BDRVIrowState *birows = bs->opaque;
+ int ret = 0;
+
+ ret = irow_save_vmstate2(birows, buf, pos, size);
+ return ret;
+}
+
+static int irow_load_vmstate(BlockDriverState *bs, uint8_t *buf, int64_t pos, int size) {
+
+ BDRVIrowState *target_birows = NULL, *birows = bs->opaque;
+ int target_index, ret = 0;
+
+ target_index = irow_find_snapshot_by_btmp(birows, birows->father_btmp_file);
+ if(target_index < 0) {
+ ret = -1;
+ goto end;
+ }
+
+ target_birows = irow_open_previous_state(birows, target_index);
+ if(target_birows == NULL) {
+ ret = -1;
+ goto end;
+ }
+
+ ret = irow_load_vmstate2(target_birows, buf, pos, size);
+
+end:
+ if(target_birows != NULL) {
+ irow_close_previous_state(target_birows);
+ target_birows = NULL;
+ }
+ return ret;
+}
+
+static int irow_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix){
+ BDRVIrowState *birows = bs->opaque;
+ char user_input[100];
+ printf("current copy_on_demand state is ");
+ if(birows->copy_on_demand) {
+ printf("ON\n");
+ } else {
+ printf("OFF\n");
+ }
+ while(1) {
+ printf("do you want to change copy_on_demand state? (y/n)");
+ if(scanf("%s", user_input)== EOF){
+ return 1;
+ }
+ user_input[0] = tolower(user_input[0]);
+ if(user_input[0] == 'y') {
+ birows->copy_on_demand = birows->copy_on_demand ? 0 : 1;
+ irow_update_meta(birows, NULL, 1);
+ break;
+ }
+ if(user_input[0] == 'n')
+ break;
+ }
+ return 0;
+}
+
+static int64_t irow_get_length(BlockDriverState *bs) {
+ BDRVIrowState *birows = bs->opaque;
+ int64_t ret;
+ ret = birows->disk_size;
+ return ret;
+}
+
+static QEMUOptionParameter irow_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ {
+ .name = BLOCK_OPT_CLUSTER_SIZE,
+ .type = OPT_SIZE,
+ .help = "irow cluster size"
+ },
+ {
+ .name = BLOCK_OPT_BACKING_FILE,
+ .type = OPT_STRING,
+ .help = "File name of a base image"
+ },
+ {
+ .name = "copy_on_demand",
+ .type = OPT_FLAG,
+ .help = "copy clusters to current irvd when needed"
+ },
+ { NULL }
+};
+
+static BlockDriver bdrv_irow = {
+ .format_name = "irow",
+ .instance_size = sizeof(BDRVIrowState),
+ .bdrv_probe = irow_probe,
+ .bdrv_open = irow_open,
+ .bdrv_read = irow_read,
+ .bdrv_write = irow_write,
+ .bdrv_close = irow_close,
+ .bdrv_create = irow_create,
+
+ .bdrv_co_flush_to_disk = irow_flush,
+
+ .bdrv_aio_readv = irow_aio_readv,
+ .bdrv_aio_writev = irow_aio_writev,
+ .bdrv_aio_flush = irow_aio_flush,
+
+ .bdrv_snapshot_create = irow_snapshot_create,
+ .bdrv_snapshot_goto = irow_snapshot_goto,
+ .bdrv_snapshot_delete = irow_snapshot_delete,
+ .bdrv_snapshot_list = irow_snapshot_list,
+
+ .bdrv_get_info = irow_get_info,
+ .bdrv_getlength = irow_get_length,
+
+ .bdrv_save_vmstate = irow_save_vmstate,
+ .bdrv_load_vmstate = irow_load_vmstate,
+
+ .create_options = irow_create_options,
+ .bdrv_check = irow_check,
+};
+
+static void bdrv_irow_init(void)
+{
+ bdrv_register(&bdrv_irow);
+}
+
+block_init(bdrv_irow_init);
new file mode 100644
@@ -0,0 +1,135 @@
+/* IROW(Improved ROW)Disk Format
+ * */
+/*
+ * iRow (imporved Redirect-on-Write) is a disk format supporting high-efficiency VM disk snapshot.
+ * iROW uses bitmap to reduce the amount of metadata, so that both the VM disk snapshot key operations
+ * performance and the VM disk I/O performance would be enhanced at the same time.
+ *
+ *The iROW VM disk image consists of a meta file and several snapshots.
+ *
+ *A snapshot consists of 2 files: a bitmap file (btmp file) and a VM disk data file (irvd file).
+ *The current state of the iROW VM disk also occupies a snapshot.
+ *
+ *The meta file consists of the meta header and the snapshots information. The meta header is used to
+ *store basic information of VM disk image. The snapshots information sequentially stores every snapshot’s name,
+ *id and others related information.
+ *
+ *The btmp file consists of a bitmap and the VM state data. The bitmap is used to indicate whether the
+ *clusters exist in corresponding irvd file. Each cluster in the VM disk image is mapped to a bit in the bitmap.
+ *
+ *The irvd file is used to store the actual data of the VM disk image. The smallest unit of storage is cluster.
+ *iROW does not decide the address of the data clusters. It just writes the clusters to the same VM disk image
+ *addresses as the virtual addresses of the clusters. Because of host machine’s file system support sparse files,
+ *iROW also achieves the gradual growth of the VM disk image size with the actual disk usage.
+ *
+ */
+#define IROW_MAGIC (('I' << 24) | ('R' << 16) | ('O' << 8) | 'W')
+#define IROW_VERSION 1
+
+#define IROW_SNAPHEADER_MAGIC (('S' << 24) | ('N' << 16) | ('A' << 8) | 'P')
+
+#define MIN_CLUSTER_BITS 9
+#define MAX_CLUSTER_BITS 21
+#define MAX_FILE_NAME_LENGTH 256
+
+#define IROW_READ 1
+#define IROW_WRITE 2
+#define IROW_AIO_READ 3
+#define IROW_AIO_WRITE 4
+
+
+typedef struct __attribute__((packed)) IRowMeta {
+ uint32_t magic;
+ uint32_t version;
+ uint32_t copy_on_demand;
+ uint32_t nb_snapshots;
+ uint32_t cluster_size;
+ uint32_t cluster_bits;
+ uint32_t sectors_per_cluster;
+ uint64_t total_clusters;
+ uint64_t disk_size;
+ char current_btmp[MAX_FILE_NAME_LENGTH];
+ char backing_file[MAX_FILE_NAME_LENGTH];
+} IRowMeta;
+
+typedef struct __attribute__((packed)) IRowSnapshotHeader {
+ uint32_t snap_magic;
+ char id_str[128];
+ char name[256];
+ char btmp_file[MAX_FILE_NAME_LENGTH];
+ char irvd_file[MAX_FILE_NAME_LENGTH];
+ char father_btmp_file[MAX_FILE_NAME_LENGTH];
+ uint32_t vm_state_size;
+ uint32_t date_sec;
+ uint32_t date_nsec;
+ uint64_t vm_clock_nsec;
+ uint32_t nb_children;
+ uint32_t is_deleted;
+} IRowSnapshotHeader;
+
+typedef struct IRowSnapshot {
+ char *id_str;
+ char *name;
+ char *btmp_file;
+ char *irvd_file;
+ char *father_btmp_file;
+ uint32_t vm_state_size;
+ uint32_t date_sec;
+ uint32_t date_nsec;
+ uint64_t vm_clock_nsec;
+ uint32_t nb_children;
+ uint32_t is_deleted;
+} IRowSnapshot;
+
+typedef struct IRowCreateState {
+ uint64_t disk_size;
+ uint32_t cluster_size;
+ uint32_t cluster_bits;
+ uint32_t copy_on_demand;
+ char *meta_file;
+ char *father_btmp_file;
+ char *btmp_file;
+ char *irvd_file;
+ char *time_value;
+ char *backing_file;
+} IRowCreateState;
+
+typedef struct ClusterCache {
+ uint8_t *cache;
+ int64_t cluster_num;
+} ClusterCache;
+
+typedef struct BDRVIrowState {
+ BlockDriverState *irow_meta;
+ BlockDriverState *irow_btmp;
+ BlockDriverState *irow_irvd;
+ uint64_t disk_size;
+ uint64_t bitmap_size;
+ uint32_t cluster_size;
+ uint32_t cluster_bits;
+ uint64_t total_clusters;
+ uint32_t sectors_per_cluster;
+ uint32_t nb_snapshots;
+ uint32_t vm_state_size;
+ uint32_t copy_on_demand;
+ int open_flags;
+ IRowSnapshot *snapshots;
+ uint32_t snapshots_is_dirty;
+ uint8_t *bitmap;
+ uint32_t bitmap_is_dirty;
+ uint32_t vmstate_is_saved;
+ uint32_t complete_image;
+ char *meta_file;
+ char *current_btmp_file;
+ char *father_btmp_file;
+ char *opened_btmp_file;
+ char *irvd_file;
+} BDRVIrowState;
+
+typedef struct ClusterBuffer {
+ uint8_t *buf;
+ uint8_t *read_from_father;
+} ClusterBuffer;
+
+#define IROW_SNAPSHOT_OFFSET sizeof(IRowMeta)
+#define MAX_MERGE_BUFFER 16 * 1024 * 1024
From: Jingsheng Zheng <zhengjs.act@gmail.com> iRow (imporved Redirect-on-Write) is a disk format supporting high-efficiency VM disk snapshot. iROW uses bitmap to reduce the amount of metadata, so that both the VM disk snapshot key operations performance and the VM disk I/O performance would be enhanced at the same time. Signed-off-by : JingshengZheng <zhengjs.act@gmail.com> --- block/Makefile.objs | 1 + block/irow.c | 2257 +++++++++++++++++++++++++++++++++++++++++++++++++++ block/irow.h | 135 +++ 3 files changed, 2393 insertions(+), 0 deletions(-) create mode 100644 block/irow.c create mode 100644 block/irow.h