From patchwork Sat Jan 26 08:15:37 2013 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: Add a disk format named iROW, supporting high-efficiency VM snapshot From: zhengjs.act@gmail.com X-Patchwork-Id: 215912 Message-Id: <1359188137-29505-1-git-send-email-zhengjs.act@gmail.com> To: qemu-devel@nongnu.org Cc: stefanha@redhat.com, Jingsheng Zheng Date: Sat, 26 Jan 2013 16:15:37 +0800 From: Jingsheng Zheng iRow (imporved Redirect-on-Write) is a disk format supporting high-efficiency VM disk snapshot. iROW uses bitmap to reduce the amount of metadata, so that both the VM disk snapshot key operations performance and the VM disk I/O performance would be enhanced at the same time. Signed-off-by : JingshengZheng --- block/Makefile.objs | 1 + block/irow.c | 2257 +++++++++++++++++++++++++++++++++++++++++++++++++++ block/irow.h | 135 +++ 3 files changed, 2393 insertions(+), 0 deletions(-) create mode 100644 block/irow.c create mode 100644 block/irow.h diff --git a/block/Makefile.objs b/block/Makefile.objs index c067f38..e045440 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -1,5 +1,6 @@ block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o +block-obj-y += irow.o block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o block-obj-y += qed-check.o block-obj-y += parallels.o blkdebug.o blkverify.o diff --git a/block/irow.c b/block/irow.c new file mode 100644 index 0000000..99b8579 --- /dev/null +++ b/block/irow.c @@ -0,0 +1,2257 @@ +/* IROW(Improved ROW)Disk Format + * */ + +/* + * iRow (imporved Redirect-on-Write) is a disk format supporting high-efficiency VM disk snapshot. + * iROW uses bitmap to reduce the amount of metadata, so that both the VM disk snapshot key operations + * performance and the VM disk I/O performance would be enhanced at the same time. + * + *The iROW VM disk image consists of a meta file and several snapshots. + * + *A snapshot consists of 2 files: a bitmap file (btmp file) and a VM disk data file (irvd file). + *The current state of the iROW VM disk also occupies a snapshot. + * + *The meta file consists of the meta header and the snapshots information. The meta header is used to + *store basic information of VM disk image. The snapshots information sequentially stores every snapshot’s name, + *id and others related information. + * + *The btmp file consists of a bitmap and the VM state data. The bitmap is used to indicate whether the + *clusters exist in corresponding irvd file. Each cluster in the VM disk image is mapped to a bit in the bitmap. + * + *The irvd file is used to store the actual data of the VM disk image. The smallest unit of storage is cluster. + *iROW does not decide the address of the data clusters. It just writes the clusters to the same VM disk image + *addresses as the virtual addresses of the clusters. Because of host machine’s file system support sparse files, + *iROW also achieves the gradual growth of the VM disk image size with the actual disk usage. + * + */ + +#include "qemu-common.h" +#include "include/block/block_int.h" +#include "include/qemu/module.h" +#include "block/irow.h" + +#include + +BDRVIrowState **birows_cache = NULL; +ClusterCache *cluster_cache = NULL; + +static int get_bits_from_size(size_t size) +{ + int ret = 0; + if (size == 0) { + return -1; + } + while (size != 1) { + if (size & 1) { + return -1; + } + size >>= 1; + ret++; + } + return ret; +} + +static int irow_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const IRowMeta *irow_meta = (const void *)buf; + + if (buf_size >= sizeof(IRowMeta) && + be32_to_cpu(irow_meta->magic) == IROW_MAGIC && + be32_to_cpu(irow_meta->version) == IROW_VERSION){ + return 100; + } + else { + return 0; + } +} + +static void irow_close_btmp(BDRVIrowState *s) { + if(s->bitmap) { + g_free(s->bitmap); + s->bitmap = NULL; + } + + if(s->irow_btmp) { + bdrv_delete(s->irow_btmp); + s->irow_btmp = NULL; + } +} + +static void irow_close_irvd(BDRVIrowState *s) { + if(s->irow_irvd) { + bdrv_delete(s->irow_irvd); + s->irow_irvd = NULL; + } +} + +static void irow_close_snapshots2(IRowSnapshot *snapshots, int nb_snapshots) { + int i; + IRowSnapshot *snap_ptr; + + if(snapshots == NULL) + return; + + for(i = 0; i < nb_snapshots; i++) { + snap_ptr = snapshots + i; + if(snap_ptr->btmp_file) { + g_free(snap_ptr->btmp_file); + snap_ptr->btmp_file = NULL; + } + + if(snap_ptr->irvd_file) { + g_free(snap_ptr->irvd_file); + snap_ptr->irvd_file = NULL; + } + + if(snap_ptr->father_btmp_file) { + g_free(snap_ptr->father_btmp_file); + snap_ptr->father_btmp_file = NULL; + } + + if(snap_ptr->id_str) { + g_free(snap_ptr->id_str); + snap_ptr->id_str = NULL; + } + + if(snap_ptr->name) { + g_free(snap_ptr->name); + snap_ptr->name = NULL; + } + } + g_free(snapshots); +} + +static void irow_close_snapshots(BDRVIrowState *birows) { + irow_close_snapshots2(birows->snapshots, birows->nb_snapshots); + birows->snapshots = NULL; +} + +static void irow_close_meta(BDRVIrowState *s) { + if(s->meta_file) { + g_free(s->meta_file); + s->meta_file = NULL; + } + + if(s->current_btmp_file) { + g_free(s->current_btmp_file); + s->current_btmp_file = NULL; + } + + if(s->father_btmp_file) { + g_free(s->father_btmp_file); + s->father_btmp_file = NULL; + } + + if(s->irvd_file) { + g_free(s->irvd_file); + s->irvd_file = NULL; + } + + if(s->opened_btmp_file) { + g_free(s->opened_btmp_file); + s->opened_btmp_file = NULL; + } + + if(s->irow_meta) { + bdrv_delete(s->irow_meta); + s->irow_meta = NULL; + } + if(s->snapshots) { + irow_close_snapshots(s); + } +} + +static void irow_close_state(BDRVIrowState *s) { + + irow_close_meta(s); + irow_close_btmp(s); + irow_close_irvd(s); + +} + +static int irow_check_bitmap(BDRVIrowState *birows) { + uint64_t i; + for(i = 0; i < birows->bitmap_size; i++) { + if(birows->bitmap[i] != 0xff) + return 0; + } + return 1; +} + +static int irow_update_btmp(BDRVIrowState *birows) { + + int ret = 0; + if(birows->bitmap_is_dirty) { + if(bdrv_pwrite(birows->irow_btmp, 0, birows->bitmap, birows->bitmap_size) != birows->bitmap_size) { + fprintf(stderr, "Failed to write the IROW bitmap data to %s\n", birows->opened_btmp_file); + ret = -1; + goto end; + } + birows->bitmap_is_dirty = 0; + ret = bdrv_truncate(birows->irow_btmp, birows->bitmap_size + birows->vm_state_size); + if(irow_check_bitmap(birows)) { + birows->complete_image = 1; + } + } + if(birows->vmstate_is_saved) { + birows->vmstate_is_saved = 0; + ret = bdrv_truncate(birows->irow_btmp, birows->bitmap_size + birows->vm_state_size); + } + +end: + return ret; +} + +static int irow_update_meta(BDRVIrowState *birows, const char *current_btmp, int change_copy_on_demand_state) { + int i, ret = 0; + uint32_t copy_on_demand; + IRowMeta meta; + IRowSnapshotHeader snap_header; + IRowSnapshot *snap_ptr; + + if(change_copy_on_demand_state == 0 && birows->snapshots_is_dirty == 0 && current_btmp == NULL) + goto end; + + if(bdrv_pread (birows->irow_meta, 0, &meta, sizeof(meta)) != sizeof(meta)) { + fprintf (stderr, "Failed to read the meta data from %s\n", birows->meta_file); + ret = -1; + goto end; + } + if(change_copy_on_demand_state) { + copy_on_demand = meta.copy_on_demand; + be32_to_cpus(©_on_demand); + copy_on_demand = copy_on_demand ? 0 : 1; + meta.copy_on_demand = cpu_to_be32(copy_on_demand); + } + if(current_btmp != NULL) { + memset(meta.current_btmp, 0, MAX_FILE_NAME_LENGTH); + strncpy(meta.current_btmp, current_btmp, MAX_FILE_NAME_LENGTH); + } + + if(birows->snapshots_is_dirty) { + meta.nb_snapshots = cpu_to_be32(birows->nb_snapshots); + for(i = 0; i < birows->nb_snapshots; i++) { + memset(&snap_header, 0, sizeof(snap_header)); + snap_ptr = birows->snapshots + i; + snap_header.snap_magic = cpu_to_be32(IROW_SNAPHEADER_MAGIC); + snap_header.date_sec = snap_ptr->date_sec; + snap_header.date_nsec = snap_ptr->date_nsec; + snap_header.vm_clock_nsec = snap_ptr->vm_clock_nsec; + snap_header.vm_state_size = snap_ptr->vm_state_size; + snap_header.nb_children = snap_ptr->nb_children; + snap_header.is_deleted = snap_ptr->is_deleted; + if(snap_ptr->id_str != NULL) + strncpy(snap_header.id_str, snap_ptr->id_str, 128); + if(snap_ptr->name != NULL) + strncpy(snap_header.name, snap_ptr->name, 256); + if(snap_ptr->btmp_file == NULL) { + fprintf(stderr, "Void btmp filename\n"); + ret = -1; + goto end; + } + strncpy(snap_header.btmp_file, snap_ptr->btmp_file, MAX_FILE_NAME_LENGTH); + if(snap_ptr->irvd_file == NULL) { + fprintf(stderr, "Void irvd filename\n"); + ret = -1; + goto end; + } + strncpy(snap_header.irvd_file, snap_ptr->irvd_file, MAX_FILE_NAME_LENGTH); + if(snap_ptr->father_btmp_file != NULL) + strncpy(snap_header.father_btmp_file, snap_ptr->father_btmp_file, MAX_FILE_NAME_LENGTH); + + if(bdrv_pwrite(birows->irow_meta, sizeof(meta) + i * sizeof(IRowSnapshotHeader), &snap_header, sizeof(snap_header)) != sizeof(snap_header)) { + fprintf (stderr, "Failed to write the snapshot #%d info to %s\n", i, birows->meta_file); + ret = -1; + goto end; + } + } + birows->snapshots_is_dirty = 0; + } + + if(bdrv_pwrite(birows->irow_meta, 0, &meta, sizeof(meta)) != sizeof(meta)) { + fprintf (stderr, "Failed to write the meta data to %s\n", birows->meta_file); + ret = -1; + goto end; + } + + ret = bdrv_truncate(birows->irow_meta, sizeof(meta) + (birows->nb_snapshots) * sizeof(IRowSnapshotHeader)); + +end: + return ret; +} + +static void irow_close_previous_state(BDRVIrowState *birows) { + birows->irow_meta = NULL; + irow_close_state(birows); + g_free(birows); +} + +static void irow_free_birows_cache(BDRVIrowState *birows) { + int i; + if(birows_cache != NULL) { + for(i = 0; i < birows->nb_snapshots; i++) { + if(birows_cache[i] != NULL) { + irow_close_previous_state(birows_cache[i]); + } + } + g_free(birows_cache); + birows_cache = NULL; + } +} + +static void irow_close(BlockDriverState *bs) { + + BDRVIrowState *s = bs->opaque; + + irow_free_birows_cache(s); + irow_close_state(s); + +} + +static int irow_open_snapshots(BDRVIrowState *birows) { + int i, ret = 0; + IRowSnapshotHeader snap_header; + IRowSnapshot *snap_ptr; + int64_t offset; + + birows->snapshots = g_malloc0(sizeof(IRowSnapshot) * birows->nb_snapshots); + offset = IROW_SNAPSHOT_OFFSET; + for(i = 0; i < birows->nb_snapshots; i++) { + if(bdrv_pread(birows->irow_meta, offset, &snap_header, sizeof(snap_header)) != sizeof(snap_header)) { + fprintf(stderr, "Failed to read snapshot #%d info from %s\n", i, birows->meta_file); + ret = -1; + goto fail; + } + snap_ptr = birows->snapshots + i; + snap_ptr->date_sec = snap_header.date_sec; + snap_ptr->date_nsec = snap_header.date_nsec; + snap_ptr->vm_clock_nsec = snap_header.vm_clock_nsec; + snap_ptr->vm_state_size = snap_header.vm_state_size; + snap_ptr->nb_children = snap_header.nb_children; + snap_ptr->is_deleted = snap_header.is_deleted; + + if(snap_header.id_str[0] != '\0') { + snap_ptr->id_str = g_malloc0(128); + strncpy(snap_ptr->id_str, snap_header.id_str, 128); + } + if(snap_header.name[0] != '\0') { + snap_ptr->name = g_malloc0(256); + strncpy(snap_ptr->name, snap_header.name, 256); + } + if(snap_header.btmp_file == '\0') { + fprintf(stderr, "Invalid btmp file name. (snapshot #%d)\n", i); + ret = -1; + goto fail; + } + snap_ptr->btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH); + strncpy(snap_ptr->btmp_file, snap_header.btmp_file, MAX_FILE_NAME_LENGTH); + if(snap_header.irvd_file == '\0') { + fprintf(stderr, "Invalid irvd file name. (snapshot #%d)\n", i); + ret = -1; + goto fail; + } + snap_ptr->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH); + strncpy(snap_ptr->irvd_file, snap_header.irvd_file, MAX_FILE_NAME_LENGTH); + if(snap_header.father_btmp_file[0] != '\0') { + snap_ptr->father_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH); + strncpy(snap_ptr->father_btmp_file, snap_header.father_btmp_file, MAX_FILE_NAME_LENGTH); + } + offset += sizeof(snap_header); + } + birows->snapshots_is_dirty = 0; + + return ret; +fail: + irow_close_snapshots(birows); + return ret; + +} + +static int irow_open_meta(BlockDriverState *bs, BDRVIrowState *birows, const char *filename, int flags) { + int ret = 0; + IRowMeta meta; + + birows->irow_meta = bdrv_new (""); + ret = bdrv_file_open(&birows->irow_meta, filename, flags); + if (ret < 0) { + fprintf (stderr, "Failed to open %s\n", filename); + goto end; + } + if (bdrv_pread (birows->irow_meta, 0, &meta, sizeof(meta)) != sizeof(meta)) { + fprintf (stderr, "Failed to read the IROW meta data from %s\n", filename); + ret = -1; + goto end; + } + be32_to_cpus(&meta.magic); + be32_to_cpus(&meta.version); + be32_to_cpus(&meta.copy_on_demand); + be32_to_cpus(&meta.cluster_size); + be32_to_cpus(&meta.cluster_bits); + be64_to_cpus(&meta.total_clusters); + be32_to_cpus(&meta.sectors_per_cluster); + be64_to_cpus(&meta.disk_size); + be32_to_cpus(&meta.nb_snapshots); + + if(meta.magic != IROW_MAGIC || meta.version != IROW_VERSION) { + fprintf (stderr, "Invalid magic number or version number!\n"); + ret = -1; + goto end; + } + if((meta.cluster_bits < MIN_CLUSTER_BITS) || (meta.cluster_bits > MAX_CLUSTER_BITS)) { + fprintf (stderr, "Invalid cluster_bits!\n"); + ret = -1; + goto end; + } + if(meta.cluster_bits != get_bits_from_size(meta.cluster_size)) { + fprintf (stderr, "cluster_size and cluster_bits do not match!\n"); + ret = -1; + goto end; + } + if(meta.total_clusters != ((meta.disk_size + meta.cluster_size - 1) >> meta.cluster_bits)) { + fprintf (stderr, "total_clusters and disk_size do not match!\n"); + ret = -1; + goto end; + } + if(meta.sectors_per_cluster != (meta.cluster_size >> BDRV_SECTOR_BITS)) { + fprintf (stderr, "Invalid sectors_per_cluster!\n"); + ret = -1; + goto end; + } + birows->copy_on_demand = meta.copy_on_demand; + birows->cluster_size = meta.cluster_size; + birows->cluster_bits = meta.cluster_bits; + birows->total_clusters = meta.total_clusters; + birows->sectors_per_cluster = meta.sectors_per_cluster; + birows->disk_size = meta.disk_size; + bs->total_sectors = meta.disk_size / BDRV_SECTOR_SIZE; + birows->bitmap_size = (birows->total_clusters + 7) >> 3; + birows->nb_snapshots = meta.nb_snapshots; + birows->meta_file = g_malloc(MAX_FILE_NAME_LENGTH); + strncpy(birows->meta_file, filename, MAX_FILE_NAME_LENGTH); + birows->current_btmp_file = g_malloc(MAX_FILE_NAME_LENGTH); + strncpy(birows->current_btmp_file, meta.current_btmp, MAX_FILE_NAME_LENGTH); + strncpy(bs->backing_file, meta.backing_file, sizeof(bs->backing_file)); + + if(cluster_cache == NULL) { + cluster_cache = g_malloc0(sizeof(ClusterCache)); + if(cluster_cache != NULL) { + cluster_cache->cache = qemu_memalign(512, birows->cluster_size); + if(cluster_cache->cache != NULL) + memset(cluster_cache->cache, 0, birows->cluster_size); + else { + fprintf(stderr, "Failed to create father cache\n"); + ret = -1; + goto end; + } + cluster_cache->cluster_num = -1; + } else { + fprintf(stderr, "Failed to create father cache\n"); + ret = -1; + goto end; + } + } + + if(irow_open_snapshots(birows) < 0) { + fprintf(stderr, "Failed to read snapshots info from %s\n", birows->meta_file); + ret = -1; + goto end; + } + +end: + return ret; +} + +static int irow_open_btmp(BDRVIrowState *birows, const char *filename, int flags) { + int ret; + + birows->irow_btmp = bdrv_new (""); + ret = bdrv_file_open(&birows->irow_btmp, filename, flags); + if (ret < 0) { + return ret; + } + birows->bitmap = qemu_memalign(512, birows->bitmap_size); + if(bdrv_pread(birows->irow_btmp, 0, birows->bitmap, birows->bitmap_size) != birows->bitmap_size) { + fprintf(stderr, "Failed to read bitmap from %s\n", filename); + return -1; + } + birows->bitmap_is_dirty = 0; + birows->vmstate_is_saved = 0; + if(irow_check_bitmap(birows)) { + birows->complete_image = 1; + } else { + birows->complete_image = 0; + } + return ret; +} + +static int irow_open_vd(BDRVIrowState *birows, const char *filename, int flags) { + int ret; + birows->irow_irvd = bdrv_new (""); + ret = bdrv_file_open(&birows->irow_irvd, filename, flags); + return ret; +} + +static int irow_open_data(BDRVIrowState *birows, int flags) { + + int ret = 0; + + if(birows->opened_btmp_file == NULL || birows->opened_btmp_file[0] == '\0') { + fprintf (stderr, "Void btmp file name\n"); + ret = -1; + goto end; + } + if(irow_open_btmp(birows, birows->opened_btmp_file, flags) < 0) { + fprintf (stderr, "Failed to open %s\n", birows->opened_btmp_file); + ret = -1; + goto end; + } + + if(birows->irvd_file == NULL || birows->irvd_file[0] == '\0') { + fprintf (stderr, "Void irvd file name\n"); + ret = -1; + goto end; + } + if(irow_open_vd(birows, birows->irvd_file, flags) < 0) { + fprintf (stderr, "Failed to open %s\n", birows->irvd_file); + ret = -1; + goto end; + } + +end: + return ret; +} + +static int irow_find_snapshot_by_btmp(BDRVIrowState *birows, const char *btmp) { + int i; + + for(i = 0; i < birows->nb_snapshots; i++) { + if(birows->snapshots[i].btmp_file != NULL) { + if(strcmp(birows->snapshots[i].btmp_file, btmp) == 0) { + return i; + } + } + } + return -1; +} + +static int irow_load_info_from_snapshot(BDRVIrowState *birows, int snapshot_index) { + IRowSnapshot *snap; + int ret = 0; + + if(snapshot_index < 0) { + fprintf (stderr, "Invalid snapshot index.\n"); + ret = -1; + goto end; + } + snap = birows->snapshots + snapshot_index; + if(snap->btmp_file == NULL) { + fprintf (stderr, "Void btmp file name in snap info\n"); + ret = -1; + goto end; + } + if(snap->irvd_file == NULL) { + fprintf (stderr, "Void irvd file name in snap info\n"); + ret = -1; + goto end; + } + birows->opened_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH); + birows->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH); + strncpy(birows->opened_btmp_file, snap->btmp_file, MAX_FILE_NAME_LENGTH); + strncpy(birows->irvd_file, snap->irvd_file, MAX_FILE_NAME_LENGTH); + if(snap->father_btmp_file) { + birows->father_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH); + strncpy(birows->father_btmp_file, snap->father_btmp_file, MAX_FILE_NAME_LENGTH); + } + birows->vm_state_size = snap->vm_state_size; +end: + return ret; +} + +static BDRVIrowState *irow_open_previous_state(BDRVIrowState *birows, int snap_index) { + BDRVIrowState *new_birows = g_malloc0(sizeof(BDRVIrowState)); + + new_birows->cluster_size = birows->cluster_size; + new_birows->cluster_bits = birows->cluster_bits; + new_birows->total_clusters = birows->total_clusters; + new_birows->sectors_per_cluster = birows->sectors_per_cluster; + new_birows->disk_size = birows->disk_size; + new_birows->bitmap_size = birows->bitmap_size; + new_birows->current_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH); + strcpy(new_birows->current_btmp_file, birows->current_btmp_file); + + new_birows->nb_snapshots = birows->nb_snapshots; + new_birows->irow_meta = birows->irow_meta; + irow_open_snapshots(new_birows); + + if(irow_load_info_from_snapshot(new_birows, snap_index) < 0) { + goto fail; + } + new_birows->open_flags = birows->open_flags; + if(irow_open_data(new_birows, new_birows->open_flags) < 0) { + goto fail; + } + + return new_birows; + +fail: + if(new_birows != NULL) { + irow_close_previous_state(new_birows); + new_birows = NULL; + } + + return NULL; +} + +static int irow_init_birows_cache(BDRVIrowState *birows) { + int ret = 0; + birows_cache = g_malloc0(sizeof(BDRVIrowState *) * birows->nb_snapshots); + if(birows_cache == NULL) { + ret = -1; + goto end; + } +end: + return ret; +} + +static int irow_open(BlockDriverState *bs, int flags) { + BDRVIrowState *s = bs->opaque; + + int snap_index; + + s->open_flags = flags; + if(irow_open_meta(bs, s, bs->filename, flags) < 0) { + fprintf (stderr, "Failed to open %s\n", bs->filename); + goto fail; + } + + snap_index = irow_find_snapshot_by_btmp(s, s->current_btmp_file); + if(irow_load_info_from_snapshot(s, snap_index) < 0) { + fprintf (stderr, "Failed to load filename from snapshot\n"); + goto fail; + } + + if(irow_open_data(s, flags) < 0) { + goto fail; + } + + if(irow_init_birows_cache(s) < 0) { + fprintf (stderr, "Failed to create birows_cache\n"); + goto fail; + } + return 0; + +fail: + irow_close (bs); + return -1; +} + +static int irow_get_bit(BDRVIrowState *birows, int64_t cluster_index) { + int64_t byte_index, bit_index; + + byte_index = cluster_index >> 3; + bit_index = cluster_index & 0x7; + return (birows->bitmap[byte_index] >> bit_index) & 1; +} + +static void irow_set_bit(BDRVIrowState *birows, int64_t cluster_index) { + int64_t byte_index, bit_index; + int old_bit; + + if(cluster_cache != NULL) { + if(cluster_index == cluster_cache->cluster_num) + cluster_cache->cluster_num = -1; + } + + byte_index = cluster_index >> 3; + bit_index = cluster_index & 0x7; + old_bit = (birows->bitmap[byte_index] >> bit_index) & 1; + if(old_bit == 0) { + birows->bitmap[byte_index] |= (1 << bit_index); + birows->bitmap_is_dirty = 1; + } +} + +static int irow_read_missing_clusters2(BlockDriverState *bs, BDRVIrowState *birows, int64_t start_cluster, int64_t nb_clusters, uint8_t *buf, uint8_t *buf_bitmap, uint64_t buf_start) { + int64_t continuous_missing_clusters, continuous_appearing_clusters, i, cluster_index, buf_index; + int64_t backing_len, backing_sector_num, backing_nb_sectors; + uint8_t *backing_buf; + int snap_index, ret = 0; + BlockDriver *drv; + + continuous_missing_clusters = 0; + continuous_appearing_clusters = 0; + for(i = 0; i < nb_clusters; i++) { + if(irow_get_bit(birows, start_cluster + i) == 0) { + buf_bitmap[buf_start + i] = 1; + continuous_missing_clusters += 1; + if(continuous_appearing_clusters != 0) { + if(strcmp(birows->current_btmp_file, birows->opened_btmp_file) != 0) { + cluster_index = start_cluster + i - continuous_appearing_clusters; + buf_index = buf_start + i - continuous_appearing_clusters; + if(cluster_cache != NULL) { + if(cluster_cache->cache != NULL) { + if(cluster_index == cluster_cache->cluster_num) { + memcpy(buf + buf_index * birows->cluster_size, cluster_cache->cache, birows->cluster_size); + cluster_index += 1; + buf_index += 1; + continuous_appearing_clusters -= 1; + if(continuous_appearing_clusters == 0) { + continue; + } + } + } + } + drv = birows->irow_irvd->drv; + if(bdrv_read(birows->irow_irvd, + cluster_index * birows->sectors_per_cluster, + buf + buf_index * birows->cluster_size, + continuous_appearing_clusters * birows->sectors_per_cluster) < 0) { + fprintf(stderr, "Failed to read clusters from %s\n", birows->irvd_file); + ret = -1; + goto end; + } + if(cluster_cache != NULL) { + if(cluster_cache->cache != NULL) { + memcpy(cluster_cache->cache, buf + (buf_start + i - 1) * birows->cluster_size, birows->cluster_size); + cluster_cache->cluster_num = start_cluster + i - 1; + } + } + } + continuous_appearing_clusters = 0; + } + } else { + continuous_appearing_clusters += 1; + if(continuous_missing_clusters != 0) { + if(birows->father_btmp_file != NULL) { + snap_index = irow_find_snapshot_by_btmp(birows, birows->father_btmp_file); + if(birows_cache[snap_index] == NULL) { + birows_cache[snap_index] = irow_open_previous_state(birows, snap_index); + if(birows_cache[snap_index] == NULL) { + ret = -1; + goto end; + } + } + ret = irow_read_missing_clusters2(bs, + birows_cache[snap_index], + start_cluster + i - continuous_missing_clusters, + continuous_missing_clusters, + buf, + buf_bitmap, + buf_start + i - continuous_missing_clusters); + + } else { + if(bs->backing_hd) { + backing_len = bdrv_getlength(bs->backing_hd) / 512; + backing_sector_num = (start_cluster + i - continuous_missing_clusters) * birows->sectors_per_cluster; + backing_nb_sectors = continuous_missing_clusters * birows->sectors_per_cluster; + backing_buf = buf + (buf_start + i - continuous_missing_clusters) * birows->cluster_size; + if(backing_sector_num < backing_len) { + if(backing_nb_sectors > backing_len - backing_sector_num) { + backing_nb_sectors = backing_len - backing_sector_num; + } + if(bdrv_read(bs->backing_hd, backing_sector_num, backing_buf, backing_nb_sectors)<0) { + fprintf(stderr, "failed to read base image: %s\n", bs->backing_file); + ret = -1; + goto end; + } + } + } + } + continuous_missing_clusters = 0; + } + } + } + if(continuous_missing_clusters != 0) { + if(birows->father_btmp_file != NULL) { + snap_index = irow_find_snapshot_by_btmp(birows, birows->father_btmp_file); + if(birows_cache[snap_index] == NULL) { + birows_cache[snap_index] = irow_open_previous_state(birows, snap_index); + if(birows_cache[snap_index] == NULL) { + ret = -1; + goto end; + } + } + ret = irow_read_missing_clusters2(bs, + birows_cache[snap_index], + start_cluster + i - continuous_missing_clusters, + continuous_missing_clusters, + buf, + buf_bitmap, + buf_start + i - continuous_missing_clusters); + + } else { + if(bs->backing_hd) { + backing_len = bdrv_getlength(bs->backing_hd) / 512; + backing_sector_num = (start_cluster + i - continuous_missing_clusters) * birows->sectors_per_cluster; + backing_nb_sectors = continuous_missing_clusters * birows->sectors_per_cluster; + backing_buf = buf + (buf_start + i - continuous_missing_clusters) * birows->cluster_size; + if(backing_sector_num < backing_len) { + if(backing_nb_sectors > backing_len - backing_sector_num) { + backing_nb_sectors = backing_len - backing_sector_num; + } + if(bdrv_read(bs->backing_hd, backing_sector_num, backing_buf, backing_nb_sectors)<0) { + fprintf(stderr, "failed to read base image: %s\n", bs->backing_file); + ret = -1; + goto end; + } + } + } + } + continuous_missing_clusters = 0; + } + + if(continuous_appearing_clusters != 0) { + if(strcmp(birows->current_btmp_file, birows->opened_btmp_file) != 0) { + cluster_index = start_cluster + i - continuous_appearing_clusters; + buf_index = buf_start + i - continuous_appearing_clusters; + if(cluster_cache != NULL) { + if(cluster_cache->cache != NULL) { + + if(cluster_index == cluster_cache->cluster_num) { + memcpy(buf + buf_index * birows->cluster_size, cluster_cache->cache, birows->cluster_size); + cluster_index += 1; + buf_index += 1; + continuous_appearing_clusters -= 1; + if(continuous_appearing_clusters == 0) { + goto end; + } + } + } + } + drv = birows->irow_irvd->drv; + if(bdrv_read(birows->irow_irvd, + cluster_index * birows->sectors_per_cluster, + buf + buf_index * birows->cluster_size, + continuous_appearing_clusters * birows->sectors_per_cluster) < 0) { + fprintf(stderr, "Failed to read clusters from %s\n", birows->irvd_file); + ret = -1; + } + if(cluster_cache != NULL) { + if(cluster_cache->cache != NULL) { + memcpy(cluster_cache->cache, buf + (buf_start + i - 1) * birows->cluster_size, birows->cluster_size); + cluster_cache->cluster_num = start_cluster + i - 1; + } + } + } + continuous_appearing_clusters = 0; + } + +end: + return ret; +} + +static int irow_read_missing_clusters(BlockDriverState *bs, int64_t first_cluster, int64_t last_cluster, uint8_t *buf, uint8_t *buf_bitmap, int is_read) { + BDRVIrowState *birows = bs->opaque; + int64_t nb_clusters; + int ret = 0; + + if(first_cluster >= birows->total_clusters) { + fprintf (stderr, "Invalid first_cluster!\n"); + ret = -1; + goto end; + } + if(last_cluster >= birows->total_clusters) { + fprintf (stderr, "Invalid last_cluster!\n"); + ret = -1; + goto end; + } + + if(is_read) { + nb_clusters = last_cluster - first_cluster + 1; + ret = irow_read_missing_clusters2(bs, birows, first_cluster, nb_clusters, buf, buf_bitmap, 0); + if(ret < 0) + goto end; + + } else { + ret = irow_read_missing_clusters2(bs, birows, first_cluster, 1, buf, buf_bitmap, 0); + if(ret < 0) + goto end; + if(first_cluster != last_cluster) { + ret = irow_read_missing_clusters2(bs, birows, last_cluster, 1, buf, buf_bitmap, 1); + } + } + + +end: + return ret; +} + +static int irow_write_clusters(BDRVIrowState *birows, int64_t cluster_index, const uint8_t *buf, int nb_clusters) { + int ret = 0; + BlockDriver *drv; + + if(cluster_index >= birows->total_clusters) { + fprintf (stderr, "Invalid cluster_index!\n"); + ret = -1; + goto end; + } + if((cluster_index + nb_clusters -1) >= birows->total_clusters) { + fprintf (stderr, "Invalid cluster_index or nb_clusters!\n"); + ret = -1; + goto end; + } + drv = birows->irow_irvd->drv; + ret = bdrv_write(birows->irow_irvd, birows->sectors_per_cluster * cluster_index, buf, birows->sectors_per_cluster * nb_clusters); + +end: + return ret; +} + +static int64_t first_sector_in_cluster(BDRVIrowState *birows, int64_t cluster_index) { + return cluster_index * birows->sectors_per_cluster; +} + +static int64_t last_sector_in_cluster(BDRVIrowState *birows, int64_t cluster_index) { + return (cluster_index + 1) * birows->sectors_per_cluster - 1; +} + +static int irow_assert_clusters(BlockDriverState *bs, ClusterBuffer *cbuf, int64_t sector_num, int nb_sectors, int op_type) { + BDRVIrowState *birows = bs->opaque; + int64_t nb_clusters, i, first_cluster, last_cluster, continuous_cluster, cluster_offset; + uint8_t *buffer_offset;// *zero_buf = NULL; + int ret = 0; + + first_cluster = sector_num / birows->sectors_per_cluster; + last_cluster = (sector_num + nb_sectors - 1) / birows->sectors_per_cluster; + nb_clusters = last_cluster - first_cluster + 1; + + switch(op_type) { + case IROW_READ: + case IROW_AIO_READ: + if(irow_read_missing_clusters(bs, first_cluster, last_cluster, cbuf->buf, cbuf->read_from_father, 1) < 0) { + ret = -1; + goto end; + } + + if(birows->copy_on_demand) { + continuous_cluster = 0; + for(i = 0; i < nb_clusters + 1; i++) { + if(cbuf->read_from_father[i] == 0) { + if(continuous_cluster == 0) + continue; + cluster_offset = first_cluster + i - continuous_cluster; + buffer_offset = cbuf->buf + (i - continuous_cluster) * birows->cluster_size; + if(irow_write_clusters(birows, cluster_offset, buffer_offset, continuous_cluster) < 0) { + ret = -1; + goto end; + } + continuous_cluster = 0; + } else { + continuous_cluster += 1; + irow_set_bit(birows, first_cluster + i); + } + } + } + break; + case IROW_WRITE: + case IROW_AIO_WRITE: + if(sector_num == first_sector_in_cluster(birows, first_cluster)) { + if((sector_num + nb_sectors - 1) == last_sector_in_cluster(birows, last_cluster)) { + break; + } else { + if(irow_read_missing_clusters(bs, last_cluster, last_cluster, cbuf->buf, cbuf->read_from_father, 0) < 0) { + ret = -1; + goto end; + } + if(cbuf->read_from_father[0] == 1) { + if(irow_write_clusters(birows, last_cluster , cbuf->buf, 1) < 0) { + ret = -1; + goto end; + } + irow_set_bit(birows, last_cluster); + } + break; + } + } else { + if((sector_num + nb_sectors - 1) == last_sector_in_cluster(birows, last_cluster)) { + if(irow_read_missing_clusters(bs, first_cluster, first_cluster, cbuf->buf, cbuf->read_from_father, 0) < 0) { + ret = -1; + goto end; + } + if(cbuf->read_from_father[0] == 1) { + if(irow_write_clusters(birows, first_cluster , cbuf->buf, 1) < 0) { + ret = -1; + goto end; + } + irow_set_bit(birows, first_cluster); + } + break; + } else { + if(irow_read_missing_clusters(bs, first_cluster, last_cluster, cbuf->buf, cbuf->read_from_father, 0) < 0) { + ret = -1; + goto end; + } + if(cbuf->read_from_father[0] == 1) { + if(irow_write_clusters(birows, first_cluster, cbuf->buf, 1) < 0) { + ret = -1; + goto end; + } + irow_set_bit(birows, first_cluster); + } + if(cbuf->read_from_father[1] == 1) { + if(irow_write_clusters(birows, last_cluster, cbuf->buf + birows->cluster_size, 1) < 0) { + ret = -1; + goto end; + } + irow_set_bit(birows, last_cluster); + } + break; + } + } + } + +end: + return ret; +} + +static int irow_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) { + + BDRVIrowState *s = bs->opaque; + int64_t first_cluster, last_cluster, nb_clusters, sector_index, cluster_index, buf_offset, temp_buf_offset, temp_buf_index; + int first_cluster_copied = 0; + BlockDriver *drv; + ClusterBuffer cbuf; + int remain_sectors, cbuf_offset, len, ret = 0; + uint8_t *temp_buf = NULL; + + first_cluster = sector_num / s->sectors_per_cluster; + last_cluster = (sector_num + nb_sectors - 1) / s->sectors_per_cluster; + nb_clusters = last_cluster - first_cluster + 1; + temp_buf_offset = (sector_num & (s->sectors_per_cluster - 1)) * BDRV_SECTOR_SIZE; + temp_buf_index = 0; + cbuf.buf = NULL; + cbuf.read_from_father = NULL; + + if(first_cluster >= s->total_clusters) { + fprintf (stderr, "Invalid sector_num.\n"); + ret = -1; + goto end; + } + if(last_cluster >= s->total_clusters) { + fprintf (stderr, "Invalid nb_sectors.\n"); + ret = -1; + goto end; + } + + temp_buf = qemu_memalign(512, nb_clusters * s->cluster_size); + memset(temp_buf, 0, nb_clusters * s->cluster_size); + if(temp_buf == NULL) { + fprintf (stderr, "Failed to create temp_buf.\n"); + ret = -1; + goto end; + } + if(cluster_cache != NULL) { + if(cluster_cache->cache != NULL) { + if(first_cluster == cluster_cache->cluster_num) { + memcpy(temp_buf, cluster_cache->cache, s->cluster_size); + first_cluster_copied = 1; + first_cluster += 1; + nb_clusters -= 1; + temp_buf_index += 1; + } + } + } + + if(nb_clusters != 0) { + drv = s->irow_irvd->drv; + ret = bdrv_read(s->irow_irvd, first_cluster * s->sectors_per_cluster, temp_buf + temp_buf_index * s->cluster_size, nb_clusters * s->sectors_per_cluster); + if(ret < 0) { + goto end; + } + } + + memcpy(buf, temp_buf + temp_buf_offset, nb_sectors * BDRV_SECTOR_SIZE); + + if(nb_clusters != 0) { + if(first_cluster_copied) { + first_cluster -= 1; + nb_clusters += 1; + } + if(cluster_cache != NULL) { + if(cluster_cache->cache != NULL) { + if(irow_get_bit(s, last_cluster)) { + memcpy(cluster_cache->cache, temp_buf + (nb_clusters - 1) * s->cluster_size, s->cluster_size); + cluster_cache->cluster_num = last_cluster; + } + } + } + + if(s->complete_image != 1) { + cbuf.buf = qemu_memalign(512, nb_clusters * s->cluster_size); + memset(cbuf.buf, 0, nb_clusters * s->cluster_size); + cbuf.read_from_father = g_malloc0(nb_clusters + 1); + + if(irow_assert_clusters(bs, &cbuf, first_sector_in_cluster(s, first_cluster), nb_clusters * s->sectors_per_cluster, IROW_READ) < 0) { + fprintf (stderr, "irow_assert_clusters() failed.\n"); + ret = -1; + goto end; + } + + irow_update_btmp(s); + + sector_index = sector_num; + remain_sectors = nb_sectors; + buf_offset = 0; + + while(remain_sectors > 0) { + cluster_index = sector_index / s->sectors_per_cluster; + len = last_sector_in_cluster(s, cluster_index) - sector_index + 1; + if(len > remain_sectors) + len = remain_sectors; + + if(cbuf.read_from_father[cluster_index - first_cluster] == 1) { + cbuf_offset = (sector_index & (s->sectors_per_cluster - 1)) + (cluster_index - first_cluster) * s->sectors_per_cluster; + memcpy(buf + buf_offset, cbuf.buf + cbuf_offset * BDRV_SECTOR_SIZE, len * BDRV_SECTOR_SIZE); + } + sector_index = first_sector_in_cluster(s, cluster_index + 1); + remain_sectors -= len; + buf_offset += len * BDRV_SECTOR_SIZE; + } + } + + } + +end: + if(cbuf.buf != NULL) { + g_free(cbuf.buf); + cbuf.buf = NULL; + } + if(cbuf.read_from_father != NULL) { + g_free(cbuf.read_from_father); + cbuf.read_from_father = NULL; + } + if(temp_buf != NULL) { + g_free(temp_buf); + temp_buf = NULL; + } + return ret; +} + +static int irow_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors) { + BDRVIrowState *s = bs->opaque; + int64_t first_cluster, last_cluster, current_cluster; + ClusterBuffer cbuf; + BlockDriver *drv; + int ret = 0; + + first_cluster = sector_num / s->sectors_per_cluster; + last_cluster = (sector_num + nb_sectors - 1) / s->sectors_per_cluster; + + + if(first_cluster >= s->total_clusters) { + fprintf (stderr, "Invalid sector_num!\n"); + ret = -1; + goto end; + } + if(last_cluster >= s->total_clusters) { + fprintf (stderr, "Invalid nb_sectors!\n"); + ret = -1; + goto end; + } + + cbuf.buf = NULL; + cbuf.read_from_father = NULL; + if(s->complete_image != 1) { + cbuf.buf = qemu_memalign(512, 2 * s->cluster_size); + memset(cbuf.buf, 0, 2 * s->cluster_size); + cbuf.read_from_father = g_malloc0(2); + if(irow_assert_clusters(bs, &cbuf, sector_num, nb_sectors, IROW_WRITE) < 0) { + ret = -1; + goto end; + } + } + + for(current_cluster = first_cluster; current_cluster <= last_cluster; current_cluster++) { + irow_set_bit(s, current_cluster); + } + + drv = s->irow_irvd->drv; + ret = bdrv_write(s->irow_irvd, sector_num, buf, nb_sectors); + if(ret < 0) { + goto end; + } + + if(irow_update_btmp(s) < 0) { + fprintf (stderr, "Failed to update btmp file. (%s)\n", s->opened_btmp_file); + ret = -1; + goto end; + } + +end: + if(cbuf.buf != NULL) { + g_free(cbuf.buf); + cbuf.buf = NULL; + } + if(cbuf.read_from_father != NULL) { + g_free(cbuf.read_from_father); + cbuf.read_from_father = NULL; + } + + return ret; +} + +static int irow_generate_filename(char *dest, const char *prefix, const char *body, const char *suffix) { + if(strlen(prefix) + strlen(body) + strlen(suffix) + 2 >= MAX_FILE_NAME_LENGTH) { + fprintf(stderr, "Invalid filename length, max is %d\n", MAX_FILE_NAME_LENGTH); + return -1; + } + strcpy(dest, prefix); + strcat(dest, "-"); + strcat(dest, body); + strcat(dest, "."); + strcat(dest, suffix); + return 0; +} + +static int irow_create_meta(IRowCreateState *cs) { + IRowMeta meta; + IRowSnapshotHeader snap_header; + uint32_t cluster_size, copy_on_demand; + uint64_t disk_size; + qemu_timeval tv; + int fd, cluster_bits, ret = 0; + + if(cs->disk_size == 0) { + fprintf(stderr, "Invalid disk_size\n"); + ret = -1; + goto end; + } + disk_size = cs->disk_size; + + if(cs->cluster_size == 0) { + fprintf(stderr, "Invalid cluster_size\n"); + ret = -1; + goto end; + } + cluster_size = cs->cluster_size; + + cluster_bits = get_bits_from_size(cluster_size); + cs->cluster_bits = cluster_bits; + if ((cluster_bits < MIN_CLUSTER_BITS) || (cluster_bits > MAX_CLUSTER_BITS)) { + fprintf(stderr, "Cluster size must be a power of two between %d and %dk\n", + 1 << MIN_CLUSTER_BITS, + 1 << (MAX_CLUSTER_BITS - 10)); + ret = -1; + goto end; + + } + copy_on_demand = cs->copy_on_demand; + if(cs->meta_file[0] == '\0') { + fprintf(stderr, "Void meta file name\n"); + ret = -1; + goto end; + } + fd = open(cs->meta_file, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); + if (fd < 0) { + fprintf(stderr, "Can not open %s\n", cs->meta_file); + ret = -1; + goto end; + } + memset(&meta, 0, sizeof(meta)); + meta.magic = cpu_to_be32(IROW_MAGIC); + meta.version = cpu_to_be32(IROW_VERSION); + meta.copy_on_demand = cpu_to_be32(copy_on_demand); + meta.cluster_size = cpu_to_be32(cluster_size); + meta.cluster_bits = cpu_to_be32(cluster_bits); + meta.total_clusters = cpu_to_be64((disk_size + cluster_size -1) >> cluster_bits); + meta.sectors_per_cluster = cpu_to_be32(cluster_size >> BDRV_SECTOR_BITS); + meta.disk_size = cpu_to_be64(disk_size); + meta.nb_snapshots = cpu_to_be32(1); + + if(irow_generate_filename(meta.current_btmp, cs->meta_file, cs->time_value, "btmp") < 0) { + ret = -1; + goto end; + } + + if(irow_generate_filename(cs->irvd_file, cs->meta_file, cs->time_value, "irvd") < 0) { + ret = -1; + goto end; + } + + if(cs->backing_file != NULL) { + strncpy(meta.backing_file, cs->backing_file, MAX_FILE_NAME_LENGTH); + } + + strncpy(cs->btmp_file, meta.current_btmp, MAX_FILE_NAME_LENGTH); + + memset(&snap_header, 0, sizeof(snap_header)); + + snap_header.snap_magic = cpu_to_be32(IROW_SNAPHEADER_MAGIC); + sprintf(snap_header.id_str, "0"); + sprintf(snap_header.name, "current state"); + strncpy(snap_header.btmp_file, cs->btmp_file, MAX_FILE_NAME_LENGTH); + strncpy(snap_header.irvd_file, cs->irvd_file, MAX_FILE_NAME_LENGTH); + qemu_gettimeofday(&tv); + snap_header.date_sec = tv.tv_sec; + snap_header.date_nsec = tv.tv_usec * 1000; + snap_header.nb_children = 0; + snap_header.is_deleted = 0; + + if(write(fd, &meta, sizeof(meta))==-1){ + ret = -1; + goto end; + } + if(write(fd, &snap_header, sizeof(snap_header))==-1){ + ret = -1; + goto end; + } + + if(close(fd) != 0) { + ret = -1; + } +end: + return ret; +} + +static int irow_create_btmp(IRowCreateState *cs) { + + char *bitmap = NULL; + int fd, bitmap_size, ret = 0; + + if(cs->btmp_file[0] == '\0') { + fprintf(stderr, "Void btmp file name\n"); + ret = -1; + goto end; + } + fd = open(cs->btmp_file, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); + if(fd < 0) { + fprintf(stderr, "Can not open %s\n", cs->btmp_file); + ret = -1; + goto end; + } + + bitmap_size = (((cs->disk_size + cs->cluster_size - 1) >> cs->cluster_bits) + 7) >> 3; + bitmap = g_malloc(bitmap_size); + memset(bitmap, 0, bitmap_size); + + if(write(fd, bitmap, bitmap_size)==-1){ + ret = -1; + goto end; + } + + if(close(fd) != 0) { + ret = -1; + } + +end: + if(bitmap != NULL) + g_free(bitmap); + return ret; +} + +static int irow_create_vd(IRowCreateState *cs) { + int fd, ret = 0; + + if(cs->irvd_file[0] == '\0') { + fprintf(stderr, "Void irvd file name\n"); + ret = -1; + goto end; + } + + fd = open(cs->irvd_file, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); + if(fd < 0) { + fprintf(stderr, "Can not open %s\n", cs->irvd_file); + ret = -1; + goto end; + } + if(fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, cs->disk_size) < 0) { + ; + } + if (ftruncate(fd, cs->disk_size) != 0) { + fprintf(stderr, "Can not truncate %s to %" PRId64 " bytes\n", cs->irvd_file, cs->disk_size); + ret = -1; + } + if (close(fd) != 0) { + ret = -1; + } + + +end: + return ret; +} + +static IRowCreateState *irow_create_state_new(void) { + IRowCreateState *cs = g_malloc0(sizeof(IRowCreateState)); + qemu_timeval tv; + + cs->meta_file = g_malloc0(MAX_FILE_NAME_LENGTH); + cs->btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH); + cs->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH); + cs->time_value = g_malloc0(MAX_FILE_NAME_LENGTH); + cs->father_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH); + + qemu_gettimeofday(&tv); + sprintf(cs->time_value, "%lx%lx", tv.tv_sec, tv.tv_usec); + return cs; +} + +static void irow_create_state_delete(IRowCreateState *cs) { + if(cs->meta_file != NULL) + g_free(cs->meta_file); + if(cs->btmp_file != NULL) + g_free(cs->btmp_file); + if(cs->irvd_file != NULL) + g_free(cs->irvd_file); + if(cs->time_value != NULL) + g_free(cs->time_value); + if(cs->father_btmp_file != NULL) + g_free(cs->father_btmp_file); + g_free(cs); +} + +static int irow_create(const char *filename, QEMUOptionParameter *options) { + IRowCreateState *cs = irow_create_state_new(); + int ret = 0; + + if(cs == NULL) { + ret = -1; + goto end; + } + cs->cluster_size = 65536; + cs->copy_on_demand = 0; + cs->backing_file = NULL; + strncpy(cs->meta_file, filename, MAX_FILE_NAME_LENGTH); + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + cs->disk_size= options->value.n; + } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { + if (options->value.n) { + cs->cluster_size = options->value.n; + } + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { + cs->backing_file = options->value.s; + } else if(!strcmp(options->name, "copy_on_demand")) { + cs->copy_on_demand = options->value.n; + } + options++; + } + + if(irow_create_meta(cs) < 0) { + fprintf(stderr, "Fail to create meta file of %s\n", filename); + ret = -1; + goto end; + } + + if(irow_create_btmp(cs) < 0) { + fprintf(stderr, "Fail to create bitmap file of %s\n", filename); + ret = -1; + goto end; + } + + if(irow_create_vd(cs) < 0) { + fprintf(stderr, "Fail to create virtual machine disk file of %s\n", filename); + ret = -1; + goto end; + } + +end: + if(cs != NULL) { + irow_create_state_delete(cs); + } + return ret; +} + +static int coroutine_fn irow_flush(BlockDriverState *bs) { + BDRVIrowState *s = bs->opaque; + + return bdrv_flush(s->irow_irvd); +} + +typedef struct IRowAIOCB { + BlockDriverAIOCB common; + int64_t sector_num; + QEMUIOVector *qiov; + int nb_sectors; + BlockDriverAIOCB *irvd_aiocb; + +} IRowAIOCB; + +static void irow_aio_cancel(BlockDriverAIOCB *blockacb) +{ + IRowAIOCB *acb = (IRowAIOCB *)blockacb; + if (acb->irvd_aiocb) + bdrv_aio_cancel(acb->irvd_aiocb); + qemu_aio_release(acb); +} + +static AIOCBInfo irow_aio_pool = { + .aiocb_size = sizeof(IRowAIOCB), + .cancel = irow_aio_cancel, +}; + + +static IRowAIOCB *irow_aio_setup(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + IRowAIOCB *acb; + + acb = qemu_aio_get(&irow_aio_pool, bs, cb, opaque); + if (!acb) + return NULL; + acb->irvd_aiocb = NULL; + acb->sector_num = sector_num; + acb->qiov = qiov; + acb->nb_sectors = nb_sectors; + return acb; +} + +static void irow_aio_readv_cb(void *opaque, int ret) { + IRowAIOCB *acb = opaque; + BlockDriverState *bs = acb->common.bs; + BDRVIrowState *birows = bs->opaque; + int64_t first_cluster, last_cluster, nb_clusters, sector_index, cluster_index, buf_offset; + ClusterBuffer cbuf; + void *buf = NULL; + int remain_sectors, cbuf_offset, len; + + if(ret < 0) { + fprintf(stderr, "aio_readv failed\n"); + goto end; + } + first_cluster = acb->sector_num / birows->sectors_per_cluster; + last_cluster = (acb->sector_num + acb->nb_sectors - 1) / birows->sectors_per_cluster; + + if(first_cluster >= birows->total_clusters) { + fprintf (stderr, "Invalid sector_num.\n"); + ret = -1; + goto end; + } + if(last_cluster >= birows->total_clusters) { + fprintf (stderr, "Invalid nb_sectors.\n"); + ret = -1; + goto end; + } + + cbuf.buf = NULL; + cbuf.read_from_father = NULL; + if(birows->complete_image != 1) { + nb_clusters = last_cluster - first_cluster + 1; + cbuf.buf = qemu_memalign(512, nb_clusters * birows->cluster_size); + memset(cbuf.buf, 0, nb_clusters * birows->cluster_size); + cbuf.read_from_father = g_malloc0(nb_clusters + 1); + if(irow_assert_clusters(bs, &cbuf, acb->sector_num, acb->nb_sectors, IROW_AIO_READ) < 0) { + fprintf (stderr, "irow_assert_clusters() failed.\n"); + ret = -1; + goto end; + } + irow_update_btmp(birows); + + buf = g_malloc(acb->qiov->size); + qemu_iovec_to_buf(acb->qiov, 0, buf, acb->qiov->size); + + sector_index = acb->sector_num; + remain_sectors = acb->nb_sectors; + buf_offset = 0; + while(remain_sectors > 0) { + cluster_index = sector_index / birows->sectors_per_cluster; + len = last_sector_in_cluster(birows, cluster_index) - sector_index + 1; + if(len > remain_sectors) + len = remain_sectors; + if(cbuf.read_from_father[cluster_index - first_cluster] == 1) { + cbuf_offset = (sector_index & (birows->sectors_per_cluster - 1)) + (cluster_index - first_cluster) * birows->sectors_per_cluster; + memcpy(buf + buf_offset, cbuf.buf + cbuf_offset * BDRV_SECTOR_SIZE, len * BDRV_SECTOR_SIZE); + } + sector_index = first_sector_in_cluster(birows, cluster_index + 1); + remain_sectors -= len; + buf_offset += len * BDRV_SECTOR_SIZE; + } + + qemu_iovec_from_buf(acb->qiov, 0, buf, acb->qiov->size); + } + + end: + if(buf != NULL) { + g_free(buf); + buf = NULL; + } + if(cbuf.buf != NULL) { + g_free(cbuf.buf); + cbuf.buf = NULL; + } + if(cbuf.read_from_father != NULL) { + g_free(cbuf.read_from_father); + cbuf.read_from_father = NULL; + } + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); +} + +static BlockDriverAIOCB *irow_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) { + IRowAIOCB *acb; + BDRVIrowState *birows = bs->opaque; + BlockDriver *drv; + + acb = irow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque); + if (!acb) + return NULL; + drv = birows->irow_irvd->drv; + acb->irvd_aiocb = drv->bdrv_aio_readv(birows->irow_irvd, sector_num, qiov, nb_sectors, irow_aio_readv_cb, acb); + if(acb->irvd_aiocb == NULL){ + qemu_aio_release(acb); + return NULL; + } + return &acb->common; +} + +static BlockDriverAIOCB *irow_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) { + BDRVIrowState *s = bs->opaque; + int64_t first_cluster, last_cluster, current_cluster; + ClusterBuffer cbuf; + BlockDriver *drv; + BlockDriverAIOCB *ret = NULL; + + first_cluster = sector_num / s->sectors_per_cluster; + last_cluster = (sector_num + nb_sectors - 1) / s->sectors_per_cluster; + + if(first_cluster >= s->total_clusters) { + fprintf (stderr, "Invalid sector_num!\n"); + goto end; + } + if(last_cluster >= s->total_clusters) { + fprintf (stderr, "Invalid nb_sectors!\n"); + goto end; + } + cbuf.buf = NULL; + cbuf.read_from_father = NULL; + if(s->complete_image != 1) { + cbuf.buf = qemu_memalign(512, 2 * s->cluster_size); + cbuf.read_from_father = g_malloc0(2); + if(irow_assert_clusters(bs, &cbuf, sector_num, nb_sectors, IROW_AIO_WRITE) < 0) { + fprintf (stderr, "irow_assert_clusters() failed.\n"); + goto end; + } + } + + for(current_cluster = first_cluster; current_cluster <= last_cluster; current_cluster++) { + irow_set_bit(s, current_cluster); + } + + drv = s->irow_irvd->drv; + ret = drv->bdrv_aio_writev(s->irow_irvd, sector_num, qiov, nb_sectors, cb, opaque ); + if(ret == NULL) { + goto end; + } + + if(irow_update_btmp(s) < 0) { + fprintf (stderr, "Failed to update btmp file. (%s)\n", s->opened_btmp_file); + ret = NULL; + goto end; + } + +end: + if(cbuf.buf != NULL) { + g_free(cbuf.buf); + cbuf.buf = NULL; + } + if(cbuf.read_from_father != NULL) { + g_free(cbuf.read_from_father); + cbuf.read_from_father = NULL; + } + return ret; +} + +static BlockDriverAIOCB *irow_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) { + BDRVIrowState *s = bs->opaque; + BlockDriverAIOCB *ret = NULL; + + ret = bdrv_aio_flush(s->irow_irvd, cb, opaque); + + return ret; +} + +static void irow_new_snapshot_id(BDRVIrowState *birows, char *id_str, int id_str_size) { + IRowSnapshot *snap_ptr; + uint i, id, found; + + for(id = 1; id < 0xffffffff; id++) { + found = 1; + for(i = 0; i < birows->nb_snapshots; i++) { + snap_ptr = birows->snapshots + i; + if(snap_ptr->id_str != NULL) { + if(id == strtoul(snap_ptr->id_str, NULL, 10)) { + found = 0; + break; + } + } + } + if(found) + break; + } + snprintf(id_str, id_str_size, "%d", id); +} + +static int irow_find_snapshot_by_id(BDRVIrowState *birows, const char *id_str) { + int i; + + for(i = 0; i < birows->nb_snapshots; i++) { + if(birows->snapshots[i].id_str != NULL) { + if(strcmp(birows->snapshots[i].id_str, id_str) == 0) { + return i; + } + } + } + return -1; +} + +static int irow_find_snapshot_by_name(BDRVIrowState *birows, const char *name) { + int i; + + for(i = 0; i < birows->nb_snapshots; i++) { + if(birows->snapshots[i].name != NULL) { + if(strcmp(birows->snapshots[i].name, name) == 0) { + return i; + } + } + } + return -1; +} + +static int irow_find_free_snapshot(BDRVIrowState *birows) { + int i; + + for(i = 0; i < birows->nb_snapshots; i++) { + if(birows->snapshots[i].nb_children == 0 && birows->snapshots[i].is_deleted == 1) { + return i; + } + } + return -1; +} + +static int irow_update_nb_children(BDRVIrowState *birows, IRowSnapshot *snap, int value) { + IRowSnapshot *father_snap; + int snap_index, ret = 0; + snap->nb_children += value; + if(snap->nb_children == 0 && snap->is_deleted == 1) { + if(snap->father_btmp_file) { + snap_index = irow_find_snapshot_by_btmp(birows, snap->father_btmp_file); + if(snap_index < 0) { + fprintf(stderr, "Failed to find father snapshot\n"); + ret = -1; + goto end; + } + father_snap = birows->snapshots + snap_index; + irow_update_nb_children(birows, father_snap, value); + } + } + +end: + return ret; +} + +static int irow_snapshot_add(BDRVIrowState *birows, IRowCreateState *cs, QEMUSnapshotInfo *sn_info) { + IRowSnapshot *new_snap, *snap; + qemu_timeval tv; + int snap_index; + + birows->snapshots = g_realloc(birows->snapshots, (birows->nb_snapshots + 1) * sizeof(IRowSnapshot)); + + snap_index = irow_find_snapshot_by_btmp(birows, birows->current_btmp_file); + if(snap_index < 0) { + return -1; + } + snap = birows->snapshots + snap_index; + + new_snap = birows->snapshots + birows->nb_snapshots; + memset(new_snap, 0, sizeof(IRowSnapshot)); + + snap->date_sec = sn_info->date_sec; + snap->date_nsec = sn_info->date_nsec; + snap->vm_clock_nsec = sn_info->vm_clock_nsec; + snap->vm_state_size = sn_info->vm_state_size; + irow_update_nb_children(birows, snap, 1); + + if(snap->id_str == NULL) { + snap->id_str = g_malloc0(128); + } else { + memset(snap->id_str, 0, 128); + } + strncpy(snap->id_str, sn_info->id_str, 128); + + if(snap->name == NULL) { + snap->name = g_malloc0(256); + } else { + memset(snap->name, 0, 256); + } + strncpy(snap->name, sn_info->name, 256); + + new_snap->id_str = g_malloc0(128); + sprintf(new_snap->id_str, "0"); + new_snap->name = g_malloc0(256); + sprintf(new_snap->name, "current state"); + new_snap->btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH); + strncpy(new_snap->btmp_file, cs->btmp_file, MAX_FILE_NAME_LENGTH); + new_snap->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH); + strncpy(new_snap->irvd_file, cs->irvd_file, MAX_FILE_NAME_LENGTH); + if(cs->father_btmp_file != NULL) { + new_snap->father_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH); + strncpy(new_snap->father_btmp_file, cs->father_btmp_file, MAX_FILE_NAME_LENGTH); + } + qemu_gettimeofday(&tv); + new_snap->date_sec = tv.tv_sec; + new_snap->date_nsec = tv.tv_usec * 1000; + + birows->nb_snapshots += 1; + birows_cache = g_realloc(birows_cache, sizeof(BDRVIrowState *) * birows->nb_snapshots); + memset(birows_cache, 0, sizeof(BDRVIrowState *) * birows->nb_snapshots); + birows->snapshots_is_dirty = 1; + + return 0; +} + +static void irow_snapshot_copy(IRowSnapshot *dst, IRowSnapshot *src) { + + if(src->id_str) { + dst->id_str = g_malloc0(128); + strncpy(dst->id_str, src->id_str, 128); + } + if(src->name) { + dst->name = g_malloc0(256); + strncpy(dst->name, src->name, 256); + } + if(src->btmp_file) { + dst->btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH); + strncpy(dst->btmp_file, src->btmp_file, MAX_FILE_NAME_LENGTH); + } + if(src->irvd_file) { + dst->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH); + strncpy(dst->irvd_file, src->irvd_file, MAX_FILE_NAME_LENGTH); + } + if(src->father_btmp_file) { + dst->father_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH); + strncpy(dst->father_btmp_file, src->father_btmp_file, MAX_FILE_NAME_LENGTH); + } + dst->date_sec = src->date_sec; + dst->date_nsec = src->date_nsec; + dst->vm_clock_nsec = src->vm_clock_nsec; + dst->vm_state_size = src->vm_state_size; + dst->nb_children = src->nb_children; + dst->is_deleted = src->is_deleted; +} + +static int irow_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) { + BDRVIrowState *s = bs->opaque; + IRowCreateState *cs = NULL; + IRowSnapshot *free_snap, *old_snap, *snap; + int snap_index, offset, ret = 0; + + if(sn_info->id_str[0] == '\0') { + irow_new_snapshot_id(s, sn_info->id_str, sizeof(sn_info->id_str)); + } + + if(irow_find_snapshot_by_id(s, sn_info->id_str) >= 0) { + fprintf(stderr, "Duplicated snapshot id\n"); + ret = -1; + goto end; + } + + if(irow_find_snapshot_by_name(s, sn_info->name) >= 0) { + fprintf(stderr, "Duplicated snapshot name\n"); + ret = -1; + goto end; + } + + cs = irow_create_state_new(); + cs->cluster_bits = s->cluster_bits; + cs->cluster_size = s->cluster_size; + cs->disk_size = s->disk_size; + strncpy(cs->meta_file, s->meta_file, MAX_FILE_NAME_LENGTH); + strncpy(cs->father_btmp_file, s->current_btmp_file, MAX_FILE_NAME_LENGTH); // 其father文件为老的当前镜像 + + snap_index = irow_find_free_snapshot(s); + if(snap_index >= 0) { + free_snap = s->snapshots + snap_index; + strcpy(cs->btmp_file, free_snap->btmp_file); + strcpy(cs->irvd_file, free_snap->irvd_file); + old_snap = s->snapshots; + s->snapshots = g_malloc0((s->nb_snapshots - 1) * sizeof(IRowSnapshot)); + offset = 0; + for(snap_index = 0; snap_index < s->nb_snapshots; snap_index++) { + snap = old_snap + snap_index; + if(snap != free_snap) { + irow_snapshot_copy(s->snapshots + offset, snap); + offset += 1; + } + } + + irow_close_snapshots2(old_snap, s->nb_snapshots); + s->nb_snapshots -= 1; + } else { + irow_generate_filename(cs->btmp_file, cs->meta_file, cs->time_value, "btmp"); + irow_generate_filename(cs->irvd_file, cs->meta_file, cs->time_value, "irvd"); + + if(irow_create_btmp(cs) < 0) { + fprintf(stderr, "Failed to create new btmp file (%s)\n", cs->btmp_file); + ret = -1; + goto end; + } + + if(irow_create_vd(cs) < 0) { + fprintf(stderr, "Failed to create new irvd file (%s)\n", cs->irvd_file); + ret = -1; + goto end; + } + } + + if(irow_snapshot_add(s, cs, sn_info) < 0) { + fprintf(stderr, "Failed to add new snapshot in mem\n"); + ret = -1; + goto end; + } + + if(irow_update_meta(s, cs->btmp_file, 0) < 0) { + fprintf(stderr, "Failed to update meta file (%s)\n", s->meta_file); + ret = -1; + goto end; + } + + s->vm_state_size = sn_info->vm_state_size; + irow_update_btmp(s); + + irow_close_btmp(s); + irow_close_irvd(s); + + strncpy(s->current_btmp_file, cs->btmp_file, MAX_FILE_NAME_LENGTH); + snap_index = irow_find_snapshot_by_btmp(s, s->current_btmp_file); + if(irow_load_info_from_snapshot(s, snap_index) < 0) { + ret = -1; + goto end; + } + ret = irow_open_data(s, s->open_flags); + memset(s->bitmap, 0, s->bitmap_size); + s->bitmap_is_dirty = 1; + if(irow_update_btmp(s) < 0) { + fprintf(stderr, "Failed to update btmp file\n"); + ret = -1; + goto end; + } + +end: + if(cs != NULL) { + irow_create_state_delete(cs); + cs = NULL; + } + return ret; +} + +static int64_t irow_vm_state_offset(BDRVIrowState *birows) { + return birows->bitmap_size; +} + +static int irow_load_vmstate2(BDRVIrowState *birows, uint8_t *buf, int64_t pos, int size) { + + return bdrv_pread(birows->irow_btmp, irow_vm_state_offset(birows) + pos, buf, size); + +} + +static int irow_save_vmstate2(BDRVIrowState *birows, const uint8_t *buf, int64_t pos, int size) { + birows->vmstate_is_saved = 1; + return bdrv_pwrite(birows->irow_btmp, irow_vm_state_offset(birows) + pos, buf, size); + +} + +static int irow_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) { + + BDRVIrowState *s = bs->opaque; + IRowSnapshot *target_snap, *current_snap, *father_snap; + int snap_index, ret = 0; + + if(strcmp(snapshot_id, "0") == 0 || strcmp(snapshot_id, "current state") == 0) { + fprintf(stderr, "No need to goto current state.\n"); + goto end; + } + + snap_index = irow_find_snapshot_by_id(s, snapshot_id); + if(snap_index < 0) { + snap_index = irow_find_snapshot_by_name(s, snapshot_id); + if(snap_index < 0) { + fprintf(stderr, "Failed to find snapshot %s\n", snapshot_id); + ret = -1; + goto end; + } + } + target_snap = s->snapshots + snap_index; + + if(target_snap->is_deleted) { + fprintf(stderr, "Can not go to deleted snapshot %s\n", snapshot_id); + ret = -1; + goto end; + } + + snap_index = irow_find_snapshot_by_btmp(s, s->current_btmp_file); + if(snap_index < 0) { + fprintf(stderr, "Failed to find current state.\n"); + ret = -1; + goto end; + } + current_snap = s->snapshots + snap_index; + snap_index = irow_find_snapshot_by_btmp(s, s->father_btmp_file); + if(snap_index < 0) { + fprintf(stderr, "Failed to find father snapshot.\n"); + ret = -1; + goto end; + } + father_snap = s->snapshots + snap_index; + strncpy(s->father_btmp_file, target_snap->btmp_file, MAX_FILE_NAME_LENGTH); + strncpy(current_snap->father_btmp_file, target_snap->btmp_file, MAX_FILE_NAME_LENGTH); + + irow_update_nb_children(s, father_snap, -1); + irow_update_nb_children(s, target_snap, 1); + + current_snap->date_sec = target_snap->date_sec; + current_snap->date_nsec = target_snap->date_nsec; + current_snap->vm_clock_nsec = target_snap->vm_clock_nsec; + current_snap->vm_state_size = 0; + + memset(s->bitmap, 0, s->bitmap_size); + s->bitmap_is_dirty = 1; + if(irow_update_btmp(s) < 0) { + fprintf(stderr, "Failed to update btmp file\n"); + ret = -1; + goto end; + } + + s->snapshots_is_dirty = 1; + if(irow_update_meta(s, NULL, 0) < 0) { + fprintf(stderr, "Failed to update meta file\n"); + ret = -1; + } + + +end: + return ret; +} + +static int irow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) { + + BDRVIrowState *s = bs->opaque; + IRowSnapshot *target_snap, *father_snap; + int snap_index, ret = 0; + + if(strcmp(snapshot_id, "0") == 0 || strcmp(snapshot_id, "current state") == 0) { + fprintf(stderr, "Can not delete current state.\n"); + goto end; + } + + snap_index = irow_find_snapshot_by_id(s, snapshot_id); + if(snap_index < 0) { + snap_index = irow_find_snapshot_by_name(s, snapshot_id); + if(snap_index < 0) { + fprintf(stderr, "Failed to find snapshot %s\n", snapshot_id); + ret = -1; + goto end; + } + } + target_snap = s->snapshots + snap_index; + + if(target_snap->is_deleted) { + fprintf(stderr, "Can not delete deleted snapshot %s\n", snapshot_id); + ret = -1; + goto end; + } + + target_snap->is_deleted = 1; + strncat(target_snap->name, "_del", 255-strlen(target_snap->name)); + + if(target_snap->nb_children == 0) { + if(target_snap->father_btmp_file) { + snap_index = irow_find_snapshot_by_btmp(s, target_snap->father_btmp_file); + if(snap_index < 0) { + fprintf(stderr, "Failed to find father snapshot\n"); + ret = -1; + goto end; + } + father_snap = s->snapshots + snap_index; + irow_update_nb_children(s, father_snap, -1); + } + } + + s->snapshots_is_dirty = 1; + irow_update_meta(s, NULL, 0); +end: + return ret; +} + +static int irow_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) { + + BDRVIrowState *s = bs->opaque; + QEMUSnapshotInfo *snap_tab, *snap_info; + IRowSnapshot *snap; + int i, offset, nb_del_snapshots = 0; + + if (s->nb_snapshots == 0) { + *psn_tab = NULL; + return s->nb_snapshots; + } + + for(i = 0; i < s->nb_snapshots; i++) { + snap = s->snapshots + i; + if(snap->is_deleted) + nb_del_snapshots += 1; + } + snap_tab = g_malloc0((s->nb_snapshots - nb_del_snapshots) * sizeof(QEMUSnapshotInfo)); + offset = 0; + for(i = 0; i < s->nb_snapshots; i++) { + snap_info = snap_tab + offset; + snap = s->snapshots + i; + if(snap->is_deleted != 1) { + if(snap->id_str != NULL) { + pstrcpy(snap_info->id_str, sizeof(snap_info->id_str), snap->id_str); + } + if(snap->name != NULL) { + pstrcpy(snap_info->name, sizeof(snap_info->name), snap->name); + } + snap_info->vm_state_size = snap->vm_state_size; + snap_info->date_sec = snap->date_sec; + snap_info->date_nsec = snap->date_nsec; + snap_info->vm_clock_nsec = snap->vm_clock_nsec; + + offset += 1; + } + } + *psn_tab = snap_tab; + return s->nb_snapshots - nb_del_snapshots; +} + +static int irow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { + BDRVIrowState *s = bs->opaque; + bdi->cluster_size = s->cluster_size; + bdi->vm_state_offset = irow_vm_state_offset(s); + return 0; +} + +static int irow_save_vmstate(BlockDriverState *bs, const uint8_t *buf, int64_t pos, int size) { + + BDRVIrowState *birows = bs->opaque; + int ret = 0; + + ret = irow_save_vmstate2(birows, buf, pos, size); + return ret; +} + +static int irow_load_vmstate(BlockDriverState *bs, uint8_t *buf, int64_t pos, int size) { + + BDRVIrowState *target_birows = NULL, *birows = bs->opaque; + int target_index, ret = 0; + + target_index = irow_find_snapshot_by_btmp(birows, birows->father_btmp_file); + if(target_index < 0) { + ret = -1; + goto end; + } + + target_birows = irow_open_previous_state(birows, target_index); + if(target_birows == NULL) { + ret = -1; + goto end; + } + + ret = irow_load_vmstate2(target_birows, buf, pos, size); + +end: + if(target_birows != NULL) { + irow_close_previous_state(target_birows); + target_birows = NULL; + } + return ret; +} + +static int irow_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix){ + BDRVIrowState *birows = bs->opaque; + char user_input[100]; + printf("current copy_on_demand state is "); + if(birows->copy_on_demand) { + printf("ON\n"); + } else { + printf("OFF\n"); + } + while(1) { + printf("do you want to change copy_on_demand state? (y/n)"); + if(scanf("%s", user_input)== EOF){ + return 1; + } + user_input[0] = tolower(user_input[0]); + if(user_input[0] == 'y') { + birows->copy_on_demand = birows->copy_on_demand ? 0 : 1; + irow_update_meta(birows, NULL, 1); + break; + } + if(user_input[0] == 'n') + break; + } + return 0; +} + +static int64_t irow_get_length(BlockDriverState *bs) { + BDRVIrowState *birows = bs->opaque; + int64_t ret; + ret = birows->disk_size; + return ret; +} + +static QEMUOptionParameter irow_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = OPT_SIZE, + .help = "irow cluster size" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = OPT_STRING, + .help = "File name of a base image" + }, + { + .name = "copy_on_demand", + .type = OPT_FLAG, + .help = "copy clusters to current irvd when needed" + }, + { NULL } +}; + +static BlockDriver bdrv_irow = { + .format_name = "irow", + .instance_size = sizeof(BDRVIrowState), + .bdrv_probe = irow_probe, + .bdrv_open = irow_open, + .bdrv_read = irow_read, + .bdrv_write = irow_write, + .bdrv_close = irow_close, + .bdrv_create = irow_create, + + .bdrv_co_flush_to_disk = irow_flush, + + .bdrv_aio_readv = irow_aio_readv, + .bdrv_aio_writev = irow_aio_writev, + .bdrv_aio_flush = irow_aio_flush, + + .bdrv_snapshot_create = irow_snapshot_create, + .bdrv_snapshot_goto = irow_snapshot_goto, + .bdrv_snapshot_delete = irow_snapshot_delete, + .bdrv_snapshot_list = irow_snapshot_list, + + .bdrv_get_info = irow_get_info, + .bdrv_getlength = irow_get_length, + + .bdrv_save_vmstate = irow_save_vmstate, + .bdrv_load_vmstate = irow_load_vmstate, + + .create_options = irow_create_options, + .bdrv_check = irow_check, +}; + +static void bdrv_irow_init(void) +{ + bdrv_register(&bdrv_irow); +} + +block_init(bdrv_irow_init); diff --git a/block/irow.h b/block/irow.h new file mode 100644 index 0000000..131b741 --- /dev/null +++ b/block/irow.h @@ -0,0 +1,135 @@ +/* IROW(Improved ROW)Disk Format + * */ +/* + * iRow (imporved Redirect-on-Write) is a disk format supporting high-efficiency VM disk snapshot. + * iROW uses bitmap to reduce the amount of metadata, so that both the VM disk snapshot key operations + * performance and the VM disk I/O performance would be enhanced at the same time. + * + *The iROW VM disk image consists of a meta file and several snapshots. + * + *A snapshot consists of 2 files: a bitmap file (btmp file) and a VM disk data file (irvd file). + *The current state of the iROW VM disk also occupies a snapshot. + * + *The meta file consists of the meta header and the snapshots information. The meta header is used to + *store basic information of VM disk image. The snapshots information sequentially stores every snapshot’s name, + *id and others related information. + * + *The btmp file consists of a bitmap and the VM state data. The bitmap is used to indicate whether the + *clusters exist in corresponding irvd file. Each cluster in the VM disk image is mapped to a bit in the bitmap. + * + *The irvd file is used to store the actual data of the VM disk image. The smallest unit of storage is cluster. + *iROW does not decide the address of the data clusters. It just writes the clusters to the same VM disk image + *addresses as the virtual addresses of the clusters. Because of host machine’s file system support sparse files, + *iROW also achieves the gradual growth of the VM disk image size with the actual disk usage. + * + */ +#define IROW_MAGIC (('I' << 24) | ('R' << 16) | ('O' << 8) | 'W') +#define IROW_VERSION 1 + +#define IROW_SNAPHEADER_MAGIC (('S' << 24) | ('N' << 16) | ('A' << 8) | 'P') + +#define MIN_CLUSTER_BITS 9 +#define MAX_CLUSTER_BITS 21 +#define MAX_FILE_NAME_LENGTH 256 + +#define IROW_READ 1 +#define IROW_WRITE 2 +#define IROW_AIO_READ 3 +#define IROW_AIO_WRITE 4 + + +typedef struct __attribute__((packed)) IRowMeta { + uint32_t magic; + uint32_t version; + uint32_t copy_on_demand; + uint32_t nb_snapshots; + uint32_t cluster_size; + uint32_t cluster_bits; + uint32_t sectors_per_cluster; + uint64_t total_clusters; + uint64_t disk_size; + char current_btmp[MAX_FILE_NAME_LENGTH]; + char backing_file[MAX_FILE_NAME_LENGTH]; +} IRowMeta; + +typedef struct __attribute__((packed)) IRowSnapshotHeader { + uint32_t snap_magic; + char id_str[128]; + char name[256]; + char btmp_file[MAX_FILE_NAME_LENGTH]; + char irvd_file[MAX_FILE_NAME_LENGTH]; + char father_btmp_file[MAX_FILE_NAME_LENGTH]; + uint32_t vm_state_size; + uint32_t date_sec; + uint32_t date_nsec; + uint64_t vm_clock_nsec; + uint32_t nb_children; + uint32_t is_deleted; +} IRowSnapshotHeader; + +typedef struct IRowSnapshot { + char *id_str; + char *name; + char *btmp_file; + char *irvd_file; + char *father_btmp_file; + uint32_t vm_state_size; + uint32_t date_sec; + uint32_t date_nsec; + uint64_t vm_clock_nsec; + uint32_t nb_children; + uint32_t is_deleted; +} IRowSnapshot; + +typedef struct IRowCreateState { + uint64_t disk_size; + uint32_t cluster_size; + uint32_t cluster_bits; + uint32_t copy_on_demand; + char *meta_file; + char *father_btmp_file; + char *btmp_file; + char *irvd_file; + char *time_value; + char *backing_file; +} IRowCreateState; + +typedef struct ClusterCache { + uint8_t *cache; + int64_t cluster_num; +} ClusterCache; + +typedef struct BDRVIrowState { + BlockDriverState *irow_meta; + BlockDriverState *irow_btmp; + BlockDriverState *irow_irvd; + uint64_t disk_size; + uint64_t bitmap_size; + uint32_t cluster_size; + uint32_t cluster_bits; + uint64_t total_clusters; + uint32_t sectors_per_cluster; + uint32_t nb_snapshots; + uint32_t vm_state_size; + uint32_t copy_on_demand; + int open_flags; + IRowSnapshot *snapshots; + uint32_t snapshots_is_dirty; + uint8_t *bitmap; + uint32_t bitmap_is_dirty; + uint32_t vmstate_is_saved; + uint32_t complete_image; + char *meta_file; + char *current_btmp_file; + char *father_btmp_file; + char *opened_btmp_file; + char *irvd_file; +} BDRVIrowState; + +typedef struct ClusterBuffer { + uint8_t *buf; + uint8_t *read_from_father; +} ClusterBuffer; + +#define IROW_SNAPSHOT_OFFSET sizeof(IRowMeta) +#define MAX_MERGE_BUFFER 16 * 1024 * 1024