From patchwork Thu Nov 7 13:12:36 2013 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Stefan Hajnoczi X-Patchwork-Id: 289338 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [IPv6:2001:4830:134:3::11]) (using TLSv1 with cipher AES256-SHA (256/256 bits)) (Client did not present a certificate) by ozlabs.org (Postfix) with ESMTPS id C9CCC2C009A for ; Fri, 8 Nov 2013 00:45:21 +1100 (EST) Received: from localhost ([::1]:40383 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1VePtg-0000NM-1O for incoming@patchwork.ozlabs.org; Thu, 07 Nov 2013 08:45:20 -0500 Received: from eggs.gnu.org ([2001:4830:134:3::10]:58453) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1VePtC-0000H1-Jt for qemu-devel@nongnu.org; Thu, 07 Nov 2013 08:44:56 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1VePt4-0007qR-U9 for qemu-devel@nongnu.org; Thu, 07 Nov 2013 08:44:50 -0500 Received: from mx1.redhat.com ([209.132.183.28]:24078) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1VePt4-0007k8-HM for qemu-devel@nongnu.org; Thu, 07 Nov 2013 08:44:42 -0500 Received: from int-mx12.intmail.prod.int.phx2.redhat.com (int-mx12.intmail.prod.int.phx2.redhat.com [10.5.11.25]) by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id rA7DE0A3021525 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK); Thu, 7 Nov 2013 08:14:00 -0500 Received: from localhost (ovpn-112-18.ams2.redhat.com [10.36.112.18]) by int-mx12.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with ESMTP id rA7DDxpd029437; Thu, 7 Nov 2013 08:13:59 -0500 From: Stefan Hajnoczi To: Date: Thu, 7 Nov 2013 14:12:36 +0100 Message-Id: <1383829964-32364-29-git-send-email-stefanha@redhat.com> In-Reply-To: <1383829964-32364-1-git-send-email-stefanha@redhat.com> References: <1383829964-32364-1-git-send-email-stefanha@redhat.com> X-Scanned-By: MIMEDefang 2.68 on 10.5.11.25 X-detected-operating-system: by eggs.gnu.org: GNU/Linux 3.x X-Received-From: 209.132.183.28 Cc: Jeff Cody , Stefan Hajnoczi , Anthony Liguori Subject: [Qemu-devel] [PULL 28/36] block: vhdx write support X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org From: Jeff Cody This adds support for writing to VHDX image files, using coroutines. Writes into the BAT table goes through the VHDX log. Currently, BAT table writes occur when expanding a dynamic VHDX file, and allocating a new BAT entry. Signed-off-by: Jeff Cody Signed-off-by: Stefan Hajnoczi --- block/vhdx.c | 212 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- block/vhdx.h | 2 +- 2 files changed, 209 insertions(+), 5 deletions(-) diff --git a/block/vhdx.c b/block/vhdx.c index 574ac4c..050f071 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -914,7 +914,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, if (payblocks--) { /* payload bat entries */ if ((s->bat[i] & VHDX_BAT_STATE_BIT_MASK) == - PAYLOAD_BLOCK_FULL_PRESENT) { + PAYLOAD_BLOCK_FULLY_PRESENT) { ret = vhdx_region_check(s, s->bat[i] & VHDX_BAT_FILE_OFF_MASK, s->block_size); if (ret < 0) { @@ -935,7 +935,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, } } - /* TODO: differencing files, write */ + /* TODO: differencing files */ /* Disable migration when VHDX images are used */ error_set(&s->migration_blocker, @@ -1040,7 +1040,7 @@ static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num, /* return zero */ qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail); break; - case PAYLOAD_BLOCK_FULL_PRESENT: + case PAYLOAD_BLOCK_FULLY_PRESENT: qemu_co_mutex_unlock(&s->lock); ret = bdrv_co_readv(bs->file, sinfo.file_offset >> BDRV_SECTOR_BITS, @@ -1070,7 +1070,43 @@ exit: return ret; } +/* + * Allocate a new payload block at the end of the file. + * + * Allocation will happen at 1MB alignment inside the file + * + * Returns the file offset start of the new payload block + */ +static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s, + uint64_t *new_offset) +{ + *new_offset = bdrv_getlength(bs->file); + + /* per the spec, the address for a block is in units of 1MB */ + *new_offset = ROUND_UP(*new_offset, 1024 * 1024); + + return bdrv_truncate(bs->file, *new_offset + s->block_size); +} + +/* + * Update the BAT table entry with the new file offset, and the new entry + * state */ +static void vhdx_update_bat_table_entry(BlockDriverState *bs, BDRVVHDXState *s, + VHDXSectorInfo *sinfo, + uint64_t *bat_entry_le, + uint64_t *bat_offset, int state) +{ + /* The BAT entry is a uint64, with 44 bits for the file offset in units of + * 1MB, and 3 bits for the block state. */ + s->bat[sinfo->bat_idx] = ((sinfo->file_offset>>20) << + VHDX_BAT_FILE_OFF_BITS); + s->bat[sinfo->bat_idx] |= state & VHDX_BAT_STATE_BIT_MASK; + + *bat_entry_le = cpu_to_le64(s->bat[sinfo->bat_idx]); + *bat_offset = s->bat_offset + sinfo->bat_idx * sizeof(VHDXBatEntry); + +} /* Per the spec, on the first write of guest-visible data to the file the * data write guid must be updated in the header */ @@ -1087,7 +1123,175 @@ int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s) static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - return -ENOTSUP; + int ret = -ENOTSUP; + BDRVVHDXState *s = bs->opaque; + VHDXSectorInfo sinfo; + uint64_t bytes_done = 0; + uint64_t bat_entry = 0; + uint64_t bat_entry_offset = 0; + QEMUIOVector hd_qiov; + struct iovec iov1 = { 0 }; + struct iovec iov2 = { 0 }; + int sectors_to_write; + int bat_state; + uint64_t bat_prior_offset = 0; + bool bat_update = false; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + qemu_co_mutex_lock(&s->lock); + + ret = vhdx_user_visible_write(bs, s); + if (ret < 0) { + goto exit; + } + + while (nb_sectors > 0) { + bool use_zero_buffers = false; + bat_update = false; + if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) { + /* not supported yet */ + ret = -ENOTSUP; + goto exit; + } else { + vhdx_block_translate(s, sector_num, nb_sectors, &sinfo); + sectors_to_write = sinfo.sectors_avail; + + qemu_iovec_reset(&hd_qiov); + /* check the payload block state */ + bat_state = s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK; + switch (bat_state) { + case PAYLOAD_BLOCK_ZERO: + /* in this case, we need to preserve zero writes for + * data that is not part of this write, so we must pad + * the rest of the buffer to zeroes */ + + /* if we are on a posix system with ftruncate() that extends + * a file, then it is zero-filled for us. On Win32, the raw + * layer uses SetFilePointer and SetFileEnd, which does not + * zero fill AFAIK */ + + /* Queue another write of zero buffers if the underlying file + * does not zero-fill on file extension */ + + if (bdrv_has_zero_init(bs->file) == 0) { + use_zero_buffers = true; + + /* zero fill the front, if any */ + if (sinfo.block_offset) { + iov1.iov_len = sinfo.block_offset; + iov1.iov_base = qemu_blockalign(bs, iov1.iov_len); + memset(iov1.iov_base, 0, iov1.iov_len); + qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0, + sinfo.block_offset); + sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS; + } + + /* our actual data */ + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, + sinfo.bytes_avail); + + /* zero fill the back, if any */ + if ((sinfo.bytes_avail - sinfo.block_offset) < + s->block_size) { + iov2.iov_len = s->block_size - + (sinfo.bytes_avail + sinfo.block_offset); + iov2.iov_base = qemu_blockalign(bs, iov2.iov_len); + memset(iov2.iov_base, 0, iov2.iov_len); + qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0, + sinfo.block_offset); + sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS; + } + } + + /* fall through */ + case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */ + case PAYLOAD_BLOCK_UNMAPPED: /* fall through */ + case PAYLOAD_BLOCK_UNDEFINED: /* fall through */ + bat_prior_offset = sinfo.file_offset; + ret = vhdx_allocate_block(bs, s, &sinfo.file_offset); + if (ret < 0) { + goto exit; + } + /* once we support differencing files, this may also be + * partially present */ + /* update block state to the newly specified state */ + vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry, + &bat_entry_offset, + PAYLOAD_BLOCK_FULLY_PRESENT); + bat_update = true; + /* since we just allocated a block, file_offset is the + * beginning of the payload block. It needs to be the + * write address, which includes the offset into the block */ + if (!use_zero_buffers) { + sinfo.file_offset += sinfo.block_offset; + } + /* fall through */ + case PAYLOAD_BLOCK_FULLY_PRESENT: + /* if the file offset address is in the header zone, + * there is a problem */ + if (sinfo.file_offset < (1024 * 1024)) { + ret = -EFAULT; + goto error_bat_restore; + } + + if (!use_zero_buffers) { + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, + sinfo.bytes_avail); + } + /* block exists, so we can just overwrite it */ + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_writev(bs->file, + sinfo.file_offset >> BDRV_SECTOR_BITS, + sectors_to_write, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto error_bat_restore; + } + break; + case PAYLOAD_BLOCK_PARTIALLY_PRESENT: + /* we don't yet support difference files, fall through + * to error */ + default: + ret = -EIO; + goto exit; + break; + } + + if (bat_update) { + /* this will update the BAT entry into the log journal, and + * then flush the log journal out to disk */ + ret = vhdx_log_write_and_flush(bs, s, &bat_entry, + sizeof(VHDXBatEntry), + bat_entry_offset); + if (ret < 0) { + goto exit; + } + } + + nb_sectors -= sinfo.sectors_avail; + sector_num += sinfo.sectors_avail; + bytes_done += sinfo.bytes_avail; + + } + } + + goto exit; + +error_bat_restore: + if (bat_update) { + /* keep metadata in sync, and restore the bat entry state + * if error. */ + sinfo.file_offset = bat_prior_offset; + vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry, + &bat_entry_offset, bat_state); + } +exit: + qemu_vfree(iov1.iov_base); + qemu_vfree(iov2.iov_base); + qemu_co_mutex_unlock(&s->lock); + qemu_iovec_destroy(&hd_qiov); + return ret; } diff --git a/block/vhdx.h b/block/vhdx.h index 6abbf50..f331548 100644 --- a/block/vhdx.h +++ b/block/vhdx.h @@ -217,7 +217,7 @@ typedef struct QEMU_PACKED VHDXLogDataSector { #define PAYLOAD_BLOCK_UNDEFINED 1 #define PAYLOAD_BLOCK_ZERO 2 #define PAYLOAD_BLOCK_UNMAPPED 5 -#define PAYLOAD_BLOCK_FULL_PRESENT 6 +#define PAYLOAD_BLOCK_FULLY_PRESENT 6 #define PAYLOAD_BLOCK_PARTIALLY_PRESENT 7 #define SB_BLOCK_NOT_PRESENT 0