diff mbox

[2/4] block: add a helper to change writeback mode on the fly

Message ID 20110315141132.GB30710@lst.de
State New
Headers show

Commit Message

Christoph Hellwig March 15, 2011, 2:11 p.m. UTC
Add a new bdrv_change_cache that can set/clear the writeback flag
at runtime by stopping all I/O and closing/reopening the image file.

All code is based on a patch from Prerna Saxena <prerna@linux.vnet.ibm.com>
with minimal refactoring.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Comments

Stefan Hajnoczi March 16, 2011, 9:49 a.m. UTC | #1
On Tue, Mar 15, 2011 at 2:11 PM, Christoph Hellwig <hch@lst.de> wrote:
> Add a new bdrv_change_cache that can set/clear the writeback flag
> at runtime by stopping all I/O and closing/reopening the image file.
>
> All code is based on a patch from Prerna Saxena <prerna@linux.vnet.ibm.com>
> with minimal refactoring.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> Index: qemu/block.c
> ===================================================================
> --- qemu.orig/block.c   2011-03-15 11:47:31.285634626 +0100
> +++ qemu/block.c        2011-03-15 14:57:03.680633093 +0100
> @@ -441,6 +441,8 @@ static int bdrv_open_common(BlockDriverS
>
>     if (flags & BDRV_O_CACHE_WB)
>         bs->enable_write_cache = 1;
> +    else
> +        bs->enable_write_cache = 0;
>
>     /*
>      * Clear flags that are internal to the block layer before opening the
> @@ -651,6 +653,44 @@ unlink_and_fail:
>     return ret;
>  }
>
> +static int bdrv_reopen(BlockDriverState *bs, int bdrv_flags)
> +{
> +    BlockDriver *drv = bs->drv;
> +    int ret;
> +
> +    if (bdrv_flags == bs->open_flags) {
> +        return 0;
> +    }
> +
> +    /* Quiesce IO for the given block device */
> +    qemu_aio_flush();
> +    bdrv_flush(bs);
> +
> +    bdrv_close(bs);
> +    ret = bdrv_open(bs, bs->filename, bdrv_flags, drv);

This will fail for -snapshot disks since the on-disk file is deleted.

In general it would be more robust to keep the original file
descriptor around in case the file cannot be opened with the new
flags.

Stefan
Kevin Wolf March 16, 2011, 9:59 a.m. UTC | #2
Am 16.03.2011 10:49, schrieb Stefan Hajnoczi:
> On Tue, Mar 15, 2011 at 2:11 PM, Christoph Hellwig <hch@lst.de> wrote:
>> Add a new bdrv_change_cache that can set/clear the writeback flag
>> at runtime by stopping all I/O and closing/reopening the image file.
>>
>> All code is based on a patch from Prerna Saxena <prerna@linux.vnet.ibm.com>
>> with minimal refactoring.
>>
>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>>
>> Index: qemu/block.c
>> ===================================================================
>> --- qemu.orig/block.c   2011-03-15 11:47:31.285634626 +0100
>> +++ qemu/block.c        2011-03-15 14:57:03.680633093 +0100
>> @@ -441,6 +441,8 @@ static int bdrv_open_common(BlockDriverS
>>
>>     if (flags & BDRV_O_CACHE_WB)
>>         bs->enable_write_cache = 1;
>> +    else
>> +        bs->enable_write_cache = 0;
>>
>>     /*
>>      * Clear flags that are internal to the block layer before opening the
>> @@ -651,6 +653,44 @@ unlink_and_fail:
>>     return ret;
>>  }
>>
>> +static int bdrv_reopen(BlockDriverState *bs, int bdrv_flags)
>> +{
>> +    BlockDriver *drv = bs->drv;
>> +    int ret;
>> +
>> +    if (bdrv_flags == bs->open_flags) {
>> +        return 0;
>> +    }
>> +
>> +    /* Quiesce IO for the given block device */
>> +    qemu_aio_flush();
>> +    bdrv_flush(bs);
>> +
>> +    bdrv_close(bs);
>> +    ret = bdrv_open(bs, bs->filename, bdrv_flags, drv);
> 
> This will fail for -snapshot disks since the on-disk file is deleted.
> 
> In general it would be more robust to keep the original file
> descriptor around in case the file cannot be opened with the new
> flags.

Looks like we'll need a bdrv_reopen for protocols?

Kevin
Daniel P. Berrangé March 17, 2011, 2:44 p.m. UTC | #3
On Tue, Mar 15, 2011 at 03:11:32PM +0100, Christoph Hellwig wrote:
> Add a new bdrv_change_cache that can set/clear the writeback flag
> at runtime by stopping all I/O and closing/reopening the image file.
> 
> All code is based on a patch from Prerna Saxena <prerna@linux.vnet.ibm.com>
> with minimal refactoring.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
>  
> +static int bdrv_reopen(BlockDriverState *bs, int bdrv_flags)
> +{
> +    BlockDriver *drv = bs->drv;
> +    int ret;
> +
> +    if (bdrv_flags == bs->open_flags) {
> +        return 0;
> +    }
> +
> +    /* Quiesce IO for the given block device */
> +    qemu_aio_flush();
> +    bdrv_flush(bs);
> +
> +    bdrv_close(bs);
> +    ret = bdrv_open(bs, bs->filename, bdrv_flags, drv);
> +
> +    /*
> +     * A failed attempt to reopen the image file must lead to 'abort()'
> +     */
> +    if (ret != 0) {
> +        abort();
> +    }
> +
> +    return ret;
> +}



> +
> +int bdrv_change_cache(BlockDriverState *bs, bool enable)
> +{
> +    int bdrv_flags = 0;
> +
> +    bdrv_flags = bs->open_flags & ~BDRV_O_CACHE_WB;
> +    if (enable) {
> +        bdrv_flags |= BDRV_O_CACHE_WB;
> +    }
> +
> +    return bdrv_reopen(bs, bdrv_flags);
> +}

Is there any way we can manage todo this *without* closing and
re-opening the file descriptor ?

One of the things we're considering for the future is to enable
QEMU to be passed open FD(s) for its drives, from the management
system, instead of having QEMU open the files itself. This will
make it possible to have strong, per-VM selinux isolation for
disks stored on NFS. It will also make it easier to let us run
QEMU inside a private filesystem namespace (CLONE_NEWNS) and not
have to expose any files in that namespace except the QEMU binary
& a few device nodes (particularly useful for hotplug since we
won't know what files need to be visible inside the private namespace
ahead of time).

If QEMU expects to close+reopen any of its disks at any time the
guest requests, this will complicate life somewhat

Regards,
Daniel
Kevin Wolf March 17, 2011, 3:11 p.m. UTC | #4
Am 17.03.2011 15:44, schrieb Daniel P. Berrange:
> On Tue, Mar 15, 2011 at 03:11:32PM +0100, Christoph Hellwig wrote:
>> Add a new bdrv_change_cache that can set/clear the writeback flag
>> at runtime by stopping all I/O and closing/reopening the image file.
>>
>> All code is based on a patch from Prerna Saxena <prerna@linux.vnet.ibm.com>
>> with minimal refactoring.
>>
>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>>
>>  
>> +static int bdrv_reopen(BlockDriverState *bs, int bdrv_flags)
>> +{
>> +    BlockDriver *drv = bs->drv;
>> +    int ret;
>> +
>> +    if (bdrv_flags == bs->open_flags) {
>> +        return 0;
>> +    }
>> +
>> +    /* Quiesce IO for the given block device */
>> +    qemu_aio_flush();
>> +    bdrv_flush(bs);
>> +
>> +    bdrv_close(bs);
>> +    ret = bdrv_open(bs, bs->filename, bdrv_flags, drv);
>> +
>> +    /*
>> +     * A failed attempt to reopen the image file must lead to 'abort()'
>> +     */
>> +    if (ret != 0) {
>> +        abort();
>> +    }
>> +
>> +    return ret;
>> +}
> 
> 
> 
>> +
>> +int bdrv_change_cache(BlockDriverState *bs, bool enable)
>> +{
>> +    int bdrv_flags = 0;
>> +
>> +    bdrv_flags = bs->open_flags & ~BDRV_O_CACHE_WB;
>> +    if (enable) {
>> +        bdrv_flags |= BDRV_O_CACHE_WB;
>> +    }
>> +
>> +    return bdrv_reopen(bs, bdrv_flags);
>> +}
> 
> Is there any way we can manage todo this *without* closing and
> re-opening the file descriptor ?
> 
> One of the things we're considering for the future is to enable
> QEMU to be passed open FD(s) for its drives, from the management
> system, instead of having QEMU open the files itself. 

What about cache mode, read-only flags etc. that are set with the right
flags during the open? Must qemu just assume that the management has
done the right thing?

And what about things like backing files or other information that
depends on the content of the image file?

> If QEMU expects to close+reopen any of its disks at any time the
> guest requests, this will complicate life somewhat

It does expect this. For example for making backing files temporarily
read-write during a 'commit' monitor command, we already reopen the
image. (Let's hope nobody uses -snapshot, live snapshots and commit, he
would lose his disk...)

Kevin
Stefan Hajnoczi March 17, 2011, 4:44 p.m. UTC | #5
On Thu, Mar 17, 2011 at 3:11 PM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 17.03.2011 15:44, schrieb Daniel P. Berrange:
>> On Tue, Mar 15, 2011 at 03:11:32PM +0100, Christoph Hellwig wrote:
>>> Add a new bdrv_change_cache that can set/clear the writeback flag
>>> at runtime by stopping all I/O and closing/reopening the image file.
>>>
>>> All code is based on a patch from Prerna Saxena <prerna@linux.vnet.ibm.com>
>>> with minimal refactoring.
>>>
>>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>>>
>>>
>>> +static int bdrv_reopen(BlockDriverState *bs, int bdrv_flags)
>>> +{
>>> +    BlockDriver *drv = bs->drv;
>>> +    int ret;
>>> +
>>> +    if (bdrv_flags == bs->open_flags) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    /* Quiesce IO for the given block device */
>>> +    qemu_aio_flush();
>>> +    bdrv_flush(bs);
>>> +
>>> +    bdrv_close(bs);
>>> +    ret = bdrv_open(bs, bs->filename, bdrv_flags, drv);
>>> +
>>> +    /*
>>> +     * A failed attempt to reopen the image file must lead to 'abort()'
>>> +     */
>>> +    if (ret != 0) {
>>> +        abort();
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>
>>
>>
>>> +
>>> +int bdrv_change_cache(BlockDriverState *bs, bool enable)
>>> +{
>>> +    int bdrv_flags = 0;
>>> +
>>> +    bdrv_flags = bs->open_flags & ~BDRV_O_CACHE_WB;
>>> +    if (enable) {
>>> +        bdrv_flags |= BDRV_O_CACHE_WB;
>>> +    }
>>> +
>>> +    return bdrv_reopen(bs, bdrv_flags);
>>> +}
>>
>> Is there any way we can manage todo this *without* closing and
>> re-opening the file descriptor ?
>>
>> One of the things we're considering for the future is to enable
>> QEMU to be passed open FD(s) for its drives, from the management
>> system, instead of having QEMU open the files itself.
>
> What about cache mode, read-only flags etc. that are set with the right
> flags during the open? Must qemu just assume that the management has
> done the right thing?
>
> And what about things like backing files or other information that
> depends on the content of the image file?
>
>> If QEMU expects to close+reopen any of its disks at any time the
>> guest requests, this will complicate life somewhat
>
> It does expect this. For example for making backing files temporarily
> read-write during a 'commit' monitor command, we already reopen the
> image. (Let's hope nobody uses -snapshot, live snapshots and commit, he
> would lose his disk...)

I think we need to use the most robust solution possible.  We don't
want to get into a situation that can be avoided.  Especially in cases
where human errors can (==will) be made.

I suggested using /proc/$pid/fd to reopen an existing file descriptor.
 That interface is only available on Linux and possibly some of
{Solaris, *BSD, OSX}.  So there needs to be a fallback, but in this
situation it feels right to take advantage of whatever means are
provided by the host OS to make safe transitions between image files.

Stefan
Blue Swirl March 19, 2011, 8:28 a.m. UTC | #6
On Thu, Mar 17, 2011 at 6:44 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Thu, Mar 17, 2011 at 3:11 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>> Am 17.03.2011 15:44, schrieb Daniel P. Berrange:
>>> On Tue, Mar 15, 2011 at 03:11:32PM +0100, Christoph Hellwig wrote:
>>>> Add a new bdrv_change_cache that can set/clear the writeback flag
>>>> at runtime by stopping all I/O and closing/reopening the image file.
>>>>
>>>> All code is based on a patch from Prerna Saxena <prerna@linux.vnet.ibm.com>
>>>> with minimal refactoring.
>>>>
>>>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>>>>
>>>>
>>>> +static int bdrv_reopen(BlockDriverState *bs, int bdrv_flags)
>>>> +{
>>>> +    BlockDriver *drv = bs->drv;
>>>> +    int ret;
>>>> +
>>>> +    if (bdrv_flags == bs->open_flags) {
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    /* Quiesce IO for the given block device */
>>>> +    qemu_aio_flush();
>>>> +    bdrv_flush(bs);
>>>> +
>>>> +    bdrv_close(bs);
>>>> +    ret = bdrv_open(bs, bs->filename, bdrv_flags, drv);
>>>> +
>>>> +    /*
>>>> +     * A failed attempt to reopen the image file must lead to 'abort()'
>>>> +     */
>>>> +    if (ret != 0) {
>>>> +        abort();
>>>> +    }
>>>> +
>>>> +    return ret;
>>>> +}
>>>
>>>
>>>
>>>> +
>>>> +int bdrv_change_cache(BlockDriverState *bs, bool enable)
>>>> +{
>>>> +    int bdrv_flags = 0;
>>>> +
>>>> +    bdrv_flags = bs->open_flags & ~BDRV_O_CACHE_WB;
>>>> +    if (enable) {
>>>> +        bdrv_flags |= BDRV_O_CACHE_WB;
>>>> +    }
>>>> +
>>>> +    return bdrv_reopen(bs, bdrv_flags);
>>>> +}
>>>
>>> Is there any way we can manage todo this *without* closing and
>>> re-opening the file descriptor ?
>>>
>>> One of the things we're considering for the future is to enable
>>> QEMU to be passed open FD(s) for its drives, from the management
>>> system, instead of having QEMU open the files itself.
>>
>> What about cache mode, read-only flags etc. that are set with the right
>> flags during the open? Must qemu just assume that the management has
>> done the right thing?
>>
>> And what about things like backing files or other information that
>> depends on the content of the image file?
>>
>>> If QEMU expects to close+reopen any of its disks at any time the
>>> guest requests, this will complicate life somewhat
>>
>> It does expect this. For example for making backing files temporarily
>> read-write during a 'commit' monitor command, we already reopen the
>> image. (Let's hope nobody uses -snapshot, live snapshots and commit, he
>> would lose his disk...)
>
> I think we need to use the most robust solution possible.  We don't
> want to get into a situation that can be avoided.  Especially in cases
> where human errors can (==will) be made.
>
> I suggested using /proc/$pid/fd to reopen an existing file descriptor.
>  That interface is only available on Linux and possibly some of
> {Solaris, *BSD, OSX}.  So there needs to be a fallback, but in this
> situation it feels right to take advantage of whatever means are
> provided by the host OS to make safe transitions between image files.

No need for such hacks. There could be an explicit reopen method,
which could be implemented differently for each backing file type
(file descriptor, file, socket, CDROM devices etc).
diff mbox

Patch

Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2011-03-15 11:47:31.285634626 +0100
+++ qemu/block.c	2011-03-15 14:57:03.680633093 +0100
@@ -441,6 +441,8 @@  static int bdrv_open_common(BlockDriverS
 
     if (flags & BDRV_O_CACHE_WB)
         bs->enable_write_cache = 1;
+    else
+        bs->enable_write_cache = 0;
 
     /*
      * Clear flags that are internal to the block layer before opening the
@@ -651,6 +653,44 @@  unlink_and_fail:
     return ret;
 }
 
+static int bdrv_reopen(BlockDriverState *bs, int bdrv_flags)
+{
+    BlockDriver *drv = bs->drv;
+    int ret;
+
+    if (bdrv_flags == bs->open_flags) {
+        return 0;
+    }
+
+    /* Quiesce IO for the given block device */
+    qemu_aio_flush();
+    bdrv_flush(bs);
+
+    bdrv_close(bs);
+    ret = bdrv_open(bs, bs->filename, bdrv_flags, drv);
+
+    /*
+     * A failed attempt to reopen the image file must lead to 'abort()'
+     */
+    if (ret != 0) {
+        abort();
+    }
+
+    return ret;
+}
+
+int bdrv_change_cache(BlockDriverState *bs, bool enable)
+{
+    int bdrv_flags = 0;
+
+    bdrv_flags = bs->open_flags & ~BDRV_O_CACHE_WB;
+    if (enable) {
+        bdrv_flags |= BDRV_O_CACHE_WB;
+    }
+
+    return bdrv_reopen(bs, bdrv_flags);
+}
+
 void bdrv_close(BlockDriverState *bs)
 {
     if (bs->drv) {
Index: qemu/block.h
===================================================================
--- qemu.orig/block.h	2011-03-15 11:47:18.664136441 +0100
+++ qemu/block.h	2011-03-15 11:47:31.813634525 +0100
@@ -87,6 +87,7 @@  int bdrv_pwrite_sync(BlockDriverState *b
 int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num,
     const uint8_t *buf, int nb_sectors);
 int bdrv_truncate(BlockDriverState *bs, int64_t offset);
+int bdrv_change_cache(BlockDriverState *bs, bool enable);
 int64_t bdrv_getlength(BlockDriverState *bs);
 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr);
 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs);