diff mbox

Add cache=volatile parameter to -drive

Message ID 1274091292-4812-1-git-send-email-agraf@suse.de
State New
Headers show

Commit Message

Alexander Graf May 17, 2010, 10:14 a.m. UTC
Usually the guest can tell the host to flush data to disk. In some cases we
don't want to flush though, but try to keep everything in cache.

So let's add a new cache value to -drive that allows us to set the cache
policy to most aggressive, disabling flushes. We call this mode "volatile",
as guest data is not guaranteed to survive host crashes anymore.

This patch also adds a noop function for aio, so we can do nothing in AIO
fashion.

Signed-off-by: Alexander Graf <agraf@suse.de>

---

v2 -> v3:

  - Add description of cache=volatile
  - Squash aio noop noop patch into this one
---
 block.c         |   28 ++++++++++++++++++++++++++++
 block.h         |    1 +
 qemu-config.c   |    2 +-
 qemu-options.hx |   13 ++++++++++---
 vl.c            |    3 +++
 5 files changed, 43 insertions(+), 4 deletions(-)

Comments

Kevin Wolf May 17, 2010, 10:42 a.m. UTC | #1
Am 17.05.2010 12:14, schrieb Alexander Graf:
> Usually the guest can tell the host to flush data to disk. In some cases we
> don't want to flush though, but try to keep everything in cache.
> 
> So let's add a new cache value to -drive that allows us to set the cache
> policy to most aggressive, disabling flushes. We call this mode "volatile",
> as guest data is not guaranteed to survive host crashes anymore.
> 
> This patch also adds a noop function for aio, so we can do nothing in AIO
> fashion.
> 
> Signed-off-by: Alexander Graf <agraf@suse.de>

Thanks, looks good to me now. Applied it to the block branch, but
depending on Anthony's opinion I might drop it again.

Kevin
Anthony Liguori May 17, 2010, 12:58 p.m. UTC | #2
On 05/17/2010 05:14 AM, Alexander Graf wrote:
> Usually the guest can tell the host to flush data to disk. In some cases we
> don't want to flush though, but try to keep everything in cache.
>
> So let's add a new cache value to -drive that allows us to set the cache
> policy to most aggressive, disabling flushes. We call this mode "volatile",
> as guest data is not guaranteed to survive host crashes anymore.
>
> This patch also adds a noop function for aio, so we can do nothing in AIO
> fashion.
>
> Signed-off-by: Alexander Graf<agraf@suse.de>
>    

I'd like to see some performance data with at least an ext3 host file 
system and an ext4 file system.

My concern is that ext3 exaggerates the cost of fsync() which will 
result in diminishing value over time for this feature as people move to 
ext4/btrfs.

Regards,

Anthony Liguori

> ---
>
> v2 ->  v3:
>
>    - Add description of cache=volatile
>    - Squash aio noop noop patch into this one
> ---
>   block.c         |   28 ++++++++++++++++++++++++++++
>   block.h         |    1 +
>   qemu-config.c   |    2 +-
>   qemu-options.hx |   13 ++++++++++---
>   vl.c            |    3 +++
>   5 files changed, 43 insertions(+), 4 deletions(-)
>
> diff --git a/block.c b/block.c
> index 48305b7..b742965 100644
> --- a/block.c
> +++ b/block.c
> @@ -50,6 +50,8 @@ static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
>           BlockDriverCompletionFunc *cb, void *opaque);
>   static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
>           BlockDriverCompletionFunc *cb, void *opaque);
> +static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
> +        BlockDriverCompletionFunc *cb, void *opaque);
>   static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>                           uint8_t *buf, int nb_sectors);
>   static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
> @@ -1306,6 +1308,10 @@ const char *bdrv_get_device_name(BlockDriverState *bs)
>
>   void bdrv_flush(BlockDriverState *bs)
>   {
> +    if (bs->open_flags&  BDRV_O_NO_FLUSH) {
> +        return;
> +    }
> +
>       if (bs->drv&&  bs->drv->bdrv_flush)
>           bs->drv->bdrv_flush(bs);
>   }
> @@ -2082,6 +2088,10 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
>   {
>       BlockDriver *drv = bs->drv;
>
> +    if (bs->open_flags&  BDRV_O_NO_FLUSH) {
> +        return bdrv_aio_noop_em(bs, cb, opaque);
> +    }
> +
>       if (!drv)
>           return NULL;
>       return drv->bdrv_aio_flush(bs, cb, opaque);
> @@ -2196,6 +2206,24 @@ static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
>       return&acb->common;
>   }
>
> +static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
> +        BlockDriverCompletionFunc *cb, void *opaque)
> +{
> +    BlockDriverAIOCBSync *acb;
> +
> +    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
> +    acb->is_write = 1; /* don't bounce in the completion hadler */
> +    acb->qiov = NULL;
> +    acb->bounce = NULL;
> +    acb->ret = 0;
> +
> +    if (!acb->bh)
> +        acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
> +
> +    qemu_bh_schedule(acb->bh);
> +    return&acb->common;
> +}
> +
>   /**************************************************************/
>   /* sync block device emulation */
>
> diff --git a/block.h b/block.h
> index f87d24e..8032b6b 100644
> --- a/block.h
> +++ b/block.h
> @@ -33,6 +33,7 @@ typedef struct QEMUSnapshotInfo {
>   #define BDRV_O_CACHE_WB    0x0040 /* use write-back caching */
>   #define BDRV_O_NATIVE_AIO  0x0080 /* use native AIO instead of the thread pool */
>   #define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
> +#define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
>
>   #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB)
>
> diff --git a/qemu-config.c b/qemu-config.c
> index d500885..bf3d493 100644
> --- a/qemu-config.c
> +++ b/qemu-config.c
> @@ -54,7 +54,7 @@ QemuOptsList qemu_drive_opts = {
>           },{
>               .name = "cache",
>               .type = QEMU_OPT_STRING,
> -            .help = "host cache usage (none, writeback, writethrough)",
> +            .help = "host cache usage (none, writeback, writethrough, volatile)",
>           },{
>               .name = "aio",
>               .type = QEMU_OPT_STRING,
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 12f6b51..6dedb4a 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -118,8 +118,9 @@ ETEXI
>   DEF("drive", HAS_ARG, QEMU_OPTION_drive,
>       "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n"
>       "       [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
> -    "       [,cache=writethrough|writeback|none][,format=f][,serial=s]\n"
> -    "       [,addr=A][,id=name][,aio=threads|native][,readonly=on|off]\n"
> +    "       [,cache=writethrough|writeback|volatile|none][,format=f]\n"
> +    "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
> +    "       [,readonly=on|off]\n"
>       "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
>   STEXI
>   @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
> @@ -148,7 +149,7 @@ These options have the same definition as they have in @option{-hdachs}.
>   @item snapshot=@var{snapshot}
>   @var{snapshot} is "on" or "off" and allows to enable snapshot for given drive (see @option{-snapshot}).
>   @item cache=@var{cache}
> -@var{cache} is "none", "writeback", or "writethrough" and controls how the host cache is used to access block data.
> +@var{cache} is "none", "writeback", "volatile", or "writethrough" and controls how the host cache is used to access block data.
>   @item aio=@var{aio}
>   @var{aio} is "threads", or "native" and selects between pthread based disk I/O and native Linux AIO.
>   @item format=@var{format}
> @@ -180,6 +181,12 @@ Some block drivers perform badly with @option{cache=writethrough}, most notably,
>   qcow2.  If performance is more important than correctness,
>   @option{cache=writeback} should be used with qcow2.
>
> +In case you don't care about data integrity over host failures, use
> +cache=volatile. This option tells qemu that it never needs to write any data
> +to the disk but can instead keeps things in cache. If anything goes wrong,
> +like your host losing power, the disk storage getting disconnected accidently,
> +etc. you're image will most probably be rendered unusable.
> +
>   Instead of @option{-cdrom} you can use:
>   @example
>   qemu -drive file=file,index=2,media=cdrom
> diff --git a/vl.c b/vl.c
> index 85bcc84..c8abce6 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -913,6 +913,9 @@ DriveInfo *drive_init(QemuOpts *opts, void *opaque,
>               bdrv_flags |= BDRV_O_NOCACHE;
>           } else if (!strcmp(buf, "writeback")) {
>               bdrv_flags |= BDRV_O_CACHE_WB;
> +        } else if (!strcmp(buf, "volatile")) {
> +            bdrv_flags |= BDRV_O_CACHE_WB;
> +            bdrv_flags |= BDRV_O_NO_FLUSH;
>           } else if (!strcmp(buf, "writethrough")) {
>               /* this is the default */
>           } else {
>
Alexander Graf May 17, 2010, 1:02 p.m. UTC | #3
On 17.05.2010, at 14:58, Anthony Liguori wrote:

> On 05/17/2010 05:14 AM, Alexander Graf wrote:
>> Usually the guest can tell the host to flush data to disk. In some cases we
>> don't want to flush though, but try to keep everything in cache.
>> 
>> So let's add a new cache value to -drive that allows us to set the cache
>> policy to most aggressive, disabling flushes. We call this mode "volatile",
>> as guest data is not guaranteed to survive host crashes anymore.
>> 
>> This patch also adds a noop function for aio, so we can do nothing in AIO
>> fashion.
>> 
>> Signed-off-by: Alexander Graf<agraf@suse.de>
>>   
> 
> I'd like to see some performance data with at least an ext3 host file system and an ext4 file system.

For ext3 data, please see my cover-letter from v2:

http://www.mail-archive.com/qemu-devel@nongnu.org/msg31714.html

> My concern is that ext3 exaggerates the cost of fsync() which will result in diminishing value over time for this feature as people move to ext4/btrfs.

There will be ext3 file systems for years out. Just because people can use better and faster file systems doesn't mean they do. And I'm sure they can't always choose. If anything, I can try and see what the numbers look like for xfs.


Alex
Anthony Liguori May 17, 2010, 1:09 p.m. UTC | #4
On 05/17/2010 08:02 AM, Alexander Graf wrote:
>> My concern is that ext3 exaggerates the cost of fsync() which will result in diminishing value over time for this feature as people move to ext4/btrfs.
>>      
> There will be ext3 file systems for years out. Just because people can use better and faster file systems doesn't mean they do. And I'm sure they can't always choose. If anything, I can try and see what the numbers look like for xfs.
>    

But ext3 with barrier=1 is pretty uncommon in practice.  Another data 
point would be an ext3 host file system with barrier=0.

Regards,

Anthony Liguori

>
> Alex
>
>
Alexander Graf May 17, 2010, 1:17 p.m. UTC | #5
On 17.05.2010, at 15:09, Anthony Liguori wrote:

> On 05/17/2010 08:02 AM, Alexander Graf wrote:
>>> My concern is that ext3 exaggerates the cost of fsync() which will result in diminishing value over time for this feature as people move to ext4/btrfs.
>>>     
>> There will be ext3 file systems for years out. Just because people can use better and faster file systems doesn't mean they do. And I'm sure they can't always choose. If anything, I can try and see what the numbers look like for xfs.
>>   
> 
> But ext3 with barrier=1 is pretty uncommon in practice.  Another data point would be an ext3 host file system with barrier=0.

Who defines what is common and what not? To me, the SLES11 default is common. In fact, the numbers in the referred mail were done on an 11.1 system.


Alex
Anthony Liguori May 17, 2010, 1:26 p.m. UTC | #6
On 05/17/2010 08:17 AM, Alexander Graf wrote:
> On 17.05.2010, at 15:09, Anthony Liguori wrote:
>
>    
>> On 05/17/2010 08:02 AM, Alexander Graf wrote:
>>      
>>>> My concern is that ext3 exaggerates the cost of fsync() which will result in diminishing value over time for this feature as people move to ext4/btrfs.
>>>>
>>>>          
>>> There will be ext3 file systems for years out. Just because people can use better and faster file systems doesn't mean they do. And I'm sure they can't always choose. If anything, I can try and see what the numbers look like for xfs.
>>>
>>>        
>> But ext3 with barrier=1 is pretty uncommon in practice.  Another data point would be an ext3 host file system with barrier=0.
>>      
> Who defines what is common and what not? To me, the SLES11 default is common. In fact, the numbers in the referred mail were done on an 11.1 system.
>    

But it wasn't the SLES10 default so there's a smaller window of systems 
that are going to be configured this way.  But this is orthogonal to the 
main point.  Let's quantify how important this detail is before we 
discuss the affected user base.

Regards,

Anthony Liguori

> Alex
>
>
Alexander Graf May 17, 2010, 2:04 p.m. UTC | #7
Anthony Liguori wrote:
> On 05/17/2010 08:17 AM, Alexander Graf wrote:
>> On 17.05.2010, at 15:09, Anthony Liguori wrote:
>>
>>   
>>> On 05/17/2010 08:02 AM, Alexander Graf wrote:
>>>     
>>>>> My concern is that ext3 exaggerates the cost of fsync() which will
>>>>> result in diminishing value over time for this feature as people
>>>>> move to ext4/btrfs.
>>>>>
>>>>>          
>>>> There will be ext3 file systems for years out. Just because people
>>>> can use better and faster file systems doesn't mean they do. And
>>>> I'm sure they can't always choose. If anything, I can try and see
>>>> what the numbers look like for xfs.
>>>>
>>>>        
>>> But ext3 with barrier=1 is pretty uncommon in practice.  Another
>>> data point would be an ext3 host file system with barrier=0.
>>>      
>> Who defines what is common and what not? To me, the SLES11 default is
>> common. In fact, the numbers in the referred mail were done on an
>> 11.1 system.
>>    
>
> But it wasn't the SLES10 default so there's a smaller window of
> systems that are going to be configured this way.  But this is
> orthogonal to the main point.  Let's quantify how important this
> detail is before we discuss the affected user base.

Alright. I took my Netbook (2GB of RAM) and a USB hard disk, so I can
easily remount the data fs the vmdk image is on. Here are the results:

# mkfs.ext3 /dev/sdc1
# mount /dev/sdc1 /mnt -obarrier=1

cache=writeback

real    0m52.801s
user    0m16.065s
sys     0m6.688s

cache=volatile

real    0m47.876s
user    0m15.921s
sys     0m6.548s

# mount /dev/sdc1 /mnt -obarrier=0

cache=writeback

real    0m53.588s
user    0m15.901s
sys     0m6.576s

cache=volatile

real    0m48.715s
user    0m16.581s
sys     0m5.856s

I don't see a difference between the results. Apparently the barrier
option doesn't change a thing.


Alex
Alexander Graf May 17, 2010, 2:22 p.m. UTC | #8
Alexander Graf wrote:
> Anthony Liguori wrote:
>   
>> On 05/17/2010 08:17 AM, Alexander Graf wrote:
>>     
>>> On 17.05.2010, at 15:09, Anthony Liguori wrote:
>>>
>>>   
>>>       
>>>> On 05/17/2010 08:02 AM, Alexander Graf wrote:
>>>>     
>>>>         
>>>>>> My concern is that ext3 exaggerates the cost of fsync() which will
>>>>>> result in diminishing value over time for this feature as people
>>>>>> move to ext4/btrfs.
>>>>>>
>>>>>>          
>>>>>>             
>>>>> There will be ext3 file systems for years out. Just because people
>>>>> can use better and faster file systems doesn't mean they do. And
>>>>> I'm sure they can't always choose. If anything, I can try and see
>>>>> what the numbers look like for xfs.
>>>>>
>>>>>        
>>>>>           
>>>> But ext3 with barrier=1 is pretty uncommon in practice.  Another
>>>> data point would be an ext3 host file system with barrier=0.
>>>>      
>>>>         
>>> Who defines what is common and what not? To me, the SLES11 default is
>>> common. In fact, the numbers in the referred mail were done on an
>>> 11.1 system.
>>>    
>>>       
>> But it wasn't the SLES10 default so there's a smaller window of
>> systems that are going to be configured this way.  But this is
>> orthogonal to the main point.  Let's quantify how important this
>> detail is before we discuss the affected user base.
>>     
>
> Alright. I took my Netbook (2GB of RAM) and a USB hard disk, so I can
> easily remount the data fs the vmdk image is on. Here are the results:
>
> # mkfs.ext3 /dev/sdc1
> # mount /dev/sdc1 /mnt -obarrier=1
>
> cache=writeback
>
> real    0m52.801s
> user    0m16.065s
> sys     0m6.688s
>
> cache=volatile
>
> real    0m47.876s
> user    0m15.921s
> sys     0m6.548s
>
> # mount /dev/sdc1 /mnt -obarrier=0
>
> cache=writeback
>
> real    0m53.588s
> user    0m15.901s
> sys     0m6.576s
>
> cache=volatile
>
> real    0m48.715s
> user    0m16.581s
> sys     0m5.856s
>
> I don't see a difference between the results. Apparently the barrier
> option doesn't change a thing.
>   

The same test case for XFS:

cache=writeback

real    0m50.868s
user    0m11.133s
sys    0m12.733s

cache=volatile

real    0m43.680s
user    0m16.089s
sys    0m7.812s

Though I did have numbers here going as far down as 25 seconds for a run!


Alex
Anthony Liguori May 17, 2010, 3:11 p.m. UTC | #9
On 05/17/2010 05:14 AM, Alexander Graf wrote:
> Usually the guest can tell the host to flush data to disk. In some cases we
> don't want to flush though, but try to keep everything in cache.
>
> So let's add a new cache value to -drive that allows us to set the cache
> policy to most aggressive, disabling flushes. We call this mode "volatile",
> as guest data is not guaranteed to survive host crashes anymore.
>
> This patch also adds a noop function for aio, so we can do nothing in AIO
> fashion.
>
> Signed-off-by: Alexander Graf<agraf@suse.de>
>
> ---
>
> v2 ->  v3:
>
>    - Add description of cache=volatile
>    - Squash aio noop noop patch into this one
> ---
>   block.c         |   28 ++++++++++++++++++++++++++++
>   block.h         |    1 +
>   qemu-config.c   |    2 +-
>   qemu-options.hx |   13 ++++++++++---
>   vl.c            |    3 +++
>   5 files changed, 43 insertions(+), 4 deletions(-)
>
> diff --git a/block.c b/block.c
> index 48305b7..b742965 100644
> --- a/block.c
> +++ b/block.c
> @@ -50,6 +50,8 @@ static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
>           BlockDriverCompletionFunc *cb, void *opaque);
>   static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
>           BlockDriverCompletionFunc *cb, void *opaque);
> +static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
> +        BlockDriverCompletionFunc *cb, void *opaque);
>   static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>                           uint8_t *buf, int nb_sectors);
>   static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
> @@ -1306,6 +1308,10 @@ const char *bdrv_get_device_name(BlockDriverState *bs)
>
>   void bdrv_flush(BlockDriverState *bs)
>   {
> +    if (bs->open_flags&  BDRV_O_NO_FLUSH) {
> +        return;
> +    }
> +
>       if (bs->drv&&  bs->drv->bdrv_flush)
>           bs->drv->bdrv_flush(bs);
>   }
> @@ -2082,6 +2088,10 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
>   {
>       BlockDriver *drv = bs->drv;
>
> +    if (bs->open_flags&  BDRV_O_NO_FLUSH) {
> +        return bdrv_aio_noop_em(bs, cb, opaque);
> +    }
> +
>       if (!drv)
>           return NULL;
>       return drv->bdrv_aio_flush(bs, cb, opaque);
> @@ -2196,6 +2206,24 @@ static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
>       return&acb->common;
>   }
>
> +static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
> +        BlockDriverCompletionFunc *cb, void *opaque)
> +{
> +    BlockDriverAIOCBSync *acb;
> +
> +    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
> +    acb->is_write = 1; /* don't bounce in the completion hadler */
> +    acb->qiov = NULL;
> +    acb->bounce = NULL;
> +    acb->ret = 0;
> +
> +    if (!acb->bh)
> +        acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
> +
> +    qemu_bh_schedule(acb->bh);
> +    return&acb->common;
> +}
> +
>   /**************************************************************/
>   /* sync block device emulation */
>
> diff --git a/block.h b/block.h
> index f87d24e..8032b6b 100644
> --- a/block.h
> +++ b/block.h
> @@ -33,6 +33,7 @@ typedef struct QEMUSnapshotInfo {
>   #define BDRV_O_CACHE_WB    0x0040 /* use write-back caching */
>   #define BDRV_O_NATIVE_AIO  0x0080 /* use native AIO instead of the thread pool */
>   #define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
> +#define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
>
>   #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB)
>
> diff --git a/qemu-config.c b/qemu-config.c
> index d500885..bf3d493 100644
> --- a/qemu-config.c
> +++ b/qemu-config.c
> @@ -54,7 +54,7 @@ QemuOptsList qemu_drive_opts = {
>           },{
>               .name = "cache",
>               .type = QEMU_OPT_STRING,
> -            .help = "host cache usage (none, writeback, writethrough)",
> +            .help = "host cache usage (none, writeback, writethrough, volatile)",
>           },{
>               .name = "aio",
>               .type = QEMU_OPT_STRING,
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 12f6b51..6dedb4a 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -118,8 +118,9 @@ ETEXI
>   DEF("drive", HAS_ARG, QEMU_OPTION_drive,
>       "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n"
>       "       [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
> -    "       [,cache=writethrough|writeback|none][,format=f][,serial=s]\n"
> -    "       [,addr=A][,id=name][,aio=threads|native][,readonly=on|off]\n"
> +    "       [,cache=writethrough|writeback|volatile|none][,format=f]\n"
> +    "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
> +    "       [,readonly=on|off]\n"
>       "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
>   STEXI
>   @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
> @@ -148,7 +149,7 @@ These options have the same definition as they have in @option{-hdachs}.
>   @item snapshot=@var{snapshot}
>   @var{snapshot} is "on" or "off" and allows to enable snapshot for given drive (see @option{-snapshot}).
>   @item cache=@var{cache}
> -@var{cache} is "none", "writeback", or "writethrough" and controls how the host cache is used to access block data.
> +@var{cache} is "none", "writeback", "volatile", or "writethrough" and controls how the host cache is used to access block data.
>   @item aio=@var{aio}
>   @var{aio} is "threads", or "native" and selects between pthread based disk I/O and native Linux AIO.
>   @item format=@var{format}
> @@ -180,6 +181,12 @@ Some block drivers perform badly with @option{cache=writethrough}, most notably,
>   qcow2.  If performance is more important than correctness,
>   @option{cache=writeback} should be used with qcow2.
>
> +In case you don't care about data integrity over host failures, use
> +cache=volatile. This option tells qemu that it never needs to write any data
> +to the disk but can instead keeps things in cache. If anything goes wrong,
> +like your host losing power, the disk storage getting disconnected accidently,
> +etc. you're image will most probably be rendered unusable.
>    

the cache=writeback help needs to be reworked too because it's no longer 
valid.

Regards,

Anthony Liguori

>   Instead of @option{-cdrom} you can use:
>   @example
>   qemu -drive file=file,index=2,media=cdrom
> diff --git a/vl.c b/vl.c
> index 85bcc84..c8abce6 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -913,6 +913,9 @@ DriveInfo *drive_init(QemuOpts *opts, void *opaque,
>               bdrv_flags |= BDRV_O_NOCACHE;
>           } else if (!strcmp(buf, "writeback")) {
>               bdrv_flags |= BDRV_O_CACHE_WB;
> +        } else if (!strcmp(buf, "volatile")) {
> +            bdrv_flags |= BDRV_O_CACHE_WB;
> +            bdrv_flags |= BDRV_O_NO_FLUSH;
>           } else if (!strcmp(buf, "writethrough")) {
>               /* this is the default */
>           } else {
>
Anthony Liguori May 17, 2010, 3:16 p.m. UTC | #10
On 05/17/2010 09:04 AM, Alexander Graf wrote:
> Anthony Liguori wrote:
>    
>> On 05/17/2010 08:17 AM, Alexander Graf wrote:
>>      
>>> On 17.05.2010, at 15:09, Anthony Liguori wrote:
>>>
>>>
>>>        
>>>> On 05/17/2010 08:02 AM, Alexander Graf wrote:
>>>>
>>>>          
>>>>>> My concern is that ext3 exaggerates the cost of fsync() which will
>>>>>> result in diminishing value over time for this feature as people
>>>>>> move to ext4/btrfs.
>>>>>>
>>>>>>
>>>>>>              
>>>>> There will be ext3 file systems for years out. Just because people
>>>>> can use better and faster file systems doesn't mean they do. And
>>>>> I'm sure they can't always choose. If anything, I can try and see
>>>>> what the numbers look like for xfs.
>>>>>
>>>>>
>>>>>            
>>>> But ext3 with barrier=1 is pretty uncommon in practice.  Another
>>>> data point would be an ext3 host file system with barrier=0.
>>>>
>>>>          
>>> Who defines what is common and what not? To me, the SLES11 default is
>>> common. In fact, the numbers in the referred mail were done on an
>>> 11.1 system.
>>>
>>>        
>> But it wasn't the SLES10 default so there's a smaller window of
>> systems that are going to be configured this way.  But this is
>> orthogonal to the main point.  Let's quantify how important this
>> detail is before we discuss the affected user base.
>>      
> Alright. I took my Netbook (2GB of RAM) and a USB hard disk, so I can
> easily remount the data fs the vmdk image is on. Here are the results:
>
> # mkfs.ext3 /dev/sdc1
> # mount /dev/sdc1 /mnt -obarrier=1
>
> cache=writeback
>
> real    0m52.801s
> user    0m16.065s
> sys     0m6.688s
>
> cache=volatile
>
> real    0m47.876s
> user    0m15.921s
> sys     0m6.548s
>
> # mount /dev/sdc1 /mnt -obarrier=0
>
> cache=writeback
>
> real    0m53.588s
> user    0m15.901s
> sys     0m6.576s
>
> cache=volatile
>
> real    0m48.715s
> user    0m16.581s
> sys     0m5.856s
>
> I don't see a difference between the results. Apparently the barrier
> option doesn't change a thing.
>    

Ok.  I don't like it, but I can see how it's compelling.  I'd like to 
see the documentation improved though.  I also think a warning printed 
on stdio about the safety of the option would be appropriate.

Regards,

Anthony Liguori

>
> Alex
>
>
Paul Brook May 17, 2010, 4:23 p.m. UTC | #11
> > I don't see a difference between the results. Apparently the barrier
> > option doesn't change a thing.
> 
> Ok.  I don't like it, but I can see how it's compelling.  I'd like to
> see the documentation improved though.  I also think a warning printed
> on stdio about the safety of the option would be appropriate.

I disagree with this last bit.

Errors should be issued if the user did something wrong.
Warnings should be issued if qemu did (or will soon do) something other than 
what the user requested, or otherwise made questionable decisions on the 
user's behalf.

In this case we're doing exactly what the user requested. The only plausible 
failure case is where a user is blindly trying options that they clearly don't 
understand or read the documentation for. I have zero sympathy for complaints 
like "Someone on the Internet told me to use --breakme, and broke thinks".

Paul
Anthony Liguori May 17, 2010, 4:26 p.m. UTC | #12
On 05/17/2010 11:23 AM, Paul Brook wrote:
>>> I don't see a difference between the results. Apparently the barrier
>>> option doesn't change a thing.
>>>        
>> Ok.  I don't like it, but I can see how it's compelling.  I'd like to
>> see the documentation improved though.  I also think a warning printed
>> on stdio about the safety of the option would be appropriate.
>>      
> I disagree with this last bit.
>
> Errors should be issued if the user did something wrong.
> Warnings should be issued if qemu did (or will soon do) something other than
> what the user requested, or otherwise made questionable decisions on the
> user's behalf.
>
> In this case we're doing exactly what the user requested. The only plausible
> failure case is where a user is blindly trying options that they clearly don't
> understand or read the documentation for. I have zero sympathy for complaints
> like "Someone on the Internet told me to use --breakme, and broke thinks".
>    

I see it as the equivalent to the Taint bit in Linux.  I want to make it 
clear to users up front that if you use this option, and you have data 
loss issues, don't complain.

Just putting something in qemu-doc.texi is not enough IMHO.  Few people 
actually read it.

Regards,

Anthony Liguori

> Paul
>
>
Alexander Graf May 17, 2010, 4:28 p.m. UTC | #13
On 17.05.2010, at 18:26, Anthony Liguori wrote:

> On 05/17/2010 11:23 AM, Paul Brook wrote:
>>>> I don't see a difference between the results. Apparently the barrier
>>>> option doesn't change a thing.
>>>>       
>>> Ok.  I don't like it, but I can see how it's compelling.  I'd like to
>>> see the documentation improved though.  I also think a warning printed
>>> on stdio about the safety of the option would be appropriate.
>>>     
>> I disagree with this last bit.
>> 
>> Errors should be issued if the user did something wrong.
>> Warnings should be issued if qemu did (or will soon do) something other than
>> what the user requested, or otherwise made questionable decisions on the
>> user's behalf.
>> 
>> In this case we're doing exactly what the user requested. The only plausible
>> failure case is where a user is blindly trying options that they clearly don't
>> understand or read the documentation for. I have zero sympathy for complaints
>> like "Someone on the Internet told me to use --breakme, and broke thinks".
>>   
> 
> I see it as the equivalent to the Taint bit in Linux.  I want to make it clear to users up front that if you use this option, and you have data loss issues, don't complain.
> 
> Just putting something in qemu-doc.texi is not enough IMHO.  Few people actually read it.

But that's why it's no default and also called "volatile". If you prefer, we can call it cache=destroys_your_image.


Alex
Jamie Lokier May 17, 2010, 8:07 p.m. UTC | #14
Alexander Graf wrote:
> 
> On 17.05.2010, at 18:26, Anthony Liguori wrote:
> 
> > On 05/17/2010 11:23 AM, Paul Brook wrote:
> >>>> I don't see a difference between the results. Apparently the barrier
> >>>> option doesn't change a thing.
> >>>>       
> >>> Ok.  I don't like it, but I can see how it's compelling.  I'd like to
> >>> see the documentation improved though.  I also think a warning printed
> >>> on stdio about the safety of the option would be appropriate.
> >>>     
> >> I disagree with this last bit.
> >> 
> >> Errors should be issued if the user did something wrong.
> >> Warnings should be issued if qemu did (or will soon do) something other than
> >> what the user requested, or otherwise made questionable decisions on the
> >> user's behalf.
> >> 
> >> In this case we're doing exactly what the user requested. The only plausible
> >> failure case is where a user is blindly trying options that they clearly don't
> >> understand or read the documentation for. I have zero sympathy for complaints
> >> like "Someone on the Internet told me to use --breakme, and broke thinks".
> >>   
> > 
> > I see it as the equivalent to the Taint bit in Linux.  I want to make it clear to users up front that if you use this option, and you have data loss issues, don't complain.
> > 
> > Just putting something in qemu-doc.texi is not enough IMHO.  Few people actually read it.
> 
> But that's why it's no default and also called "volatile". If you prefer, we can call it cache=destroys_your_image.

With that semantic, a future iteration of cache=volatile could even
avoid writing to the backing file at all, if that's yet faster.  I
wonder if that would be faster.  Anyone fancy doing a hack with the
whole guest image as a big malloc inside qemu?  I don't have enough RAM :-)

-- Jamie
Kevin Wolf May 18, 2010, 7:42 a.m. UTC | #15
Am 17.05.2010 22:07, schrieb Jamie Lokier:
> Alexander Graf wrote:
>>
>> On 17.05.2010, at 18:26, Anthony Liguori wrote:
>>
>>> On 05/17/2010 11:23 AM, Paul Brook wrote:
>>>>>> I don't see a difference between the results. Apparently the barrier
>>>>>> option doesn't change a thing.
>>>>>>       
>>>>> Ok.  I don't like it, but I can see how it's compelling.  I'd like to
>>>>> see the documentation improved though.  I also think a warning printed
>>>>> on stdio about the safety of the option would be appropriate.
>>>>>     
>>>> I disagree with this last bit.
>>>>
>>>> Errors should be issued if the user did something wrong.
>>>> Warnings should be issued if qemu did (or will soon do) something other than
>>>> what the user requested, or otherwise made questionable decisions on the
>>>> user's behalf.
>>>>
>>>> In this case we're doing exactly what the user requested. The only plausible
>>>> failure case is where a user is blindly trying options that they clearly don't
>>>> understand or read the documentation for. I have zero sympathy for complaints
>>>> like "Someone on the Internet told me to use --breakme, and broke thinks".
>>>>   
>>>
>>> I see it as the equivalent to the Taint bit in Linux.  I want to make it clear to users up front that if you use this option, and you have data loss issues, don't complain.
>>>
>>> Just putting something in qemu-doc.texi is not enough IMHO.  Few people actually read it.
>>
>> But that's why it's no default and also called "volatile". If you prefer, we can call it cache=destroys_your_image.
> 
> With that semantic, a future iteration of cache=volatile could even
> avoid writing to the backing file at all, if that's yet faster.  I
> wonder if that would be faster.  Anyone fancy doing a hack with the
> whole guest image as a big malloc inside qemu?  I don't have enough RAM :-)

But then you'd probably want a separate RAM block driver instead of
messing with cache options.

Kevin
Alexander Graf May 25, 2010, 5:59 p.m. UTC | #16
Anthony Liguori wrote:
> On 05/17/2010 11:23 AM, Paul Brook wrote:
>>>> I don't see a difference between the results. Apparently the barrier
>>>> option doesn't change a thing.
>>>>        
>>> Ok.  I don't like it, but I can see how it's compelling.  I'd like to
>>> see the documentation improved though.  I also think a warning printed
>>> on stdio about the safety of the option would be appropriate.
>>>      
>> I disagree with this last bit.
>>
>> Errors should be issued if the user did something wrong.
>> Warnings should be issued if qemu did (or will soon do) something
>> other than
>> what the user requested, or otherwise made questionable decisions on the
>> user's behalf.
>>
>> In this case we're doing exactly what the user requested. The only
>> plausible
>> failure case is where a user is blindly trying options that they
>> clearly don't
>> understand or read the documentation for. I have zero sympathy for
>> complaints
>> like "Someone on the Internet told me to use --breakme, and broke
>> thinks".
>>    
>
> I see it as the equivalent to the Taint bit in Linux.  I want to make
> it clear to users up front that if you use this option, and you have
> data loss issues, don't complain.
>
> Just putting something in qemu-doc.texi is not enough IMHO.  Few
> people actually read it.

So what exactly is the conclusion here? I really want to see this
getting merged.


Alex
Anthony Liguori May 25, 2010, 6:48 p.m. UTC | #17
On 05/25/2010 12:59 PM, Alexander Graf wrote:
>> I see it as the equivalent to the Taint bit in Linux.  I want to make
>> it clear to users up front that if you use this option, and you have
>> data loss issues, don't complain.
>>
>> Just putting something in qemu-doc.texi is not enough IMHO.  Few
>> people actually read it.
>>      
> So what exactly is the conclusion here? I really want to see this
> getting merged.
>    

Make it more scary and try again.

Regards,

Anthony Liguori

>
> Alex
>
>
Alexander Graf May 25, 2010, 7:01 p.m. UTC | #18
Anthony Liguori wrote:
> On 05/25/2010 12:59 PM, Alexander Graf wrote:
>>> I see it as the equivalent to the Taint bit in Linux.  I want to make
>>> it clear to users up front that if you use this option, and you have
>>> data loss issues, don't complain.
>>>
>>> Just putting something in qemu-doc.texi is not enough IMHO.  Few
>>> people actually read it.
>>>      
>> So what exactly is the conclusion here? I really want to see this
>> getting merged.
>>    
>
> Make it more scary and try again.

I don't see how to make it more scary while still considering users sane
human beings. You don't print out a big fat warning on -drive if=scsi
either, right?

Alex
Aurelien Jarno May 25, 2010, 9:01 p.m. UTC | #19
On Tue, May 25, 2010 at 07:59:18PM +0200, Alexander Graf wrote:
> Anthony Liguori wrote:
> > On 05/17/2010 11:23 AM, Paul Brook wrote:
> >>>> I don't see a difference between the results. Apparently the barrier
> >>>> option doesn't change a thing.
> >>>>        
> >>> Ok.  I don't like it, but I can see how it's compelling.  I'd like to
> >>> see the documentation improved though.  I also think a warning printed
> >>> on stdio about the safety of the option would be appropriate.
> >>>      
> >> I disagree with this last bit.
> >>
> >> Errors should be issued if the user did something wrong.
> >> Warnings should be issued if qemu did (or will soon do) something
> >> other than
> >> what the user requested, or otherwise made questionable decisions on the
> >> user's behalf.
> >>
> >> In this case we're doing exactly what the user requested. The only
> >> plausible
> >> failure case is where a user is blindly trying options that they
> >> clearly don't
> >> understand or read the documentation for. I have zero sympathy for
> >> complaints
> >> like "Someone on the Internet told me to use --breakme, and broke
> >> thinks".
> >>    
> >
> > I see it as the equivalent to the Taint bit in Linux.  I want to make
> > it clear to users up front that if you use this option, and you have
> > data loss issues, don't complain.
> >
> > Just putting something in qemu-doc.texi is not enough IMHO.  Few
> > people actually read it.
> 
> So what exactly is the conclusion here? I really want to see this
> getting merged
> 

I really think this patch can be useful, in my own case when testing
debian-installer (I already cache=writeback). In short all that is about
developing and testing, as opposed to run a VM in production, can
benefit about that. This was one of the original use case of QEMU before
KVM arrived.

Unless someone can convince me not to do it, I seriously considering
applying this patch.
Anthony Liguori May 26, 2010, 1:31 a.m. UTC | #20
On 05/25/2010 04:01 PM, Aurelien Jarno wrote:
>
> I really think this patch can be useful, in my own case when testing
> debian-installer (I already cache=writeback). In short all that is about
> developing and testing, as opposed to run a VM in production, can
> benefit about that. This was one of the original use case of QEMU before
> KVM arrived.
>
> Unless someone can convince me not to do it, I seriously considering
> applying this patch.
>    

There really needs to be an indication in the --help output of what the 
ramifications of this option are, in the very least.  It should also be 
removable via a ./configure option because no sane distribution should 
enable this for end users.

Regards,

Anthony Liguori
Kevin Wolf May 26, 2010, 8:43 a.m. UTC | #21
Am 26.05.2010 03:31, schrieb Anthony Liguori:
> On 05/25/2010 04:01 PM, Aurelien Jarno wrote:
>>
>> I really think this patch can be useful, in my own case when testing
>> debian-installer (I already cache=writeback). In short all that is about
>> developing and testing, as opposed to run a VM in production, can
>> benefit about that. This was one of the original use case of QEMU before
>> KVM arrived.
>>
>> Unless someone can convince me not to do it, I seriously considering
>> applying this patch.
>>    
> 
> There really needs to be an indication in the --help output of what the 
> ramifications of this option are, in the very least.  It should also be 
> removable via a ./configure option because no sane distribution should 
> enable this for end users.

We know better what you stupid user want? There are valid use cases for
this cache option, most notably installation.

I could agree with requesting that the option should be called
cache=unsafe (or even Alex' cache=destroys_your_image), but that should
really be enough to tell everyone that his data is not safe with this
option.

Kevin
Aurelien Jarno May 26, 2010, 8:52 a.m. UTC | #22
On Tue, May 25, 2010 at 08:31:20PM -0500, Anthony Liguori wrote:
> On 05/25/2010 04:01 PM, Aurelien Jarno wrote:
> >
> >I really think this patch can be useful, in my own case when testing
> >debian-installer (I already cache=writeback). In short all that is about
> >developing and testing, as opposed to run a VM in production, can
> >benefit about that. This was one of the original use case of QEMU before
> >KVM arrived.
> >
> >Unless someone can convince me not to do it, I seriously considering
> >applying this patch.
> 
> There really needs to be an indication in the --help output of what
> the ramifications of this option are, in the very least.  It should

That's indeed something than can be done, but to avoid double standards,
it should also be done for other features that can lead to data
corruption. I am talking for example on the qcow format, which is not
really supported anymore.

> also be removable via a ./configure option because no sane
> distribution should enable this for end users.
> 

I totally disagree. All the examples I have given apply to qemu *users*,
not qemu developers. They surely don't want to recompile qemu for their
usage. Note also that what is proposed in this patch was the default not
too long ago, and that a lot of users complained about the new default 
for their usage, they see it as a regression. We even had to put a note
explaining that in the Debian package to avoid to many bug reports. 
cache=writeback only answer partially to this use case.
Kevin Wolf May 26, 2010, 9:16 a.m. UTC | #23
Am 26.05.2010 10:52, schrieb Aurelien Jarno:
> On Tue, May 25, 2010 at 08:31:20PM -0500, Anthony Liguori wrote:
>> On 05/25/2010 04:01 PM, Aurelien Jarno wrote:
>>>
>>> I really think this patch can be useful, in my own case when testing
>>> debian-installer (I already cache=writeback). In short all that is about
>>> developing and testing, as opposed to run a VM in production, can
>>> benefit about that. This was one of the original use case of QEMU before
>>> KVM arrived.
>>>
>>> Unless someone can convince me not to do it, I seriously considering
>>> applying this patch.
>>
>> There really needs to be an indication in the --help output of what
>> the ramifications of this option are, in the very least.  It should
> 
> That's indeed something than can be done, but to avoid double standards,
> it should also be done for other features that can lead to data
> corruption. I am talking for example on the qcow format, which is not
> really supported anymore.

That said, qcow1 is probably in a better state than half of the other
drivers. Basically only raw and qcow2 are really maintained, you'd need
to warn about any other format then.

Kevin
Avi Kivity May 26, 2010, 1:06 p.m. UTC | #24
On 05/17/2010 03:58 PM, Anthony Liguori wrote:
> On 05/17/2010 05:14 AM, Alexander Graf wrote:
>> Usually the guest can tell the host to flush data to disk. In some 
>> cases we
>> don't want to flush though, but try to keep everything in cache.
>>
>> So let's add a new cache value to -drive that allows us to set the cache
>> policy to most aggressive, disabling flushes. We call this mode 
>> "volatile",
>> as guest data is not guaranteed to survive host crashes anymore.
>>
>> This patch also adds a noop function for aio, so we can do nothing in 
>> AIO
>> fashion.
>>
>> Signed-off-by: Alexander Graf<agraf@suse.de>
>
> I'd like to see some performance data with at least an ext3 host file 
> system and an ext4 file system.
>
> My concern is that ext3 exaggerates the cost of fsync() which will 
> result in diminishing value over time for this feature as people move 
> to ext4/btrfs.
>

In fact, btrfs is currently unusable for virt because O_SYNC writes 
inflate a guest write to a host write. by a huge factor (50x-100x).  
cache=writethrough is 100% unusable, cache=writeback is barely 
tolerable.  As of 2.6.32, cache=volatile is probably required to get 
something resembling reasonable performance on btrfs.

Of course, we expect that btrfs will improve in time, but still it 
doesn't seem to be fsync friendly.
Avi Kivity May 26, 2010, 1:09 p.m. UTC | #25
On 05/25/2010 09:48 PM, Anthony Liguori wrote:
> On 05/25/2010 12:59 PM, Alexander Graf wrote:
>>> I see it as the equivalent to the Taint bit in Linux.  I want to make
>>> it clear to users up front that if you use this option, and you have
>>> data loss issues, don't complain.
>>>
>>> Just putting something in qemu-doc.texi is not enough IMHO.  Few
>>> people actually read it.
>> So what exactly is the conclusion here? I really want to see this
>> getting merged.
>
> Make it more scary and try again.

How about ./configure --with-cache-volatile to enable it at all?

We wants it.
Anthony Liguori May 26, 2010, 1:42 p.m. UTC | #26
On 05/26/2010 03:43 AM, Kevin Wolf wrote:
> Am 26.05.2010 03:31, schrieb Anthony Liguori:
>    
>> On 05/25/2010 04:01 PM, Aurelien Jarno wrote:
>>      
>>> I really think this patch can be useful, in my own case when testing
>>> debian-installer (I already cache=writeback). In short all that is about
>>> developing and testing, as opposed to run a VM in production, can
>>> benefit about that. This was one of the original use case of QEMU before
>>> KVM arrived.
>>>
>>> Unless someone can convince me not to do it, I seriously considering
>>> applying this patch.
>>>
>>>        
>> There really needs to be an indication in the --help output of what the
>> ramifications of this option are, in the very least.  It should also be
>> removable via a ./configure option because no sane distribution should
>> enable this for end users.
>>      
> We know better what you stupid user want?

What percentage of qemu users do you think have actually read qemu-doc.texi?

It's not a stretch for someone to have heard that cache options can 
improve performance, and then see cache=volatile in the help output, try 
it, and then start using it because they observe a performance improvement.

That's not being stupid.  I think it's a reasonable expectation for a 
user to have that their data is safe.

Regards,

Anthony Liguori

>   There are valid use cases for
> this cache option, most notably installation.
>
> I could agree with requesting that the option should be called
> cache=unsafe (or even Alex' cache=destroys_your_image), but that should
> really be enough to tell everyone that his data is not safe with this
> option.
>
> Kevin
>
Anthony Liguori May 26, 2010, 1:48 p.m. UTC | #27
On 05/26/2010 03:52 AM, Aurelien Jarno wrote:
> On Tue, May 25, 2010 at 08:31:20PM -0500, Anthony Liguori wrote:
>    
>> On 05/25/2010 04:01 PM, Aurelien Jarno wrote:
>>      
>>> I really think this patch can be useful, in my own case when testing
>>> debian-installer (I already cache=writeback). In short all that is about
>>> developing and testing, as opposed to run a VM in production, can
>>> benefit about that. This was one of the original use case of QEMU before
>>> KVM arrived.
>>>
>>> Unless someone can convince me not to do it, I seriously considering
>>> applying this patch.
>>>        
>> There really needs to be an indication in the --help output of what
>> the ramifications of this option are, in the very least.  It should
>>      
> That's indeed something than can be done, but to avoid double standards,
> it should also be done for other features that can lead to data
> corruption. I am talking for example on the qcow format, which is not
> really supported anymore.
>    

I agree.

>> also be removable via a ./configure option because no sane
>> distribution should enable this for end users.
>>
>>      
> I totally disagree. All the examples I have given apply to qemu *users*,
> not qemu developers. They surely don't want to recompile qemu for their
> usage. Note also that what is proposed in this patch was the default not
> too long ago, and that a lot of users complained about the new default
> for their usage, they see it as a regression. We even had to put a note
> explaining that in the Debian package to avoid to many bug reports.
> cache=writeback only answer partially to this use case.
>    

It's hard for me to consider this a performance regression because 
ultimately, you're getting greater than bare metal performance (because 
of extremely aggressive caching).  It might be a regression from the 
previous performance, but that was at the cost of safety.

We might get 100 bug reports about this "regression" but they concern 
much less than 1 bug report of image corruption because of power 
failure/host crash.  A reputation of being unsafe is very difficult to 
get rid of and is something that I hear concerns about frequently.

I'm not suggesting that the compile option should be disabled by default 
upstream.  But the option should be there for distributions because I 
hope that any enterprise distro disables it.

Regards,

Anthony Liguori
Anthony Liguori May 26, 2010, 1:50 p.m. UTC | #28
On 05/26/2010 08:06 AM, Avi Kivity wrote:
> On 05/17/2010 03:58 PM, Anthony Liguori wrote:
>> On 05/17/2010 05:14 AM, Alexander Graf wrote:
>>> Usually the guest can tell the host to flush data to disk. In some 
>>> cases we
>>> don't want to flush though, but try to keep everything in cache.
>>>
>>> So let's add a new cache value to -drive that allows us to set the 
>>> cache
>>> policy to most aggressive, disabling flushes. We call this mode 
>>> "volatile",
>>> as guest data is not guaranteed to survive host crashes anymore.
>>>
>>> This patch also adds a noop function for aio, so we can do nothing 
>>> in AIO
>>> fashion.
>>>
>>> Signed-off-by: Alexander Graf<agraf@suse.de>
>>
>> I'd like to see some performance data with at least an ext3 host file 
>> system and an ext4 file system.
>>
>> My concern is that ext3 exaggerates the cost of fsync() which will 
>> result in diminishing value over time for this feature as people move 
>> to ext4/btrfs.
>>
>
> In fact, btrfs is currently unusable for virt because O_SYNC writes 
> inflate a guest write to a host write. by a huge factor (50x-100x).  
> cache=writethrough is 100% unusable, cache=writeback is barely 
> tolerable.  As of 2.6.32, cache=volatile is probably required to get 
> something resembling reasonable performance on btrfs.
>
> Of course, we expect that btrfs will improve in time, but still it 
> doesn't seem to be fsync friendly.

So you're suggesting that anyone who uses virt on btrfs should be 
prepared to deal with data corruption on host failure?  That sounds to 
me like btrfs isn't ready for real workloads.

Regards,

Anthony Liguori
Kevin Wolf May 26, 2010, 2:03 p.m. UTC | #29
Am 26.05.2010 15:42, schrieb Anthony Liguori:
> On 05/26/2010 03:43 AM, Kevin Wolf wrote:
>> Am 26.05.2010 03:31, schrieb Anthony Liguori:
>>    
>>> On 05/25/2010 04:01 PM, Aurelien Jarno wrote:
>>>      
>>>> I really think this patch can be useful, in my own case when testing
>>>> debian-installer (I already cache=writeback). In short all that is about
>>>> developing and testing, as opposed to run a VM in production, can
>>>> benefit about that. This was one of the original use case of QEMU before
>>>> KVM arrived.
>>>>
>>>> Unless someone can convince me not to do it, I seriously considering
>>>> applying this patch.
>>>>
>>>>        
>>> There really needs to be an indication in the --help output of what the
>>> ramifications of this option are, in the very least.  It should also be
>>> removable via a ./configure option because no sane distribution should
>>> enable this for end users.
>>>      
>> We know better what you stupid user want?
> 
> What percentage of qemu users do you think have actually read qemu-doc.texi?

As I said, put the warning in the option name like cache=unsafe or
something even more scary and I'm all for it.

> It's not a stretch for someone to have heard that cache options can 
> improve performance, and then see cache=volatile in the help output, try 
> it, and then start using it because they observe a performance improvement.
> 
> That's not being stupid.  I think it's a reasonable expectation for a 
> user to have that their data is safe.

You seem to think that the user is too stupid to allow him to use this
option even if he's perfectly aware what it's doing. It's a useful
option if it's used right.

We need to make clear that it's dangerous when it's used in the wrong
cases (for example by naming), but just disabling is not a solution for
that. You don't suggest that "no sane distribution" should ship rm,
because it's dangerous if you use it wrong, do you?

Kevin
Anthony Liguori May 26, 2010, 2:08 p.m. UTC | #30
On 05/26/2010 09:03 AM, Kevin Wolf wrote:
> Am 26.05.2010 15:42, schrieb Anthony Liguori:
>    
>> On 05/26/2010 03:43 AM, Kevin Wolf wrote:
>>      
>>> Am 26.05.2010 03:31, schrieb Anthony Liguori:
>>>
>>>        
>>>> On 05/25/2010 04:01 PM, Aurelien Jarno wrote:
>>>>
>>>>          
>>>>> I really think this patch can be useful, in my own case when testing
>>>>> debian-installer (I already cache=writeback). In short all that is about
>>>>> developing and testing, as opposed to run a VM in production, can
>>>>> benefit about that. This was one of the original use case of QEMU before
>>>>> KVM arrived.
>>>>>
>>>>> Unless someone can convince me not to do it, I seriously considering
>>>>> applying this patch.
>>>>>
>>>>>
>>>>>            
>>>> There really needs to be an indication in the --help output of what the
>>>> ramifications of this option are, in the very least.  It should also be
>>>> removable via a ./configure option because no sane distribution should
>>>> enable this for end users.
>>>>
>>>>          
>>> We know better what you stupid user want?
>>>        
>> What percentage of qemu users do you think have actually read qemu-doc.texi?
>>      
> As I said, put the warning in the option name like cache=unsafe or
> something even more scary and I'm all for it.
>
>    
>> It's not a stretch for someone to have heard that cache options can
>> improve performance, and then see cache=volatile in the help output, try
>> it, and then start using it because they observe a performance improvement.
>>
>> That's not being stupid.  I think it's a reasonable expectation for a
>> user to have that their data is safe.
>>      
> You seem to think that the user is too stupid to allow him to use this
> option even if he's perfectly aware what it's doing. It's a useful
> option if it's used right.
>    

No, that's not what I said.  I'm saying we need to try hard to make a 
user aware of what they're doing.

If it spit out a warning on stdio, I wouldn't think a compile option is 
needed.  Even with help output text, I'm concerned that someone is going 
to find a bad example on the internet.

cache=unsafe addresses the problem although I think it's a bit hokey.

> We need to make clear that it's dangerous when it's used in the wrong
> cases (for example by naming), but just disabling is not a solution for
> that. You don't suggest that "no sane distribution" should ship rm,
> because it's dangerous if you use it wrong, do you?
>    

You realize that quite a lot of distributions carry a patch to rm that 
prevents a user from doing rm -rf /?

Regards,

Anthony Liguori

> Kevin
>
Aurelien Jarno May 26, 2010, 2:12 p.m. UTC | #31
Anthony Liguori a écrit :
> On 05/26/2010 03:52 AM, Aurelien Jarno wrote:
>> On Tue, May 25, 2010 at 08:31:20PM -0500, Anthony Liguori wrote:
>>    
>>> On 05/25/2010 04:01 PM, Aurelien Jarno wrote:
>>>      
>>>> I really think this patch can be useful, in my own case when testing
>>>> debian-installer (I already cache=writeback). In short all that is about
>>>> developing and testing, as opposed to run a VM in production, can
>>>> benefit about that. This was one of the original use case of QEMU before
>>>> KVM arrived.
>>>>
>>>> Unless someone can convince me not to do it, I seriously considering
>>>> applying this patch.
>>>>        
>>> There really needs to be an indication in the --help output of what
>>> the ramifications of this option are, in the very least.  It should
>>>      
>> That's indeed something than can be done, but to avoid double standards,
>> it should also be done for other features that can lead to data
>> corruption. I am talking for example on the qcow format, which is not
>> really supported anymore.
>>    
> 
> I agree.
> 
>>> also be removable via a ./configure option because no sane
>>> distribution should enable this for end users.
>>>
>>>      
>> I totally disagree. All the examples I have given apply to qemu *users*,
>> not qemu developers. They surely don't want to recompile qemu for their
>> usage. Note also that what is proposed in this patch was the default not
>> too long ago, and that a lot of users complained about the new default
>> for their usage, they see it as a regression. We even had to put a note
>> explaining that in the Debian package to avoid to many bug reports.
>> cache=writeback only answer partially to this use case.
>>    
> 
> It's hard for me to consider this a performance regression because 
> ultimately, you're getting greater than bare metal performance (because 
> of extremely aggressive caching).  It might be a regression from the 
> previous performance, but that was at the cost of safety.

For people who don't care about safety it's still a regression. And it
is a common usage of QEMU.

> We might get 100 bug reports about this "regression" but they concern 
> much less than 1 bug report of image corruption because of power 
> failure/host crash.  A reputation of being unsafe is very difficult to 
> get rid of and is something that I hear concerns about frequently.
>
> I'm not suggesting that the compile option should be disabled by default 
> upstream.  But the option should be there for distributions because I 
> hope that any enterprise distro disables it.
> 

Which basically means those distro don't care about some use cases of
QEMU, that were for most of them the original uses cases. It's sad.

Sometimes I really whishes that KVM never tried to reintegrate code into
QEMU, it doesn't bring only good things.
Aurelien Jarno May 26, 2010, 2:13 p.m. UTC | #32
Kevin Wolf a écrit :
> Am 26.05.2010 15:42, schrieb Anthony Liguori:
>> On 05/26/2010 03:43 AM, Kevin Wolf wrote:
>>> Am 26.05.2010 03:31, schrieb Anthony Liguori:
>>>    
>>>> On 05/25/2010 04:01 PM, Aurelien Jarno wrote:
>>>>      
>>>>> I really think this patch can be useful, in my own case when testing
>>>>> debian-installer (I already cache=writeback). In short all that is about
>>>>> developing and testing, as opposed to run a VM in production, can
>>>>> benefit about that. This was one of the original use case of QEMU before
>>>>> KVM arrived.
>>>>>
>>>>> Unless someone can convince me not to do it, I seriously considering
>>>>> applying this patch.
>>>>>
>>>>>        
>>>> There really needs to be an indication in the --help output of what the
>>>> ramifications of this option are, in the very least.  It should also be
>>>> removable via a ./configure option because no sane distribution should
>>>> enable this for end users.
>>>>      
>>> We know better what you stupid user want?
>> What percentage of qemu users do you think have actually read qemu-doc.texi?
> 
> As I said, put the warning in the option name like cache=unsafe or
> something even more scary and I'm all for it.
> 

I am also fine for an option likes this, sounds the way to go.
Anthony Liguori May 26, 2010, 2:19 p.m. UTC | #33
On 05/26/2010 09:12 AM, Aurelien Jarno wrote:
>> It's hard for me to consider this a performance regression because
>> ultimately, you're getting greater than bare metal performance (because
>> of extremely aggressive caching).  It might be a regression from the
>> previous performance, but that was at the cost of safety.
>>      
> For people who don't care about safety it's still a regression. And it
> is a common usage of QEMU.
>    

It's not a functional change.  It's a change in performance.  There are 
tons of changes in performance characteristics of qemu from version to 
version.  It's not even a massive one.

>> We might get 100 bug reports about this "regression" but they concern
>> much less than 1 bug report of image corruption because of power
>> failure/host crash.  A reputation of being unsafe is very difficult to
>> get rid of and is something that I hear concerns about frequently.
>>
>> I'm not suggesting that the compile option should be disabled by default
>> upstream.  But the option should be there for distributions because I
>> hope that any enterprise distro disables it.
>>
>>      
> Which basically means those distro don't care about some use cases of
> QEMU, that were for most of them the original uses cases. It's sad.
>    

This isn't a feature.  This is a change in performance.  No one is not 
able to satisfy their use case from this behavior.

> Sometimes I really whishes that KVM never tried to reintegrate code into
> QEMU, it doesn't bring only good things.
>    

I highly doubt that this is even visible on benchmarks without using 
KVM.  The improvement on a microbenchmark was relatively small and the 
cost from TCG would almost certainly dwarf it.

Also, remember before KVM, we had single threaded IO and posix-aio 
(which is still single threaded).  If KVM never happened, block 
performance would be far, far worse than it is today with cache=writeback.

Regards,

Anthony Liguori
Kevin Wolf May 26, 2010, 2:26 p.m. UTC | #34
Am 26.05.2010 16:08, schrieb Anthony Liguori:
> On 05/26/2010 09:03 AM, Kevin Wolf wrote:
>> Am 26.05.2010 15:42, schrieb Anthony Liguori:
>>    
>>> On 05/26/2010 03:43 AM, Kevin Wolf wrote:
>>>      
>>>> Am 26.05.2010 03:31, schrieb Anthony Liguori:
>>>>
>>>>        
>>>>> On 05/25/2010 04:01 PM, Aurelien Jarno wrote:
>>>>>
>>>>>          
>>>>>> I really think this patch can be useful, in my own case when testing
>>>>>> debian-installer (I already cache=writeback). In short all that is about
>>>>>> developing and testing, as opposed to run a VM in production, can
>>>>>> benefit about that. This was one of the original use case of QEMU before
>>>>>> KVM arrived.
>>>>>>
>>>>>> Unless someone can convince me not to do it, I seriously considering
>>>>>> applying this patch.
>>>>>>
>>>>>>
>>>>>>            
>>>>> There really needs to be an indication in the --help output of what the
>>>>> ramifications of this option are, in the very least.  It should also be
>>>>> removable via a ./configure option because no sane distribution should
>>>>> enable this for end users.
>>>>>
>>>>>          
>>>> We know better what you stupid user want?
>>>>        
>>> What percentage of qemu users do you think have actually read qemu-doc.texi?
>>>      
>> As I said, put the warning in the option name like cache=unsafe or
>> something even more scary and I'm all for it.
>>
>>    
>>> It's not a stretch for someone to have heard that cache options can
>>> improve performance, and then see cache=volatile in the help output, try
>>> it, and then start using it because they observe a performance improvement.
>>>
>>> That's not being stupid.  I think it's a reasonable expectation for a
>>> user to have that their data is safe.
>>>      
>> You seem to think that the user is too stupid to allow him to use this
>> option even if he's perfectly aware what it's doing. It's a useful
>> option if it's used right.
>>    
> 
> No, that's not what I said.  I'm saying we need to try hard to make a 
> user aware of what they're doing.
> 
> If it spit out a warning on stdio, I wouldn't think a compile option is 
> needed.  Even with help output text, I'm concerned that someone is going 
> to find a bad example on the internet.
> 
> cache=unsafe addresses the problem although I think it's a bit hokey.

Then let's do it this way. I'm not opposed to a stdio message either,
even though I don't think it's really necessary with a name like
cache=unsafe. I just say that disabling the option is not a solution
because it prevents valid use.

>> We need to make clear that it's dangerous when it's used in the wrong
>> cases (for example by naming), but just disabling is not a solution for
>> that. You don't suggest that "no sane distribution" should ship rm,
>> because it's dangerous if you use it wrong, do you?
>>    
> 
> You realize that quite a lot of distributions carry a patch to rm that 
> prevents a user from doing rm -rf /?

Most rm invocations that I regretted later were not rm -rf /. Actually,
I think rm -rf / is not among them at all. ;-)

And I seem to remember that even these rm patches still allow the
protection to be overridden by some force flag. But I've never tried it out.

Kevin
Paolo Bonzini May 26, 2010, 2:27 p.m. UTC | #35
On 05/26/2010 03:48 PM, Anthony Liguori wrote:
> We might get 100 bug reports about this "regression" but they concern
> much less than 1 bug report of image corruption because of power
> failure/host crash. A reputation of being unsafe is very difficult to
> get rid of and is something that I hear concerns about frequently.

True, but how many people will use cache=volatile?  Nobody is going to 
make it the default.  If a blog post appears "hey look cache=volatile 
will speedup your virtual machine", and gets so much momentum that 
people start using it and lose data because of it (which is highly 
hypothetical and unlikely), QEMU developers are not the ones to be blamed.

> I'm not suggesting that the compile option should be disabled by default
> upstream. But the option should be there for distributions because I
> hope that any enterprise distro disables it.

Actually it's perfectly possible that they will _enable_ it if a 
configure option is required to enable cache=volatile.  RHEL for example 
doesn't support at all running qemu directly, only via libvirt.  If 
libvirt doesn't pass cache=volatile to qemu, they're safe.

(Well, virt-install uses libvirt, so it couldn't use cache=volatile 
either, so I admit it's not a great example).

Paolo
Avi Kivity May 26, 2010, 3:13 p.m. UTC | #36
On 05/26/2010 04:50 PM, Anthony Liguori wrote:
>> In fact, btrfs is currently unusable for virt because O_SYNC writes 
>> inflate a guest write to a host write. by a huge factor (50x-100x).  
>> cache=writethrough is 100% unusable, cache=writeback is barely 
>> tolerable.  As of 2.6.32, cache=volatile is probably required to get 
>> something resembling reasonable performance on btrfs.
>>
>> Of course, we expect that btrfs will improve in time, but still it 
>> doesn't seem to be fsync friendly.
>
>
> So you're suggesting that anyone who uses virt on btrfs should be 
> prepared to deal with data corruption on host failure? 

No.

> That sounds to me like btrfs isn't ready for real workloads.

The btrfs developers aren't saying anything different.  But people still 
want to try it out (to wire up snapshotting to management, for example).
Aurelien Jarno May 26, 2010, 3:40 p.m. UTC | #37
Anthony Liguori a écrit :
> On 05/26/2010 09:12 AM, Aurelien Jarno wrote:
>>> It's hard for me to consider this a performance regression because
>>> ultimately, you're getting greater than bare metal performance (because
>>> of extremely aggressive caching).  It might be a regression from the
>>> previous performance, but that was at the cost of safety.
>>>      
>> For people who don't care about safety it's still a regression. And it
>> is a common usage of QEMU.
>>    
> 
> It's not a functional change.  It's a change in performance.  There are 
> tons of changes in performance characteristics of qemu from version to 
> version.  It's not even a massive one.
> 
>>> We might get 100 bug reports about this "regression" but they concern
>>> much less than 1 bug report of image corruption because of power
>>> failure/host crash.  A reputation of being unsafe is very difficult to
>>> get rid of and is something that I hear concerns about frequently.
>>>
>>> I'm not suggesting that the compile option should be disabled by default
>>> upstream.  But the option should be there for distributions because I
>>> hope that any enterprise distro disables it.
>>>
>>>      
>> Which basically means those distro don't care about some use cases of
>> QEMU, that were for most of them the original uses cases. It's sad.
>>    
> 
> This isn't a feature.  This is a change in performance.  No one is not 
> able to satisfy their use case from this behavior.
> 
>> Sometimes I really whishes that KVM never tried to reintegrate code into
>> QEMU, it doesn't bring only good things.
>>    
> 
> I highly doubt that this is even visible on benchmarks without using 
> KVM.  The improvement on a microbenchmark was relatively small and the 
> cost from TCG would almost certainly dwarf it.

It is something clearly visible. Before fsync() was not used, and it
happens this syscall can be very expensive (ie a few seconds, especially
with some other i/o load on the system) on ext3 with not so old kernels.
A google search for "firefox fsync" will give you a few pointers.

> Also, remember before KVM, we had single threaded IO and posix-aio 
> (which is still single threaded).  If KVM never happened, block 
> performance would be far, far worse than it is today with cache=writeback.
> 

io thread is not enable by default in QEMU.
Anthony Liguori May 26, 2010, 4:11 p.m. UTC | #38
On 05/26/2010 10:40 AM, Aurelien Jarno wrote:
>> I highly doubt that this is even visible on benchmarks without using
>> KVM.  The improvement on a microbenchmark was relatively small and the
>> cost from TCG would almost certainly dwarf it.
>>      
> It is something clearly visible. Before fsync() was not used, and it
> happens this syscall can be very expensive (ie a few seconds, especially
> with some other i/o load on the system) on ext3 with not so old kernels.
> A google search for "firefox fsync" will give you a few pointers.
>    

I'm well aware.  I've asked multiple times for benchmark data with ext3 
because of this issue.  So far, it hasn't been noticable.

>> Also, remember before KVM, we had single threaded IO and posix-aio
>> (which is still single threaded).  If KVM never happened, block
>> performance would be far, far worse than it is today with cache=writeback.
>>
>>      
> io thread is not enable by default in QEMU.
>    

But the thread pool based aio implementation is always enabled 
regardless of io thread.  I was going to build and run 0.9.1 to do a 
benchmark of TCG comparing that to the latest git, but I don't have an 
easy way to get a version of gcc-3.  I've forgotten how annoying that 
used to be :-)

I would be very surprised if TCG + IDE had any observable performance 
difference pre/post the ide flush implementation.  I'm almost inclined 
to suggestion that flush controls should be a qdev property verses a 
drive property since that's really where the change happened.

Regards,

Anthony Liguori
diff mbox

Patch

diff --git a/block.c b/block.c
index 48305b7..b742965 100644
--- a/block.c
+++ b/block.c
@@ -50,6 +50,8 @@  static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
         BlockDriverCompletionFunc *cb, void *opaque);
 static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
         BlockDriverCompletionFunc *cb, void *opaque);
+static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque);
 static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
                         uint8_t *buf, int nb_sectors);
 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
@@ -1306,6 +1308,10 @@  const char *bdrv_get_device_name(BlockDriverState *bs)
 
 void bdrv_flush(BlockDriverState *bs)
 {
+    if (bs->open_flags & BDRV_O_NO_FLUSH) {
+        return;
+    }
+
     if (bs->drv && bs->drv->bdrv_flush)
         bs->drv->bdrv_flush(bs);
 }
@@ -2082,6 +2088,10 @@  BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
 {
     BlockDriver *drv = bs->drv;
 
+    if (bs->open_flags & BDRV_O_NO_FLUSH) {
+        return bdrv_aio_noop_em(bs, cb, opaque);
+    }
+
     if (!drv)
         return NULL;
     return drv->bdrv_aio_flush(bs, cb, opaque);
@@ -2196,6 +2206,24 @@  static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
     return &acb->common;
 }
 
+static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriverAIOCBSync *acb;
+
+    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
+    acb->is_write = 1; /* don't bounce in the completion hadler */
+    acb->qiov = NULL;
+    acb->bounce = NULL;
+    acb->ret = 0;
+
+    if (!acb->bh)
+        acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
+
+    qemu_bh_schedule(acb->bh);
+    return &acb->common;
+}
+
 /**************************************************************/
 /* sync block device emulation */
 
diff --git a/block.h b/block.h
index f87d24e..8032b6b 100644
--- a/block.h
+++ b/block.h
@@ -33,6 +33,7 @@  typedef struct QEMUSnapshotInfo {
 #define BDRV_O_CACHE_WB    0x0040 /* use write-back caching */
 #define BDRV_O_NATIVE_AIO  0x0080 /* use native AIO instead of the thread pool */
 #define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
+#define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
 
 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB)
 
diff --git a/qemu-config.c b/qemu-config.c
index d500885..bf3d493 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -54,7 +54,7 @@  QemuOptsList qemu_drive_opts = {
         },{
             .name = "cache",
             .type = QEMU_OPT_STRING,
-            .help = "host cache usage (none, writeback, writethrough)",
+            .help = "host cache usage (none, writeback, writethrough, volatile)",
         },{
             .name = "aio",
             .type = QEMU_OPT_STRING,
diff --git a/qemu-options.hx b/qemu-options.hx
index 12f6b51..6dedb4a 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -118,8 +118,9 @@  ETEXI
 DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n"
     "       [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
-    "       [,cache=writethrough|writeback|none][,format=f][,serial=s]\n"
-    "       [,addr=A][,id=name][,aio=threads|native][,readonly=on|off]\n"
+    "       [,cache=writethrough|writeback|volatile|none][,format=f]\n"
+    "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
+    "       [,readonly=on|off]\n"
     "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
 STEXI
 @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
@@ -148,7 +149,7 @@  These options have the same definition as they have in @option{-hdachs}.
 @item snapshot=@var{snapshot}
 @var{snapshot} is "on" or "off" and allows to enable snapshot for given drive (see @option{-snapshot}).
 @item cache=@var{cache}
-@var{cache} is "none", "writeback", or "writethrough" and controls how the host cache is used to access block data.
+@var{cache} is "none", "writeback", "volatile", or "writethrough" and controls how the host cache is used to access block data.
 @item aio=@var{aio}
 @var{aio} is "threads", or "native" and selects between pthread based disk I/O and native Linux AIO.
 @item format=@var{format}
@@ -180,6 +181,12 @@  Some block drivers perform badly with @option{cache=writethrough}, most notably,
 qcow2.  If performance is more important than correctness,
 @option{cache=writeback} should be used with qcow2.
 
+In case you don't care about data integrity over host failures, use
+cache=volatile. This option tells qemu that it never needs to write any data
+to the disk but can instead keeps things in cache. If anything goes wrong,
+like your host losing power, the disk storage getting disconnected accidently,
+etc. you're image will most probably be rendered unusable.
+
 Instead of @option{-cdrom} you can use:
 @example
 qemu -drive file=file,index=2,media=cdrom
diff --git a/vl.c b/vl.c
index 85bcc84..c8abce6 100644
--- a/vl.c
+++ b/vl.c
@@ -913,6 +913,9 @@  DriveInfo *drive_init(QemuOpts *opts, void *opaque,
             bdrv_flags |= BDRV_O_NOCACHE;
         } else if (!strcmp(buf, "writeback")) {
             bdrv_flags |= BDRV_O_CACHE_WB;
+        } else if (!strcmp(buf, "volatile")) {
+            bdrv_flags |= BDRV_O_CACHE_WB;
+            bdrv_flags |= BDRV_O_NO_FLUSH;
         } else if (!strcmp(buf, "writethrough")) {
             /* this is the default */
         } else {