diff mbox

[1/5] block: add discard support

Message ID 20101201153514.GA6310@lst.de
State New
Headers show

Commit Message

Christoph Hellwig Dec. 1, 2010, 3:35 p.m. UTC
Add a new bdrv_discard method to free blocks in a mapping image, and a new
drive property to set the granularity for these discard.  If no discard
granularity support is set discard support is disabled.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Comments

Kevin Wolf Dec. 2, 2010, 12:12 p.m. UTC | #1
Am 01.12.2010 16:35, schrieb Christoph Hellwig:
> Add a new bdrv_discard method to free blocks in a mapping image, and a new
> drive property to set the granularity for these discard.  If no discard
> granularity support is set discard support is disabled.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Index: qemu/block.c
> ===================================================================
> --- qemu.orig/block.c	2010-11-25 16:17:32.922003704 +0100
> +++ qemu/block.c	2010-11-29 14:10:21.793255565 +0100
> @@ -1499,6 +1499,15 @@ int bdrv_has_zero_init(BlockDriverState
>      return 1;
>  }
>  
> +int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
> +{
> +    if (!bs->drv)
> +        return -ENOMEDIUM;
> +    if (!bs->drv->bdrv_discard)
> +        return 0;

Missing braces.

> +    return bs->drv->bdrv_discard(bs, sector_num, nb_sectors);
> +}
> +
>  /*
>   * Returns true iff the specified sector is present in the disk image. Drivers
>   * not implementing the functionality are assumed to not support backing files,
> Index: qemu/block.h
> ===================================================================
> --- qemu.orig/block.h	2010-11-25 16:17:32.929004193 +0100
> +++ qemu/block.h	2010-11-29 14:07:00.267003145 +0100
> @@ -146,6 +146,7 @@ int bdrv_flush(BlockDriverState *bs);
>  void bdrv_flush_all(void);
>  void bdrv_close_all(void);
>  
> +int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
>  int bdrv_has_zero_init(BlockDriverState *bs);
>  int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
>  	int *pnum);
> Index: qemu/block_int.h
> ===================================================================
> --- qemu.orig/block_int.h	2010-11-25 16:17:32.935003774 +0100
> +++ qemu/block_int.h	2010-11-29 14:09:31.926264704 +0100
> @@ -73,6 +73,8 @@ struct BlockDriver {
>          BlockDriverCompletionFunc *cb, void *opaque);
>      BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
>          BlockDriverCompletionFunc *cb, void *opaque);
> +    int (*bdrv_discard)(BlockDriverState *bs, int64_t sector_num,
> +                        int nb_sectors);
>  
>      int (*bdrv_aio_multiwrite)(BlockDriverState *bs, BlockRequest *reqs,
>          int num_reqs);
> @@ -227,6 +229,7 @@ typedef struct BlockConf {
>      uint16_t logical_block_size;
>      uint16_t min_io_size;
>      uint32_t opt_io_size;
> +    uint32_t discard_granularity;
>  } BlockConf;
>  
>  static inline unsigned int get_physical_block_exp(BlockConf *conf)
> @@ -249,6 +252,8 @@ static inline unsigned int get_physical_
>      DEFINE_PROP_UINT16("physical_block_size", _state,                   \
>                         _conf.physical_block_size, 512),                 \
>      DEFINE_PROP_UINT16("min_io_size", _state, _conf.min_io_size, 0),  \
> -    DEFINE_PROP_UINT32("opt_io_size", _state, _conf.opt_io_size, 0)
> +    DEFINE_PROP_UINT32("opt_io_size", _state, _conf.opt_io_size, 0), \
> +    DEFINE_PROP_UINT32("discard_granularity", _state, \
> +                       _conf.discard_granularity, 0)

Is there no way to get this value automatically?

At least for non-raw images (and it should be trivial to implement this
in qcow2) I guess we'll want to set it to the cluster size of the image
instead of requiring the user to set this value.

Kevin
Christoph Hellwig Dec. 10, 2010, 1:38 p.m. UTC | #2
On Thu, Dec 02, 2010 at 01:12:13PM +0100, Kevin Wolf wrote:
> >      DEFINE_PROP_UINT16("physical_block_size", _state,                   \
> >                         _conf.physical_block_size, 512),                 \
> >      DEFINE_PROP_UINT16("min_io_size", _state, _conf.min_io_size, 0),  \
> > -    DEFINE_PROP_UINT32("opt_io_size", _state, _conf.opt_io_size, 0)
> > +    DEFINE_PROP_UINT32("opt_io_size", _state, _conf.opt_io_size, 0), \
> > +    DEFINE_PROP_UINT32("discard_granularity", _state, \
> > +                       _conf.discard_granularity, 0)
> 
> Is there no way to get this value automatically?
> 
> At least for non-raw images (and it should be trivial to implement this
> in qcow2) I guess we'll want to set it to the cluster size of the image
> instead of requiring the user to set this value.

It's guest visible state, so it must not change due to migrations.  For
the current implementation all values for it work anyway - if it's
smaller than the block size we'll zero out the remainder of the block.
Paul Brook Dec. 11, 2010, 12:50 p.m. UTC | #3
> On Thu, Dec 02, 2010 at 01:12:13PM +0100, Kevin Wolf wrote:
> > >      DEFINE_PROP_UINT16("physical_block_size", _state,                 
> > >       \
> > >      
> > >                         _conf.physical_block_size, 512),               
> > >                          \
> > >      
> > >      DEFINE_PROP_UINT16("min_io_size", _state, _conf.min_io_size, 0), 
> > >      \
> > > 
> > > -    DEFINE_PROP_UINT32("opt_io_size", _state, _conf.opt_io_size, 0)
> > > +    DEFINE_PROP_UINT32("opt_io_size", _state, _conf.opt_io_size, 0), \
> > > +    DEFINE_PROP_UINT32("discard_granularity", _state, \
> > > +                       _conf.discard_granularity, 0)
> > 
> > Is there no way to get this value automatically?
> > 
> > At least for non-raw images (and it should be trivial to implement this
> > in qcow2) I guess we'll want to set it to the cluster size of the image
> > instead of requiring the user to set this value.
> 
> It's guest visible state, so it must not change due to migrations.  For
> the current implementation all values for it work anyway - if it's
> smaller than the block size we'll zero out the remainder of the block.

That sounds wrong. Surely we should leave partial blocks untouched.

Paul
Christoph Hellwig Dec. 13, 2010, 3:43 p.m. UTC | #4
On Sat, Dec 11, 2010 at 12:50:20PM +0000, Paul Brook wrote:
> > It's guest visible state, so it must not change due to migrations.  For
> > the current implementation all values for it work anyway - if it's
> > smaller than the block size we'll zero out the remainder of the block.
> 
> That sounds wrong. Surely we should leave partial blocks untouched.

While zeroing them is not required for qemu, the general semantics of
the XFS ioctl require it.  It punches a hole, which means it's makes the
new area equivalent to a hole create by truncating a file to a larger
size and then only writing at the larger offset.  The semantics for a
hole in all Unix filesystems is that we read back zeroes from them.
If we write into a sparse file at a not block aligned offset the
zeroing of the partial block also happens.
Paolo Bonzini Dec. 13, 2010, 4:07 p.m. UTC | #5
On 12/10/2010 02:38 PM, Christoph Hellwig wrote:
> if it's smaller than the block size we'll zero out the remainder of
> the block.

I think it should fail at VM startup time, or even better do nothing at all.

When you write in the middle of an absent block, and a partially-zero 
block is created, this is not visible: a read cannot see the difference 
between "all zeros because it's sparse" and "all zeros because it's zero".

If I ask you to (optionally) punch a 1kb hole but all you can do is 
punch a 2kb hole, I do care about the second kilobyte of data.  Since 
the hole punching of bdrv_discard is completely optional, it should not 
be done in this case.

Paolo
Christoph Hellwig Dec. 13, 2010, 4:15 p.m. UTC | #6
On Mon, Dec 13, 2010 at 05:07:27PM +0100, Paolo Bonzini wrote:
> On 12/10/2010 02:38 PM, Christoph Hellwig wrote:
> >if it's smaller than the block size we'll zero out the remainder of
> >the block.
> 
> I think it should fail at VM startup time, or even better do nothing at all.

What should fail?

> When you write in the middle of an absent block, and a partially-zero 
> block is created, this is not visible: a read cannot see the difference 
> between "all zeros because it's sparse" and "all zeros because it's zero".

You can not see from a VM if a block is not allocated or zeroed.  Then
again we'll never create a fully zeroed block anyway unless we get
really stupid discard patterns from the guest OS.

> If I ask you to (optionally) punch a 1kb hole but all you can do is 
> punch a 2kb hole, I do care about the second kilobyte of data.  Since 
> the hole punching of bdrv_discard is completely optional, it should not 
> be done in this case.

Of course we do not discard the second KB in that case.  If you issue
a 1k UNRSVSP ioctl on a 2k block size XFS filesystem it will zero
exactly the 1k you specified, which is required for the semantics of the
ioctl.  Yes, it's not optimal, but qemu can't easily know what block
size the underlying filesystem has.
Paul Brook Dec. 13, 2010, 4:17 p.m. UTC | #7
> On Sat, Dec 11, 2010 at 12:50:20PM +0000, Paul Brook wrote:
> > > It's guest visible state, so it must not change due to migrations.  For
> > > the current implementation all values for it work anyway - if it's
> > > smaller than the block size we'll zero out the remainder of the block.
> > 
> > That sounds wrong. Surely we should leave partial blocks untouched.
> 
> While zeroing them is not required for qemu, the general semantics of
> the XFS ioctl require it.  It punches a hole, which means it's makes the
> new area equivalent to a hole create by truncating a file to a larger
> size and then only writing at the larger offset.  The semantics for a
> hole in all Unix filesystems is that we read back zeroes from them.
> If we write into a sparse file at a not block aligned offset the
> zeroing of the partial block also happens.

Ah, so it was just inconsistent use of the term "block".  When the erase 
region includes part of a block, we zero that part of the block and leave the 
rest of the block untouched.

Paul
Paolo Bonzini Dec. 13, 2010, 4:24 p.m. UTC | #8
On 12/13/2010 05:15 PM, Christoph Hellwig wrote:
> On Mon, Dec 13, 2010 at 05:07:27PM +0100, Paolo Bonzini wrote:
>> On 12/10/2010 02:38 PM, Christoph Hellwig wrote:
>>> if it's smaller than the block size we'll zero out the remainder of
>>> the block.
>>
>> I think it should fail at VM startup time, or even better do nothing at all.
>
> What should fail?

Nothing -- you wrote "if it's smaller than the block size we'll zero out 
the remainder of the block" which I interpreted the wrong way, i.e. as 
"XFS will round up the size to the remainder of the block and zero that 
part out as well".

Thanks for the clarification.

Paolo
diff mbox

Patch

Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2010-11-25 16:17:32.922003704 +0100
+++ qemu/block.c	2010-11-29 14:10:21.793255565 +0100
@@ -1499,6 +1499,15 @@  int bdrv_has_zero_init(BlockDriverState
     return 1;
 }
 
+int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+    if (!bs->drv)
+        return -ENOMEDIUM;
+    if (!bs->drv->bdrv_discard)
+        return 0;
+    return bs->drv->bdrv_discard(bs, sector_num, nb_sectors);
+}
+
 /*
  * Returns true iff the specified sector is present in the disk image. Drivers
  * not implementing the functionality are assumed to not support backing files,
Index: qemu/block.h
===================================================================
--- qemu.orig/block.h	2010-11-25 16:17:32.929004193 +0100
+++ qemu/block.h	2010-11-29 14:07:00.267003145 +0100
@@ -146,6 +146,7 @@  int bdrv_flush(BlockDriverState *bs);
 void bdrv_flush_all(void);
 void bdrv_close_all(void);
 
+int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
 int bdrv_has_zero_init(BlockDriverState *bs);
 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 	int *pnum);
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h	2010-11-25 16:17:32.935003774 +0100
+++ qemu/block_int.h	2010-11-29 14:09:31.926264704 +0100
@@ -73,6 +73,8 @@  struct BlockDriver {
         BlockDriverCompletionFunc *cb, void *opaque);
     BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
         BlockDriverCompletionFunc *cb, void *opaque);
+    int (*bdrv_discard)(BlockDriverState *bs, int64_t sector_num,
+                        int nb_sectors);
 
     int (*bdrv_aio_multiwrite)(BlockDriverState *bs, BlockRequest *reqs,
         int num_reqs);
@@ -227,6 +229,7 @@  typedef struct BlockConf {
     uint16_t logical_block_size;
     uint16_t min_io_size;
     uint32_t opt_io_size;
+    uint32_t discard_granularity;
 } BlockConf;
 
 static inline unsigned int get_physical_block_exp(BlockConf *conf)
@@ -249,6 +252,8 @@  static inline unsigned int get_physical_
     DEFINE_PROP_UINT16("physical_block_size", _state,                   \
                        _conf.physical_block_size, 512),                 \
     DEFINE_PROP_UINT16("min_io_size", _state, _conf.min_io_size, 0),  \
-    DEFINE_PROP_UINT32("opt_io_size", _state, _conf.opt_io_size, 0)
+    DEFINE_PROP_UINT32("opt_io_size", _state, _conf.opt_io_size, 0), \
+    DEFINE_PROP_UINT32("discard_granularity", _state, \
+                       _conf.discard_granularity, 0)
 
 #endif /* BLOCK_INT_H */
Index: qemu/block/raw.c
===================================================================
--- qemu.orig/block/raw.c	2010-11-25 16:17:32.943003495 +0100
+++ qemu/block/raw.c	2010-11-29 14:07:00.278003495 +0100
@@ -65,6 +65,11 @@  static int raw_probe(const uint8_t *buf,
    return 1; /* everything can be opened as raw image */
 }
 
+static int raw_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+    return bdrv_discard(bs->file, sector_num, nb_sectors);
+}
+
 static int raw_is_inserted(BlockDriverState *bs)
 {
     return bdrv_is_inserted(bs->file);
@@ -130,6 +135,7 @@  static BlockDriver bdrv_raw = {
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
     .bdrv_aio_flush     = raw_aio_flush,
+    .bdrv_discard       = raw_discard,
 
     .bdrv_is_inserted   = raw_is_inserted,
     .bdrv_eject         = raw_eject,