diff mbox

[1.1,19/22] block: implement is_allocated for raw

Message ID 1336488722-13120-20-git-send-email-pbonzini@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini May 8, 2012, 2:51 p.m. UTC
Either FIEMAP, or SEEK_DATA+SEEK_HOLE can be used to implement the
is_allocated callback for raw files.  Ext4, btrfs and XFS all support
it.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 block/raw-posix.c |  102 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/raw.c       |    8 +++++
 2 files changed, 110 insertions(+)

Comments

Kevin Wolf May 9, 2012, 1:40 p.m. UTC | #1
Am 08.05.2012 16:51, schrieb Paolo Bonzini:
> Either FIEMAP, or SEEK_DATA+SEEK_HOLE can be used to implement the
> is_allocated callback for raw files.  Ext4, btrfs and XFS all support
> it.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

This is for 1.2, I'll queue it in block-next.

> ---
>  block/raw-posix.c |  102 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/raw.c       |    8 +++++
>  2 files changed, 110 insertions(+)
> 
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index 03fcfcc..6753c73 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -52,6 +52,10 @@
>  #include <sys/param.h>
>  #include <linux/cdrom.h>
>  #include <linux/fd.h>
> +#include <linux/fs.h>
> +#endif
> +#ifdef CONFIG_FIEMAP
> +#include <linux/fiemap.h>
>  #endif
>  #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
>  #include <sys/disk.h>
> @@ -104,6 +108,13 @@
>  #define O_DIRECT O_DSYNC
>  #endif
>  
> +#ifndef SEEK_DATA
> +#define SEEK_DATA 3
> +#endif
> +#ifndef SEEK_HOLE
> +#define SEEK_HOLE 4
> +#endif

How is that going to be portable? You assume that on non-Linux you'll
get -EINVAL, but what does guarantee that 3 or 4 aren't already used for
the standard SEEK_* constants or for a different non-standard extension?

Kevin
Paolo Bonzini May 9, 2012, 2:05 p.m. UTC | #2
Il 09/05/2012 15:40, Kevin Wolf ha scritto:
>> > +#ifndef SEEK_DATA
>> > +#define SEEK_DATA 3
>> > +#endif
>> > +#ifndef SEEK_HOLE
>> > +#define SEEK_HOLE 4
>> > +#endif
> How is that going to be portable? You assume that on non-Linux you'll
> get -EINVAL, but what does guarantee that 3 or 4 aren't already used for
> the standard SEEK_* constants or for a different non-standard extension?

While SEEK_* is not guaranteed by POSIX to be 0/1/2, the values is so
old that there may still exist programs that hard-code the values
(similar to O_RDONLY/O_WRONLY/O_RDWR, though probably not any other O_*
constant).  It would be quite unwise to define them to something else.
Even MS-DOS reused the values!

AFAIK this is the only extension of lseek that's ever been added.  It
was done on Solaris first and then in Linux and the BSDs.  It used 3/4
there too, see for example http://bugs.python.org/msg119551 (Solaris)
and http://mail-index.netbsd.org/tech-kern/2011/08/17/msg011231.html
(NetBSD).

Paolo
Kevin Wolf May 9, 2012, 2:10 p.m. UTC | #3
Am 09.05.2012 16:05, schrieb Paolo Bonzini:
> Il 09/05/2012 15:40, Kevin Wolf ha scritto:
>>>> +#ifndef SEEK_DATA
>>>> +#define SEEK_DATA 3
>>>> +#endif
>>>> +#ifndef SEEK_HOLE
>>>> +#define SEEK_HOLE 4
>>>> +#endif
>> How is that going to be portable? You assume that on non-Linux you'll
>> get -EINVAL, but what does guarantee that 3 or 4 aren't already used for
>> the standard SEEK_* constants or for a different non-standard extension?
> 
> While SEEK_* is not guaranteed by POSIX to be 0/1/2, the values is so
> old that there may still exist programs that hard-code the values
> (similar to O_RDONLY/O_WRONLY/O_RDWR, though probably not any other O_*
> constant).  It would be quite unwise to define them to something else.
> Even MS-DOS reused the values!
> 
> AFAIK this is the only extension of lseek that's ever been added.  It
> was done on Solaris first and then in Linux and the BSDs.  It used 3/4
> there too, see for example http://bugs.python.org/msg119551 (Solaris)
> and http://mail-index.netbsd.org/tech-kern/2011/08/17/msg011231.html
> (NetBSD).

Why not simply #ifdef the whole code out and fall back to the current
"everything is allocated" behaviour when SEEK_DATA/HOLE aren't defined?

Kevin
Paolo Bonzini May 9, 2012, 2:24 p.m. UTC | #4
Il 09/05/2012 16:10, Kevin Wolf ha scritto:
> > While SEEK_* is not guaranteed by POSIX to be 0/1/2, the values is so
> > old that there may still exist programs that hard-code the values
> > (similar to O_RDONLY/O_WRONLY/O_RDWR, though probably not any other O_*
> > constant).  It would be quite unwise to define them to something else.
> > Even MS-DOS reused the values!
> > 
> > AFAIK this is the only extension of lseek that's ever been added.  It
> > was done on Solaris first and then in Linux and the BSDs.  It used 3/4
> > there too, see for example http://bugs.python.org/msg119551 (Solaris)
> > and http://mail-index.netbsd.org/tech-kern/2011/08/17/msg011231.html
> > (NetBSD).
>
> Why not simply #ifdef the whole code out and fall back to the current
> "everything is allocated" behaviour when SEEK_DATA/HOLE aren't defined?

That would be okay too of course.

When I wrote it Google Code existed still, so I must have taken the
idiom from some place.  Now it doesn't anymore, so I cannot check and
it's okay to keep it safe.  The code is a little bit uglier though.

Paolo
diff mbox

Patch

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 03fcfcc..6753c73 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -52,6 +52,10 @@ 
 #include <sys/param.h>
 #include <linux/cdrom.h>
 #include <linux/fd.h>
+#include <linux/fs.h>
+#endif
+#ifdef CONFIG_FIEMAP
+#include <linux/fiemap.h>
 #endif
 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
 #include <sys/disk.h>
@@ -104,6 +108,13 @@ 
 #define O_DIRECT O_DSYNC
 #endif
 
+#ifndef SEEK_DATA
+#define SEEK_DATA 3
+#endif
+#ifndef SEEK_HOLE
+#define SEEK_HOLE 4
+#endif
+
 #define FTYPE_FILE   0
 #define FTYPE_CD     1
 #define FTYPE_FD     2
@@ -583,6 +594,96 @@  static int raw_create(const char *filename, QEMUOptionParameter *options)
     return result;
 }
 
+/*
+ * Returns true iff the specified sector is present in the disk image. Drivers
+ * not implementing the functionality are assumed to not support backing files,
+ * hence all their sectors are reported as allocated.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ */
+static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
+                                            int64_t sector_num,
+                                            int nb_sectors, int *pnum)
+{
+    BDRVRawState *s = bs->opaque;
+    off_t start, data, hole;
+    int ret;
+
+    ret = fd_open(bs);
+    if (ret < 0) {
+        return ret;
+    }
+
+    start = sector_num * BDRV_SECTOR_SIZE;
+#ifdef CONFIG_FIEMAP
+    struct {
+        struct fiemap fm;
+        struct fiemap_extent fe;
+    } f;
+    f.fm.fm_start = start;
+    f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE;
+    f.fm.fm_flags = 0;
+    f.fm.fm_extent_count = 1;
+    f.fm.fm_reserved = 0;
+    if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
+        /* Assume everything is allocated.  */
+        *pnum = nb_sectors;
+        return 1;
+    }
+
+    if (f.fm.fm_mapped_extents == 0) {
+        /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length.
+         * f.fm.fm_start + f.fm.fm_length must be clamped to the file size!
+         */
+        off_t length = lseek(s->fd, 0, SEEK_END);
+        hole = f.fm.fm_start;
+        data = MIN(f.fm.fm_start + f.fm.fm_length, length);
+    } else {
+        data = f.fe.fe_logical;
+        hole = f.fe.fe_logical + f.fe.fe_length;
+    }
+#else
+    hole = lseek(s->fd, start, SEEK_HOLE);
+    if (hole == -1) {
+        /* -ENXIO indicates that sector_num was past the end of the file.
+         * There is a virtual hole there.  */
+        assert(errno != -ENXIO);
+
+        /* Most likely EINVAL.  Assume everything is allocated.  */
+        *pnum = nb_sectors;
+        return 1;
+    }
+
+    if (hole > start) {
+        data = start;
+    } else {
+        /* On a hole.  We need another syscall to find its end.  */
+        data = lseek(s->fd, start, SEEK_DATA);
+        if (data == -1) {
+            data = lseek(s->fd, 0, SEEK_END);
+        }
+    }
+#endif
+
+    if (data <= start) {
+        /* On a data extent, compute sectors to the end of the extent.  */
+        *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
+        return 1;
+    } else {
+        /* On a hole, compute sectors to the beginning of the next extent.  */
+        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
+        return 0;
+    }
+}
+
 #ifdef CONFIG_XFS
 static int xfs_discard(BDRVRawState *s, int64_t sector_num, int nb_sectors)
 {
@@ -634,6 +735,7 @@  static BlockDriver bdrv_file = {
     .bdrv_close = raw_close,
     .bdrv_create = raw_create,
     .bdrv_co_discard = raw_co_discard,
+    .bdrv_co_is_allocated = raw_co_is_allocated,
 
     .bdrv_aio_readv = raw_aio_readv,
     .bdrv_aio_writev = raw_aio_writev,
diff --git a/block/raw.c b/block/raw.c
index 7086e31..09d9b48 100644
--- a/block/raw.c
+++ b/block/raw.c
@@ -25,6 +25,13 @@  static void raw_close(BlockDriverState *bs)
 {
 }
 
+static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
+                                            int64_t sector_num,
+                                            int nb_sectors, int *pnum)
+{
+    return bdrv_co_is_allocated(bs->file, sector_num, nb_sectors, pnum);
+}
+
 static int64_t raw_getlength(BlockDriverState *bs)
 {
     return bdrv_getlength(bs->file);
@@ -108,6 +115,7 @@  static BlockDriver bdrv_raw = {
 
     .bdrv_co_readv          = raw_co_readv,
     .bdrv_co_writev         = raw_co_writev,
+    .bdrv_co_is_allocated   = raw_co_is_allocated,
     .bdrv_co_discard        = raw_co_discard,
 
     .bdrv_probe         = raw_probe,