diff mbox series

block/file-posix: Optimize for macOS

Message ID 20210219085148.90545-1-akihiko.odaki@gmail.com
State New
Headers show
Series block/file-posix: Optimize for macOS | expand

Commit Message

Akihiko Odaki Feb. 19, 2021, 8:51 a.m. UTC
This commit introduces "punch hole" operation and optimizes transfer
block size for macOS.

This commit introduces two additional members,
discard_granularity and opt_io to BlockSizes type in
include/block/block.h. Also, the members of the type are now
optional. Set -1 to discard_granularity and 0 to other members
for the default values.

Signed-off-by: Akihiko Odaki <akihiko.odaki@gmail.com>
---
 block/file-posix.c    | 40 ++++++++++++++++++++++++++++++++++++++--
 block/nvme.c          |  2 ++
 hw/block/block.c      | 12 ++++++++++--
 include/block/block.h |  2 ++
 4 files changed, 52 insertions(+), 4 deletions(-)

Comments

no-reply@patchew.org Feb. 19, 2021, 8:57 a.m. UTC | #1
Patchew URL: https://patchew.org/QEMU/20210219085148.90545-1-akihiko.odaki@gmail.com/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Type: series
Message-id: 20210219085148.90545-1-akihiko.odaki@gmail.com
Subject: [PATCH] block/file-posix: Optimize for macOS

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git rev-parse base > /dev/null || exit 0
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 * [new tag]         patchew/20210219085148.90545-1-akihiko.odaki@gmail.com -> patchew/20210219085148.90545-1-akihiko.odaki@gmail.com
Switched to a new branch 'test'
455b4a3 block/file-posix: Optimize for macOS

=== OUTPUT BEGIN ===
WARNING: architecture specific defines should be avoided
#90: FILE: block/file-posix.c:2133:
+#if defined(__APPLE__) && (__MACH__)

ERROR: suspect code indent for conditional statements (8, 11)
#141: FILE: hw/block/block.c:73:
+        if (!backend_ret && blocksizes.phys) {
            conf->physical_block_size = blocksizes.phys;

total: 1 errors, 1 warnings, 129 lines checked

Commit 455b4a328821 (block/file-posix: Optimize for macOS) has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.
=== OUTPUT END ===

Test command exited with code: 1


The full log is available at
http://patchew.org/logs/20210219085148.90545-1-akihiko.odaki@gmail.com/testing.checkpatch/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-devel@redhat.com
Stefan Hajnoczi Feb. 24, 2021, 4:27 p.m. UTC | #2
On Fri, Feb 19, 2021 at 05:51:48PM +0900, Akihiko Odaki wrote:
> This commit introduces "punch hole" operation and optimizes transfer
> block size for macOS.
> 
> This commit introduces two additional members,
> discard_granularity and opt_io to BlockSizes type in
> include/block/block.h. Also, the members of the type are now
> optional. Set -1 to discard_granularity and 0 to other members
> for the default values.

I remember BlockSizes was added specifically for s390 DASD devices.
Normally QEMU does not automatically expose details of the underlying
hardware to the guest because it breaks live migration compatibility.

If a VM is running on host A where the value happens to be 512 bytes and
is migrated to host B where the value happens to be 4KB then:

1. The value reported to the guest by the storage device will change
   unexpectedly and the guest software is probably not prepared for this
   to happen.

2. I/O requests that violate the constraint imposed by host B's value
   will suddenly start failing and the VM may no longer be operational.

I think there was an argument that DASDs are passthrough devices and the
user always knows what they are doing, so we can ignore this problem for
DASDs.

This reasoning does not apply to POSIX files on macOS hosts, so I think
we need to figure out what to do here. The easiest option is not to
merge this patch series, but if this feature is important to you then we
need to think about how to extend the block size probing to be live
migration friendly or to change the QEMU command-line to support this
use case without unexpected live migration breakage.

CCing Kevin Wolf and Markus Armbruster.

> Signed-off-by: Akihiko Odaki <akihiko.odaki@gmail.com>
> ---
>  block/file-posix.c    | 40 ++++++++++++++++++++++++++++++++++++++--
>  block/nvme.c          |  2 ++
>  hw/block/block.c      | 12 ++++++++++--
>  include/block/block.h |  2 ++
>  4 files changed, 52 insertions(+), 4 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 00cdaaa2d41..defbc04c8e7 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -44,6 +44,7 @@
>  #if defined(__APPLE__) && (__MACH__)
>  #include <paths.h>
>  #include <sys/param.h>
> +#include <sys/mount.h>
>  #include <IOKit/IOKitLib.h>
>  #include <IOKit/IOBSD.h>
>  #include <IOKit/storage/IOMediaBSDClient.h>
> @@ -1274,6 +1275,8 @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
>      if (check_for_dasd(s->fd) < 0) {
>          return -ENOTSUP;
>      }
> +    bsz->opt_io = 0;
> +    bsz->discard_granularity = -1;
>      ret = probe_logical_blocksize(s->fd, &bsz->log);
>      if (ret < 0) {
>          return ret;
> @@ -1568,6 +1571,7 @@ out:
>      }
>  }
>  
> +G_GNUC_UNUSED
>  static int translate_err(int err)
>  {
>      if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
> @@ -1777,16 +1781,27 @@ static int handle_aiocb_discard(void *opaque)
>              }
>          } while (errno == EINTR);
>  
> -        ret = -errno;
> +        ret = translate_err(-errno);
>  #endif
>      } else {
>  #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
>          ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
>                             aiocb->aio_offset, aiocb->aio_nbytes);
> +        ret = translate_err(-errno);
> +#elif defined(__APPLE__) && (__MACH__)
> +        fpunchhole_t fpunchhole;
> +        fpunchhole.fp_flags = 0;
> +        fpunchhole.reserved = 0;
> +        fpunchhole.fp_offset = aiocb->aio_offset;
> +        fpunchhole.fp_length = aiocb->aio_nbytes;
> +        if (fcntl(s->fd, F_PUNCHHOLE, &fpunchhole) == -1) {
> +            ret = errno == ENODEV ? -ENOTSUP : -errno;
> +        } else {
> +            ret = 0;
> +        }
>  #endif
>      }
>  
> -    ret = translate_err(ret);
>      if (ret == -ENOTSUP) {
>          s->has_discard = false;
>      }
> @@ -2095,6 +2110,26 @@ static int raw_co_flush_to_disk(BlockDriverState *bs)
>      return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
>  }
>  
> +static int raw_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
> +{
> +#if defined(__APPLE__) && (__MACH__)
> +    BDRVRawState *s = bs->opaque;
> +    struct statfs buf;
> +
> +    if (!fstatfs(s->fd, &buf)) {
> +        bsz->phys = 0;
> +        bsz->log = 0;
> +        bsz->opt_io = buf.f_iosize;
> +        bsz->discard_granularity = buf.f_bsize;
> +        return 0;
> +    }
> +
> +    return -errno;
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
> +
>  static void raw_aio_attach_aio_context(BlockDriverState *bs,
>                                         AioContext *new_context)
>  {
> @@ -3229,6 +3264,7 @@ BlockDriver bdrv_file = {
>      .bdrv_refresh_limits = raw_refresh_limits,
>      .bdrv_io_plug = raw_aio_plug,
>      .bdrv_io_unplug = raw_aio_unplug,
> +    .bdrv_probe_blocksizes = raw_probe_blocksizes,
>      .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>  
>      .bdrv_co_truncate = raw_co_truncate,
> diff --git a/block/nvme.c b/block/nvme.c
> index 5a6fbacf4a5..6d84bbdb632 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> @@ -983,6 +983,8 @@ static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
>      uint32_t blocksize = nvme_get_blocksize(bs);
>      bsz->phys = blocksize;
>      bsz->log = blocksize;
> +    bsz->opt_io = 0;
> +    bsz->discard_granularity = -1;
>      return 0;
>  }
>  
> diff --git a/hw/block/block.c b/hw/block/block.c
> index 1e34573da71..c907e5a7722 100644
> --- a/hw/block/block.c
> +++ b/hw/block/block.c
> @@ -70,19 +70,27 @@ bool blkconf_blocksizes(BlockConf *conf, Error **errp)
>      backend_ret = blk_probe_blocksizes(blk, &blocksizes);
>      /* fill in detected values if they are not defined via qemu command line */
>      if (!conf->physical_block_size) {
> -        if (!backend_ret) {
> +        if (!backend_ret && blocksizes.phys) {
>             conf->physical_block_size = blocksizes.phys;
>          } else {
>              conf->physical_block_size = BDRV_SECTOR_SIZE;
>          }
>      }
>      if (!conf->logical_block_size) {
> -        if (!backend_ret) {
> +        if (!backend_ret && blocksizes.log) {
>              conf->logical_block_size = blocksizes.log;
>          } else {
>              conf->logical_block_size = BDRV_SECTOR_SIZE;
>          }
>      }
> +    if (!backend_ret) {
> +        if (!conf->opt_io_size) {
> +            conf->opt_io_size = blocksizes.opt_io;
> +        }
> +        if (conf->discard_granularity == -1) {
> +            conf->discard_granularity = blocksizes.discard_granularity;
> +        }
> +    }
>  
>      if (conf->logical_block_size > conf->physical_block_size) {
>          error_setg(errp,
> diff --git a/include/block/block.h b/include/block/block.h
> index a193545b6ab..31bf3acacad 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -91,6 +91,8 @@ typedef enum {
>  typedef struct BlockSizes {
>      uint32_t phys;
>      uint32_t log;
> +    uint32_t discard_granularity;
> +    uint32_t opt_io;
>  } BlockSizes;
>  
>  typedef struct HDGeometry {
> -- 
> 2.24.3 (Apple Git-128)
>
Kevin Wolf Feb. 24, 2021, 5:30 p.m. UTC | #3
Am 24.02.2021 um 17:27 hat Stefan Hajnoczi geschrieben:
> On Fri, Feb 19, 2021 at 05:51:48PM +0900, Akihiko Odaki wrote:
> > This commit introduces "punch hole" operation and optimizes transfer
> > block size for macOS.
> > 
> > This commit introduces two additional members,
> > discard_granularity and opt_io to BlockSizes type in
> > include/block/block.h. Also, the members of the type are now
> > optional. Set -1 to discard_granularity and 0 to other members
> > for the default values.
> 
> I remember BlockSizes was added specifically for s390 DASD devices.
> Normally QEMU does not automatically expose details of the underlying
> hardware to the guest because it breaks live migration compatibility.
> 
> If a VM is running on host A where the value happens to be 512 bytes and
> is migrated to host B where the value happens to be 4KB then:
> 
> 1. The value reported to the guest by the storage device will change
>    unexpectedly and the guest software is probably not prepared for this
>    to happen.
> 
> 2. I/O requests that violate the constraint imposed by host B's value
>    will suddenly start failing and the VM may no longer be operational.
> 
> I think there was an argument that DASDs are passthrough devices and the
> user always knows what they are doing, so we can ignore this problem for
> DASDs.
> 
> This reasoning does not apply to POSIX files on macOS hosts, so I think
> we need to figure out what to do here. The easiest option is not to
> merge this patch series, but if this feature is important to you then we
> need to think about how to extend the block size probing to be live
> migration friendly or to change the QEMU command-line to support this
> use case without unexpected live migration breakage.

Dave actually made a good point on IRC: Even if we change live migration
so that it doesn't break when we move to a host where different defaults
are autodetected (we could do this by including these values in the
migration stream and letting that override what the user specific on the
command line), it still means that the guest visible device would change
after the next reboot.

The same can happen without live migration, either by copying the image
to a different host, or by changing the hardware of the host.

I'm not sure how critical such changes are now, but I seem to remember
that in the past, one big reason to avoid them was that Windows guests
would require reactivation after a few changes.

(Also adding Peter to CC as the libvirt representative who, I suspect,
might not like the idea of autodetecting by default very much :-))

Kevin
Akihiko Odaki Feb. 25, 2021, 2:09 a.m. UTC | #4
2021年2月25日(木) 2:31 Kevin Wolf <kwolf@redhat.com>:
>
> Am 24.02.2021 um 17:27 hat Stefan Hajnoczi geschrieben:
> > On Fri, Feb 19, 2021 at 05:51:48PM +0900, Akihiko Odaki wrote:
> > > This commit introduces "punch hole" operation and optimizes transfer
> > > block size for macOS.
> > >
> > > This commit introduces two additional members,
> > > discard_granularity and opt_io to BlockSizes type in
> > > include/block/block.h. Also, the members of the type are now
> > > optional. Set -1 to discard_granularity and 0 to other members
> > > for the default values.
> >
> > I remember BlockSizes was added specifically for s390 DASD devices.
> > Normally QEMU does not automatically expose details of the underlying
> > hardware to the guest because it breaks live migration compatibility.
> >
> > If a VM is running on host A where the value happens to be 512 bytes and
> > is migrated to host B where the value happens to be 4KB then:
> >
> > 1. The value reported to the guest by the storage device will change
> >    unexpectedly and the guest software is probably not prepared for this
> >    to happen.
> >
> > 2. I/O requests that violate the constraint imposed by host B's value
> >    will suddenly start failing and the VM may no longer be operational.
> >
> > I think there was an argument that DASDs are passthrough devices and the
> > user always knows what they are doing, so we can ignore this problem for
> > DASDs.
> >
> > This reasoning does not apply to POSIX files on macOS hosts, so I think
> > we need to figure out what to do here. The easiest option is not to
> > merge this patch series, but if this feature is important to you then we
> > need to think about how to extend the block size probing to be live
> > migration friendly or to change the QEMU command-line to support this
> > use case without unexpected live migration breakage.
>
> Dave actually made a good point on IRC: Even if we change live migration
> so that it doesn't break when we move to a host where different defaults
> are autodetected (we could do this by including these values in the
> migration stream and letting that override what the user specific on the
> command line), it still means that the guest visible device would change
> after the next reboot.
>
> The same can happen without live migration, either by copying the image
> to a different host, or by changing the hardware of the host.
>
> I'm not sure how critical such changes are now, but I seem to remember
> that in the past, one big reason to avoid them was that Windows guests
> would require reactivation after a few changes.
>
> (Also adding Peter to CC as the libvirt representative who, I suspect,
> might not like the idea of autodetecting by default very much :-))
>
> Kevin

The "copy" concern perhaps also applies to the host device backend,
which already has some auto detections. When the physical backend
device fails, a user may create a live snapshot, "dd" it to another
disk, and resume with the new disk, which can have different block
sizes.

I wonder if it is worthwhile to have an option to disable any kind of
autodetection depending on the host, including those of the host
device backend and maybe of subsystems other than block devices.

Regards,
Akihiko Odaki
diff mbox series

Patch

diff --git a/block/file-posix.c b/block/file-posix.c
index 00cdaaa2d41..defbc04c8e7 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -44,6 +44,7 @@ 
 #if defined(__APPLE__) && (__MACH__)
 #include <paths.h>
 #include <sys/param.h>
+#include <sys/mount.h>
 #include <IOKit/IOKitLib.h>
 #include <IOKit/IOBSD.h>
 #include <IOKit/storage/IOMediaBSDClient.h>
@@ -1274,6 +1275,8 @@  static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
     if (check_for_dasd(s->fd) < 0) {
         return -ENOTSUP;
     }
+    bsz->opt_io = 0;
+    bsz->discard_granularity = -1;
     ret = probe_logical_blocksize(s->fd, &bsz->log);
     if (ret < 0) {
         return ret;
@@ -1568,6 +1571,7 @@  out:
     }
 }
 
+G_GNUC_UNUSED
 static int translate_err(int err)
 {
     if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
@@ -1777,16 +1781,27 @@  static int handle_aiocb_discard(void *opaque)
             }
         } while (errno == EINTR);
 
-        ret = -errno;
+        ret = translate_err(-errno);
 #endif
     } else {
 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
         ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                            aiocb->aio_offset, aiocb->aio_nbytes);
+        ret = translate_err(-errno);
+#elif defined(__APPLE__) && (__MACH__)
+        fpunchhole_t fpunchhole;
+        fpunchhole.fp_flags = 0;
+        fpunchhole.reserved = 0;
+        fpunchhole.fp_offset = aiocb->aio_offset;
+        fpunchhole.fp_length = aiocb->aio_nbytes;
+        if (fcntl(s->fd, F_PUNCHHOLE, &fpunchhole) == -1) {
+            ret = errno == ENODEV ? -ENOTSUP : -errno;
+        } else {
+            ret = 0;
+        }
 #endif
     }
 
-    ret = translate_err(ret);
     if (ret == -ENOTSUP) {
         s->has_discard = false;
     }
@@ -2095,6 +2110,26 @@  static int raw_co_flush_to_disk(BlockDriverState *bs)
     return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
 }
 
+static int raw_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
+{
+#if defined(__APPLE__) && (__MACH__)
+    BDRVRawState *s = bs->opaque;
+    struct statfs buf;
+
+    if (!fstatfs(s->fd, &buf)) {
+        bsz->phys = 0;
+        bsz->log = 0;
+        bsz->opt_io = buf.f_iosize;
+        bsz->discard_granularity = buf.f_bsize;
+        return 0;
+    }
+
+    return -errno;
+#else
+    return -ENOTSUP;
+#endif
+}
+
 static void raw_aio_attach_aio_context(BlockDriverState *bs,
                                        AioContext *new_context)
 {
@@ -3229,6 +3264,7 @@  BlockDriver bdrv_file = {
     .bdrv_refresh_limits = raw_refresh_limits,
     .bdrv_io_plug = raw_aio_plug,
     .bdrv_io_unplug = raw_aio_unplug,
+    .bdrv_probe_blocksizes = raw_probe_blocksizes,
     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
     .bdrv_co_truncate = raw_co_truncate,
diff --git a/block/nvme.c b/block/nvme.c
index 5a6fbacf4a5..6d84bbdb632 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -983,6 +983,8 @@  static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
     uint32_t blocksize = nvme_get_blocksize(bs);
     bsz->phys = blocksize;
     bsz->log = blocksize;
+    bsz->opt_io = 0;
+    bsz->discard_granularity = -1;
     return 0;
 }
 
diff --git a/hw/block/block.c b/hw/block/block.c
index 1e34573da71..c907e5a7722 100644
--- a/hw/block/block.c
+++ b/hw/block/block.c
@@ -70,19 +70,27 @@  bool blkconf_blocksizes(BlockConf *conf, Error **errp)
     backend_ret = blk_probe_blocksizes(blk, &blocksizes);
     /* fill in detected values if they are not defined via qemu command line */
     if (!conf->physical_block_size) {
-        if (!backend_ret) {
+        if (!backend_ret && blocksizes.phys) {
            conf->physical_block_size = blocksizes.phys;
         } else {
             conf->physical_block_size = BDRV_SECTOR_SIZE;
         }
     }
     if (!conf->logical_block_size) {
-        if (!backend_ret) {
+        if (!backend_ret && blocksizes.log) {
             conf->logical_block_size = blocksizes.log;
         } else {
             conf->logical_block_size = BDRV_SECTOR_SIZE;
         }
     }
+    if (!backend_ret) {
+        if (!conf->opt_io_size) {
+            conf->opt_io_size = blocksizes.opt_io;
+        }
+        if (conf->discard_granularity == -1) {
+            conf->discard_granularity = blocksizes.discard_granularity;
+        }
+    }
 
     if (conf->logical_block_size > conf->physical_block_size) {
         error_setg(errp,
diff --git a/include/block/block.h b/include/block/block.h
index a193545b6ab..31bf3acacad 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -91,6 +91,8 @@  typedef enum {
 typedef struct BlockSizes {
     uint32_t phys;
     uint32_t log;
+    uint32_t discard_granularity;
+    uint32_t opt_io;
 } BlockSizes;
 
 typedef struct HDGeometry {