[v2,1/1] raw-posix.c - use max transfer length / max segement count only for SCSI passthrough
diff mbox series

Message ID 20190704124342.7753-2-mlevitsk@redhat.com
State New
Headers show
Series
  • Don't obey the kernel block device max transfer len / max segments for raw block devices
Related show

Commit Message

Maxim Levitsky July 4, 2019, 12:43 p.m. UTC
Regular kernel block devices (/dev/sda*, /dev/nvme*, etc) don't have
max segment size/max segment count hardware requirements exposed
to the userspace, but rather the kernel block layer
takes care to split the incoming requests that
violate these requirements.

Allowing the kernel to do the splitting allows qemu to avoid
various overheads that arise otherwise from this.

This is especially visible in nbd server,
exposing as a raw file, a mostly empty qcow2 image over the net.
In this case most of the reads by the remote user
won't even hit the underlying kernel block device,
and therefore most of the  overhead will be in the
nbd traffic which increases significantly with lower max transfer size.

In addition to that even for local block device
access the peformance improves a bit due to less
traffic between qemu and the kernel when large
transfer sizes are used (e.g for image conversion)

More info can be found at:
https://bugzilla.redhat.com/show_bug.cgi?id=1647104

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 block/file-posix.c | 54 ++++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 26 deletions(-)

Comments

Maxim Levitsky July 10, 2019, 1:43 p.m. UTC | #1
On Thu, 2019-07-04 at 15:43 +0300, Maxim Levitsky wrote:
> Regular kernel block devices (/dev/sda*, /dev/nvme*, etc) don't have
> max segment size/max segment count hardware requirements exposed
> to the userspace, but rather the kernel block layer
> takes care to split the incoming requests that
> violate these requirements.
> 
> Allowing the kernel to do the splitting allows qemu to avoid
> various overheads that arise otherwise from this.
> 
> This is especially visible in nbd server,
> exposing as a raw file, a mostly empty qcow2 image over the net.
> In this case most of the reads by the remote user
> won't even hit the underlying kernel block device,
> and therefore most of the  overhead will be in the
> nbd traffic which increases significantly with lower max transfer size.
> 
> In addition to that even for local block device
> access the peformance improves a bit due to less
> traffic between qemu and the kernel when large
> transfer sizes are used (e.g for image conversion)
> 
> More info can be found at:
> https://bugzilla.redhat.com/show_bug.cgi?id=1647104
> 
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> Reviewed-by: Eric Blake <eblake@redhat.com>
> ---
>  block/file-posix.c | 54 ++++++++++++++++++++++++----------------------
>  1 file changed, 28 insertions(+), 26 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index ab05b51a66..4479cc7ab4 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1038,15 +1038,13 @@ static void raw_reopen_abort(BDRVReopenState *state)
>      s->reopen_state = NULL;
>  }
>  
> -static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
> +static int sg_get_max_transfer_length(int fd)
>  {
>  #ifdef BLKSECTGET
>      int max_bytes = 0;
> -    short max_sectors = 0;
> -    if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
> +
> +    if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
>          return max_bytes;
> -    } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
> -        return max_sectors << BDRV_SECTOR_BITS;
>      } else {
>          return -errno;
>      }
> @@ -1055,25 +1053,31 @@ static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
>  #endif
>  }
>  
> -static int hdev_get_max_segments(const struct stat *st)
> +static int sg_get_max_segments(int fd)
>  {
>  #ifdef CONFIG_LINUX
>      char buf[32];
>      const char *end;
> -    char *sysfspath;
> +    char *sysfspath = NULL;
>      int ret;
> -    int fd = -1;
> +    int sysfd = -1;
>      long max_segments;
> +    struct stat st;
> +
> +    if (fstat(fd, &st)) {
> +        ret = -errno;
> +        goto out;
> +    }
>  
>      sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> -                                major(st->st_rdev), minor(st->st_rdev));
> -    fd = open(sysfspath, O_RDONLY);
> -    if (fd == -1) {
> +                                major(st.st_rdev), minor(st.st_rdev));
> +    sysfd = open(sysfspath, O_RDONLY);
> +    if (sysfd == -1) {
>          ret = -errno;
>          goto out;
>      }
>      do {
> -        ret = read(fd, buf, sizeof(buf) - 1);
> +        ret = read(sysfd, buf, sizeof(buf) - 1);
>      } while (ret == -1 && errno == EINTR);
>      if (ret < 0) {
>          ret = -errno;
> @@ -1090,8 +1094,8 @@ static int hdev_get_max_segments(const struct stat *st)
>      }
>  
>  out:
> -    if (fd != -1) {
> -        close(fd);
> +    if (sysfd != -1) {
> +        close(sysfd);
>      }
>      g_free(sysfspath);
>      return ret;
> @@ -1103,19 +1107,17 @@ out:
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>      BDRVRawState *s = bs->opaque;
> -    struct stat st;
>  
> -    if (!fstat(s->fd, &st)) {
> -        if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
> -            int ret = hdev_get_max_transfer_length(bs, s->fd);
> -            if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
> -                bs->bl.max_transfer = pow2floor(ret);
> -            }
> -            ret = hdev_get_max_segments(&st);
> -            if (ret > 0) {
> -                bs->bl.max_transfer = MIN(bs->bl.max_transfer,
> -                                          ret * getpagesize());
> -            }
> +    if (bs->sg) {
> +        int ret = sg_get_max_transfer_length(s->fd);
> +
> +        if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
> +            bs->bl.max_transfer = pow2floor(ret);
> +        }
> +
> +        ret = sg_get_max_segments(s->fd);
> +        if (ret > 0) {
> +            bs->bl.max_transfer = MIN(bs->bl.max_transfer, ret * getpagesize());
>          }
>      }
>  


Ping.

Best regards,
	Maxim Levitsky

Patch
diff mbox series

diff --git a/block/file-posix.c b/block/file-posix.c
index ab05b51a66..4479cc7ab4 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1038,15 +1038,13 @@  static void raw_reopen_abort(BDRVReopenState *state)
     s->reopen_state = NULL;
 }
 
-static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
+static int sg_get_max_transfer_length(int fd)
 {
 #ifdef BLKSECTGET
     int max_bytes = 0;
-    short max_sectors = 0;
-    if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
+
+    if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
         return max_bytes;
-    } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
-        return max_sectors << BDRV_SECTOR_BITS;
     } else {
         return -errno;
     }
@@ -1055,25 +1053,31 @@  static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
 #endif
 }
 
-static int hdev_get_max_segments(const struct stat *st)
+static int sg_get_max_segments(int fd)
 {
 #ifdef CONFIG_LINUX
     char buf[32];
     const char *end;
-    char *sysfspath;
+    char *sysfspath = NULL;
     int ret;
-    int fd = -1;
+    int sysfd = -1;
     long max_segments;
+    struct stat st;
+
+    if (fstat(fd, &st)) {
+        ret = -errno;
+        goto out;
+    }
 
     sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
-                                major(st->st_rdev), minor(st->st_rdev));
-    fd = open(sysfspath, O_RDONLY);
-    if (fd == -1) {
+                                major(st.st_rdev), minor(st.st_rdev));
+    sysfd = open(sysfspath, O_RDONLY);
+    if (sysfd == -1) {
         ret = -errno;
         goto out;
     }
     do {
-        ret = read(fd, buf, sizeof(buf) - 1);
+        ret = read(sysfd, buf, sizeof(buf) - 1);
     } while (ret == -1 && errno == EINTR);
     if (ret < 0) {
         ret = -errno;
@@ -1090,8 +1094,8 @@  static int hdev_get_max_segments(const struct stat *st)
     }
 
 out:
-    if (fd != -1) {
-        close(fd);
+    if (sysfd != -1) {
+        close(sysfd);
     }
     g_free(sysfspath);
     return ret;
@@ -1103,19 +1107,17 @@  out:
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
-    struct stat st;
 
-    if (!fstat(s->fd, &st)) {
-        if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
-            int ret = hdev_get_max_transfer_length(bs, s->fd);
-            if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
-                bs->bl.max_transfer = pow2floor(ret);
-            }
-            ret = hdev_get_max_segments(&st);
-            if (ret > 0) {
-                bs->bl.max_transfer = MIN(bs->bl.max_transfer,
-                                          ret * getpagesize());
-            }
+    if (bs->sg) {
+        int ret = sg_get_max_transfer_length(s->fd);
+
+        if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
+            bs->bl.max_transfer = pow2floor(ret);
+        }
+
+        ret = sg_get_max_segments(s->fd);
+        if (ret > 0) {
+            bs->bl.max_transfer = MIN(bs->bl.max_transfer, ret * getpagesize());
         }
     }