diff mbox series

[3/3] block/io_uring: resubmit short buffered reads

Message ID 20190715201950.9444-4-stefanha@redhat.com
State New
Headers show
Series block/io_uring: fix EINTR and resubmit short reads | expand

Commit Message

Stefan Hajnoczi July 15, 2019, 8:19 p.m. UTC
The io_uring API had unusual read behavior up until recently, where
short reads could occur when the start of the file range was in the page
cache and a later portion was not in the page cache.  Normally read(2)
does not expose this detail to applications and this behavior has been
fixed in Linux commit 9d93a3f5a0c ("io_uring: punt short reads to async
* context").

In the meantime Linux distros have shipped kernels where io_uring
exhibits the old behavior and there is no simple way to detect it.

Add a slow path for resubmitting short read requests.  The idea is
simple: shorten the iovecs and increment the file offset each time a
short read occurs and then resubmit the request.  The implementation
requires adding additional fields to LuringAIOCB to keep track of where
we were.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io_uring.c   | 75 +++++++++++++++++++++++++++++++++++++++-------
 block/trace-events |  3 +-
 2 files changed, 67 insertions(+), 11 deletions(-)

Comments

Philippe Mathieu-Daudé July 16, 2019, 7:15 a.m. UTC | #1
On 7/15/19 10:19 PM, Stefan Hajnoczi wrote:
> The io_uring API had unusual read behavior up until recently, where
> short reads could occur when the start of the file range was in the page
> cache and a later portion was not in the page cache.  Normally read(2)
> does not expose this detail to applications and this behavior has been
> fixed in Linux commit 9d93a3f5a0c ("io_uring: punt short reads to async
> * context").
> 
> In the meantime Linux distros have shipped kernels where io_uring
> exhibits the old behavior and there is no simple way to detect it.
> 
> Add a slow path for resubmitting short read requests.  The idea is
> simple: shorten the iovecs and increment the file offset each time a
> short read occurs and then resubmit the request.  The implementation
> requires adding additional fields to LuringAIOCB to keep track of where
> we were.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  block/io_uring.c   | 75 +++++++++++++++++++++++++++++++++++++++-------
>  block/trace-events |  3 +-
>  2 files changed, 67 insertions(+), 11 deletions(-)
> 
> diff --git a/block/io_uring.c b/block/io_uring.c
> index 97e4f876d7..12cef71175 100644
> --- a/block/io_uring.c
> +++ b/block/io_uring.c
> @@ -28,6 +28,12 @@ typedef struct LuringAIOCB {
>      QEMUIOVector *qiov;
>      bool is_read;
>      QSIMPLEQ_ENTRY(LuringAIOCB) next;
> +
> +    /* Buffered reads may require resubmission, see
> +     * luring_resubmit_short_read().
> +     */
> +    int total_read;
> +    QEMUIOVector resubmit_qiov;
>  } LuringAIOCB;
>  
>  typedef struct LuringQueue {
> @@ -99,6 +105,43 @@ static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb)
>      s->io_q.in_queue++;
>  }
>  
> +/* Before Linux commit 9d93a3f5a0c ("io_uring: punt short reads to async
> + * context") a buffered I/O request with the start of the file range in the
> + * page cache could result in a short read.  Applications need to resubmit the
> + * remaining read request.
> + *
> + * This is a slow path but recent kernels never take it.
> + */
> +static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb,
> +                                       int nread)
> +{
> +    QEMUIOVector *resubmit_qiov;
> +    size_t remaining;
> +
> +    trace_luring_resubmit_short_read(s, luringcb, nread);
> +
> +    /* Update read position */
> +    luringcb->total_read += nread;
> +    remaining = luringcb->qiov->size - luringcb->total_read;
> +
> +    /* Shorten qiov */
> +    resubmit_qiov = &luringcb->resubmit_qiov;
> +    if (resubmit_qiov->iov == NULL) {
> +        qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov);
> +    } else {
> +        qemu_iovec_reset(resubmit_qiov);
> +    }
> +    qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read,
> +                      remaining);
> +
> +    /* Update sqe */
> +    luringcb->sqeq.off += nread;
> +    luringcb->sqeq.addr = (__u64)(uintptr_t)luringcb->resubmit_qiov.iov;
> +    luringcb->sqeq.len = luringcb->resubmit_qiov.niov;
> +
> +    luring_resubmit(s, luringcb);
> +}
> +
>  /**
>   * luring_process_completions:
>   * @s: AIO state
> @@ -135,6 +178,7 @@ static void luring_process_completions(LuringState *s)
>      while (io_uring_peek_cqe(&s->ring, &cqes) == 0) {
>          LuringAIOCB *luringcb;
>          int ret;
> +        int total_bytes;
>  
>          if (!cqes) {
>              break;
> @@ -150,25 +194,36 @@ static void luring_process_completions(LuringState *s)
>  
>          trace_luring_process_completion(s, luringcb, ret);
>  
> -        if (ret == luringcb->qiov->size) {
> +        /* total_read is non-zero only for resubmitted read requests */
> +        total_bytes = ret + luringcb->total_read;
> +
> +        if (ret < 0) {
> +            if (ret == -EINTR) {
> +                luring_resubmit(s, luringcb);
> +                continue;
> +            }

Else fail with ret = -errno. OK.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>

> +        } else if (total_bytes == luringcb->qiov->size) {
>              ret = 0;
> -        } else if (ret >= 0) {
> +        } else {
>              /* Short Read/Write */
>              if (luringcb->is_read) {
> -                /* Read, pad with zeroes */
> -                qemu_iovec_memset(luringcb->qiov, ret, 0,
> -                luringcb->qiov->size - ret);
> -                ret = 0;
> +                if (ret > 0) {
> +                    luring_resubmit_short_read(s, luringcb, ret);
> +                    continue;
> +                } else {
> +                    /* Pad with zeroes */
> +                    qemu_iovec_memset(luringcb->qiov, total_bytes, 0,
> +                                      luringcb->qiov->size - total_bytes);
> +                    ret = 0;
> +                }
>              } else {
>                  ret = -ENOSPC;;
>              }
> -        /* Add to overflow queue to be resubmitted later */
> -        } else if (ret == -EINTR) {
> -            luring_resubmit(s, luringcb);
> -            continue;
>          }
>          luringcb->ret = ret;
>  
> +        qemu_iovec_destroy(&luringcb->resubmit_qiov);
> +
>          /*
>           * If the coroutine is already entered it must be in ioq_submit()
>           * and will notice luringcb->ret has been filled in when it
> diff --git a/block/trace-events b/block/trace-events
> index 02952fe4cb..f434cac634 100644
> --- a/block/trace-events
> +++ b/block/trace-events
> @@ -60,7 +60,7 @@ qmp_block_stream(void *bs) "bs %p"
>  file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d"
>  file_copy_file_range(void *bs, int src, int64_t src_off, int dst, int64_t dst_off, int64_t bytes, int flags, int64_t ret) "bs %p src_fd %d offset %"PRIu64" dst_fd %d offset %"PRIu64" bytes %"PRIu64" flags %d ret %"PRId64
>  
> -#io_uring.c
> +# io_uring.c

(left over from patch #1)

>  luring_init_state(void *s, size_t size) "s %p size %zu"
>  luring_cleanup_state(void *s) "%p freed"
>  luring_io_plug(void *s) "LuringState %p plug"
> @@ -70,6 +70,7 @@ luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d"
>  luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d"
>  luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d"
>  luring_io_uring_submit(void *s, int ret) "LuringState %p ret %d"
> +luring_resubmit_short_read(void *s, void *luringcb, int nread) "LuringState %p luringcb %p nread %d"
>  
>  # qcow2.c
>  qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d"
>
diff mbox series

Patch

diff --git a/block/io_uring.c b/block/io_uring.c
index 97e4f876d7..12cef71175 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -28,6 +28,12 @@  typedef struct LuringAIOCB {
     QEMUIOVector *qiov;
     bool is_read;
     QSIMPLEQ_ENTRY(LuringAIOCB) next;
+
+    /* Buffered reads may require resubmission, see
+     * luring_resubmit_short_read().
+     */
+    int total_read;
+    QEMUIOVector resubmit_qiov;
 } LuringAIOCB;
 
 typedef struct LuringQueue {
@@ -99,6 +105,43 @@  static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb)
     s->io_q.in_queue++;
 }
 
+/* Before Linux commit 9d93a3f5a0c ("io_uring: punt short reads to async
+ * context") a buffered I/O request with the start of the file range in the
+ * page cache could result in a short read.  Applications need to resubmit the
+ * remaining read request.
+ *
+ * This is a slow path but recent kernels never take it.
+ */
+static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb,
+                                       int nread)
+{
+    QEMUIOVector *resubmit_qiov;
+    size_t remaining;
+
+    trace_luring_resubmit_short_read(s, luringcb, nread);
+
+    /* Update read position */
+    luringcb->total_read += nread;
+    remaining = luringcb->qiov->size - luringcb->total_read;
+
+    /* Shorten qiov */
+    resubmit_qiov = &luringcb->resubmit_qiov;
+    if (resubmit_qiov->iov == NULL) {
+        qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov);
+    } else {
+        qemu_iovec_reset(resubmit_qiov);
+    }
+    qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read,
+                      remaining);
+
+    /* Update sqe */
+    luringcb->sqeq.off += nread;
+    luringcb->sqeq.addr = (__u64)(uintptr_t)luringcb->resubmit_qiov.iov;
+    luringcb->sqeq.len = luringcb->resubmit_qiov.niov;
+
+    luring_resubmit(s, luringcb);
+}
+
 /**
  * luring_process_completions:
  * @s: AIO state
@@ -135,6 +178,7 @@  static void luring_process_completions(LuringState *s)
     while (io_uring_peek_cqe(&s->ring, &cqes) == 0) {
         LuringAIOCB *luringcb;
         int ret;
+        int total_bytes;
 
         if (!cqes) {
             break;
@@ -150,25 +194,36 @@  static void luring_process_completions(LuringState *s)
 
         trace_luring_process_completion(s, luringcb, ret);
 
-        if (ret == luringcb->qiov->size) {
+        /* total_read is non-zero only for resubmitted read requests */
+        total_bytes = ret + luringcb->total_read;
+
+        if (ret < 0) {
+            if (ret == -EINTR) {
+                luring_resubmit(s, luringcb);
+                continue;
+            }
+        } else if (total_bytes == luringcb->qiov->size) {
             ret = 0;
-        } else if (ret >= 0) {
+        } else {
             /* Short Read/Write */
             if (luringcb->is_read) {
-                /* Read, pad with zeroes */
-                qemu_iovec_memset(luringcb->qiov, ret, 0,
-                luringcb->qiov->size - ret);
-                ret = 0;
+                if (ret > 0) {
+                    luring_resubmit_short_read(s, luringcb, ret);
+                    continue;
+                } else {
+                    /* Pad with zeroes */
+                    qemu_iovec_memset(luringcb->qiov, total_bytes, 0,
+                                      luringcb->qiov->size - total_bytes);
+                    ret = 0;
+                }
             } else {
                 ret = -ENOSPC;;
             }
-        /* Add to overflow queue to be resubmitted later */
-        } else if (ret == -EINTR) {
-            luring_resubmit(s, luringcb);
-            continue;
         }
         luringcb->ret = ret;
 
+        qemu_iovec_destroy(&luringcb->resubmit_qiov);
+
         /*
          * If the coroutine is already entered it must be in ioq_submit()
          * and will notice luringcb->ret has been filled in when it
diff --git a/block/trace-events b/block/trace-events
index 02952fe4cb..f434cac634 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -60,7 +60,7 @@  qmp_block_stream(void *bs) "bs %p"
 file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d"
 file_copy_file_range(void *bs, int src, int64_t src_off, int dst, int64_t dst_off, int64_t bytes, int flags, int64_t ret) "bs %p src_fd %d offset %"PRIu64" dst_fd %d offset %"PRIu64" bytes %"PRIu64" flags %d ret %"PRId64
 
-#io_uring.c
+# io_uring.c
 luring_init_state(void *s, size_t size) "s %p size %zu"
 luring_cleanup_state(void *s) "%p freed"
 luring_io_plug(void *s) "LuringState %p plug"
@@ -70,6 +70,7 @@  luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d"
 luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d"
 luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d"
 luring_io_uring_submit(void *s, int ret) "LuringState %p ret %d"
+luring_resubmit_short_read(void *s, void *luringcb, int nread) "LuringState %p luringcb %p nread %d"
 
 # qcow2.c
 qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d"