diff mbox series

[v2,5/6] block/linux-aio: convert to blk_io_plug_call() API

Message ID 20230523171300.132347-6-stefanha@redhat.com
State New
Headers show
Series block: add blk_io_plug_call() API | expand

Commit Message

Stefan Hajnoczi May 23, 2023, 5:12 p.m. UTC
Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 include/block/raw-aio.h |  7 -------
 block/file-posix.c      | 28 ----------------------------
 block/linux-aio.c       | 41 +++++++++++------------------------------
 3 files changed, 11 insertions(+), 65 deletions(-)

Comments

Stefano Garzarella May 24, 2023, 8:52 a.m. UTC | #1
On Tue, May 23, 2023 at 01:12:59PM -0400, Stefan Hajnoczi wrote:
>Stop using the .bdrv_co_io_plug() API because it is not multi-queue
>block layer friendly. Use the new blk_io_plug_call() API to batch I/O
>submission instead.
>
>Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
>Reviewed-by: Eric Blake <eblake@redhat.com>
>---
> include/block/raw-aio.h |  7 -------
> block/file-posix.c      | 28 ----------------------------
> block/linux-aio.c       | 41 +++++++++++------------------------------
> 3 files changed, 11 insertions(+), 65 deletions(-)
>
>diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
>index da60ca13ef..0f63c2800c 100644
>--- a/include/block/raw-aio.h
>+++ b/include/block/raw-aio.h
>@@ -62,13 +62,6 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
>
> void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
> void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
>-
>-/*
>- * laio_io_plug/unplug work in the thread's current AioContext, therefore the
>- * caller must ensure that they are paired in the same IOThread.
>- */
>-void laio_io_plug(void);
>-void laio_io_unplug(uint64_t dev_max_batch);
> #endif
> /* io_uring.c - Linux io_uring implementation */
> #ifdef CONFIG_LINUX_IO_URING
>diff --git a/block/file-posix.c b/block/file-posix.c
>index 7baa8491dd..ac1ed54811 100644
>--- a/block/file-posix.c
>+++ b/block/file-posix.c
>@@ -2550,26 +2550,6 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
>     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
> }
>
>-static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
>-{
>-    BDRVRawState __attribute__((unused)) *s = bs->opaque;
>-#ifdef CONFIG_LINUX_AIO
>-    if (s->use_linux_aio) {
>-        laio_io_plug();
>-    }
>-#endif
>-}
>-
>-static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
>-{
>-    BDRVRawState __attribute__((unused)) *s = bs->opaque;
>-#ifdef CONFIG_LINUX_AIO
>-    if (s->use_linux_aio) {
>-        laio_io_unplug(s->aio_max_batch);
>-    }
>-#endif
>-}
>-
> static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
> {
>     BDRVRawState *s = bs->opaque;
>@@ -3914,8 +3894,6 @@ BlockDriver bdrv_file = {
>     .bdrv_co_copy_range_from = raw_co_copy_range_from,
>     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
>     .bdrv_refresh_limits = raw_refresh_limits,
>-    .bdrv_co_io_plug        = raw_co_io_plug,
>-    .bdrv_co_io_unplug      = raw_co_io_unplug,
>     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>
>     .bdrv_co_truncate                   = raw_co_truncate,
>@@ -4286,8 +4264,6 @@ static BlockDriver bdrv_host_device = {
>     .bdrv_co_copy_range_from = raw_co_copy_range_from,
>     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
>     .bdrv_refresh_limits = raw_refresh_limits,
>-    .bdrv_co_io_plug        = raw_co_io_plug,
>-    .bdrv_co_io_unplug      = raw_co_io_unplug,
>     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>
>     .bdrv_co_truncate                   = raw_co_truncate,
>@@ -4424,8 +4400,6 @@ static BlockDriver bdrv_host_cdrom = {
>     .bdrv_co_pwritev        = raw_co_pwritev,
>     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
>     .bdrv_refresh_limits    = cdrom_refresh_limits,
>-    .bdrv_co_io_plug        = raw_co_io_plug,
>-    .bdrv_co_io_unplug      = raw_co_io_unplug,
>     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>
>     .bdrv_co_truncate                   = raw_co_truncate,
>@@ -4552,8 +4526,6 @@ static BlockDriver bdrv_host_cdrom = {
>     .bdrv_co_pwritev        = raw_co_pwritev,
>     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
>     .bdrv_refresh_limits    = cdrom_refresh_limits,
>-    .bdrv_co_io_plug        = raw_co_io_plug,
>-    .bdrv_co_io_unplug      = raw_co_io_unplug,
>     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>
>     .bdrv_co_truncate                   = raw_co_truncate,
>diff --git a/block/linux-aio.c b/block/linux-aio.c
>index 442c86209b..5021aed68f 100644
>--- a/block/linux-aio.c
>+++ b/block/linux-aio.c
>@@ -15,6 +15,7 @@
> #include "qemu/event_notifier.h"
> #include "qemu/coroutine.h"
> #include "qapi/error.h"
>+#include "sysemu/block-backend.h"
>
> /* Only used for assertions.  */
> #include "qemu/coroutine_int.h"
>@@ -46,7 +47,6 @@ struct qemu_laiocb {
> };
>
> typedef struct {
>-    int plugged;
>     unsigned int in_queue;
>     unsigned int in_flight;
>     bool blocked;
>@@ -236,7 +236,7 @@ static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
> {
>     qemu_laio_process_completions(s);
>
>-    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
>+    if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
>         ioq_submit(s);
>     }
> }
>@@ -277,7 +277,6 @@ static void qemu_laio_poll_ready(EventNotifier *opaque)
> static void ioq_init(LaioQueue *io_q)
> {
>     QSIMPLEQ_INIT(&io_q->pending);
>-    io_q->plugged = 0;
>     io_q->in_queue = 0;
>     io_q->in_flight = 0;
>     io_q->blocked = false;
>@@ -354,31 +353,11 @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
>     return max_batch;
> }
>
>-void laio_io_plug(void)
>+static void laio_unplug_fn(void *opaque)
> {
>-    AioContext *ctx = qemu_get_current_aio_context();
>-    LinuxAioState *s = aio_get_linux_aio(ctx);
>+    LinuxAioState *s = opaque;
>
>-    s->io_q.plugged++;
>-}
>-
>-void laio_io_unplug(uint64_t dev_max_batch)
>-{
>-    AioContext *ctx = qemu_get_current_aio_context();
>-    LinuxAioState *s = aio_get_linux_aio(ctx);
>-
>-    assert(s->io_q.plugged);
>-    s->io_q.plugged--;
>-
>-    /*
>-     * Why max batch checking is performed here:
>-     * Another BDS may have queued requests with a higher dev_max_batch and
>-     * therefore in_queue could now exceed our dev_max_batch. Re-check the max
>-     * batch so we can honor our device's dev_max_batch.
>-     */
>-    if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||

Why are we removing this condition?
Could the same situation occur with the new API?

Thanks,
Stefano

>-        (!s->io_q.plugged &&
>-         !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
>+    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
>         ioq_submit(s);
>     }
> }
>@@ -410,10 +389,12 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
>
>     QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
>     s->io_q.in_queue++;
>-    if (!s->io_q.blocked &&
>-        (!s->io_q.plugged ||
>-         s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
>-        ioq_submit(s);
>+    if (!s->io_q.blocked) {
>+        if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
>+            ioq_submit(s);
>+        } else {
>+            blk_io_plug_call(laio_unplug_fn, s);
>+        }
>     }
>
>     return 0;
>-- 
>2.40.1
>
Stefan Hajnoczi May 24, 2023, 7:36 p.m. UTC | #2
On Wed, May 24, 2023 at 10:52:03AM +0200, Stefano Garzarella wrote:
> On Tue, May 23, 2023 at 01:12:59PM -0400, Stefan Hajnoczi wrote:
> > Stop using the .bdrv_co_io_plug() API because it is not multi-queue
> > block layer friendly. Use the new blk_io_plug_call() API to batch I/O
> > submission instead.
> > 
> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > Reviewed-by: Eric Blake <eblake@redhat.com>
> > ---
> > include/block/raw-aio.h |  7 -------
> > block/file-posix.c      | 28 ----------------------------
> > block/linux-aio.c       | 41 +++++++++++------------------------------
> > 3 files changed, 11 insertions(+), 65 deletions(-)
> > 
> > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> > index da60ca13ef..0f63c2800c 100644
> > --- a/include/block/raw-aio.h
> > +++ b/include/block/raw-aio.h
> > @@ -62,13 +62,6 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
> > 
> > void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
> > void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
> > -
> > -/*
> > - * laio_io_plug/unplug work in the thread's current AioContext, therefore the
> > - * caller must ensure that they are paired in the same IOThread.
> > - */
> > -void laio_io_plug(void);
> > -void laio_io_unplug(uint64_t dev_max_batch);
> > #endif
> > /* io_uring.c - Linux io_uring implementation */
> > #ifdef CONFIG_LINUX_IO_URING
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 7baa8491dd..ac1ed54811 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -2550,26 +2550,6 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
> >     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
> > }
> > 
> > -static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
> > -{
> > -    BDRVRawState __attribute__((unused)) *s = bs->opaque;
> > -#ifdef CONFIG_LINUX_AIO
> > -    if (s->use_linux_aio) {
> > -        laio_io_plug();
> > -    }
> > -#endif
> > -}
> > -
> > -static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
> > -{
> > -    BDRVRawState __attribute__((unused)) *s = bs->opaque;
> > -#ifdef CONFIG_LINUX_AIO
> > -    if (s->use_linux_aio) {
> > -        laio_io_unplug(s->aio_max_batch);
> > -    }
> > -#endif
> > -}
> > -
> > static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
> > {
> >     BDRVRawState *s = bs->opaque;
> > @@ -3914,8 +3894,6 @@ BlockDriver bdrv_file = {
> >     .bdrv_co_copy_range_from = raw_co_copy_range_from,
> >     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> >     .bdrv_refresh_limits = raw_refresh_limits,
> > -    .bdrv_co_io_plug        = raw_co_io_plug,
> > -    .bdrv_co_io_unplug      = raw_co_io_unplug,
> >     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > 
> >     .bdrv_co_truncate                   = raw_co_truncate,
> > @@ -4286,8 +4264,6 @@ static BlockDriver bdrv_host_device = {
> >     .bdrv_co_copy_range_from = raw_co_copy_range_from,
> >     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> >     .bdrv_refresh_limits = raw_refresh_limits,
> > -    .bdrv_co_io_plug        = raw_co_io_plug,
> > -    .bdrv_co_io_unplug      = raw_co_io_unplug,
> >     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > 
> >     .bdrv_co_truncate                   = raw_co_truncate,
> > @@ -4424,8 +4400,6 @@ static BlockDriver bdrv_host_cdrom = {
> >     .bdrv_co_pwritev        = raw_co_pwritev,
> >     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> >     .bdrv_refresh_limits    = cdrom_refresh_limits,
> > -    .bdrv_co_io_plug        = raw_co_io_plug,
> > -    .bdrv_co_io_unplug      = raw_co_io_unplug,
> >     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > 
> >     .bdrv_co_truncate                   = raw_co_truncate,
> > @@ -4552,8 +4526,6 @@ static BlockDriver bdrv_host_cdrom = {
> >     .bdrv_co_pwritev        = raw_co_pwritev,
> >     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> >     .bdrv_refresh_limits    = cdrom_refresh_limits,
> > -    .bdrv_co_io_plug        = raw_co_io_plug,
> > -    .bdrv_co_io_unplug      = raw_co_io_unplug,
> >     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > 
> >     .bdrv_co_truncate                   = raw_co_truncate,
> > diff --git a/block/linux-aio.c b/block/linux-aio.c
> > index 442c86209b..5021aed68f 100644
> > --- a/block/linux-aio.c
> > +++ b/block/linux-aio.c
> > @@ -15,6 +15,7 @@
> > #include "qemu/event_notifier.h"
> > #include "qemu/coroutine.h"
> > #include "qapi/error.h"
> > +#include "sysemu/block-backend.h"
> > 
> > /* Only used for assertions.  */
> > #include "qemu/coroutine_int.h"
> > @@ -46,7 +47,6 @@ struct qemu_laiocb {
> > };
> > 
> > typedef struct {
> > -    int plugged;
> >     unsigned int in_queue;
> >     unsigned int in_flight;
> >     bool blocked;
> > @@ -236,7 +236,7 @@ static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
> > {
> >     qemu_laio_process_completions(s);
> > 
> > -    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
> > +    if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
> >         ioq_submit(s);
> >     }
> > }
> > @@ -277,7 +277,6 @@ static void qemu_laio_poll_ready(EventNotifier *opaque)
> > static void ioq_init(LaioQueue *io_q)
> > {
> >     QSIMPLEQ_INIT(&io_q->pending);
> > -    io_q->plugged = 0;
> >     io_q->in_queue = 0;
> >     io_q->in_flight = 0;
> >     io_q->blocked = false;
> > @@ -354,31 +353,11 @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
> >     return max_batch;
> > }
> > 
> > -void laio_io_plug(void)
> > +static void laio_unplug_fn(void *opaque)
> > {
> > -    AioContext *ctx = qemu_get_current_aio_context();
> > -    LinuxAioState *s = aio_get_linux_aio(ctx);
> > +    LinuxAioState *s = opaque;
> > 
> > -    s->io_q.plugged++;
> > -}
> > -
> > -void laio_io_unplug(uint64_t dev_max_batch)
> > -{
> > -    AioContext *ctx = qemu_get_current_aio_context();
> > -    LinuxAioState *s = aio_get_linux_aio(ctx);
> > -
> > -    assert(s->io_q.plugged);
> > -    s->io_q.plugged--;
> > -
> > -    /*
> > -     * Why max batch checking is performed here:
> > -     * Another BDS may have queued requests with a higher dev_max_batch and
> > -     * therefore in_queue could now exceed our dev_max_batch. Re-check the max
> > -     * batch so we can honor our device's dev_max_batch.
> > -     */
> > -    if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
> 
> Why are we removing this condition?
> Could the same situation occur with the new API?

The semantics of unplug_fn() are different from .bdrv_co_unplug():
1. unplug_fn() is only called when the last blk_io_unplug() call occurs,
   not every time blk_io_unplug() is called.
2. unplug_fn() is per-thread, not per-BlockDriverState, so there is no
   way to get per-BlockDriverState fields like dev_max_batch.

Therefore this condition cannot be moved to laio_unplug_fn().

How important is this condition? I believe that dropping it does not
have much of an effect but maybe I missed something.

Also, does it make sense to define per-BlockDriverState batching limits
when the AIO engine (Linux AIO or io_uring) is thread-local and shared
between all BlockDriverStates? I believe the fundamental reason (that we
discovered later) why dev_max_batch is effective is because the Linux
kernel processes 32 I/O request submissions at a time. Anything above 32
adds latency without a batching benefit.

Stefan
Stefano Garzarella May 29, 2023, 8:50 a.m. UTC | #3
On Wed, May 24, 2023 at 03:36:34PM -0400, Stefan Hajnoczi wrote:
>On Wed, May 24, 2023 at 10:52:03AM +0200, Stefano Garzarella wrote:
>> On Tue, May 23, 2023 at 01:12:59PM -0400, Stefan Hajnoczi wrote:
>> > Stop using the .bdrv_co_io_plug() API because it is not multi-queue
>> > block layer friendly. Use the new blk_io_plug_call() API to batch I/O
>> > submission instead.
>> >
>> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
>> > Reviewed-by: Eric Blake <eblake@redhat.com>
>> > ---
>> > include/block/raw-aio.h |  7 -------
>> > block/file-posix.c      | 28 ----------------------------
>> > block/linux-aio.c       | 41 +++++++++++------------------------------
>> > 3 files changed, 11 insertions(+), 65 deletions(-)
>> >
>> > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
>> > index da60ca13ef..0f63c2800c 100644
>> > --- a/include/block/raw-aio.h
>> > +++ b/include/block/raw-aio.h
>> > @@ -62,13 +62,6 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
>> >
>> > void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
>> > void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
>> > -
>> > -/*
>> > - * laio_io_plug/unplug work in the thread's current AioContext, therefore the
>> > - * caller must ensure that they are paired in the same IOThread.
>> > - */
>> > -void laio_io_plug(void);
>> > -void laio_io_unplug(uint64_t dev_max_batch);
>> > #endif
>> > /* io_uring.c - Linux io_uring implementation */
>> > #ifdef CONFIG_LINUX_IO_URING
>> > diff --git a/block/file-posix.c b/block/file-posix.c
>> > index 7baa8491dd..ac1ed54811 100644
>> > --- a/block/file-posix.c
>> > +++ b/block/file-posix.c
>> > @@ -2550,26 +2550,6 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
>> >     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
>> > }
>> >
>> > -static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
>> > -{
>> > -    BDRVRawState __attribute__((unused)) *s = bs->opaque;
>> > -#ifdef CONFIG_LINUX_AIO
>> > -    if (s->use_linux_aio) {
>> > -        laio_io_plug();
>> > -    }
>> > -#endif
>> > -}
>> > -
>> > -static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
>> > -{
>> > -    BDRVRawState __attribute__((unused)) *s = bs->opaque;
>> > -#ifdef CONFIG_LINUX_AIO
>> > -    if (s->use_linux_aio) {
>> > -        laio_io_unplug(s->aio_max_batch);
>> > -    }
>> > -#endif
>> > -}
>> > -
>> > static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
>> > {
>> >     BDRVRawState *s = bs->opaque;
>> > @@ -3914,8 +3894,6 @@ BlockDriver bdrv_file = {
>> >     .bdrv_co_copy_range_from = raw_co_copy_range_from,
>> >     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
>> >     .bdrv_refresh_limits = raw_refresh_limits,
>> > -    .bdrv_co_io_plug        = raw_co_io_plug,
>> > -    .bdrv_co_io_unplug      = raw_co_io_unplug,
>> >     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>> >
>> >     .bdrv_co_truncate                   = raw_co_truncate,
>> > @@ -4286,8 +4264,6 @@ static BlockDriver bdrv_host_device = {
>> >     .bdrv_co_copy_range_from = raw_co_copy_range_from,
>> >     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
>> >     .bdrv_refresh_limits = raw_refresh_limits,
>> > -    .bdrv_co_io_plug        = raw_co_io_plug,
>> > -    .bdrv_co_io_unplug      = raw_co_io_unplug,
>> >     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>> >
>> >     .bdrv_co_truncate                   = raw_co_truncate,
>> > @@ -4424,8 +4400,6 @@ static BlockDriver bdrv_host_cdrom = {
>> >     .bdrv_co_pwritev        = raw_co_pwritev,
>> >     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
>> >     .bdrv_refresh_limits    = cdrom_refresh_limits,
>> > -    .bdrv_co_io_plug        = raw_co_io_plug,
>> > -    .bdrv_co_io_unplug      = raw_co_io_unplug,
>> >     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>> >
>> >     .bdrv_co_truncate                   = raw_co_truncate,
>> > @@ -4552,8 +4526,6 @@ static BlockDriver bdrv_host_cdrom = {
>> >     .bdrv_co_pwritev        = raw_co_pwritev,
>> >     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
>> >     .bdrv_refresh_limits    = cdrom_refresh_limits,
>> > -    .bdrv_co_io_plug        = raw_co_io_plug,
>> > -    .bdrv_co_io_unplug      = raw_co_io_unplug,
>> >     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>> >
>> >     .bdrv_co_truncate                   = raw_co_truncate,
>> > diff --git a/block/linux-aio.c b/block/linux-aio.c
>> > index 442c86209b..5021aed68f 100644
>> > --- a/block/linux-aio.c
>> > +++ b/block/linux-aio.c
>> > @@ -15,6 +15,7 @@
>> > #include "qemu/event_notifier.h"
>> > #include "qemu/coroutine.h"
>> > #include "qapi/error.h"
>> > +#include "sysemu/block-backend.h"
>> >
>> > /* Only used for assertions.  */
>> > #include "qemu/coroutine_int.h"
>> > @@ -46,7 +47,6 @@ struct qemu_laiocb {
>> > };
>> >
>> > typedef struct {
>> > -    int plugged;
>> >     unsigned int in_queue;
>> >     unsigned int in_flight;
>> >     bool blocked;
>> > @@ -236,7 +236,7 @@ static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
>> > {
>> >     qemu_laio_process_completions(s);
>> >
>> > -    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
>> > +    if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
>> >         ioq_submit(s);
>> >     }
>> > }
>> > @@ -277,7 +277,6 @@ static void qemu_laio_poll_ready(EventNotifier *opaque)
>> > static void ioq_init(LaioQueue *io_q)
>> > {
>> >     QSIMPLEQ_INIT(&io_q->pending);
>> > -    io_q->plugged = 0;
>> >     io_q->in_queue = 0;
>> >     io_q->in_flight = 0;
>> >     io_q->blocked = false;
>> > @@ -354,31 +353,11 @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
>> >     return max_batch;
>> > }
>> >
>> > -void laio_io_plug(void)
>> > +static void laio_unplug_fn(void *opaque)
>> > {
>> > -    AioContext *ctx = qemu_get_current_aio_context();
>> > -    LinuxAioState *s = aio_get_linux_aio(ctx);
>> > +    LinuxAioState *s = opaque;
>> >
>> > -    s->io_q.plugged++;
>> > -}
>> > -
>> > -void laio_io_unplug(uint64_t dev_max_batch)
>> > -{
>> > -    AioContext *ctx = qemu_get_current_aio_context();
>> > -    LinuxAioState *s = aio_get_linux_aio(ctx);
>> > -
>> > -    assert(s->io_q.plugged);
>> > -    s->io_q.plugged--;
>> > -
>> > -    /*
>> > -     * Why max batch checking is performed here:
>> > -     * Another BDS may have queued requests with a higher dev_max_batch and
>> > -     * therefore in_queue could now exceed our dev_max_batch. Re-check the max
>> > -     * batch so we can honor our device's dev_max_batch.
>> > -     */
>> > -    if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
>>
>> Why are we removing this condition?
>> Could the same situation occur with the new API?
>
>The semantics of unplug_fn() are different from .bdrv_co_unplug():
>1. unplug_fn() is only called when the last blk_io_unplug() call occurs,
>   not every time blk_io_unplug() is called.
>2. unplug_fn() is per-thread, not per-BlockDriverState, so there is no
>   way to get per-BlockDriverState fields like dev_max_batch.
>
>Therefore this condition cannot be moved to laio_unplug_fn().

I see now.

>
>How important is this condition? I believe that dropping it does not
>have much of an effect but maybe I missed something.

With Kevin we agreed to add it to avoid extra latency in some devices,
but we didn't do much testing on this.

IIRC what solved the performance degradation was the check in
laio_do_submit() that we still have after this changes.

So it may not have much effect, but maybe it's worth mentioning in
the commit description.

>
>Also, does it make sense to define per-BlockDriverState batching limits
>when the AIO engine (Linux AIO or io_uring) is thread-local and shared
>between all BlockDriverStates? I believe the fundamental reason (that we
>discovered later) why dev_max_batch is effective is because the Linux
>kernel processes 32 I/O request submissions at a time. Anything above 32
>adds latency without a batching benefit.

This is a good point, maybe we should confirm it with some tests though.

Thanks,
Stefano
Stefan Hajnoczi May 30, 2023, 5:11 p.m. UTC | #4
On Mon, May 29, 2023 at 10:50:34AM +0200, Stefano Garzarella wrote:
> On Wed, May 24, 2023 at 03:36:34PM -0400, Stefan Hajnoczi wrote:
> > On Wed, May 24, 2023 at 10:52:03AM +0200, Stefano Garzarella wrote:
> > > On Tue, May 23, 2023 at 01:12:59PM -0400, Stefan Hajnoczi wrote:
> > > > Stop using the .bdrv_co_io_plug() API because it is not multi-queue
> > > > block layer friendly. Use the new blk_io_plug_call() API to batch I/O
> > > > submission instead.
> > > >
> > > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > Reviewed-by: Eric Blake <eblake@redhat.com>
> > > > ---
> > > > include/block/raw-aio.h |  7 -------
> > > > block/file-posix.c      | 28 ----------------------------
> > > > block/linux-aio.c       | 41 +++++++++++------------------------------
> > > > 3 files changed, 11 insertions(+), 65 deletions(-)
> > > >
> > > > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> > > > index da60ca13ef..0f63c2800c 100644
> > > > --- a/include/block/raw-aio.h
> > > > +++ b/include/block/raw-aio.h
> > > > @@ -62,13 +62,6 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
> > > >
> > > > void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
> > > > void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
> > > > -
> > > > -/*
> > > > - * laio_io_plug/unplug work in the thread's current AioContext, therefore the
> > > > - * caller must ensure that they are paired in the same IOThread.
> > > > - */
> > > > -void laio_io_plug(void);
> > > > -void laio_io_unplug(uint64_t dev_max_batch);
> > > > #endif
> > > > /* io_uring.c - Linux io_uring implementation */
> > > > #ifdef CONFIG_LINUX_IO_URING
> > > > diff --git a/block/file-posix.c b/block/file-posix.c
> > > > index 7baa8491dd..ac1ed54811 100644
> > > > --- a/block/file-posix.c
> > > > +++ b/block/file-posix.c
> > > > @@ -2550,26 +2550,6 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
> > > >     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
> > > > }
> > > >
> > > > -static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
> > > > -{
> > > > -    BDRVRawState __attribute__((unused)) *s = bs->opaque;
> > > > -#ifdef CONFIG_LINUX_AIO
> > > > -    if (s->use_linux_aio) {
> > > > -        laio_io_plug();
> > > > -    }
> > > > -#endif
> > > > -}
> > > > -
> > > > -static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
> > > > -{
> > > > -    BDRVRawState __attribute__((unused)) *s = bs->opaque;
> > > > -#ifdef CONFIG_LINUX_AIO
> > > > -    if (s->use_linux_aio) {
> > > > -        laio_io_unplug(s->aio_max_batch);
> > > > -    }
> > > > -#endif
> > > > -}
> > > > -
> > > > static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
> > > > {
> > > >     BDRVRawState *s = bs->opaque;
> > > > @@ -3914,8 +3894,6 @@ BlockDriver bdrv_file = {
> > > >     .bdrv_co_copy_range_from = raw_co_copy_range_from,
> > > >     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> > > >     .bdrv_refresh_limits = raw_refresh_limits,
> > > > -    .bdrv_co_io_plug        = raw_co_io_plug,
> > > > -    .bdrv_co_io_unplug      = raw_co_io_unplug,
> > > >     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > > >
> > > >     .bdrv_co_truncate                   = raw_co_truncate,
> > > > @@ -4286,8 +4264,6 @@ static BlockDriver bdrv_host_device = {
> > > >     .bdrv_co_copy_range_from = raw_co_copy_range_from,
> > > >     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> > > >     .bdrv_refresh_limits = raw_refresh_limits,
> > > > -    .bdrv_co_io_plug        = raw_co_io_plug,
> > > > -    .bdrv_co_io_unplug      = raw_co_io_unplug,
> > > >     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > > >
> > > >     .bdrv_co_truncate                   = raw_co_truncate,
> > > > @@ -4424,8 +4400,6 @@ static BlockDriver bdrv_host_cdrom = {
> > > >     .bdrv_co_pwritev        = raw_co_pwritev,
> > > >     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> > > >     .bdrv_refresh_limits    = cdrom_refresh_limits,
> > > > -    .bdrv_co_io_plug        = raw_co_io_plug,
> > > > -    .bdrv_co_io_unplug      = raw_co_io_unplug,
> > > >     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > > >
> > > >     .bdrv_co_truncate                   = raw_co_truncate,
> > > > @@ -4552,8 +4526,6 @@ static BlockDriver bdrv_host_cdrom = {
> > > >     .bdrv_co_pwritev        = raw_co_pwritev,
> > > >     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> > > >     .bdrv_refresh_limits    = cdrom_refresh_limits,
> > > > -    .bdrv_co_io_plug        = raw_co_io_plug,
> > > > -    .bdrv_co_io_unplug      = raw_co_io_unplug,
> > > >     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > > >
> > > >     .bdrv_co_truncate                   = raw_co_truncate,
> > > > diff --git a/block/linux-aio.c b/block/linux-aio.c
> > > > index 442c86209b..5021aed68f 100644
> > > > --- a/block/linux-aio.c
> > > > +++ b/block/linux-aio.c
> > > > @@ -15,6 +15,7 @@
> > > > #include "qemu/event_notifier.h"
> > > > #include "qemu/coroutine.h"
> > > > #include "qapi/error.h"
> > > > +#include "sysemu/block-backend.h"
> > > >
> > > > /* Only used for assertions.  */
> > > > #include "qemu/coroutine_int.h"
> > > > @@ -46,7 +47,6 @@ struct qemu_laiocb {
> > > > };
> > > >
> > > > typedef struct {
> > > > -    int plugged;
> > > >     unsigned int in_queue;
> > > >     unsigned int in_flight;
> > > >     bool blocked;
> > > > @@ -236,7 +236,7 @@ static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
> > > > {
> > > >     qemu_laio_process_completions(s);
> > > >
> > > > -    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
> > > > +    if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
> > > >         ioq_submit(s);
> > > >     }
> > > > }
> > > > @@ -277,7 +277,6 @@ static void qemu_laio_poll_ready(EventNotifier *opaque)
> > > > static void ioq_init(LaioQueue *io_q)
> > > > {
> > > >     QSIMPLEQ_INIT(&io_q->pending);
> > > > -    io_q->plugged = 0;
> > > >     io_q->in_queue = 0;
> > > >     io_q->in_flight = 0;
> > > >     io_q->blocked = false;
> > > > @@ -354,31 +353,11 @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
> > > >     return max_batch;
> > > > }
> > > >
> > > > -void laio_io_plug(void)
> > > > +static void laio_unplug_fn(void *opaque)
> > > > {
> > > > -    AioContext *ctx = qemu_get_current_aio_context();
> > > > -    LinuxAioState *s = aio_get_linux_aio(ctx);
> > > > +    LinuxAioState *s = opaque;
> > > >
> > > > -    s->io_q.plugged++;
> > > > -}
> > > > -
> > > > -void laio_io_unplug(uint64_t dev_max_batch)
> > > > -{
> > > > -    AioContext *ctx = qemu_get_current_aio_context();
> > > > -    LinuxAioState *s = aio_get_linux_aio(ctx);
> > > > -
> > > > -    assert(s->io_q.plugged);
> > > > -    s->io_q.plugged--;
> > > > -
> > > > -    /*
> > > > -     * Why max batch checking is performed here:
> > > > -     * Another BDS may have queued requests with a higher dev_max_batch and
> > > > -     * therefore in_queue could now exceed our dev_max_batch. Re-check the max
> > > > -     * batch so we can honor our device's dev_max_batch.
> > > > -     */
> > > > -    if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
> > > 
> > > Why are we removing this condition?
> > > Could the same situation occur with the new API?
> > 
> > The semantics of unplug_fn() are different from .bdrv_co_unplug():
> > 1. unplug_fn() is only called when the last blk_io_unplug() call occurs,
> >   not every time blk_io_unplug() is called.
> > 2. unplug_fn() is per-thread, not per-BlockDriverState, so there is no
> >   way to get per-BlockDriverState fields like dev_max_batch.
> > 
> > Therefore this condition cannot be moved to laio_unplug_fn().
> 
> I see now.
> 
> > 
> > How important is this condition? I believe that dropping it does not
> > have much of an effect but maybe I missed something.
> 
> With Kevin we agreed to add it to avoid extra latency in some devices,
> but we didn't do much testing on this.
> 
> IIRC what solved the performance degradation was the check in
> laio_do_submit() that we still have after this changes.
> 
> So it may not have much effect, but maybe it's worth mentioning in
> the commit description.

I'll update the commit description.

> > 
> > Also, does it make sense to define per-BlockDriverState batching limits
> > when the AIO engine (Linux AIO or io_uring) is thread-local and shared
> > between all BlockDriverStates? I believe the fundamental reason (that we
> > discovered later) why dev_max_batch is effective is because the Linux
> > kernel processes 32 I/O request submissions at a time. Anything above 32
> > adds latency without a batching benefit.
> 
> This is a good point, maybe we should confirm it with some tests though.

Yes, I would benchmark it. Also, switching to per-thread max_batch
involves a command-line interface change and we still need to keep
aio-max-batch for compatibility for some time, so it's not urgent.

Stefan
diff mbox series

Patch

diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index da60ca13ef..0f63c2800c 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -62,13 +62,6 @@  int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
 
 void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
 void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
-
-/*
- * laio_io_plug/unplug work in the thread's current AioContext, therefore the
- * caller must ensure that they are paired in the same IOThread.
- */
-void laio_io_plug(void);
-void laio_io_unplug(uint64_t dev_max_batch);
 #endif
 /* io_uring.c - Linux io_uring implementation */
 #ifdef CONFIG_LINUX_IO_URING
diff --git a/block/file-posix.c b/block/file-posix.c
index 7baa8491dd..ac1ed54811 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2550,26 +2550,6 @@  static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
 }
 
-static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
-{
-    BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
-    if (s->use_linux_aio) {
-        laio_io_plug();
-    }
-#endif
-}
-
-static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
-{
-    BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
-    if (s->use_linux_aio) {
-        laio_io_unplug(s->aio_max_batch);
-    }
-#endif
-}
-
 static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
@@ -3914,8 +3894,6 @@  BlockDriver bdrv_file = {
     .bdrv_co_copy_range_from = raw_co_copy_range_from,
     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
     .bdrv_refresh_limits = raw_refresh_limits,
-    .bdrv_co_io_plug        = raw_co_io_plug,
-    .bdrv_co_io_unplug      = raw_co_io_unplug,
     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
     .bdrv_co_truncate                   = raw_co_truncate,
@@ -4286,8 +4264,6 @@  static BlockDriver bdrv_host_device = {
     .bdrv_co_copy_range_from = raw_co_copy_range_from,
     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
     .bdrv_refresh_limits = raw_refresh_limits,
-    .bdrv_co_io_plug        = raw_co_io_plug,
-    .bdrv_co_io_unplug      = raw_co_io_unplug,
     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
     .bdrv_co_truncate                   = raw_co_truncate,
@@ -4424,8 +4400,6 @@  static BlockDriver bdrv_host_cdrom = {
     .bdrv_co_pwritev        = raw_co_pwritev,
     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
     .bdrv_refresh_limits    = cdrom_refresh_limits,
-    .bdrv_co_io_plug        = raw_co_io_plug,
-    .bdrv_co_io_unplug      = raw_co_io_unplug,
     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
     .bdrv_co_truncate                   = raw_co_truncate,
@@ -4552,8 +4526,6 @@  static BlockDriver bdrv_host_cdrom = {
     .bdrv_co_pwritev        = raw_co_pwritev,
     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
     .bdrv_refresh_limits    = cdrom_refresh_limits,
-    .bdrv_co_io_plug        = raw_co_io_plug,
-    .bdrv_co_io_unplug      = raw_co_io_unplug,
     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
     .bdrv_co_truncate                   = raw_co_truncate,
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 442c86209b..5021aed68f 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -15,6 +15,7 @@ 
 #include "qemu/event_notifier.h"
 #include "qemu/coroutine.h"
 #include "qapi/error.h"
+#include "sysemu/block-backend.h"
 
 /* Only used for assertions.  */
 #include "qemu/coroutine_int.h"
@@ -46,7 +47,6 @@  struct qemu_laiocb {
 };
 
 typedef struct {
-    int plugged;
     unsigned int in_queue;
     unsigned int in_flight;
     bool blocked;
@@ -236,7 +236,7 @@  static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
 {
     qemu_laio_process_completions(s);
 
-    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+    if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
         ioq_submit(s);
     }
 }
@@ -277,7 +277,6 @@  static void qemu_laio_poll_ready(EventNotifier *opaque)
 static void ioq_init(LaioQueue *io_q)
 {
     QSIMPLEQ_INIT(&io_q->pending);
-    io_q->plugged = 0;
     io_q->in_queue = 0;
     io_q->in_flight = 0;
     io_q->blocked = false;
@@ -354,31 +353,11 @@  static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
     return max_batch;
 }
 
-void laio_io_plug(void)
+static void laio_unplug_fn(void *opaque)
 {
-    AioContext *ctx = qemu_get_current_aio_context();
-    LinuxAioState *s = aio_get_linux_aio(ctx);
+    LinuxAioState *s = opaque;
 
-    s->io_q.plugged++;
-}
-
-void laio_io_unplug(uint64_t dev_max_batch)
-{
-    AioContext *ctx = qemu_get_current_aio_context();
-    LinuxAioState *s = aio_get_linux_aio(ctx);
-
-    assert(s->io_q.plugged);
-    s->io_q.plugged--;
-
-    /*
-     * Why max batch checking is performed here:
-     * Another BDS may have queued requests with a higher dev_max_batch and
-     * therefore in_queue could now exceed our dev_max_batch. Re-check the max
-     * batch so we can honor our device's dev_max_batch.
-     */
-    if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
-        (!s->io_q.plugged &&
-         !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
+    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
         ioq_submit(s);
     }
 }
@@ -410,10 +389,12 @@  static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
 
     QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
     s->io_q.in_queue++;
-    if (!s->io_q.blocked &&
-        (!s->io_q.plugged ||
-         s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
-        ioq_submit(s);
+    if (!s->io_q.blocked) {
+        if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
+            ioq_submit(s);
+        } else {
+            blk_io_plug_call(laio_unplug_fn, s);
+        }
     }
 
     return 0;