diff mbox series

[RFC,13/13] HACK: schedule fence return on main AIO context

Message ID 20230421011223.718-14-gurchetansingh@chromium.org
State New
Headers show
Series gfxstream + rutabaga_gfx: a surprising delight or startling epiphany? | expand

Commit Message

Gurchetan Singh April 21, 2023, 1:12 a.m. UTC
gfxstream and both cross-domain (and even newer versions
virglrenderer: see VIRGL_RENDERER_ASYNC_FENCE_CB) like to signal
fence completion on threads ("callback threads") that are
different from the thread that processes the command queue
("main thread").

This is generally possible with locking, and this what we do
in crosvm and other virtio-gpu1.1 implementations.  However, on
QEMU a deadlock is observed if virtio_gpu_ctrl_response_nodata(..)
[used in the fence callback] is used from a thread that is not the
main thread.

The reason is the main thread takes the big QEMU lock (bql) somewhere
when processing the command queue, and virtio_gpu_ctrl_response_nodata(..)
needs that lock.  If you add in the lock needed to protect &g->fenceq
from concurrent access by the main thread and the callback threads,
you end can end up with deadlocks.

It's possible to workaround this by scheduling the return of the fence
descriptors via aio_bh_schedule_oneshot_full(..), but that somewhat
negates the rationale for the asynchronous callbacks.

I also played around with aio_context_acquire()/aio_context_release(),
doesn't seem to help.

Is signaling the virtio_queue outside of the main thread possible?  If
so, how?

Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
---
 hw/display/virtio-gpu-rutabaga.c | 29 ++++++++++++++++++++++++++---
 include/hw/virtio/virtio-gpu.h   |  1 +
 2 files changed, 27 insertions(+), 3 deletions(-)

Comments

Stefan Hajnoczi April 21, 2023, 3:59 p.m. UTC | #1
On Thu, 20 Apr 2023 at 21:13, Gurchetan Singh
<gurchetansingh@chromium.org> wrote:
>
> gfxstream and both cross-domain (and even newer versions
> virglrenderer: see VIRGL_RENDERER_ASYNC_FENCE_CB) like to signal
> fence completion on threads ("callback threads") that are
> different from the thread that processes the command queue
> ("main thread").
>
> This is generally possible with locking, and this what we do
> in crosvm and other virtio-gpu1.1 implementations.  However, on
> QEMU a deadlock is observed if virtio_gpu_ctrl_response_nodata(..)
> [used in the fence callback] is used from a thread that is not the
> main thread.
>
> The reason is the main thread takes the big QEMU lock (bql) somewhere
> when processing the command queue, and virtio_gpu_ctrl_response_nodata(..)
> needs that lock.  If you add in the lock needed to protect &g->fenceq
> from concurrent access by the main thread and the callback threads,
> you end can end up with deadlocks.
>
> It's possible to workaround this by scheduling the return of the fence
> descriptors via aio_bh_schedule_oneshot_full(..), but that somewhat
> negates the rationale for the asynchronous callbacks.
>
> I also played around with aio_context_acquire()/aio_context_release(),
> doesn't seem to help.
>
> Is signaling the virtio_queue outside of the main thread possible?  If
> so, how?

Yes, if you want a specific thread to process a virtqueue, monitor the
virtqueue's host_notifier (aka ioeventfd) from the thread. That's how
--device virtio-blk,iothread= works. It attaches the host_notifier to
the IOThread's AioContext. Whenever the guest kicks the virtqueue, the
IOThread will respond instead of QEMU's main loop thread.

That said, I don't know the threading model of your code. Could you
explain which threads are involved? Do you want to process all buffers
from virtqueue in a specific thread or only these fence buffers?

>
> Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> ---
>  hw/display/virtio-gpu-rutabaga.c | 29 ++++++++++++++++++++++++++---
>  include/hw/virtio/virtio-gpu.h   |  1 +
>  2 files changed, 27 insertions(+), 3 deletions(-)
>
> diff --git a/hw/display/virtio-gpu-rutabaga.c b/hw/display/virtio-gpu-rutabaga.c
> index 196267aac2..5c296aeef1 100644
> --- a/hw/display/virtio-gpu-rutabaga.c
> +++ b/hw/display/virtio-gpu-rutabaga.c
> @@ -31,6 +31,11 @@ static int virtio_gpu_rutabaga_init(VirtIOGPU *g);
>
>  #define CHECK_RESULT(result, cmd) CHECK(result == 0, cmd)
>
> +struct rutabaga_aio_data {
> +    struct VirtIOGPUGL *virtio_gpu;
> +    struct rutabaga_fence fence;
> +};
> +
>  static void
>  virtio_gpu_rutabaga_update_cursor(VirtIOGPU *g, struct virtio_gpu_scanout *s,
>                                    uint32_t resource_id)
> @@ -823,10 +828,11 @@ virtio_gpu_rutabaga_process_cmd(VirtIOGPU *g,
>  }
>
>  static void
> -virtio_gpu_rutabaga_fence_cb(uint64_t user_data,
> -                             struct rutabaga_fence fence_data)
> +virtio_gpu_rutabaga_aio_cb(void *opaque)
>  {
> -    VirtIOGPU *g = (VirtIOGPU *)(void*)(uintptr_t)user_data;
> +    struct rutabaga_aio_data *data =  (struct rutabaga_aio_data *)opaque;
> +    VirtIOGPU *g = (VirtIOGPU *)data->virtio_gpu;
> +    struct rutabaga_fence fence_data = data->fence;
>      struct virtio_gpu_ctrl_command *cmd, *tmp;
>
>      bool signaled_ctx_specific = fence_data.flags & RUTABAGA_FLAG_INFO_RING_IDX;
> @@ -856,6 +862,22 @@ virtio_gpu_rutabaga_fence_cb(uint64_t user_data,
>          QTAILQ_REMOVE(&g->fenceq, cmd, next);
>          g_free(cmd);
>      }
> +
> +    g_free(data);
> +}
> +
> +static void
> +virtio_gpu_rutabaga_fence_cb(uint64_t user_data,
> +                             struct rutabaga_fence fence_data) {
> +    struct rutabaga_aio_data *data;
> +    VirtIOGPU *g = (VirtIOGPU *)(void*)(uintptr_t)user_data;
> +    GET_VIRTIO_GPU_GL(g);
> +
> +    data = g_new0(struct rutabaga_aio_data, 1);
> +    data->virtio_gpu = virtio_gpu;
> +    data->fence = fence_data;
> +    aio_bh_schedule_oneshot_full(virtio_gpu->ctx, virtio_gpu_rutabaga_aio_cb,
> +                                 (void *)data, "aio");
>  }
>
>  static int virtio_gpu_rutabaga_init(VirtIOGPU *g)
> @@ -912,6 +934,7 @@ static int virtio_gpu_rutabaga_init(VirtIOGPU *g)
>          free(channels.channels);
>      }
>
> +    virtio_gpu->ctx = qemu_get_aio_context();
>      return result;
>  }
>
> diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h
> index 034c71e8f5..b33ad0c68f 100644
> --- a/include/hw/virtio/virtio-gpu.h
> +++ b/include/hw/virtio/virtio-gpu.h
> @@ -225,6 +225,7 @@ struct VirtIOGPUGL {
>      char *wayland_socket_path;
>      uint32_t num_capsets;
>      void *rutabaga;
> +    AioContext *ctx;
>  };
>
>  struct VhostUserGPU {
> --
> 2.40.0.634.g4ca3ef3211-goog
>
>
Gurchetan Singh April 21, 2023, 11:20 p.m. UTC | #2
On Fri, Apr 21, 2023 at 9:00 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
>
> On Thu, 20 Apr 2023 at 21:13, Gurchetan Singh
> <gurchetansingh@chromium.org> wrote:
> >
> > gfxstream and both cross-domain (and even newer versions
> > virglrenderer: see VIRGL_RENDERER_ASYNC_FENCE_CB) like to signal
> > fence completion on threads ("callback threads") that are
> > different from the thread that processes the command queue
> > ("main thread").
> >
> > This is generally possible with locking, and this what we do
> > in crosvm and other virtio-gpu1.1 implementations.  However, on
> > QEMU a deadlock is observed if virtio_gpu_ctrl_response_nodata(..)
> > [used in the fence callback] is used from a thread that is not the
> > main thread.
> >
> > The reason is the main thread takes the big QEMU lock (bql) somewhere
> > when processing the command queue, and virtio_gpu_ctrl_response_nodata(..)
> > needs that lock.  If you add in the lock needed to protect &g->fenceq
> > from concurrent access by the main thread and the callback threads,
> > you end can end up with deadlocks.
> >
> > It's possible to workaround this by scheduling the return of the fence
> > descriptors via aio_bh_schedule_oneshot_full(..), but that somewhat
> > negates the rationale for the asynchronous callbacks.
> >
> > I also played around with aio_context_acquire()/aio_context_release(),
> > doesn't seem to help.
> >
> > Is signaling the virtio_queue outside of the main thread possible?  If
> > so, how?
>
> Yes, if you want a specific thread to process a virtqueue, monitor the
> virtqueue's host_notifier (aka ioeventfd) from the thread. That's how
> --device virtio-blk,iothread= works. It attaches the host_notifier to
> the IOThread's AioContext. Whenever the guest kicks the virtqueue, the
> IOThread will respond instead of QEMU's main loop thread.
>
> That said, I don't know the threading model of your code. Could you
> explain which threads are involved? Do you want to process all buffers
> from virtqueue in a specific thread or only these fence buffers?

Only the fence callback would be signalled via these callback threads.
The virtqueue would not be processed by the callback thread.

There can be multiple callback threads: for example, one for the
gfxstream context and another for the Wayland context.  These threads
could be a C++ thread, a Rust thread or any other.

The strategy used by crosvm is to have a mutex around the fence state
to synchronize between multiple callback threads (which signal fences)
and main thread (which generates fences).

I tried to use aio_context_acquire(..)/aio_context_release(..) to
synchronize these threads, but that results in a deadlock.  I think
those functions may assume an IOThread and not necessarily any thread.
It seems aio_context_acquire(..) succeeds for the callback thread
though.

Here's what tried (rather than this patch which works, but is less
ideal than the solution below):

diff --git a/hw/display/virtio-gpu-rutabaga.c b/hw/display/virtio-gpu-rutabaga.c
index 196267aac2..0993b09090 100644
--- a/hw/display/virtio-gpu-rutabaga.c
+++ b/hw/display/virtio-gpu-rutabaga.c
@@ -798,6 +798,7 @@ virtio_gpu_rutabaga_process_cmd(VirtIOGPU *g,
     }

     if (cmd->finished) {
+        g_free(cmd);
         return;
     }
     if (cmd->error) {
@@ -811,6 +812,8 @@ virtio_gpu_rutabaga_process_cmd(VirtIOGPU *g,
         return;
     }

+    QTAILQ_INSERT_TAIL(&g->fenceq, cmd, next);
+
     fence.flags = cmd->cmd_hdr.flags;
     fence.ctx_id = cmd->cmd_hdr.ctx_id;
     fence.fence_id = cmd->cmd_hdr.fence_id;
@@ -827,9 +830,11 @@ virtio_gpu_rutabaga_fence_cb(uint64_t user_data,
                              struct rutabaga_fence fence_data)
 {
     VirtIOGPU *g = (VirtIOGPU *)(void*)(uintptr_t)user_data;
+    GET_VIRTIO_GPU_GL(g);
     struct virtio_gpu_ctrl_command *cmd, *tmp;

     bool signaled_ctx_specific = fence_data.flags &
RUTABAGA_FLAG_INFO_RING_IDX;
+    aio_context_acquire(virtio_gpu->ctx);

     QTAILQ_FOREACH_SAFE(cmd, &g->fenceq, next, tmp) {
         /*
@@ -856,6 +861,7 @@ virtio_gpu_rutabaga_fence_cb(uint64_t user_data,
         QTAILQ_REMOVE(&g->fenceq, cmd, next);
         g_free(cmd);
     }
+    aio_context_release(virtio_gpu->ctx);
 }

 static int virtio_gpu_rutabaga_init(VirtIOGPU *g)
@@ -912,6 +918,7 @@ static int virtio_gpu_rutabaga_init(VirtIOGPU *g)
         free(channels.channels);
     }

+    virtio_gpu->ctx = qemu_get_aio_context();
     return result;
 }

@@ -942,6 +949,30 @@ static int
virtio_gpu_rutabaga_get_num_capsets(VirtIOGPU *g)
     return (int)(num_capsets);
 }

+static void virtio_gpu_rutabaga_process_cmdq(VirtIOGPU *g)
+{
+    struct virtio_gpu_ctrl_command *cmd;
+    VirtIOGPUClass *vgc = VIRTIO_GPU_GET_CLASS(g);
+
+    if (g->processing_cmdq) {
+        return;
+    }
+    g->processing_cmdq = true;
+    while (!QTAILQ_EMPTY(&g->cmdq)) {
+        cmd = QTAILQ_FIRST(&g->cmdq);
+
+        if (g->parent_obj.renderer_blocked) {
+            break;
+        }
+
+        QTAILQ_REMOVE(&g->cmdq, cmd, next);
+
+        /* process command */
+        vgc->process_cmd(g, cmd);
+    }
+    g->processing_cmdq = false;
+}
+
 static void virtio_gpu_rutabaga_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
 {
     VirtIOGPU *g = VIRTIO_GPU(vdev);
@@ -972,7 +1003,7 @@ static void
virtio_gpu_rutabaga_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
         cmd = virtqueue_pop(vq, sizeof(struct virtio_gpu_ctrl_command));
     }

-    virtio_gpu_process_cmdq(g);
+    virtio_gpu_rutabaga_process_cmdq(g);
 }

 void virtio_gpu_rutabaga_device_realize(DeviceState *qdev, Error **errp)
diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h
index 034c71e8f5..b33ad0c68f 100644
--- a/include/hw/virtio/virtio-gpu.h
+++ b/include/hw/virtio/virtio-gpu.h
@@ -225,6 +225,7 @@ struct VirtIOGPUGL {
     char *wayland_socket_path;
     uint32_t num_capsets;
     void *rutabaga;
+    AioContext *ctx;
 };

 struct VhostUserGPU {

--------------------------------------------------------------------------------------------------------

Main Thread deadlocked with above patch:

[Switching to thread 1 (Thread 0x7ffff5ced600 (LWP 8584))]
#0  futex_wait (private=0, expected=2, futex_word=0x5555569893a0
<qemu_global_mutex>) at ../sysdeps/nptl/futex-internal.h:146
146 in ../sysdeps/nptl/futex-internal.h
(gdb) bt
#0  futex_wait (private=0, expected=2, futex_word=0x5555569893a0
<qemu_global_mutex>) at ../sysdeps/nptl/futex-internal.h:146
#1  __GI___lll_lock_wait (futex=futex@entry=0x5555569893a0
<qemu_global_mutex>, private=0) at ./nptl/lowlevellock.c:49
#2  0x00007ffff7098082 in lll_mutex_lock_optimized
(mutex=0x5555569893a0 <qemu_global_mutex>) at
./nptl/pthread_mutex_lock.c:48
#3  ___pthread_mutex_lock (mutex=0x5555569893a0 <qemu_global_mutex>)
at ./nptl/pthread_mutex_lock.c:93
#4  0x00005555560019da in qemu_mutex_lock_impl (mutex=0x5555569893a0
<qemu_global_mutex>, file=0x55555626eaeb "../util/main-loop.c",
line=311) at ../util/qemu-thread-posix.c:94
#5  0x0000555555afdf2d in qemu_mutex_lock_iothread_impl
(file=0x55555626eaeb "../util/main-loop.c", line=311) at
../softmmu/cpus.c:504
#6  0x000055555601c39e in os_host_main_loop_wait (timeout=1471024) at
../util/main-loop.c:311
#7  0x000055555601c4bd in main_loop_wait (nonblocking=0) at
../util/main-loop.c:592
#8  0x0000555555b070a3 in qemu_main_loop () at ../softmmu/runstate.c:731
#9  0x0000555555e11dbd in qemu_default_main () at ../softmmu/main.c:37
#10 0x0000555555e11df7 in main (argc=23, argv=0x7fffffffde28) at
../softmmu/main.c:48

Callback Thread deadlocked with above patch:

[Switching to thread 56 (Thread 0x7ffd2c9ff640 (LWP 8649))]
#0  futex_wait (private=0, expected=2, futex_word=0x5555569893a0
<qemu_global_mutex>) at ../sysdeps/nptl/futex-internal.h:146
146 in ../sysdeps/nptl/futex-internal.h
(gdb) bt
#0  futex_wait (private=0, expected=2, futex_word=0x5555569893a0
<qemu_global_mutex>) at ../sysdeps/nptl/futex-internal.h:146
#1  __GI___lll_lock_wait (futex=futex@entry=0x5555569893a0
<qemu_global_mutex>, private=0) at ./nptl/lowlevellock.c:49
#2  0x00007ffff7098082 in lll_mutex_lock_optimized
(mutex=0x5555569893a0 <qemu_global_mutex>) at
./nptl/pthread_mutex_lock.c:48
#3  ___pthread_mutex_lock (mutex=0x5555569893a0 <qemu_global_mutex>)
at ./nptl/pthread_mutex_lock.c:93
#4  0x00005555560019da in qemu_mutex_lock_impl (mutex=0x5555569893a0
<qemu_global_mutex>, file=0x5555561df60c "../softmmu/physmem.c",
line=2581) at ../util/qemu-thread-posix.c:94
#5  0x0000555555afdf2d in qemu_mutex_lock_iothread_impl
(file=0x5555561df60c "../softmmu/physmem.c", line=2581) at
../softmmu/cpus.c:504
#6  0x0000555555d69a1a in prepare_mmio_access (mr=0x555556c10ca0) at
../softmmu/physmem.c:2581
#7  0x0000555555d6b7a8 in address_space_stl_internal
(as=0x5555578270f8, addr=4276125700, val=33, attrs=..., result=0x0,
endian=DEVICE_LITTLE_ENDIAN) at
/home/lenovo/qemu/memory_ldst.c.inc:318
#8  0x0000555555d6b91d in address_space_stl_le (as=0x5555578270f8,
addr=4276125700, val=33, attrs=..., result=0x0) at
/home/lenovo/qemu/memory_ldst.c.inc:357
#9  0x0000555555a0657c in pci_msi_trigger (dev=0x555557826ec0,
msg=...) at ../hw/pci/pci.c:356
#10 0x0000555555a02a5a in msi_send_message (dev=0x555557826ec0,
msg=...) at ../hw/pci/msi.c:379
#11 0x0000555555a045a0 in msix_notify (dev=0x555557826ec0, vector=1)
at ../hw/pci/msix.c:542
#12 0x0000555555ac8780 in virtio_pci_notify (d=0x555557826ec0,
vector=1) at ../hw/virtio/virtio-pci.c:77
#13 0x0000555555d15ddc in virtio_notify_vector (vdev=0x55555783ffd0,
vector=1) at ../hw/virtio/virtio.c:1985
#14 0x0000555555d17393 in virtio_irq (vq=0x555558716d80) at
../hw/virtio/virtio.c:2461
#15 0x0000555555d17439 in virtio_notify (vdev=0x55555783ffd0,
vq=0x555558716d80) at ../hw/virtio/virtio.c:2473
#16 0x0000555555b98732 in virtio_gpu_ctrl_response (g=0x55555783ffd0,
cmd=0x7ffdd80068b0, resp=0x7ffd2c9fe150, resp_len=24) at
../hw/display/virtio-gpu.c:177
#17 0x0000555555b9879c in virtio_gpu_ctrl_response_nodata
(g=0x55555783ffd0, cmd=0x7ffdd80068b0, type=VIRTIO_GPU_RESP_OK_NODATA)
at ../hw/display/virtio-gpu.c:189
#18 0x0000555555ba6b82 in virtio_gpu_rutabaga_fence_cb
(user_data=93825028849616, fence_data=...) at
../hw/display/virtio-gpu-rutabaga.c:860
#19 0x00007ffff75b9381 in
_ZN131_$LT$rutabaga_gfx..rutabaga_utils..RutabagaFenceClosure$LT$T$GT$$u20$as$u20$rutabaga_gfx..rutabaga_utils..RutabagaFenceCallback$GT$4call17hcbf1b4ac094a2a60E.llvm.14356420492591683714
() at /usr/local/lib/librutabaga_gfx_ffi.so
#20 0x00007ffff75d501b in
rutabaga_gfx::cross_domain::cross_domain::CrossDomainWorker::run () at
/usr/local/lib/librutabaga_gfx_ffi.so
#21 0x00007ffff75dd651 in
std::sys_common::backtrace::__rust_begin_short_backtrace () at
/usr/local/lib/librutabaga_gfx_ffi.so
#22 0x00007ffff75c62ba in
core::ops::function::FnOnce::call_once{{vtable.shim}} () at
/usr/local/lib/librutabaga_gfx_ffi.so
#23 0x00007ffff75e3b73 in alloc::boxed::{impl#44}::call_once<(), dyn
core::ops::function::FnOnce<(), Output=()>, alloc::alloc::Global>
(self=..., args=()) at library/alloc/src/boxed.rs:1940
#24 alloc::boxed::{impl#44}::call_once<(), alloc::boxed::Box<dyn
core::ops::function::FnOnce<(), Output=()>, alloc::alloc::Global>,
alloc::alloc::Global> (self=0x5555572e27d0, args=())
    at library/alloc/src/boxed.rs:1940
#25 std::sys::unix::thread::{impl#2}::new::thread_start
(main=0x5555572e27d0) at library/std/src/sys/unix/thread.rs:108
#26 0x00007ffff7094b43 in start_thread (arg=<optimized out>) at
./nptl/pthread_create.c:442
#27 0x00007ffff7126a00 in clone3 () at
../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

>
>
> >
> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> > ---
> >  hw/display/virtio-gpu-rutabaga.c | 29 ++++++++++++++++++++++++++---
> >  include/hw/virtio/virtio-gpu.h   |  1 +
> >  2 files changed, 27 insertions(+), 3 deletions(-)
> >
> > diff --git a/hw/display/virtio-gpu-rutabaga.c b/hw/display/virtio-gpu-rutabaga.c
> > index 196267aac2..5c296aeef1 100644
> > --- a/hw/display/virtio-gpu-rutabaga.c
> > +++ b/hw/display/virtio-gpu-rutabaga.c
> > @@ -31,6 +31,11 @@ static int virtio_gpu_rutabaga_init(VirtIOGPU *g);
> >
> >  #define CHECK_RESULT(result, cmd) CHECK(result == 0, cmd)
> >
> > +struct rutabaga_aio_data {
> > +    struct VirtIOGPUGL *virtio_gpu;
> > +    struct rutabaga_fence fence;
> > +};
> > +
> >  static void
> >  virtio_gpu_rutabaga_update_cursor(VirtIOGPU *g, struct virtio_gpu_scanout *s,
> >                                    uint32_t resource_id)
> > @@ -823,10 +828,11 @@ virtio_gpu_rutabaga_process_cmd(VirtIOGPU *g,
> >  }
> >
> >  static void
> > -virtio_gpu_rutabaga_fence_cb(uint64_t user_data,
> > -                             struct rutabaga_fence fence_data)
> > +virtio_gpu_rutabaga_aio_cb(void *opaque)
> >  {
> > -    VirtIOGPU *g = (VirtIOGPU *)(void*)(uintptr_t)user_data;
> > +    struct rutabaga_aio_data *data =  (struct rutabaga_aio_data *)opaque;
> > +    VirtIOGPU *g = (VirtIOGPU *)data->virtio_gpu;
> > +    struct rutabaga_fence fence_data = data->fence;
> >      struct virtio_gpu_ctrl_command *cmd, *tmp;
> >
> >      bool signaled_ctx_specific = fence_data.flags & RUTABAGA_FLAG_INFO_RING_IDX;
> > @@ -856,6 +862,22 @@ virtio_gpu_rutabaga_fence_cb(uint64_t user_data,
> >          QTAILQ_REMOVE(&g->fenceq, cmd, next);
> >          g_free(cmd);
> >      }
> > +
> > +    g_free(data);
> > +}
> > +
> > +static void
> > +virtio_gpu_rutabaga_fence_cb(uint64_t user_data,
> > +                             struct rutabaga_fence fence_data) {
> > +    struct rutabaga_aio_data *data;
> > +    VirtIOGPU *g = (VirtIOGPU *)(void*)(uintptr_t)user_data;
> > +    GET_VIRTIO_GPU_GL(g);
> > +
> > +    data = g_new0(struct rutabaga_aio_data, 1);
> > +    data->virtio_gpu = virtio_gpu;
> > +    data->fence = fence_data;
> > +    aio_bh_schedule_oneshot_full(virtio_gpu->ctx, virtio_gpu_rutabaga_aio_cb,
> > +                                 (void *)data, "aio");
> >  }
> >
> >  static int virtio_gpu_rutabaga_init(VirtIOGPU *g)
> > @@ -912,6 +934,7 @@ static int virtio_gpu_rutabaga_init(VirtIOGPU *g)
> >          free(channels.channels);
> >      }
> >
> > +    virtio_gpu->ctx = qemu_get_aio_context();
> >      return result;
> >  }
> >
> > diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h
> > index 034c71e8f5..b33ad0c68f 100644
> > --- a/include/hw/virtio/virtio-gpu.h
> > +++ b/include/hw/virtio/virtio-gpu.h
> > @@ -225,6 +225,7 @@ struct VirtIOGPUGL {
> >      char *wayland_socket_path;
> >      uint32_t num_capsets;
> >      void *rutabaga;
> > +    AioContext *ctx;
> >  };
> >
> >  struct VhostUserGPU {
> > --
> > 2.40.0.634.g4ca3ef3211-goog
> >
> >
Stefan Hajnoczi April 25, 2023, 12:04 p.m. UTC | #3
On Fri, 21 Apr 2023 at 19:21, Gurchetan Singh
<gurchetansingh@chromium.org> wrote:
>
> On Fri, Apr 21, 2023 at 9:00 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
> >
> > On Thu, 20 Apr 2023 at 21:13, Gurchetan Singh
> > <gurchetansingh@chromium.org> wrote:
> > >
> > > gfxstream and both cross-domain (and even newer versions
> > > virglrenderer: see VIRGL_RENDERER_ASYNC_FENCE_CB) like to signal
> > > fence completion on threads ("callback threads") that are
> > > different from the thread that processes the command queue
> > > ("main thread").
> > >
> > > This is generally possible with locking, and this what we do
> > > in crosvm and other virtio-gpu1.1 implementations.  However, on
> > > QEMU a deadlock is observed if virtio_gpu_ctrl_response_nodata(..)
> > > [used in the fence callback] is used from a thread that is not the
> > > main thread.
> > >
> > > The reason is the main thread takes the big QEMU lock (bql) somewhere
> > > when processing the command queue, and virtio_gpu_ctrl_response_nodata(..)
> > > needs that lock.  If you add in the lock needed to protect &g->fenceq
> > > from concurrent access by the main thread and the callback threads,
> > > you end can end up with deadlocks.
> > >
> > > It's possible to workaround this by scheduling the return of the fence
> > > descriptors via aio_bh_schedule_oneshot_full(..), but that somewhat
> > > negates the rationale for the asynchronous callbacks.
> > >
> > > I also played around with aio_context_acquire()/aio_context_release(),
> > > doesn't seem to help.
> > >
> > > Is signaling the virtio_queue outside of the main thread possible?  If
> > > so, how?
> >
> > Yes, if you want a specific thread to process a virtqueue, monitor the
> > virtqueue's host_notifier (aka ioeventfd) from the thread. That's how
> > --device virtio-blk,iothread= works. It attaches the host_notifier to
> > the IOThread's AioContext. Whenever the guest kicks the virtqueue, the
> > IOThread will respond instead of QEMU's main loop thread.
> >
> > That said, I don't know the threading model of your code. Could you
> > explain which threads are involved? Do you want to process all buffers
> > from virtqueue in a specific thread or only these fence buffers?
>
> Only the fence callback would be signalled via these callback threads.
> The virtqueue would not be processed by the callback thread.
>
> There can be multiple callback threads: for example, one for the
> gfxstream context and another for the Wayland context.  These threads
> could be a C++ thread, a Rust thread or any other.
>
> The strategy used by crosvm is to have a mutex around the fence state
> to synchronize between multiple callback threads (which signal fences)
> and main thread (which generates fences).
>
> I tried to use aio_context_acquire(..)/aio_context_release(..) to
> synchronize these threads, but that results in a deadlock.  I think
> those functions may assume an IOThread and not necessarily any thread.
> It seems aio_context_acquire(..) succeeds for the callback thread
> though.
>
> Here's what tried (rather than this patch which works, but is less
> ideal than the solution below):

I don't have time to study the code and understand the threading
model, but one thing stood out:

> Callback Thread deadlocked with above patch:
>
> [Switching to thread 56 (Thread 0x7ffd2c9ff640 (LWP 8649))]
> #0  futex_wait (private=0, expected=2, futex_word=0x5555569893a0
> <qemu_global_mutex>) at ../sysdeps/nptl/futex-internal.h:146
> 146 in ../sysdeps/nptl/futex-internal.h
> (gdb) bt
> #0  futex_wait (private=0, expected=2, futex_word=0x5555569893a0
> <qemu_global_mutex>) at ../sysdeps/nptl/futex-internal.h:146
> #1  __GI___lll_lock_wait (futex=futex@entry=0x5555569893a0
> <qemu_global_mutex>, private=0) at ./nptl/lowlevellock.c:49
> #2  0x00007ffff7098082 in lll_mutex_lock_optimized
> (mutex=0x5555569893a0 <qemu_global_mutex>) at
> ./nptl/pthread_mutex_lock.c:48
> #3  ___pthread_mutex_lock (mutex=0x5555569893a0 <qemu_global_mutex>)
> at ./nptl/pthread_mutex_lock.c:93
> #4  0x00005555560019da in qemu_mutex_lock_impl (mutex=0x5555569893a0
> <qemu_global_mutex>, file=0x5555561df60c "../softmmu/physmem.c",
> line=2581) at ../util/qemu-thread-posix.c:94
> #5  0x0000555555afdf2d in qemu_mutex_lock_iothread_impl
> (file=0x5555561df60c "../softmmu/physmem.c", line=2581) at
> ../softmmu/cpus.c:504
> #6  0x0000555555d69a1a in prepare_mmio_access (mr=0x555556c10ca0) at
> ../softmmu/physmem.c:2581
> #7  0x0000555555d6b7a8 in address_space_stl_internal
> (as=0x5555578270f8, addr=4276125700, val=33, attrs=..., result=0x0,
> endian=DEVICE_LITTLE_ENDIAN) at
> /home/lenovo/qemu/memory_ldst.c.inc:318
> #8  0x0000555555d6b91d in address_space_stl_le (as=0x5555578270f8,
> addr=4276125700, val=33, attrs=..., result=0x0) at
> /home/lenovo/qemu/memory_ldst.c.inc:357
> #9  0x0000555555a0657c in pci_msi_trigger (dev=0x555557826ec0,
> msg=...) at ../hw/pci/pci.c:356
> #10 0x0000555555a02a5a in msi_send_message (dev=0x555557826ec0,
> msg=...) at ../hw/pci/msi.c:379
> #11 0x0000555555a045a0 in msix_notify (dev=0x555557826ec0, vector=1)
> at ../hw/pci/msix.c:542
> #12 0x0000555555ac8780 in virtio_pci_notify (d=0x555557826ec0,
> vector=1) at ../hw/virtio/virtio-pci.c:77
> #13 0x0000555555d15ddc in virtio_notify_vector (vdev=0x55555783ffd0,
> vector=1) at ../hw/virtio/virtio.c:1985
> #14 0x0000555555d17393 in virtio_irq (vq=0x555558716d80) at
> ../hw/virtio/virtio.c:2461
> #15 0x0000555555d17439 in virtio_notify (vdev=0x55555783ffd0,
> vq=0x555558716d80) at ../hw/virtio/virtio.c:2473
> #16 0x0000555555b98732 in virtio_gpu_ctrl_response (g=0x55555783ffd0,
> cmd=0x7ffdd80068b0, resp=0x7ffd2c9fe150, resp_len=24) at
> ../hw/display/virtio-gpu.c:177
> #17 0x0000555555b9879c in virtio_gpu_ctrl_response_nodata
> (g=0x55555783ffd0, cmd=0x7ffdd80068b0, type=VIRTIO_GPU_RESP_OK_NODATA)
> at ../hw/display/virtio-gpu.c:189
> #18 0x0000555555ba6b82 in virtio_gpu_rutabaga_fence_cb
> (user_data=93825028849616, fence_data=...) at
> ../hw/display/virtio-gpu-rutabaga.c:860
> #19 0x00007ffff75b9381 in
> _ZN131_$LT$rutabaga_gfx..rutabaga_utils..RutabagaFenceClosure$LT$T$GT$$u20$as$u20$rutabaga_gfx..rutabaga_utils..RutabagaFenceCallback$GT$4call17hcbf1b4ac094a2a60E.llvm.14356420492591683714
> () at /usr/local/lib/librutabaga_gfx_ffi.so
> #20 0x00007ffff75d501b in
> rutabaga_gfx::cross_domain::cross_domain::CrossDomainWorker::run () at
> /usr/local/lib/librutabaga_gfx_ffi.so
> #21 0x00007ffff75dd651 in
> std::sys_common::backtrace::__rust_begin_short_backtrace () at
> /usr/local/lib/librutabaga_gfx_ffi.so
> #22 0x00007ffff75c62ba in
> core::ops::function::FnOnce::call_once{{vtable.shim}} () at
> /usr/local/lib/librutabaga_gfx_ffi.so
> #23 0x00007ffff75e3b73 in alloc::boxed::{impl#44}::call_once<(), dyn
> core::ops::function::FnOnce<(), Output=()>, alloc::alloc::Global>
> (self=..., args=()) at library/alloc/src/boxed.rs:1940
> #24 alloc::boxed::{impl#44}::call_once<(), alloc::boxed::Box<dyn
> core::ops::function::FnOnce<(), Output=()>, alloc::alloc::Global>,
> alloc::alloc::Global> (self=0x5555572e27d0, args=())
>     at library/alloc/src/boxed.rs:1940
> #25 std::sys::unix::thread::{impl#2}::new::thread_start
> (main=0x5555572e27d0) at library/std/src/sys/unix/thread.rs:108
> #26 0x00007ffff7094b43 in start_thread (arg=<optimized out>) at
> ./nptl/pthread_create.c:442
> #27 0x00007ffff7126a00 in clone3 () at
> ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

This Rust thread is calling QEMU emulation code that is not designed
to run outside the big QEMU lock (virtio_notify()).

I think prepare_mmio_access() is deadlocking on the big QEMU lock.
Another thread must be holding it (maybe waiting for this thread?) and
therefore the hang occurs.

Also, there is some additional setup like rcu_register_thread() that
may also be missing in this Rust thread.

The conservative approach is to only call QEMU emulation functions
like virtio_notify() from a QEMU thread. Some parts of QEMU are
thread-safe and don't need the big QEMU lock, but that requires
carefully calling only safe functions and at first glance I guess this
patch series isn't doing that.

Stefan
diff mbox series

Patch

diff --git a/hw/display/virtio-gpu-rutabaga.c b/hw/display/virtio-gpu-rutabaga.c
index 196267aac2..5c296aeef1 100644
--- a/hw/display/virtio-gpu-rutabaga.c
+++ b/hw/display/virtio-gpu-rutabaga.c
@@ -31,6 +31,11 @@  static int virtio_gpu_rutabaga_init(VirtIOGPU *g);
 
 #define CHECK_RESULT(result, cmd) CHECK(result == 0, cmd)
 
+struct rutabaga_aio_data {
+    struct VirtIOGPUGL *virtio_gpu;
+    struct rutabaga_fence fence;
+};
+
 static void
 virtio_gpu_rutabaga_update_cursor(VirtIOGPU *g, struct virtio_gpu_scanout *s,
                                   uint32_t resource_id)
@@ -823,10 +828,11 @@  virtio_gpu_rutabaga_process_cmd(VirtIOGPU *g,
 }
 
 static void
-virtio_gpu_rutabaga_fence_cb(uint64_t user_data,
-                             struct rutabaga_fence fence_data)
+virtio_gpu_rutabaga_aio_cb(void *opaque)
 {
-    VirtIOGPU *g = (VirtIOGPU *)(void*)(uintptr_t)user_data;
+    struct rutabaga_aio_data *data =  (struct rutabaga_aio_data *)opaque;
+    VirtIOGPU *g = (VirtIOGPU *)data->virtio_gpu;
+    struct rutabaga_fence fence_data = data->fence;
     struct virtio_gpu_ctrl_command *cmd, *tmp;
 
     bool signaled_ctx_specific = fence_data.flags & RUTABAGA_FLAG_INFO_RING_IDX;
@@ -856,6 +862,22 @@  virtio_gpu_rutabaga_fence_cb(uint64_t user_data,
         QTAILQ_REMOVE(&g->fenceq, cmd, next);
         g_free(cmd);
     }
+
+    g_free(data);
+}
+
+static void
+virtio_gpu_rutabaga_fence_cb(uint64_t user_data,
+                             struct rutabaga_fence fence_data) {
+    struct rutabaga_aio_data *data;
+    VirtIOGPU *g = (VirtIOGPU *)(void*)(uintptr_t)user_data;
+    GET_VIRTIO_GPU_GL(g);
+
+    data = g_new0(struct rutabaga_aio_data, 1);
+    data->virtio_gpu = virtio_gpu;
+    data->fence = fence_data;
+    aio_bh_schedule_oneshot_full(virtio_gpu->ctx, virtio_gpu_rutabaga_aio_cb,
+                                 (void *)data, "aio");
 }
 
 static int virtio_gpu_rutabaga_init(VirtIOGPU *g)
@@ -912,6 +934,7 @@  static int virtio_gpu_rutabaga_init(VirtIOGPU *g)
         free(channels.channels);
     }
 
+    virtio_gpu->ctx = qemu_get_aio_context();
     return result;
 }
 
diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h
index 034c71e8f5..b33ad0c68f 100644
--- a/include/hw/virtio/virtio-gpu.h
+++ b/include/hw/virtio/virtio-gpu.h
@@ -225,6 +225,7 @@  struct VirtIOGPUGL {
     char *wayland_socket_path;
     uint32_t num_capsets;
     void *rutabaga;
+    AioContext *ctx;
 };
 
 struct VhostUserGPU {