diff mbox series

monitor: avoid potential dead-lock when cleaning up

Message ID 20180801093401.9160-1-marcandre.lureau@redhat.com
State New
Headers show
Series monitor: avoid potential dead-lock when cleaning up | expand

Commit Message

Marc-André Lureau Aug. 1, 2018, 9:34 a.m. UTC
When a monitor is connected to a Spice chardev, the monitor cleanup
can dead-lock:

 #0  0x00007f43446637fd in __lll_lock_wait () at /lib64/libpthread.so.0
 #1  0x00007f434465ccf4 in pthread_mutex_lock () at /lib64/libpthread.so.0
 #2  0x0000556dd79f22ba in qemu_mutex_lock_impl (mutex=0x556dd81c9220 <monitor_lock>, file=0x556dd7ae3648 "/home/elmarco/src/qq/monitor.c", line=645) at /home/elmarco/src/qq/util/qemu-thread-posix.c:66
 #3  0x0000556dd7431bd5 in monitor_qapi_event_queue (event=QAPI_EVENT_SPICE_DISCONNECTED, qdict=0x556dd9abc850, errp=0x7fffb7bbddd8) at /home/elmarco/src/qq/monitor.c:645
 #4  0x0000556dd79d476b in qapi_event_send_spice_disconnected (server=0x556dd98ee760, client=0x556ddaaa8560, errp=0x556dd82180d0 <error_abort>) at qapi/qapi-events-ui.c:149
 #5  0x0000556dd7870fc1 in channel_event (event=3, info=0x556ddad1b590) at /home/elmarco/src/qq/ui/spice-core.c:235
 #6  0x00007f434560a6bb in reds_handle_channel_event (reds=<optimized out>, event=3, info=0x556ddad1b590) at reds.c:316
 #7  0x00007f43455f393b in main_dispatcher_self_handle_channel_event (info=0x556ddad1b590, event=3, self=0x556dd9a7d8c0) at main-dispatcher.c:197
 #8  0x00007f43455f393b in main_dispatcher_channel_event (self=0x556dd9a7d8c0, event=event@entry=3, info=0x556ddad1b590) at main-dispatcher.c:197
 #9  0x00007f4345612833 in red_stream_push_channel_event (s=s@entry=0x556ddae2ef40, event=event@entry=3) at red-stream.c:414
 #10 0x00007f434561286b in red_stream_free (s=0x556ddae2ef40) at red-stream.c:388
 #11 0x00007f43455f9ddc in red_channel_client_finalize (object=0x556dd9bb21a0) at red-channel-client.c:347
 #12 0x00007f434b5f9fb9 in g_object_unref () at /lib64/libgobject-2.0.so.0
 #13 0x00007f43455fc212 in red_channel_client_push (rcc=0x556dd9bb21a0) at red-channel-client.c:1341
 #14 0x0000556dd76081ba in spice_port_set_fe_open (chr=0x556dd9925e20, fe_open=0) at /home/elmarco/src/qq/chardev/spice.c:241
 #15 0x0000556dd796d74a in qemu_chr_fe_set_open (be=0x556dd9a37c00, fe_open=0) at /home/elmarco/src/qq/chardev/char-fe.c:340
 #16 0x0000556dd796d4d9 in qemu_chr_fe_set_handlers (b=0x556dd9a37c00, fd_can_read=0x0, fd_read=0x0, fd_event=0x0, be_change=0x0, opaque=0x0, context=0x0, set_open=true) at /home/elmarco/src/qq/chardev/char-fe.c:280
 #17 0x0000556dd796d359 in qemu_chr_fe_deinit (b=0x556dd9a37c00, del=false) at /home/elmarco/src/qq/chardev/char-fe.c:233
 #18 0x0000556dd7432240 in monitor_data_destroy (mon=0x556dd9a37c00) at /home/elmarco/src/qq/monitor.c:786
 #19 0x0000556dd743b968 in monitor_cleanup () at /home/elmarco/src/qq/monitor.c:4683
 #20 0x0000556dd75ce776 in main (argc=3, argv=0x7fffb7bbe458, envp=0x7fffb7bbe478) at /home/elmarco/src/qq/vl.c:4660

Because spice code tries to emit a "disconnected" signal on the
monitors. Fix this situation by tightening the monitor lock time to
the monitor list removal.

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 monitor.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

Comments

Markus Armbruster Aug. 1, 2018, 1:19 p.m. UTC | #1
Marc-André Lureau <marcandre.lureau@redhat.com> writes:

> When a monitor is connected to a Spice chardev, the monitor cleanup
> can dead-lock:
>
>  #0  0x00007f43446637fd in __lll_lock_wait () at /lib64/libpthread.so.0
>  #1  0x00007f434465ccf4 in pthread_mutex_lock () at /lib64/libpthread.so.0
>  #2  0x0000556dd79f22ba in qemu_mutex_lock_impl (mutex=0x556dd81c9220 <monitor_lock>, file=0x556dd7ae3648 "/home/elmarco/src/qq/monitor.c", line=645) at /home/elmarco/src/qq/util/qemu-thread-posix.c:66
>  #3  0x0000556dd7431bd5 in monitor_qapi_event_queue (event=QAPI_EVENT_SPICE_DISCONNECTED, qdict=0x556dd9abc850, errp=0x7fffb7bbddd8) at /home/elmarco/src/qq/monitor.c:645
>  #4  0x0000556dd79d476b in qapi_event_send_spice_disconnected (server=0x556dd98ee760, client=0x556ddaaa8560, errp=0x556dd82180d0 <error_abort>) at qapi/qapi-events-ui.c:149
>  #5  0x0000556dd7870fc1 in channel_event (event=3, info=0x556ddad1b590) at /home/elmarco/src/qq/ui/spice-core.c:235
>  #6  0x00007f434560a6bb in reds_handle_channel_event (reds=<optimized out>, event=3, info=0x556ddad1b590) at reds.c:316
>  #7  0x00007f43455f393b in main_dispatcher_self_handle_channel_event (info=0x556ddad1b590, event=3, self=0x556dd9a7d8c0) at main-dispatcher.c:197
>  #8  0x00007f43455f393b in main_dispatcher_channel_event (self=0x556dd9a7d8c0, event=event@entry=3, info=0x556ddad1b590) at main-dispatcher.c:197
>  #9  0x00007f4345612833 in red_stream_push_channel_event (s=s@entry=0x556ddae2ef40, event=event@entry=3) at red-stream.c:414
>  #10 0x00007f434561286b in red_stream_free (s=0x556ddae2ef40) at red-stream.c:388
>  #11 0x00007f43455f9ddc in red_channel_client_finalize (object=0x556dd9bb21a0) at red-channel-client.c:347
>  #12 0x00007f434b5f9fb9 in g_object_unref () at /lib64/libgobject-2.0.so.0
>  #13 0x00007f43455fc212 in red_channel_client_push (rcc=0x556dd9bb21a0) at red-channel-client.c:1341
>  #14 0x0000556dd76081ba in spice_port_set_fe_open (chr=0x556dd9925e20, fe_open=0) at /home/elmarco/src/qq/chardev/spice.c:241
>  #15 0x0000556dd796d74a in qemu_chr_fe_set_open (be=0x556dd9a37c00, fe_open=0) at /home/elmarco/src/qq/chardev/char-fe.c:340
>  #16 0x0000556dd796d4d9 in qemu_chr_fe_set_handlers (b=0x556dd9a37c00, fd_can_read=0x0, fd_read=0x0, fd_event=0x0, be_change=0x0, opaque=0x0, context=0x0, set_open=true) at /home/elmarco/src/qq/chardev/char-fe.c:280
>  #17 0x0000556dd796d359 in qemu_chr_fe_deinit (b=0x556dd9a37c00, del=false) at /home/elmarco/src/qq/chardev/char-fe.c:233
>  #18 0x0000556dd7432240 in monitor_data_destroy (mon=0x556dd9a37c00) at /home/elmarco/src/qq/monitor.c:786
>  #19 0x0000556dd743b968 in monitor_cleanup () at /home/elmarco/src/qq/monitor.c:4683
>  #20 0x0000556dd75ce776 in main (argc=3, argv=0x7fffb7bbe458, envp=0x7fffb7bbe478) at /home/elmarco/src/qq/vl.c:4660
>
> Because spice code tries to emit a "disconnected" signal on the
> monitors. Fix this situation by tightening the monitor lock time to
> the monitor list removal.
>
> Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>

Do you think this should go into 3.0?

> ---
>  monitor.c | 22 +++++++++++++++-------
>  1 file changed, 15 insertions(+), 7 deletions(-)
>
> diff --git a/monitor.c b/monitor.c
> index 0fa0910a2a..a16a6c5311 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -4702,8 +4702,6 @@ void monitor_init(Chardev *chr, int flags)
>  
>  void monitor_cleanup(void)
>  {
> -    Monitor *mon, *next;
> -
>      /*
>       * We need to explicitly stop the I/O thread (but not destroy it),
>       * clean up the monitor resources, then destroy the I/O thread since
> @@ -4719,14 +4717,24 @@ void monitor_cleanup(void)
>      monitor_qmp_bh_responder(NULL);
>  
>      /* Flush output buffers and destroy monitors */
> -    qemu_mutex_lock(&monitor_lock);
> -    QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
> -        QTAILQ_REMOVE(&mon_list, mon, entry);
> +    do {

for (;;), please.

> +        Monitor *mon;
> +
> +        qemu_mutex_lock(&monitor_lock);
> +        mon = QTAILQ_FIRST(&mon_list);
> +        if (mon) {
> +            QTAILQ_REMOVE(&mon_list, mon, entry);
> +        }
> +        qemu_mutex_unlock(&monitor_lock);
> +
> +        if (!mon) {
> +            break;
> +        }
> +
>          monitor_flush(mon);
>          monitor_data_destroy(mon);
>          g_free(mon);
> -    }
> -    qemu_mutex_unlock(&monitor_lock);
> +    } while (true);
>  
>      /* QEMUBHs needs to be deleted before destroying the I/O thread */
>      qemu_bh_delete(qmp_dispatcher_bh);

Iterating safely over a list protected by a lock should be easier than
that.  Sad.

Hmm, what about this:

diff --git a/monitor.c b/monitor.c
index 77861e96af..4a23f6c7bc 100644
--- a/monitor.c
+++ b/monitor.c
@@ -4721,9 +4721,11 @@ void monitor_cleanup(void)
     qemu_mutex_lock(&monitor_lock);
     QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
         QTAILQ_REMOVE(&mon_list, mon, entry);
+        qemu_mutex_unlock(&monitor_lock);
         monitor_flush(mon);
         monitor_data_destroy(mon);
         g_free(mon);
+        qemu_mutex_lock(&monitor_lock);
     }
     qemu_mutex_unlock(&monitor_lock);
Marc-André Lureau Aug. 1, 2018, 1:32 p.m. UTC | #2
Hi

On Wed, Aug 1, 2018 at 3:19 PM, Markus Armbruster <armbru@redhat.com> wrote:
> Marc-André Lureau <marcandre.lureau@redhat.com> writes:
>
>> When a monitor is connected to a Spice chardev, the monitor cleanup
>> can dead-lock:
>>
>>  #0  0x00007f43446637fd in __lll_lock_wait () at /lib64/libpthread.so.0
>>  #1  0x00007f434465ccf4 in pthread_mutex_lock () at /lib64/libpthread.so.0
>>  #2  0x0000556dd79f22ba in qemu_mutex_lock_impl (mutex=0x556dd81c9220 <monitor_lock>, file=0x556dd7ae3648 "/home/elmarco/src/qq/monitor.c", line=645) at /home/elmarco/src/qq/util/qemu-thread-posix.c:66
>>  #3  0x0000556dd7431bd5 in monitor_qapi_event_queue (event=QAPI_EVENT_SPICE_DISCONNECTED, qdict=0x556dd9abc850, errp=0x7fffb7bbddd8) at /home/elmarco/src/qq/monitor.c:645
>>  #4  0x0000556dd79d476b in qapi_event_send_spice_disconnected (server=0x556dd98ee760, client=0x556ddaaa8560, errp=0x556dd82180d0 <error_abort>) at qapi/qapi-events-ui.c:149
>>  #5  0x0000556dd7870fc1 in channel_event (event=3, info=0x556ddad1b590) at /home/elmarco/src/qq/ui/spice-core.c:235
>>  #6  0x00007f434560a6bb in reds_handle_channel_event (reds=<optimized out>, event=3, info=0x556ddad1b590) at reds.c:316
>>  #7  0x00007f43455f393b in main_dispatcher_self_handle_channel_event (info=0x556ddad1b590, event=3, self=0x556dd9a7d8c0) at main-dispatcher.c:197
>>  #8  0x00007f43455f393b in main_dispatcher_channel_event (self=0x556dd9a7d8c0, event=event@entry=3, info=0x556ddad1b590) at main-dispatcher.c:197
>>  #9  0x00007f4345612833 in red_stream_push_channel_event (s=s@entry=0x556ddae2ef40, event=event@entry=3) at red-stream.c:414
>>  #10 0x00007f434561286b in red_stream_free (s=0x556ddae2ef40) at red-stream.c:388
>>  #11 0x00007f43455f9ddc in red_channel_client_finalize (object=0x556dd9bb21a0) at red-channel-client.c:347
>>  #12 0x00007f434b5f9fb9 in g_object_unref () at /lib64/libgobject-2.0.so.0
>>  #13 0x00007f43455fc212 in red_channel_client_push (rcc=0x556dd9bb21a0) at red-channel-client.c:1341
>>  #14 0x0000556dd76081ba in spice_port_set_fe_open (chr=0x556dd9925e20, fe_open=0) at /home/elmarco/src/qq/chardev/spice.c:241
>>  #15 0x0000556dd796d74a in qemu_chr_fe_set_open (be=0x556dd9a37c00, fe_open=0) at /home/elmarco/src/qq/chardev/char-fe.c:340
>>  #16 0x0000556dd796d4d9 in qemu_chr_fe_set_handlers (b=0x556dd9a37c00, fd_can_read=0x0, fd_read=0x0, fd_event=0x0, be_change=0x0, opaque=0x0, context=0x0, set_open=true) at /home/elmarco/src/qq/chardev/char-fe.c:280
>>  #17 0x0000556dd796d359 in qemu_chr_fe_deinit (b=0x556dd9a37c00, del=false) at /home/elmarco/src/qq/chardev/char-fe.c:233
>>  #18 0x0000556dd7432240 in monitor_data_destroy (mon=0x556dd9a37c00) at /home/elmarco/src/qq/monitor.c:786
>>  #19 0x0000556dd743b968 in monitor_cleanup () at /home/elmarco/src/qq/monitor.c:4683
>>  #20 0x0000556dd75ce776 in main (argc=3, argv=0x7fffb7bbe458, envp=0x7fffb7bbe478) at /home/elmarco/src/qq/vl.c:4660
>>
>> Because spice code tries to emit a "disconnected" signal on the
>> monitors. Fix this situation by tightening the monitor lock time to
>> the monitor list removal.
>>
>> Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
>
> Do you think this should go into 3.0?
>
>> ---
>>  monitor.c | 22 +++++++++++++++-------
>>  1 file changed, 15 insertions(+), 7 deletions(-)
>>
>> diff --git a/monitor.c b/monitor.c
>> index 0fa0910a2a..a16a6c5311 100644
>> --- a/monitor.c
>> +++ b/monitor.c
>> @@ -4702,8 +4702,6 @@ void monitor_init(Chardev *chr, int flags)
>>
>>  void monitor_cleanup(void)
>>  {
>> -    Monitor *mon, *next;
>> -
>>      /*
>>       * We need to explicitly stop the I/O thread (but not destroy it),
>>       * clean up the monitor resources, then destroy the I/O thread since
>> @@ -4719,14 +4717,24 @@ void monitor_cleanup(void)
>>      monitor_qmp_bh_responder(NULL);
>>
>>      /* Flush output buffers and destroy monitors */
>> -    qemu_mutex_lock(&monitor_lock);
>> -    QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
>> -        QTAILQ_REMOVE(&mon_list, mon, entry);
>> +    do {
>
> for (;;), please.
>
>> +        Monitor *mon;
>> +
>> +        qemu_mutex_lock(&monitor_lock);
>> +        mon = QTAILQ_FIRST(&mon_list);
>> +        if (mon) {
>> +            QTAILQ_REMOVE(&mon_list, mon, entry);
>> +        }
>> +        qemu_mutex_unlock(&monitor_lock);
>> +
>> +        if (!mon) {
>> +            break;
>> +        }
>> +
>>          monitor_flush(mon);
>>          monitor_data_destroy(mon);
>>          g_free(mon);
>> -    }
>> -    qemu_mutex_unlock(&monitor_lock);
>> +    } while (true);
>>
>>      /* QEMUBHs needs to be deleted before destroying the I/O thread */
>>      qemu_bh_delete(qmp_dispatcher_bh);
>
> Iterating safely over a list protected by a lock should be easier than
> that.  Sad.
>
> Hmm, what about this:
>
> diff --git a/monitor.c b/monitor.c
> index 77861e96af..4a23f6c7bc 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -4721,9 +4721,11 @@ void monitor_cleanup(void)
>      qemu_mutex_lock(&monitor_lock);
>      QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
>          QTAILQ_REMOVE(&mon_list, mon, entry);
> +        qemu_mutex_unlock(&monitor_lock);
>          monitor_flush(mon);
>          monitor_data_destroy(mon);
>          g_free(mon);
> +        qemu_mutex_lock(&monitor_lock);

Although unlikely, there is a chance the monitor list could be
modified while flushing/cleaning up, I suppose, in this case we could
miss the new monitors (if next is NULL).

>      }
>      qemu_mutex_unlock(&monitor_lock);
>
>
Markus Armbruster Aug. 1, 2018, 3:09 p.m. UTC | #3
Marc-André Lureau <marcandre.lureau@gmail.com> writes:

> Hi
>
> On Wed, Aug 1, 2018 at 3:19 PM, Markus Armbruster <armbru@redhat.com> wrote:
>> Marc-André Lureau <marcandre.lureau@redhat.com> writes:
>>
>>> When a monitor is connected to a Spice chardev, the monitor cleanup
>>> can dead-lock:
>>>
>>>  #0  0x00007f43446637fd in __lll_lock_wait () at /lib64/libpthread.so.0
>>>  #1  0x00007f434465ccf4 in pthread_mutex_lock () at /lib64/libpthread.so.0
>>>  #2  0x0000556dd79f22ba in qemu_mutex_lock_impl (mutex=0x556dd81c9220 <monitor_lock>, file=0x556dd7ae3648 "/home/elmarco/src/qq/monitor.c", line=645) at /home/elmarco/src/qq/util/qemu-thread-posix.c:66
>>>  #3  0x0000556dd7431bd5 in monitor_qapi_event_queue (event=QAPI_EVENT_SPICE_DISCONNECTED, qdict=0x556dd9abc850, errp=0x7fffb7bbddd8) at /home/elmarco/src/qq/monitor.c:645
>>>  #4  0x0000556dd79d476b in qapi_event_send_spice_disconnected (server=0x556dd98ee760, client=0x556ddaaa8560, errp=0x556dd82180d0 <error_abort>) at qapi/qapi-events-ui.c:149
>>>  #5  0x0000556dd7870fc1 in channel_event (event=3, info=0x556ddad1b590) at /home/elmarco/src/qq/ui/spice-core.c:235
>>>  #6  0x00007f434560a6bb in reds_handle_channel_event (reds=<optimized out>, event=3, info=0x556ddad1b590) at reds.c:316
>>>  #7  0x00007f43455f393b in main_dispatcher_self_handle_channel_event (info=0x556ddad1b590, event=3, self=0x556dd9a7d8c0) at main-dispatcher.c:197
>>>  #8  0x00007f43455f393b in main_dispatcher_channel_event (self=0x556dd9a7d8c0, event=event@entry=3, info=0x556ddad1b590) at main-dispatcher.c:197
>>>  #9  0x00007f4345612833 in red_stream_push_channel_event (s=s@entry=0x556ddae2ef40, event=event@entry=3) at red-stream.c:414
>>>  #10 0x00007f434561286b in red_stream_free (s=0x556ddae2ef40) at red-stream.c:388
>>>  #11 0x00007f43455f9ddc in red_channel_client_finalize (object=0x556dd9bb21a0) at red-channel-client.c:347
>>>  #12 0x00007f434b5f9fb9 in g_object_unref () at /lib64/libgobject-2.0.so.0
>>>  #13 0x00007f43455fc212 in red_channel_client_push (rcc=0x556dd9bb21a0) at red-channel-client.c:1341
>>>  #14 0x0000556dd76081ba in spice_port_set_fe_open (chr=0x556dd9925e20, fe_open=0) at /home/elmarco/src/qq/chardev/spice.c:241
>>>  #15 0x0000556dd796d74a in qemu_chr_fe_set_open (be=0x556dd9a37c00, fe_open=0) at /home/elmarco/src/qq/chardev/char-fe.c:340
>>>  #16 0x0000556dd796d4d9 in qemu_chr_fe_set_handlers (b=0x556dd9a37c00, fd_can_read=0x0, fd_read=0x0, fd_event=0x0, be_change=0x0, opaque=0x0, context=0x0, set_open=true) at /home/elmarco/src/qq/chardev/char-fe.c:280
>>>  #17 0x0000556dd796d359 in qemu_chr_fe_deinit (b=0x556dd9a37c00, del=false) at /home/elmarco/src/qq/chardev/char-fe.c:233
>>>  #18 0x0000556dd7432240 in monitor_data_destroy (mon=0x556dd9a37c00) at /home/elmarco/src/qq/monitor.c:786
>>>  #19 0x0000556dd743b968 in monitor_cleanup () at /home/elmarco/src/qq/monitor.c:4683
>>>  #20 0x0000556dd75ce776 in main (argc=3, argv=0x7fffb7bbe458, envp=0x7fffb7bbe478) at /home/elmarco/src/qq/vl.c:4660
>>>
>>> Because spice code tries to emit a "disconnected" signal on the
>>> monitors. Fix this situation by tightening the monitor lock time to
>>> the monitor list removal.
>>>
>>> Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
>>
>> Do you think this should go into 3.0?
>>
>>> ---
>>>  monitor.c | 22 +++++++++++++++-------
>>>  1 file changed, 15 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/monitor.c b/monitor.c
>>> index 0fa0910a2a..a16a6c5311 100644
>>> --- a/monitor.c
>>> +++ b/monitor.c
>>> @@ -4702,8 +4702,6 @@ void monitor_init(Chardev *chr, int flags)
>>>
>>>  void monitor_cleanup(void)
>>>  {
>>> -    Monitor *mon, *next;
>>> -
>>>      /*
>>>       * We need to explicitly stop the I/O thread (but not destroy it),
>>>       * clean up the monitor resources, then destroy the I/O thread since
>>> @@ -4719,14 +4717,24 @@ void monitor_cleanup(void)
>>>      monitor_qmp_bh_responder(NULL);
>>>
>>>      /* Flush output buffers and destroy monitors */
>>> -    qemu_mutex_lock(&monitor_lock);
>>> -    QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
>>> -        QTAILQ_REMOVE(&mon_list, mon, entry);
>>> +    do {
>>
>> for (;;), please.
>>
>>> +        Monitor *mon;
>>> +
>>> +        qemu_mutex_lock(&monitor_lock);
>>> +        mon = QTAILQ_FIRST(&mon_list);
>>> +        if (mon) {
>>> +            QTAILQ_REMOVE(&mon_list, mon, entry);
>>> +        }
>>> +        qemu_mutex_unlock(&monitor_lock);
>>> +
>>> +        if (!mon) {
>>> +            break;
>>> +        }
>>> +
>>>          monitor_flush(mon);
>>>          monitor_data_destroy(mon);
>>>          g_free(mon);
>>> -    }
>>> -    qemu_mutex_unlock(&monitor_lock);
>>> +    } while (true);
>>>
>>>      /* QEMUBHs needs to be deleted before destroying the I/O thread */
>>>      qemu_bh_delete(qmp_dispatcher_bh);
>>
>> Iterating safely over a list protected by a lock should be easier than
>> that.  Sad.
>>
>> Hmm, what about this:
>>
>> diff --git a/monitor.c b/monitor.c
>> index 77861e96af..4a23f6c7bc 100644
>> --- a/monitor.c
>> +++ b/monitor.c
>> @@ -4721,9 +4721,11 @@ void monitor_cleanup(void)
>>      qemu_mutex_lock(&monitor_lock);
>>      QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
>>          QTAILQ_REMOVE(&mon_list, mon, entry);
>> +        qemu_mutex_unlock(&monitor_lock);
>>          monitor_flush(mon);
>>          monitor_data_destroy(mon);
>>          g_free(mon);
>> +        qemu_mutex_lock(&monitor_lock);
>
> Although unlikely, there is a chance the monitor list could be
> modified while flushing/cleaning up, I suppose, in this case we could
> miss the new monitors (if next is NULL).

Your loop prevents that from happening while it runs, but does nothing
to stop it from happening afterwards.  If we want to lock out new
monitors, we need to make monitor_init() fail or impossible to call.

>
>>      }
>>      qemu_mutex_unlock(&monitor_lock);
>>
>>
Marc-André Lureau Aug. 17, 2018, 5:50 p.m. UTC | #4
Hi
On Wed, Aug 1, 2018 at 5:09 PM Markus Armbruster <armbru@redhat.com> wrote:
>
> Marc-André Lureau <marcandre.lureau@gmail.com> writes:
>
> > Hi
> >
> > On Wed, Aug 1, 2018 at 3:19 PM, Markus Armbruster <armbru@redhat.com> wrote:
> >> Marc-André Lureau <marcandre.lureau@redhat.com> writes:
> >>
> >>> When a monitor is connected to a Spice chardev, the monitor cleanup
> >>> can dead-lock:
> >>>
> >>>  #0  0x00007f43446637fd in __lll_lock_wait () at /lib64/libpthread.so.0
> >>>  #1  0x00007f434465ccf4 in pthread_mutex_lock () at /lib64/libpthread.so.0
> >>>  #2  0x0000556dd79f22ba in qemu_mutex_lock_impl (mutex=0x556dd81c9220 <monitor_lock>, file=0x556dd7ae3648 "/home/elmarco/src/qq/monitor.c", line=645) at /home/elmarco/src/qq/util/qemu-thread-posix.c:66
> >>>  #3  0x0000556dd7431bd5 in monitor_qapi_event_queue (event=QAPI_EVENT_SPICE_DISCONNECTED, qdict=0x556dd9abc850, errp=0x7fffb7bbddd8) at /home/elmarco/src/qq/monitor.c:645
> >>>  #4  0x0000556dd79d476b in qapi_event_send_spice_disconnected (server=0x556dd98ee760, client=0x556ddaaa8560, errp=0x556dd82180d0 <error_abort>) at qapi/qapi-events-ui.c:149
> >>>  #5  0x0000556dd7870fc1 in channel_event (event=3, info=0x556ddad1b590) at /home/elmarco/src/qq/ui/spice-core.c:235
> >>>  #6  0x00007f434560a6bb in reds_handle_channel_event (reds=<optimized out>, event=3, info=0x556ddad1b590) at reds.c:316
> >>>  #7  0x00007f43455f393b in main_dispatcher_self_handle_channel_event (info=0x556ddad1b590, event=3, self=0x556dd9a7d8c0) at main-dispatcher.c:197
> >>>  #8  0x00007f43455f393b in main_dispatcher_channel_event (self=0x556dd9a7d8c0, event=event@entry=3, info=0x556ddad1b590) at main-dispatcher.c:197
> >>>  #9  0x00007f4345612833 in red_stream_push_channel_event (s=s@entry=0x556ddae2ef40, event=event@entry=3) at red-stream.c:414
> >>>  #10 0x00007f434561286b in red_stream_free (s=0x556ddae2ef40) at red-stream.c:388
> >>>  #11 0x00007f43455f9ddc in red_channel_client_finalize (object=0x556dd9bb21a0) at red-channel-client.c:347
> >>>  #12 0x00007f434b5f9fb9 in g_object_unref () at /lib64/libgobject-2.0.so.0
> >>>  #13 0x00007f43455fc212 in red_channel_client_push (rcc=0x556dd9bb21a0) at red-channel-client.c:1341
> >>>  #14 0x0000556dd76081ba in spice_port_set_fe_open (chr=0x556dd9925e20, fe_open=0) at /home/elmarco/src/qq/chardev/spice.c:241
> >>>  #15 0x0000556dd796d74a in qemu_chr_fe_set_open (be=0x556dd9a37c00, fe_open=0) at /home/elmarco/src/qq/chardev/char-fe.c:340
> >>>  #16 0x0000556dd796d4d9 in qemu_chr_fe_set_handlers (b=0x556dd9a37c00, fd_can_read=0x0, fd_read=0x0, fd_event=0x0, be_change=0x0, opaque=0x0, context=0x0, set_open=true) at /home/elmarco/src/qq/chardev/char-fe.c:280
> >>>  #17 0x0000556dd796d359 in qemu_chr_fe_deinit (b=0x556dd9a37c00, del=false) at /home/elmarco/src/qq/chardev/char-fe.c:233
> >>>  #18 0x0000556dd7432240 in monitor_data_destroy (mon=0x556dd9a37c00) at /home/elmarco/src/qq/monitor.c:786
> >>>  #19 0x0000556dd743b968 in monitor_cleanup () at /home/elmarco/src/qq/monitor.c:4683
> >>>  #20 0x0000556dd75ce776 in main (argc=3, argv=0x7fffb7bbe458, envp=0x7fffb7bbe478) at /home/elmarco/src/qq/vl.c:4660
> >>>
> >>> Because spice code tries to emit a "disconnected" signal on the
> >>> monitors. Fix this situation by tightening the monitor lock time to
> >>> the monitor list removal.
> >>>
> >>> Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> >>
> >> Do you think this should go into 3.0?
> >>
> >>> ---
> >>>  monitor.c | 22 +++++++++++++++-------
> >>>  1 file changed, 15 insertions(+), 7 deletions(-)
> >>>
> >>> diff --git a/monitor.c b/monitor.c
> >>> index 0fa0910a2a..a16a6c5311 100644
> >>> --- a/monitor.c
> >>> +++ b/monitor.c
> >>> @@ -4702,8 +4702,6 @@ void monitor_init(Chardev *chr, int flags)
> >>>
> >>>  void monitor_cleanup(void)
> >>>  {
> >>> -    Monitor *mon, *next;
> >>> -
> >>>      /*
> >>>       * We need to explicitly stop the I/O thread (but not destroy it),
> >>>       * clean up the monitor resources, then destroy the I/O thread since
> >>> @@ -4719,14 +4717,24 @@ void monitor_cleanup(void)
> >>>      monitor_qmp_bh_responder(NULL);
> >>>
> >>>      /* Flush output buffers and destroy monitors */
> >>> -    qemu_mutex_lock(&monitor_lock);
> >>> -    QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
> >>> -        QTAILQ_REMOVE(&mon_list, mon, entry);
> >>> +    do {
> >>
> >> for (;;), please.
> >>
> >>> +        Monitor *mon;
> >>> +
> >>> +        qemu_mutex_lock(&monitor_lock);
> >>> +        mon = QTAILQ_FIRST(&mon_list);
> >>> +        if (mon) {
> >>> +            QTAILQ_REMOVE(&mon_list, mon, entry);
> >>> +        }
> >>> +        qemu_mutex_unlock(&monitor_lock);
> >>> +
> >>> +        if (!mon) {
> >>> +            break;
> >>> +        }
> >>> +
> >>>          monitor_flush(mon);
> >>>          monitor_data_destroy(mon);
> >>>          g_free(mon);
> >>> -    }
> >>> -    qemu_mutex_unlock(&monitor_lock);
> >>> +    } while (true);
> >>>
> >>>      /* QEMUBHs needs to be deleted before destroying the I/O thread */
> >>>      qemu_bh_delete(qmp_dispatcher_bh);
> >>
> >> Iterating safely over a list protected by a lock should be easier than
> >> that.  Sad.
> >>
> >> Hmm, what about this:
> >>
> >> diff --git a/monitor.c b/monitor.c
> >> index 77861e96af..4a23f6c7bc 100644
> >> --- a/monitor.c
> >> +++ b/monitor.c
> >> @@ -4721,9 +4721,11 @@ void monitor_cleanup(void)
> >>      qemu_mutex_lock(&monitor_lock);
> >>      QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
> >>          QTAILQ_REMOVE(&mon_list, mon, entry);
> >> +        qemu_mutex_unlock(&monitor_lock);
> >>          monitor_flush(mon);
> >>          monitor_data_destroy(mon);
> >>          g_free(mon);
> >> +        qemu_mutex_lock(&monitor_lock);
> >
> > Although unlikely, there is a chance the monitor list could be
> > modified while flushing/cleaning up, I suppose, in this case we could
> > miss the new monitors (if next is NULL).
>
> Your loop prevents that from happening while it runs, but does nothing
> to stop it from happening afterwards.  If we want to lock out new
> monitors, we need to make monitor_init() fail or impossible to call.

Not so trivial. Is there other threads capable of calling
monitor_init() by the time monitor_cleanup() is called? It looks like
monitor_init() may only be called from the main thread.

>
> >
> >>      }
> >>      qemu_mutex_unlock(&monitor_lock);
> >>
> >>
Markus Armbruster Aug. 20, 2018, 6:57 a.m. UTC | #5
Marc-André Lureau <marcandre.lureau@gmail.com> writes:

> Hi
> On Wed, Aug 1, 2018 at 5:09 PM Markus Armbruster <armbru@redhat.com> wrote:
>>
>> Marc-André Lureau <marcandre.lureau@gmail.com> writes:
>>
>> > Hi
>> >
>> > On Wed, Aug 1, 2018 at 3:19 PM, Markus Armbruster <armbru@redhat.com> wrote:
>> >> Marc-André Lureau <marcandre.lureau@redhat.com> writes:
>> >>
>> >>> When a monitor is connected to a Spice chardev, the monitor cleanup
>> >>> can dead-lock:
>> >>>
>> >>>  #0  0x00007f43446637fd in __lll_lock_wait () at /lib64/libpthread.so.0
>> >>>  #1  0x00007f434465ccf4 in pthread_mutex_lock () at /lib64/libpthread.so.0
>> >>>  #2  0x0000556dd79f22ba in qemu_mutex_lock_impl (mutex=0x556dd81c9220 <monitor_lock>, file=0x556dd7ae3648 "/home/elmarco/src/qq/monitor.c", line=645) at /home/elmarco/src/qq/util/qemu-thread-posix.c:66
>> >>>  #3  0x0000556dd7431bd5 in monitor_qapi_event_queue (event=QAPI_EVENT_SPICE_DISCONNECTED, qdict=0x556dd9abc850, errp=0x7fffb7bbddd8) at /home/elmarco/src/qq/monitor.c:645
>> >>>  #4  0x0000556dd79d476b in qapi_event_send_spice_disconnected (server=0x556dd98ee760, client=0x556ddaaa8560, errp=0x556dd82180d0 <error_abort>) at qapi/qapi-events-ui.c:149
>> >>>  #5  0x0000556dd7870fc1 in channel_event (event=3, info=0x556ddad1b590) at /home/elmarco/src/qq/ui/spice-core.c:235
>> >>>  #6  0x00007f434560a6bb in reds_handle_channel_event (reds=<optimized out>, event=3, info=0x556ddad1b590) at reds.c:316
>> >>>  #7  0x00007f43455f393b in main_dispatcher_self_handle_channel_event (info=0x556ddad1b590, event=3, self=0x556dd9a7d8c0) at main-dispatcher.c:197
>> >>>  #8  0x00007f43455f393b in main_dispatcher_channel_event (self=0x556dd9a7d8c0, event=event@entry=3, info=0x556ddad1b590) at main-dispatcher.c:197
>> >>>  #9  0x00007f4345612833 in red_stream_push_channel_event (s=s@entry=0x556ddae2ef40, event=event@entry=3) at red-stream.c:414
>> >>>  #10 0x00007f434561286b in red_stream_free (s=0x556ddae2ef40) at red-stream.c:388
>> >>>  #11 0x00007f43455f9ddc in red_channel_client_finalize (object=0x556dd9bb21a0) at red-channel-client.c:347
>> >>>  #12 0x00007f434b5f9fb9 in g_object_unref () at /lib64/libgobject-2.0.so.0
>> >>>  #13 0x00007f43455fc212 in red_channel_client_push (rcc=0x556dd9bb21a0) at red-channel-client.c:1341
>> >>>  #14 0x0000556dd76081ba in spice_port_set_fe_open (chr=0x556dd9925e20, fe_open=0) at /home/elmarco/src/qq/chardev/spice.c:241
>> >>>  #15 0x0000556dd796d74a in qemu_chr_fe_set_open (be=0x556dd9a37c00, fe_open=0) at /home/elmarco/src/qq/chardev/char-fe.c:340
>> >>>  #16 0x0000556dd796d4d9 in qemu_chr_fe_set_handlers (b=0x556dd9a37c00, fd_can_read=0x0, fd_read=0x0, fd_event=0x0, be_change=0x0, opaque=0x0, context=0x0, set_open=true) at /home/elmarco/src/qq/chardev/char-fe.c:280
>> >>>  #17 0x0000556dd796d359 in qemu_chr_fe_deinit (b=0x556dd9a37c00, del=false) at /home/elmarco/src/qq/chardev/char-fe.c:233
>> >>>  #18 0x0000556dd7432240 in monitor_data_destroy (mon=0x556dd9a37c00) at /home/elmarco/src/qq/monitor.c:786
>> >>>  #19 0x0000556dd743b968 in monitor_cleanup () at /home/elmarco/src/qq/monitor.c:4683
>> >>>  #20 0x0000556dd75ce776 in main (argc=3, argv=0x7fffb7bbe458, envp=0x7fffb7bbe478) at /home/elmarco/src/qq/vl.c:4660
>> >>>
>> >>> Because spice code tries to emit a "disconnected" signal on the
>> >>> monitors. Fix this situation by tightening the monitor lock time to
>> >>> the monitor list removal.
>> >>>
>> >>> Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
>> >>
>> >> Do you think this should go into 3.0?
>> >>
>> >>> ---
>> >>>  monitor.c | 22 +++++++++++++++-------
>> >>>  1 file changed, 15 insertions(+), 7 deletions(-)
>> >>>
>> >>> diff --git a/monitor.c b/monitor.c
>> >>> index 0fa0910a2a..a16a6c5311 100644
>> >>> --- a/monitor.c
>> >>> +++ b/monitor.c
>> >>> @@ -4702,8 +4702,6 @@ void monitor_init(Chardev *chr, int flags)
>> >>>
>> >>>  void monitor_cleanup(void)
>> >>>  {
>> >>> -    Monitor *mon, *next;
>> >>> -
>> >>>      /*
>> >>>       * We need to explicitly stop the I/O thread (but not destroy it),
>> >>>       * clean up the monitor resources, then destroy the I/O thread since
>> >>> @@ -4719,14 +4717,24 @@ void monitor_cleanup(void)
>> >>>      monitor_qmp_bh_responder(NULL);
>> >>>
>> >>>      /* Flush output buffers and destroy monitors */
>> >>> -    qemu_mutex_lock(&monitor_lock);
>> >>> -    QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
>> >>> -        QTAILQ_REMOVE(&mon_list, mon, entry);
>> >>> +    do {
>> >>
>> >> for (;;), please.
>> >>
>> >>> +        Monitor *mon;
>> >>> +
>> >>> +        qemu_mutex_lock(&monitor_lock);
>> >>> +        mon = QTAILQ_FIRST(&mon_list);
>> >>> +        if (mon) {
>> >>> +            QTAILQ_REMOVE(&mon_list, mon, entry);
>> >>> +        }
>> >>> +        qemu_mutex_unlock(&monitor_lock);
>> >>> +
>> >>> +        if (!mon) {
>> >>> +            break;
>> >>> +        }
>> >>> +
>> >>>          monitor_flush(mon);
>> >>>          monitor_data_destroy(mon);
>> >>>          g_free(mon);
>> >>> -    }
>> >>> -    qemu_mutex_unlock(&monitor_lock);
>> >>> +    } while (true);
>> >>>
>> >>>      /* QEMUBHs needs to be deleted before destroying the I/O thread */
>> >>>      qemu_bh_delete(qmp_dispatcher_bh);
>> >>
>> >> Iterating safely over a list protected by a lock should be easier than
>> >> that.  Sad.
>> >>
>> >> Hmm, what about this:
>> >>
>> >> diff --git a/monitor.c b/monitor.c
>> >> index 77861e96af..4a23f6c7bc 100644
>> >> --- a/monitor.c
>> >> +++ b/monitor.c
>> >> @@ -4721,9 +4721,11 @@ void monitor_cleanup(void)
>> >>      qemu_mutex_lock(&monitor_lock);
>> >>      QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
>> >>          QTAILQ_REMOVE(&mon_list, mon, entry);
>> >> +        qemu_mutex_unlock(&monitor_lock);
>> >>          monitor_flush(mon);
>> >>          monitor_data_destroy(mon);
>> >>          g_free(mon);
>> >> +        qemu_mutex_lock(&monitor_lock);
>> >
>> > Although unlikely, there is a chance the monitor list could be
>> > modified while flushing/cleaning up, I suppose, in this case we could
>> > miss the new monitors (if next is NULL).
>>
>> Your loop prevents that from happening while it runs, but does nothing
>> to stop it from happening afterwards.  If we want to lock out new
>> monitors, we need to make monitor_init() fail or impossible to call.
>
> Not so trivial. Is there other threads capable of calling
> monitor_init() by the time monitor_cleanup() is called? It looks like
> monitor_init() may only be called from the main thread.

Callers:

* gdbserver_start()

  CLI option -gdb, HMP command gdbserver, linux user CLI option -g and
  environment variable QEMU_GDB

  The interesting one is the HMP command.  Does your loop lock it out?
  If we run it only in the main thread, and we run the HMP command only
  in the main thread, it obviously does.

* mon_init_func()

  CLI option -mon and its convenience buddies -monitor, -qmp,
  -qmp-pretty

  We don't have a monitor command to spawn off a new monitor, but we
  could have.

* qemu_chr_new_noreplay()

  gdbserver_start() again, and qemu_chr_new(), which is called all over
  the place.  I lack the time to review these calls.  Are you sure this
  one can only run in the main thread?

Synchronizing monitor creation and cleanup explicitly might be cleaner.
I guess monitor_lock kind of sort of almost does that before your patch,
but it can deadlock because it's too coarse.

I'm afraid we need to rethink the set of locks protecting shared monitor
state.
Marc-André Lureau Aug. 20, 2018, 6:13 p.m. UTC | #6
Hi

On Mon, Aug 20, 2018 at 8:57 AM Markus Armbruster <armbru@redhat.com> wrote:
>
> Marc-André Lureau <marcandre.lureau@gmail.com> writes:
>
> > Hi
> > On Wed, Aug 1, 2018 at 5:09 PM Markus Armbruster <armbru@redhat.com> wrote:
> >>
> >> Marc-André Lureau <marcandre.lureau@gmail.com> writes:
> >>
> >> > Hi
> >> >
> >> > On Wed, Aug 1, 2018 at 3:19 PM, Markus Armbruster <armbru@redhat.com> wrote:
> >> >> Marc-André Lureau <marcandre.lureau@redhat.com> writes:
> >> >>
> >> >>> When a monitor is connected to a Spice chardev, the monitor cleanup
> >> >>> can dead-lock:
> >> >>>
> >> >>>  #0  0x00007f43446637fd in __lll_lock_wait () at /lib64/libpthread.so.0
> >> >>>  #1  0x00007f434465ccf4 in pthread_mutex_lock () at /lib64/libpthread.so.0
> >> >>>  #2  0x0000556dd79f22ba in qemu_mutex_lock_impl (mutex=0x556dd81c9220 <monitor_lock>, file=0x556dd7ae3648 "/home/elmarco/src/qq/monitor.c", line=645) at /home/elmarco/src/qq/util/qemu-thread-posix.c:66
> >> >>>  #3  0x0000556dd7431bd5 in monitor_qapi_event_queue (event=QAPI_EVENT_SPICE_DISCONNECTED, qdict=0x556dd9abc850, errp=0x7fffb7bbddd8) at /home/elmarco/src/qq/monitor.c:645
> >> >>>  #4  0x0000556dd79d476b in qapi_event_send_spice_disconnected (server=0x556dd98ee760, client=0x556ddaaa8560, errp=0x556dd82180d0 <error_abort>) at qapi/qapi-events-ui.c:149
> >> >>>  #5  0x0000556dd7870fc1 in channel_event (event=3, info=0x556ddad1b590) at /home/elmarco/src/qq/ui/spice-core.c:235
> >> >>>  #6  0x00007f434560a6bb in reds_handle_channel_event (reds=<optimized out>, event=3, info=0x556ddad1b590) at reds.c:316
> >> >>>  #7  0x00007f43455f393b in main_dispatcher_self_handle_channel_event (info=0x556ddad1b590, event=3, self=0x556dd9a7d8c0) at main-dispatcher.c:197
> >> >>>  #8  0x00007f43455f393b in main_dispatcher_channel_event (self=0x556dd9a7d8c0, event=event@entry=3, info=0x556ddad1b590) at main-dispatcher.c:197
> >> >>>  #9  0x00007f4345612833 in red_stream_push_channel_event (s=s@entry=0x556ddae2ef40, event=event@entry=3) at red-stream.c:414
> >> >>>  #10 0x00007f434561286b in red_stream_free (s=0x556ddae2ef40) at red-stream.c:388
> >> >>>  #11 0x00007f43455f9ddc in red_channel_client_finalize (object=0x556dd9bb21a0) at red-channel-client.c:347
> >> >>>  #12 0x00007f434b5f9fb9 in g_object_unref () at /lib64/libgobject-2.0.so.0
> >> >>>  #13 0x00007f43455fc212 in red_channel_client_push (rcc=0x556dd9bb21a0) at red-channel-client.c:1341
> >> >>>  #14 0x0000556dd76081ba in spice_port_set_fe_open (chr=0x556dd9925e20, fe_open=0) at /home/elmarco/src/qq/chardev/spice.c:241
> >> >>>  #15 0x0000556dd796d74a in qemu_chr_fe_set_open (be=0x556dd9a37c00, fe_open=0) at /home/elmarco/src/qq/chardev/char-fe.c:340
> >> >>>  #16 0x0000556dd796d4d9 in qemu_chr_fe_set_handlers (b=0x556dd9a37c00, fd_can_read=0x0, fd_read=0x0, fd_event=0x0, be_change=0x0, opaque=0x0, context=0x0, set_open=true) at /home/elmarco/src/qq/chardev/char-fe.c:280
> >> >>>  #17 0x0000556dd796d359 in qemu_chr_fe_deinit (b=0x556dd9a37c00, del=false) at /home/elmarco/src/qq/chardev/char-fe.c:233
> >> >>>  #18 0x0000556dd7432240 in monitor_data_destroy (mon=0x556dd9a37c00) at /home/elmarco/src/qq/monitor.c:786
> >> >>>  #19 0x0000556dd743b968 in monitor_cleanup () at /home/elmarco/src/qq/monitor.c:4683
> >> >>>  #20 0x0000556dd75ce776 in main (argc=3, argv=0x7fffb7bbe458, envp=0x7fffb7bbe478) at /home/elmarco/src/qq/vl.c:4660
> >> >>>
> >> >>> Because spice code tries to emit a "disconnected" signal on the
> >> >>> monitors. Fix this situation by tightening the monitor lock time to
> >> >>> the monitor list removal.
> >> >>>
> >> >>> Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> >> >>
> >> >> Do you think this should go into 3.0?
> >> >>
> >> >>> ---
> >> >>>  monitor.c | 22 +++++++++++++++-------
> >> >>>  1 file changed, 15 insertions(+), 7 deletions(-)
> >> >>>
> >> >>> diff --git a/monitor.c b/monitor.c
> >> >>> index 0fa0910a2a..a16a6c5311 100644
> >> >>> --- a/monitor.c
> >> >>> +++ b/monitor.c
> >> >>> @@ -4702,8 +4702,6 @@ void monitor_init(Chardev *chr, int flags)
> >> >>>
> >> >>>  void monitor_cleanup(void)
> >> >>>  {
> >> >>> -    Monitor *mon, *next;
> >> >>> -
> >> >>>      /*
> >> >>>       * We need to explicitly stop the I/O thread (but not destroy it),
> >> >>>       * clean up the monitor resources, then destroy the I/O thread since
> >> >>> @@ -4719,14 +4717,24 @@ void monitor_cleanup(void)
> >> >>>      monitor_qmp_bh_responder(NULL);
> >> >>>
> >> >>>      /* Flush output buffers and destroy monitors */
> >> >>> -    qemu_mutex_lock(&monitor_lock);
> >> >>> -    QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
> >> >>> -        QTAILQ_REMOVE(&mon_list, mon, entry);
> >> >>> +    do {
> >> >>
> >> >> for (;;), please.
> >> >>
> >> >>> +        Monitor *mon;
> >> >>> +
> >> >>> +        qemu_mutex_lock(&monitor_lock);
> >> >>> +        mon = QTAILQ_FIRST(&mon_list);
> >> >>> +        if (mon) {
> >> >>> +            QTAILQ_REMOVE(&mon_list, mon, entry);
> >> >>> +        }
> >> >>> +        qemu_mutex_unlock(&monitor_lock);
> >> >>> +
> >> >>> +        if (!mon) {
> >> >>> +            break;
> >> >>> +        }
> >> >>> +
> >> >>>          monitor_flush(mon);
> >> >>>          monitor_data_destroy(mon);
> >> >>>          g_free(mon);
> >> >>> -    }
> >> >>> -    qemu_mutex_unlock(&monitor_lock);
> >> >>> +    } while (true);
> >> >>>
> >> >>>      /* QEMUBHs needs to be deleted before destroying the I/O thread */
> >> >>>      qemu_bh_delete(qmp_dispatcher_bh);
> >> >>
> >> >> Iterating safely over a list protected by a lock should be easier than
> >> >> that.  Sad.
> >> >>
> >> >> Hmm, what about this:
> >> >>
> >> >> diff --git a/monitor.c b/monitor.c
> >> >> index 77861e96af..4a23f6c7bc 100644
> >> >> --- a/monitor.c
> >> >> +++ b/monitor.c
> >> >> @@ -4721,9 +4721,11 @@ void monitor_cleanup(void)
> >> >>      qemu_mutex_lock(&monitor_lock);
> >> >>      QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
> >> >>          QTAILQ_REMOVE(&mon_list, mon, entry);
> >> >> +        qemu_mutex_unlock(&monitor_lock);
> >> >>          monitor_flush(mon);
> >> >>          monitor_data_destroy(mon);
> >> >>          g_free(mon);
> >> >> +        qemu_mutex_lock(&monitor_lock);
> >> >
> >> > Although unlikely, there is a chance the monitor list could be
> >> > modified while flushing/cleaning up, I suppose, in this case we could
> >> > miss the new monitors (if next is NULL).
> >>
> >> Your loop prevents that from happening while it runs, but does nothing
> >> to stop it from happening afterwards.  If we want to lock out new
> >> monitors, we need to make monitor_init() fail or impossible to call.
> >
> > Not so trivial. Is there other threads capable of calling
> > monitor_init() by the time monitor_cleanup() is called? It looks like
> > monitor_init() may only be called from the main thread.
>
> Callers:
>
> * gdbserver_start()
>
>   CLI option -gdb, HMP command gdbserver, linux user CLI option -g and
>   environment variable QEMU_GDB
>
>   The interesting one is the HMP command.  Does your loop lock it out?
>   If we run it only in the main thread, and we run the HMP command only
>   in the main thread, it obviously does.
>
> * mon_init_func()
>
>   CLI option -mon and its convenience buddies -monitor, -qmp,
>   -qmp-pretty
>
>   We don't have a monitor command to spawn off a new monitor, but we
>   could have.
>
> * qemu_chr_new_noreplay()
>
>   gdbserver_start() again, and qemu_chr_new(), which is called all over
>   the place.  I lack the time to review these calls.  Are you sure this
>   one can only run in the main thread?
>

No, I am not sure, but I would consider it a bug today. However, if
it's possible to keep using the monitor or create new monitor after
monitor_cleanup() is called, we have probably have more issues to
solve.

However, this problem is not directly related to the dead-lock fixed
here, and the problem is pre-existing.

Probably the cleanup code would have to look different if we want to
solve the init/cleanup races, but that's a different fix. Do we have
to solve it first, or can we add a FIXME?

> Synchronizing monitor creation and cleanup explicitly might be cleaner.
> I guess monitor_lock kind of sort of almost does that before your patch,
> but it can deadlock because it's too coarse.
>
> I'm afraid we need to rethink the set of locks protecting shared monitor
> state.

Yes, and probably change a bit monitor/chardev creation to be under
tighter control...
Markus Armbruster Aug. 21, 2018, 5:07 a.m. UTC | #7
Marc-André Lureau <marcandre.lureau@gmail.com> writes:

> Hi
>
> On Mon, Aug 20, 2018 at 8:57 AM Markus Armbruster <armbru@redhat.com> wrote:
>>
>> Marc-André Lureau <marcandre.lureau@gmail.com> writes:
>>
>> > Hi
>> > On Wed, Aug 1, 2018 at 5:09 PM Markus Armbruster <armbru@redhat.com> wrote:
>> >>
>> >> Marc-André Lureau <marcandre.lureau@gmail.com> writes:
>> >>
>> >> > Hi
>> >> >
>> >> > On Wed, Aug 1, 2018 at 3:19 PM, Markus Armbruster <armbru@redhat.com> wrote:
>> >> >> Marc-André Lureau <marcandre.lureau@redhat.com> writes:
>> >> >>
>> >> >>> When a monitor is connected to a Spice chardev, the monitor cleanup
>> >> >>> can dead-lock:
>> >> >>>
>> >> >>>  #0  0x00007f43446637fd in __lll_lock_wait () at /lib64/libpthread.so.0
>> >> >>>  #1  0x00007f434465ccf4 in pthread_mutex_lock () at /lib64/libpthread.so.0
>> >> >>>  #2  0x0000556dd79f22ba in qemu_mutex_lock_impl (mutex=0x556dd81c9220 <monitor_lock>, file=0x556dd7ae3648 "/home/elmarco/src/qq/monitor.c", line=645) at /home/elmarco/src/qq/util/qemu-thread-posix.c:66
>> >> >>>  #3  0x0000556dd7431bd5 in monitor_qapi_event_queue (event=QAPI_EVENT_SPICE_DISCONNECTED, qdict=0x556dd9abc850, errp=0x7fffb7bbddd8) at /home/elmarco/src/qq/monitor.c:645
>> >> >>>  #4  0x0000556dd79d476b in qapi_event_send_spice_disconnected (server=0x556dd98ee760, client=0x556ddaaa8560, errp=0x556dd82180d0 <error_abort>) at qapi/qapi-events-ui.c:149
>> >> >>>  #5  0x0000556dd7870fc1 in channel_event (event=3, info=0x556ddad1b590) at /home/elmarco/src/qq/ui/spice-core.c:235
>> >> >>>  #6  0x00007f434560a6bb in reds_handle_channel_event (reds=<optimized out>, event=3, info=0x556ddad1b590) at reds.c:316
>> >> >>>  #7  0x00007f43455f393b in main_dispatcher_self_handle_channel_event (info=0x556ddad1b590, event=3, self=0x556dd9a7d8c0) at main-dispatcher.c:197
>> >> >>>  #8  0x00007f43455f393b in main_dispatcher_channel_event (self=0x556dd9a7d8c0, event=event@entry=3, info=0x556ddad1b590) at main-dispatcher.c:197
>> >> >>>  #9  0x00007f4345612833 in red_stream_push_channel_event (s=s@entry=0x556ddae2ef40, event=event@entry=3) at red-stream.c:414
>> >> >>>  #10 0x00007f434561286b in red_stream_free (s=0x556ddae2ef40) at red-stream.c:388
>> >> >>>  #11 0x00007f43455f9ddc in red_channel_client_finalize (object=0x556dd9bb21a0) at red-channel-client.c:347
>> >> >>>  #12 0x00007f434b5f9fb9 in g_object_unref () at /lib64/libgobject-2.0.so.0
>> >> >>>  #13 0x00007f43455fc212 in red_channel_client_push (rcc=0x556dd9bb21a0) at red-channel-client.c:1341
>> >> >>>  #14 0x0000556dd76081ba in spice_port_set_fe_open (chr=0x556dd9925e20, fe_open=0) at /home/elmarco/src/qq/chardev/spice.c:241
>> >> >>>  #15 0x0000556dd796d74a in qemu_chr_fe_set_open (be=0x556dd9a37c00, fe_open=0) at /home/elmarco/src/qq/chardev/char-fe.c:340
>> >> >>>  #16 0x0000556dd796d4d9 in qemu_chr_fe_set_handlers (b=0x556dd9a37c00, fd_can_read=0x0, fd_read=0x0, fd_event=0x0, be_change=0x0, opaque=0x0, context=0x0, set_open=true) at /home/elmarco/src/qq/chardev/char-fe.c:280
>> >> >>>  #17 0x0000556dd796d359 in qemu_chr_fe_deinit (b=0x556dd9a37c00, del=false) at /home/elmarco/src/qq/chardev/char-fe.c:233
>> >> >>>  #18 0x0000556dd7432240 in monitor_data_destroy (mon=0x556dd9a37c00) at /home/elmarco/src/qq/monitor.c:786
>> >> >>>  #19 0x0000556dd743b968 in monitor_cleanup () at /home/elmarco/src/qq/monitor.c:4683
>> >> >>>  #20 0x0000556dd75ce776 in main (argc=3, argv=0x7fffb7bbe458, envp=0x7fffb7bbe478) at /home/elmarco/src/qq/vl.c:4660
>> >> >>>
>> >> >>> Because spice code tries to emit a "disconnected" signal on the
>> >> >>> monitors. Fix this situation by tightening the monitor lock time to
>> >> >>> the monitor list removal.
>> >> >>>
>> >> >>> Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
>> >> >>
>> >> >> Do you think this should go into 3.0?
>> >> >>
>> >> >>> ---
>> >> >>>  monitor.c | 22 +++++++++++++++-------
>> >> >>>  1 file changed, 15 insertions(+), 7 deletions(-)
>> >> >>>
>> >> >>> diff --git a/monitor.c b/monitor.c
>> >> >>> index 0fa0910a2a..a16a6c5311 100644
>> >> >>> --- a/monitor.c
>> >> >>> +++ b/monitor.c
>> >> >>> @@ -4702,8 +4702,6 @@ void monitor_init(Chardev *chr, int flags)
>> >> >>>
>> >> >>>  void monitor_cleanup(void)
>> >> >>>  {
>> >> >>> -    Monitor *mon, *next;
>> >> >>> -
>> >> >>>      /*
>> >> >>>       * We need to explicitly stop the I/O thread (but not destroy it),
>> >> >>>       * clean up the monitor resources, then destroy the I/O thread since
>> >> >>> @@ -4719,14 +4717,24 @@ void monitor_cleanup(void)
>> >> >>>      monitor_qmp_bh_responder(NULL);
>> >> >>>
>> >> >>>      /* Flush output buffers and destroy monitors */
>> >> >>> -    qemu_mutex_lock(&monitor_lock);
>> >> >>> -    QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
>> >> >>> -        QTAILQ_REMOVE(&mon_list, mon, entry);
>> >> >>> +    do {
>> >> >>
>> >> >> for (;;), please.
>> >> >>
>> >> >>> +        Monitor *mon;
>> >> >>> +
>> >> >>> +        qemu_mutex_lock(&monitor_lock);
>> >> >>> +        mon = QTAILQ_FIRST(&mon_list);
>> >> >>> +        if (mon) {
>> >> >>> +            QTAILQ_REMOVE(&mon_list, mon, entry);
>> >> >>> +        }
>> >> >>> +        qemu_mutex_unlock(&monitor_lock);
>> >> >>> +
>> >> >>> +        if (!mon) {
>> >> >>> +            break;
>> >> >>> +        }
>> >> >>> +
>> >> >>>          monitor_flush(mon);
>> >> >>>          monitor_data_destroy(mon);
>> >> >>>          g_free(mon);
>> >> >>> -    }
>> >> >>> -    qemu_mutex_unlock(&monitor_lock);
>> >> >>> +    } while (true);
>> >> >>>
>> >> >>>      /* QEMUBHs needs to be deleted before destroying the I/O thread */
>> >> >>>      qemu_bh_delete(qmp_dispatcher_bh);
>> >> >>
>> >> >> Iterating safely over a list protected by a lock should be easier than
>> >> >> that.  Sad.
>> >> >>
>> >> >> Hmm, what about this:
>> >> >>
>> >> >> diff --git a/monitor.c b/monitor.c
>> >> >> index 77861e96af..4a23f6c7bc 100644
>> >> >> --- a/monitor.c
>> >> >> +++ b/monitor.c
>> >> >> @@ -4721,9 +4721,11 @@ void monitor_cleanup(void)
>> >> >>      qemu_mutex_lock(&monitor_lock);
>> >> >>      QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
>> >> >>          QTAILQ_REMOVE(&mon_list, mon, entry);
>> >> >> +        qemu_mutex_unlock(&monitor_lock);
>> >> >>          monitor_flush(mon);
>> >> >>          monitor_data_destroy(mon);
>> >> >>          g_free(mon);
>> >> >> +        qemu_mutex_lock(&monitor_lock);
>> >> >
>> >> > Although unlikely, there is a chance the monitor list could be
>> >> > modified while flushing/cleaning up, I suppose, in this case we could
>> >> > miss the new monitors (if next is NULL).
>> >>
>> >> Your loop prevents that from happening while it runs, but does nothing
>> >> to stop it from happening afterwards.  If we want to lock out new
>> >> monitors, we need to make monitor_init() fail or impossible to call.
>> >
>> > Not so trivial. Is there other threads capable of calling
>> > monitor_init() by the time monitor_cleanup() is called? It looks like
>> > monitor_init() may only be called from the main thread.
>>
>> Callers:
>>
>> * gdbserver_start()
>>
>>   CLI option -gdb, HMP command gdbserver, linux user CLI option -g and
>>   environment variable QEMU_GDB
>>
>>   The interesting one is the HMP command.  Does your loop lock it out?
>>   If we run it only in the main thread, and we run the HMP command only
>>   in the main thread, it obviously does.
>>
>> * mon_init_func()
>>
>>   CLI option -mon and its convenience buddies -monitor, -qmp,
>>   -qmp-pretty
>>
>>   We don't have a monitor command to spawn off a new monitor, but we
>>   could have.
>>
>> * qemu_chr_new_noreplay()
>>
>>   gdbserver_start() again, and qemu_chr_new(), which is called all over
>>   the place.  I lack the time to review these calls.  Are you sure this
>>   one can only run in the main thread?
>>
>
> No, I am not sure, but I would consider it a bug today. However, if
> it's possible to keep using the monitor or create new monitor after
> monitor_cleanup() is called, we have probably have more issues to
> solve.
>
> However, this problem is not directly related to the dead-lock fixed
> here, and the problem is pre-existing.
>
> Probably the cleanup code would have to look different if we want to
> solve the init/cleanup races, but that's a different fix. Do we have
> to solve it first, or can we add a FIXME?

FIXME is fine.

Can we go with the simpler fix I sketched?

>> Synchronizing monitor creation and cleanup explicitly might be cleaner.
>> I guess monitor_lock kind of sort of almost does that before your patch,
>> but it can deadlock because it's too coarse.
>>
>> I'm afraid we need to rethink the set of locks protecting shared monitor
>> state.
>
> Yes, and probably change a bit monitor/chardev creation to be under
> tighter control...

We're going to have fun...
diff mbox series

Patch

diff --git a/monitor.c b/monitor.c
index 0fa0910a2a..a16a6c5311 100644
--- a/monitor.c
+++ b/monitor.c
@@ -4702,8 +4702,6 @@  void monitor_init(Chardev *chr, int flags)
 
 void monitor_cleanup(void)
 {
-    Monitor *mon, *next;
-
     /*
      * We need to explicitly stop the I/O thread (but not destroy it),
      * clean up the monitor resources, then destroy the I/O thread since
@@ -4719,14 +4717,24 @@  void monitor_cleanup(void)
     monitor_qmp_bh_responder(NULL);
 
     /* Flush output buffers and destroy monitors */
-    qemu_mutex_lock(&monitor_lock);
-    QTAILQ_FOREACH_SAFE(mon, &mon_list, entry, next) {
-        QTAILQ_REMOVE(&mon_list, mon, entry);
+    do {
+        Monitor *mon;
+
+        qemu_mutex_lock(&monitor_lock);
+        mon = QTAILQ_FIRST(&mon_list);
+        if (mon) {
+            QTAILQ_REMOVE(&mon_list, mon, entry);
+        }
+        qemu_mutex_unlock(&monitor_lock);
+
+        if (!mon) {
+            break;
+        }
+
         monitor_flush(mon);
         monitor_data_destroy(mon);
         g_free(mon);
-    }
-    qemu_mutex_unlock(&monitor_lock);
+    } while (true);
 
     /* QEMUBHs needs to be deleted before destroying the I/O thread */
     qemu_bh_delete(qmp_dispatcher_bh);