diff mbox

char: do not use atexit cleanup handler

Message ID 20160704153823.16879-1-marcandre.lureau@redhat.com
State New
Headers show

Commit Message

Marc-André Lureau July 4, 2016, 3:38 p.m. UTC
From: Marc-André Lureau <marcandre.lureau@redhat.com>

It turns out qemu is calling exit() in various places from various
threads without taking much care of resources state. The atexit()
cleanup handlers cannot easily destroy resources that are in use (by
the same thread or other).

Since c1111a24a3, TCG arm guests run into the following abort() when
running tests, the chardev mutex is locked during the write, so
qemu_mutex_destroy() returns an error:

 #0  0x00007fffdbb806f5 in raise () at /lib64/libc.so.6
 #1  0x00007fffdbb822fa in abort () at /lib64/libc.so.6
 #2  0x00005555557616fe in error_exit (err=<optimized out>, msg=msg@entry=0x555555c38c30 <__func__.14622> "qemu_mutex_destroy")
     at /home/drjones/code/qemu/util/qemu-thread-posix.c:39
 #3  0x0000555555b0be20 in qemu_mutex_destroy (mutex=mutex@entry=0x5555566aa0e0) at /home/drjones/code/qemu/util/qemu-thread-posix.c:57
 #4  0x00005555558aab00 in qemu_chr_free_common (chr=0x5555566aa0e0) at /home/drjones/code/qemu/qemu-char.c:4029
 #5  0x00005555558b05f9 in qemu_chr_delete (chr=<optimized out>) at /home/drjones/code/qemu/qemu-char.c:4038
 #6  0x00005555558b05f9 in qemu_chr_delete (chr=<optimized out>) at /home/drjones/code/qemu/qemu-char.c:4044
 #7  0x00005555558b062c in qemu_chr_cleanup () at /home/drjones/code/qemu/qemu-char.c:4557
 #8  0x00007fffdbb851e8 in __run_exit_handlers () at /lib64/libc.so.6
 #9  0x00007fffdbb85235 in  () at /lib64/libc.so.6
 #10 0x00005555558d1b39 in testdev_write (testdev=0x5555566aa0a0) at /home/drjones/code/qemu/backends/testdev.c:71
 #11 0x00005555558d1b39 in testdev_write (chr=<optimized out>, buf=0x7fffc343fd9a "", len=0) at /home/drjones/code/qemu/backends/testdev.c:95
 #12 0x00005555558adced in qemu_chr_fe_write (s=0x5555566aa0e0, buf=buf@entry=0x7fffc343fd98 "0q", len=len@entry=2) at /home/drjones/code/qemu/qemu-char.c:282

Instead of using a atexit() handler, only run the chardev cleanup as
initially proposed at the end of main(), where there are less chances
(hic) of conflicts or other races.

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reported-by: Andrew Jones <drjones@redhat.com>
---
 include/sysemu/char.h | 7 +++++++
 qemu-char.c           | 4 +---
 vl.c                  | 1 +
 3 files changed, 9 insertions(+), 3 deletions(-)

Comments

Paolo Bonzini July 4, 2016, 3:49 p.m. UTC | #1
On 04/07/2016 17:38, marcandre.lureau@redhat.com wrote:
> From: Marc-André Lureau <marcandre.lureau@redhat.com>
> 
> It turns out qemu is calling exit() in various places from various
> threads without taking much care of resources state. The atexit()
> cleanup handlers cannot easily destroy resources that are in use (by
> the same thread or other).
> 
> Since c1111a24a3, TCG arm guests run into the following abort() when
> running tests, the chardev mutex is locked during the write, so
> qemu_mutex_destroy() returns an error:
> 
>  #0  0x00007fffdbb806f5 in raise () at /lib64/libc.so.6
>  #1  0x00007fffdbb822fa in abort () at /lib64/libc.so.6
>  #2  0x00005555557616fe in error_exit (err=<optimized out>, msg=msg@entry=0x555555c38c30 <__func__.14622> "qemu_mutex_destroy")
>      at /home/drjones/code/qemu/util/qemu-thread-posix.c:39
>  #3  0x0000555555b0be20 in qemu_mutex_destroy (mutex=mutex@entry=0x5555566aa0e0) at /home/drjones/code/qemu/util/qemu-thread-posix.c:57
>  #4  0x00005555558aab00 in qemu_chr_free_common (chr=0x5555566aa0e0) at /home/drjones/code/qemu/qemu-char.c:4029
>  #5  0x00005555558b05f9 in qemu_chr_delete (chr=<optimized out>) at /home/drjones/code/qemu/qemu-char.c:4038
>  #6  0x00005555558b05f9 in qemu_chr_delete (chr=<optimized out>) at /home/drjones/code/qemu/qemu-char.c:4044
>  #7  0x00005555558b062c in qemu_chr_cleanup () at /home/drjones/code/qemu/qemu-char.c:4557
>  #8  0x00007fffdbb851e8 in __run_exit_handlers () at /lib64/libc.so.6
>  #9  0x00007fffdbb85235 in  () at /lib64/libc.so.6
>  #10 0x00005555558d1b39 in testdev_write (testdev=0x5555566aa0a0) at /home/drjones/code/qemu/backends/testdev.c:71
>  #11 0x00005555558d1b39 in testdev_write (chr=<optimized out>, buf=0x7fffc343fd9a "", len=0) at /home/drjones/code/qemu/backends/testdev.c:95
>  #12 0x00005555558adced in qemu_chr_fe_write (s=0x5555566aa0e0, buf=buf@entry=0x7fffc343fd98 "0q", len=len@entry=2) at /home/drjones/code/qemu/qemu-char.c:282
> 
> Instead of using a atexit() handler, only run the chardev cleanup as
> initially proposed at the end of main(), where there are less chances
> (hic) of conflicts or other races.
> 
> Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> Reported-by: Andrew Jones <drjones@redhat.com>
> ---
>  include/sysemu/char.h | 7 +++++++
>  qemu-char.c           | 4 +---
>  vl.c                  | 1 +
>  3 files changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/include/sysemu/char.h b/include/sysemu/char.h
> index 57df10a..0ea9eac 100644
> --- a/include/sysemu/char.h
> +++ b/include/sysemu/char.h
> @@ -151,6 +151,13 @@ CharDriverState *qemu_chr_new(const char *label, const char *filename,
>   */
>  void qemu_chr_disconnect(CharDriverState *chr);
>  
> +/**
> + * @qemu_chr_cleanup:
> + *
> + * Delete all chardevs (when leaving qemu)
> + */
> +void qemu_chr_cleanup(void);
> +
>  /**
>   * @qemu_chr_new_noreplay:
>   *
> diff --git a/qemu-char.c b/qemu-char.c
> index b73969d..a542192 100644
> --- a/qemu-char.c
> +++ b/qemu-char.c
> @@ -4549,7 +4549,7 @@ void qmp_chardev_remove(const char *id, Error **errp)
>      qemu_chr_delete(chr);
>  }
>  
> -static void qemu_chr_cleanup(void)
> +void qemu_chr_cleanup(void)
>  {
>      CharDriverState *chr, *tmp;
>  
> @@ -4604,8 +4604,6 @@ static void register_types(void)
>       * is specified
>       */
>      qemu_add_machine_init_done_notifier(&muxes_realize_notify);
> -
> -    atexit(qemu_chr_cleanup);
>  }
>  
>  type_init(register_types);
> diff --git a/vl.c b/vl.c
> index 9bb7f4c..d0b9ff9 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -4596,6 +4596,7 @@ int main(int argc, char **argv, char **envp)
>  #ifdef CONFIG_TPM
>      tpm_cleanup();
>  #endif
> +    qemu_chr_cleanup();
>  
>      return 0;
>  }
> 

Queued, thanks.

Paolo
Daniel P. Berrangé July 4, 2016, 4:31 p.m. UTC | #2
On Mon, Jul 04, 2016 at 05:38:23PM +0200, marcandre.lureau@redhat.com wrote:
> From: Marc-André Lureau <marcandre.lureau@redhat.com>
> 
> It turns out qemu is calling exit() in various places from various
> threads without taking much care of resources state. The atexit()
> cleanup handlers cannot easily destroy resources that are in use (by
> the same thread or other).

[snip]

> Instead of using a atexit() handler, only run the chardev cleanup as
> initially proposed at the end of main(), where there are less chances
> (hic) of conflicts or other races.

This doesn't really seem all that much safer. There's still plenty of
chance that threads are running in the background at the end of the
main() method, so plenty of scope for the qemu_chr_cleanup() call to
cause threads to segv by destroying the chardevs they're using behind
their back.

IIUC, the original intent here was that we call unlink() on the UNIX
socket paths when QEMU exits.

Surely we can come up with a way to that, and only that, upon exit,
without actually having to free the chardev memory with all the risks
that entails.

eg, have a qemu_chr_close() method that closes & cleans up resources,
separately from actually free'ing the Chardev struct with all the
risk of crashing concurrent threads that entails.

> Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> Reported-by: Andrew Jones <drjones@redhat.com>
> ---
>  include/sysemu/char.h | 7 +++++++
>  qemu-char.c           | 4 +---
>  vl.c                  | 1 +
>  3 files changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/include/sysemu/char.h b/include/sysemu/char.h
> index 57df10a..0ea9eac 100644
> --- a/include/sysemu/char.h
> +++ b/include/sysemu/char.h
> @@ -151,6 +151,13 @@ CharDriverState *qemu_chr_new(const char *label, const char *filename,
>   */
>  void qemu_chr_disconnect(CharDriverState *chr);
>  
> +/**
> + * @qemu_chr_cleanup:
> + *
> + * Delete all chardevs (when leaving qemu)
> + */
> +void qemu_chr_cleanup(void);
> +
>  /**
>   * @qemu_chr_new_noreplay:
>   *
> diff --git a/qemu-char.c b/qemu-char.c
> index b73969d..a542192 100644
> --- a/qemu-char.c
> +++ b/qemu-char.c
> @@ -4549,7 +4549,7 @@ void qmp_chardev_remove(const char *id, Error **errp)
>      qemu_chr_delete(chr);
>  }
>  
> -static void qemu_chr_cleanup(void)
> +void qemu_chr_cleanup(void)
>  {
>      CharDriverState *chr, *tmp;
>  
> @@ -4604,8 +4604,6 @@ static void register_types(void)
>       * is specified
>       */
>      qemu_add_machine_init_done_notifier(&muxes_realize_notify);
> -
> -    atexit(qemu_chr_cleanup);
>  }
>  
>  type_init(register_types);
> diff --git a/vl.c b/vl.c
> index 9bb7f4c..d0b9ff9 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -4596,6 +4596,7 @@ int main(int argc, char **argv, char **envp)
>  #ifdef CONFIG_TPM
>      tpm_cleanup();
>  #endif
> +    qemu_chr_cleanup();
>  
>      return 0;

Regards,
Daniel
Marc-André Lureau July 4, 2016, 4:43 p.m. UTC | #3
Hi

On Mon, Jul 4, 2016 at 6:31 PM, Daniel P. Berrange <berrange@redhat.com> wrote:
> On Mon, Jul 04, 2016 at 05:38:23PM +0200, marcandre.lureau@redhat.com wrote:
>> From: Marc-André Lureau <marcandre.lureau@redhat.com>
>>
>> It turns out qemu is calling exit() in various places from various
>> threads without taking much care of resources state. The atexit()
>> cleanup handlers cannot easily destroy resources that are in use (by
>> the same thread or other).
>
> [snip]
>
>> Instead of using a atexit() handler, only run the chardev cleanup as
>> initially proposed at the end of main(), where there are less chances
>> (hic) of conflicts or other races.
>
> This doesn't really seem all that much safer. There's still plenty of
> chance that threads are running in the background at the end of the
> main() method, so plenty of scope for the qemu_chr_cleanup() call to
> cause threads to segv by destroying the chardevs they're using behind
> their back.

Yep yep

Note, just before the end of main() all CPU threads should be frozen.

Is there any reason why there is no support for joining an cleaning up
all the running threads at this point?

> IIUC, the original intent here was that we call unlink() on the UNIX
> socket paths when QEMU exits.
>
> Surely we can come up with a way to that, and only that, upon exit,
> without actually having to free the chardev memory with all the risks
> that entails.

Doesn't sound that simple if you can't take the lock. And when you do
you can't destroy it. It sounds hard to avoid races, so I might be
missing something.

> eg, have a qemu_chr_close() method that closes & cleans up resources,
> separately from actually free'ing the Chardev struct with all the
> risk of crashing concurrent threads that entails.

Closing half of the chardev resources could also lead to other kind of
errors (IO etc)..

>
>> Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
>> Reported-by: Andrew Jones <drjones@redhat.com>
>> ---
>>  include/sysemu/char.h | 7 +++++++
>>  qemu-char.c           | 4 +---
>>  vl.c                  | 1 +
>>  3 files changed, 9 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/sysemu/char.h b/include/sysemu/char.h
>> index 57df10a..0ea9eac 100644
>> --- a/include/sysemu/char.h
>> +++ b/include/sysemu/char.h
>> @@ -151,6 +151,13 @@ CharDriverState *qemu_chr_new(const char *label, const char *filename,
>>   */
>>  void qemu_chr_disconnect(CharDriverState *chr);
>>
>> +/**
>> + * @qemu_chr_cleanup:
>> + *
>> + * Delete all chardevs (when leaving qemu)
>> + */
>> +void qemu_chr_cleanup(void);
>> +
>>  /**
>>   * @qemu_chr_new_noreplay:
>>   *
>> diff --git a/qemu-char.c b/qemu-char.c
>> index b73969d..a542192 100644
>> --- a/qemu-char.c
>> +++ b/qemu-char.c
>> @@ -4549,7 +4549,7 @@ void qmp_chardev_remove(const char *id, Error **errp)
>>      qemu_chr_delete(chr);
>>  }
>>
>> -static void qemu_chr_cleanup(void)
>> +void qemu_chr_cleanup(void)
>>  {
>>      CharDriverState *chr, *tmp;
>>
>> @@ -4604,8 +4604,6 @@ static void register_types(void)
>>       * is specified
>>       */
>>      qemu_add_machine_init_done_notifier(&muxes_realize_notify);
>> -
>> -    atexit(qemu_chr_cleanup);
>>  }
>>
>>  type_init(register_types);
>> diff --git a/vl.c b/vl.c
>> index 9bb7f4c..d0b9ff9 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -4596,6 +4596,7 @@ int main(int argc, char **argv, char **envp)
>>  #ifdef CONFIG_TPM
>>      tpm_cleanup();
>>  #endif
>> +    qemu_chr_cleanup();
>>
>>      return 0;
>
> Regards,
> Daniel
> --
> |: http://berrange.com      -o-    http://www.flickr.com/photos/dberrange/ :|
> |: http://libvirt.org              -o-             http://virt-manager.org :|
> |: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
> |: http://entangle-photo.org       -o-       http://live.gnome.org/gtk-vnc :|
>
Paolo Bonzini July 4, 2016, 4:46 p.m. UTC | #4
On 04/07/2016 18:31, Daniel P. Berrange wrote:
>> > Instead of using a atexit() handler, only run the chardev cleanup as
>> > initially proposed at the end of main(), where there are less chances
>> > (hic) of conflicts or other races.
> This doesn't really seem all that much safer. There's still plenty of
> chance that threads are running in the background at the end of the
> main() method, so plenty of scope for the qemu_chr_cleanup() call to
> cause threads to segv by destroying the chardevs they're using behind
> their back.

At this point you have stopped all CPUs and block devices.  There is not
much else that is going on in QEMU at all, at this point.  The solution
would be to stop those threads.

Paolo

> IIUC, the original intent here was that we call unlink() on the UNIX
> socket paths when QEMU exits.
> 
> Surely we can come up with a way to that, and only that, upon exit,
> without actually having to free the chardev memory with all the risks
> that entails.
> 
> eg, have a qemu_chr_close() method that closes & cleans up resources,
> separately from actually free'ing the Chardev struct with all the
> risk of crashing concurrent threads that entails.
>
Daniel P. Berrangé July 4, 2016, 5:07 p.m. UTC | #5
On Mon, Jul 04, 2016 at 06:46:47PM +0200, Paolo Bonzini wrote:
> 
> 
> On 04/07/2016 18:31, Daniel P. Berrange wrote:
> >> > Instead of using a atexit() handler, only run the chardev cleanup as
> >> > initially proposed at the end of main(), where there are less chances
> >> > (hic) of conflicts or other races.
> > This doesn't really seem all that much safer. There's still plenty of
> > chance that threads are running in the background at the end of the
> > main() method, so plenty of scope for the qemu_chr_cleanup() call to
> > cause threads to segv by destroying the chardevs they're using behind
> > their back.
> 
> At this point you have stopped all CPUs and block devices.  There is not
> much else that is going on in QEMU at all, at this point.  The solution
> would be to stop those threads.

What about graphics threads ? In particular I'd be thinking of spice
which uses threads and chardevs.

Regards,
Daniel
Paolo Bonzini July 4, 2016, 5:08 p.m. UTC | #6
On 04/07/2016 19:07, Daniel P. Berrange wrote:
> > At this point you have stopped all CPUs and block devices.  There is not
> > much else that is going on in QEMU at all, at this point.  The solution
> > would be to stop those threads.
>
> What about graphics threads ? In particular I'd be thinking of spice
> which uses threads and chardevs.

I think it should be quiesced after pause_all_vcpus returns.  Marc-André
should know, but it's better to check with Gerd.

(FWIW, it's not so easy to shut down the threads because for example
some threads might belong to non-hotunpluggable devices).

Paolo
Daniel P. Berrangé July 4, 2016, 5:12 p.m. UTC | #7
On Mon, Jul 04, 2016 at 06:43:42PM +0200, Marc-André Lureau wrote:
> Hi
> 
> On Mon, Jul 4, 2016 at 6:31 PM, Daniel P. Berrange <berrange@redhat.com> wrote:
> > On Mon, Jul 04, 2016 at 05:38:23PM +0200, marcandre.lureau@redhat.com wrote:
> >> From: Marc-André Lureau <marcandre.lureau@redhat.com>
> >>
> >> It turns out qemu is calling exit() in various places from various
> >> threads without taking much care of resources state. The atexit()
> >> cleanup handlers cannot easily destroy resources that are in use (by
> >> the same thread or other).
> >
> > [snip]
> >
> >> Instead of using a atexit() handler, only run the chardev cleanup as
> >> initially proposed at the end of main(), where there are less chances
> >> (hic) of conflicts or other races.
> >
> > This doesn't really seem all that much safer. There's still plenty of
> > chance that threads are running in the background at the end of the
> > main() method, so plenty of scope for the qemu_chr_cleanup() call to
> > cause threads to segv by destroying the chardevs they're using behind
> > their back.
> 
> Yep yep
> 
> Note, just before the end of main() all CPU threads should be frozen.
> 
> Is there any reason why there is no support for joining an cleaning up
> all the running threads at this point?
> 
> > IIUC, the original intent here was that we call unlink() on the UNIX
> > socket paths when QEMU exits.
> >
> > Surely we can come up with a way to that, and only that, upon exit,
> > without actually having to free the chardev memory with all the risks
> > that entails.
> 
> Doesn't sound that simple if you can't take the lock. And when you do
> you can't destroy it. It sounds hard to avoid races, so I might be
> missing something.

The other option is to not try to do this at the chardev level at all.
Instead we could have the QIOChannelSocket impl maintain a list of UNIX
socket paths that need unlinking, and have it register an atexit() handler
itself, whose sole purpose is to unlink the paths. We could simply have a
global GList of char* UNIX socket paths, so the atexit handler would not
touch the QIOChannelSocket objects at all.  This avoiding all races or
issues with concurrent access.

Regards,
Daniel
Marc-Andre Lureau July 4, 2016, 5:19 p.m. UTC | #8
Hi

----- Original Message -----
> 
> 
> On 04/07/2016 19:07, Daniel P. Berrange wrote:
> > > At this point you have stopped all CPUs and block devices.  There is not
> > > much else that is going on in QEMU at all, at this point.  The solution
> > > would be to stop those threads.
> >
> > What about graphics threads ? In particular I'd be thinking of spice
> > which uses threads and chardevs.
> 
> I think it should be quiesced after pause_all_vcpus returns.  Marc-André
> should know, but it's better to check with Gerd.

In theory, spice_server_vm_stop() should be called at this point, and all chardev in spice are stopped too there, as well as the qxl worker processing thread (although the thread is not joined here neither..).

> (FWIW, it's not so easy to shut down the threads because for example
> some threads might belong to non-hotunpluggable devices).

ok
Gerd Hoffmann July 4, 2016, 7:53 p.m. UTC | #9
Hi,

> > > What about graphics threads ? In particular I'd be thinking of spice
> > > which uses threads and chardevs.
> > 
> > I think it should be quiesced after pause_all_vcpus returns.  Marc-André
> > should know, but it's better to check with Gerd.
> 
> In theory, spice_server_vm_stop() should be called at this point,

Yes, that should handle the qxl worker thread.

> and all chardev in spice are stopped too there, as well as the qxl
> worker processing thread (although the thread is not joined here
> neither..).

The chardevs are handled in iothread context anyway, so I don't think
they need any special care.

cheers,
  Gerd
Paolo Bonzini July 5, 2016, 8:24 a.m. UTC | #10
On 04/07/2016 21:53, Gerd Hoffmann wrote:
>   Hi,
> 
>>>> What about graphics threads ? In particular I'd be thinking of spice
>>>> which uses threads and chardevs.
>>>
>>> I think it should be quiesced after pause_all_vcpus returns.  Marc-André
>>> should know, but it's better to check with Gerd.
>>
>> In theory, spice_server_vm_stop() should be called at this point,
> 
> Yes, that should handle the qxl worker thread.
> 
>> and all chardev in spice are stopped too there, as well as the qxl
>> worker processing thread (although the thread is not joined here
>> neither..).
> 
> The chardevs are handled in iothread context anyway, so I don't think
> they need any special care.

Writes to chardevs can be done from non-iothread threads, but SPICE
doesn't do that.

Paolo
diff mbox

Patch

diff --git a/include/sysemu/char.h b/include/sysemu/char.h
index 57df10a..0ea9eac 100644
--- a/include/sysemu/char.h
+++ b/include/sysemu/char.h
@@ -151,6 +151,13 @@  CharDriverState *qemu_chr_new(const char *label, const char *filename,
  */
 void qemu_chr_disconnect(CharDriverState *chr);
 
+/**
+ * @qemu_chr_cleanup:
+ *
+ * Delete all chardevs (when leaving qemu)
+ */
+void qemu_chr_cleanup(void);
+
 /**
  * @qemu_chr_new_noreplay:
  *
diff --git a/qemu-char.c b/qemu-char.c
index b73969d..a542192 100644
--- a/qemu-char.c
+++ b/qemu-char.c
@@ -4549,7 +4549,7 @@  void qmp_chardev_remove(const char *id, Error **errp)
     qemu_chr_delete(chr);
 }
 
-static void qemu_chr_cleanup(void)
+void qemu_chr_cleanup(void)
 {
     CharDriverState *chr, *tmp;
 
@@ -4604,8 +4604,6 @@  static void register_types(void)
      * is specified
      */
     qemu_add_machine_init_done_notifier(&muxes_realize_notify);
-
-    atexit(qemu_chr_cleanup);
 }
 
 type_init(register_types);
diff --git a/vl.c b/vl.c
index 9bb7f4c..d0b9ff9 100644
--- a/vl.c
+++ b/vl.c
@@ -4596,6 +4596,7 @@  int main(int argc, char **argv, char **envp)
 #ifdef CONFIG_TPM
     tpm_cleanup();
 #endif
+    qemu_chr_cleanup();
 
     return 0;
 }