diff mbox series

[v3,3/3] char-socket: Lock tcp_chr_disconnect() and socket_reconnect_timeout()

Message ID 48f7e50413f6f222acb1bf3a9de773e387a2855a.1550842915.git.berto@igalia.com
State New
Headers show
Series char-socket: Fix race condition | expand

Commit Message

Alberto Garcia Feb. 22, 2019, 1:46 p.m. UTC
There's a race condition in which the tcp_chr_read() ioc handler can
close a connection that is being written to from another thread.

Running iotest 136 in a loop triggers this problem and crashes QEMU.

 (gdb) bt
 #0  0x00005558b842902d in object_get_class (obj=0x0) at qom/object.c:860
 #1  0x00005558b84f92db in qio_channel_writev_full (ioc=0x0, iov=0x7ffc355decf0, niov=1, fds=0x0, nfds=0, errp=0x0) at io/channel.c:76
 #2  0x00005558b84e0e9e in io_channel_send_full (ioc=0x0, buf=0x5558baf5beb0, len=138, fds=0x0, nfds=0) at chardev/char-io.c:123
 #3  0x00005558b84e4a69 in tcp_chr_write (chr=0x5558ba460380, buf=0x5558baf5beb0 "...", len=138) at chardev/char-socket.c:135
 #4  0x00005558b84dca55 in qemu_chr_write_buffer (s=0x5558ba460380, buf=0x5558baf5beb0 "...", len=138, offset=0x7ffc355dedd0, write_all=false) at chardev/char.c:112
 #5  0x00005558b84dcbc2 in qemu_chr_write (s=0x5558ba460380, buf=0x5558baf5beb0 "...", len=138, write_all=false) at chardev/char.c:147
 #6  0x00005558b84dfb26 in qemu_chr_fe_write (be=0x5558ba476610, buf=0x5558baf5beb0 "...", len=138) at chardev/char-fe.c:42
 #7  0x00005558b8088c86 in monitor_flush_locked (mon=0x5558ba476610) at monitor.c:406
 #8  0x00005558b8088e8c in monitor_puts (mon=0x5558ba476610, str=0x5558ba921e49 "") at monitor.c:449
 #9  0x00005558b8089178 in qmp_send_response (mon=0x5558ba476610, rsp=0x5558bb161600) at monitor.c:498
 #10 0x00005558b808920c in monitor_qapi_event_emit (event=QAPI_EVENT_SHUTDOWN, qdict=0x5558bb161600) at monitor.c:526
 #11 0x00005558b8089307 in monitor_qapi_event_queue_no_reenter (event=QAPI_EVENT_SHUTDOWN, qdict=0x5558bb161600) at monitor.c:551
 #12 0x00005558b80896c0 in qapi_event_emit (event=QAPI_EVENT_SHUTDOWN, qdict=0x5558bb161600) at monitor.c:626
 #13 0x00005558b855f23b in qapi_event_send_shutdown (guest=false, reason=SHUTDOWN_CAUSE_HOST_QMP_QUIT) at qapi/qapi-events-run-state.c:43
 #14 0x00005558b81911ef in qemu_system_shutdown (cause=SHUTDOWN_CAUSE_HOST_QMP_QUIT) at vl.c:1837
 #15 0x00005558b8191308 in main_loop_should_exit () at vl.c:1885
 #16 0x00005558b819140d in main_loop () at vl.c:1924
 #17 0x00005558b8198c84 in main (argc=18, argv=0x7ffc355df3f8, envp=0x7ffc355df490) at vl.c:4665

This patch adds a lock to protect tcp_chr_disconnect() and
socket_reconnect_timeout()

Signed-off-by: Alberto Garcia <berto@igalia.com>
---
 chardev/char-socket.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

Comments

Daniel P. Berrangé Feb. 22, 2019, 1:51 p.m. UTC | #1
On Fri, Feb 22, 2019 at 03:46:26PM +0200, Alberto Garcia wrote:
> There's a race condition in which the tcp_chr_read() ioc handler can
> close a connection that is being written to from another thread.
> 
> Running iotest 136 in a loop triggers this problem and crashes QEMU.
> 
>  (gdb) bt
>  #0  0x00005558b842902d in object_get_class (obj=0x0) at qom/object.c:860
>  #1  0x00005558b84f92db in qio_channel_writev_full (ioc=0x0, iov=0x7ffc355decf0, niov=1, fds=0x0, nfds=0, errp=0x0) at io/channel.c:76
>  #2  0x00005558b84e0e9e in io_channel_send_full (ioc=0x0, buf=0x5558baf5beb0, len=138, fds=0x0, nfds=0) at chardev/char-io.c:123
>  #3  0x00005558b84e4a69 in tcp_chr_write (chr=0x5558ba460380, buf=0x5558baf5beb0 "...", len=138) at chardev/char-socket.c:135
>  #4  0x00005558b84dca55 in qemu_chr_write_buffer (s=0x5558ba460380, buf=0x5558baf5beb0 "...", len=138, offset=0x7ffc355dedd0, write_all=false) at chardev/char.c:112
>  #5  0x00005558b84dcbc2 in qemu_chr_write (s=0x5558ba460380, buf=0x5558baf5beb0 "...", len=138, write_all=false) at chardev/char.c:147
>  #6  0x00005558b84dfb26 in qemu_chr_fe_write (be=0x5558ba476610, buf=0x5558baf5beb0 "...", len=138) at chardev/char-fe.c:42
>  #7  0x00005558b8088c86 in monitor_flush_locked (mon=0x5558ba476610) at monitor.c:406
>  #8  0x00005558b8088e8c in monitor_puts (mon=0x5558ba476610, str=0x5558ba921e49 "") at monitor.c:449
>  #9  0x00005558b8089178 in qmp_send_response (mon=0x5558ba476610, rsp=0x5558bb161600) at monitor.c:498
>  #10 0x00005558b808920c in monitor_qapi_event_emit (event=QAPI_EVENT_SHUTDOWN, qdict=0x5558bb161600) at monitor.c:526
>  #11 0x00005558b8089307 in monitor_qapi_event_queue_no_reenter (event=QAPI_EVENT_SHUTDOWN, qdict=0x5558bb161600) at monitor.c:551
>  #12 0x00005558b80896c0 in qapi_event_emit (event=QAPI_EVENT_SHUTDOWN, qdict=0x5558bb161600) at monitor.c:626
>  #13 0x00005558b855f23b in qapi_event_send_shutdown (guest=false, reason=SHUTDOWN_CAUSE_HOST_QMP_QUIT) at qapi/qapi-events-run-state.c:43
>  #14 0x00005558b81911ef in qemu_system_shutdown (cause=SHUTDOWN_CAUSE_HOST_QMP_QUIT) at vl.c:1837
>  #15 0x00005558b8191308 in main_loop_should_exit () at vl.c:1885
>  #16 0x00005558b819140d in main_loop () at vl.c:1924
>  #17 0x00005558b8198c84 in main (argc=18, argv=0x7ffc355df3f8, envp=0x7ffc355df490) at vl.c:4665
> 
> This patch adds a lock to protect tcp_chr_disconnect() and
> socket_reconnect_timeout()
> 
> Signed-off-by: Alberto Garcia <berto@igalia.com>
> ---
>  chardev/char-socket.c | 25 +++++++++++++++++++++----
>  1 file changed, 21 insertions(+), 4 deletions(-)

Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>


Regards,
Daniel
Max Reitz July 16, 2019, 9:56 a.m. UTC | #2
On 22.02.19 14:46, Alberto Garcia wrote:
> There's a race condition in which the tcp_chr_read() ioc handler can
> close a connection that is being written to from another thread.
> 
> Running iotest 136 in a loop triggers this problem and crashes QEMU.
> 
>  (gdb) bt
>  #0  0x00005558b842902d in object_get_class (obj=0x0) at qom/object.c:860
>  #1  0x00005558b84f92db in qio_channel_writev_full (ioc=0x0, iov=0x7ffc355decf0, niov=1, fds=0x0, nfds=0, errp=0x0) at io/channel.c:76
>  #2  0x00005558b84e0e9e in io_channel_send_full (ioc=0x0, buf=0x5558baf5beb0, len=138, fds=0x0, nfds=0) at chardev/char-io.c:123
>  #3  0x00005558b84e4a69 in tcp_chr_write (chr=0x5558ba460380, buf=0x5558baf5beb0 "...", len=138) at chardev/char-socket.c:135
>  #4  0x00005558b84dca55 in qemu_chr_write_buffer (s=0x5558ba460380, buf=0x5558baf5beb0 "...", len=138, offset=0x7ffc355dedd0, write_all=false) at chardev/char.c:112
>  #5  0x00005558b84dcbc2 in qemu_chr_write (s=0x5558ba460380, buf=0x5558baf5beb0 "...", len=138, write_all=false) at chardev/char.c:147
>  #6  0x00005558b84dfb26 in qemu_chr_fe_write (be=0x5558ba476610, buf=0x5558baf5beb0 "...", len=138) at chardev/char-fe.c:42
>  #7  0x00005558b8088c86 in monitor_flush_locked (mon=0x5558ba476610) at monitor.c:406
>  #8  0x00005558b8088e8c in monitor_puts (mon=0x5558ba476610, str=0x5558ba921e49 "") at monitor.c:449
>  #9  0x00005558b8089178 in qmp_send_response (mon=0x5558ba476610, rsp=0x5558bb161600) at monitor.c:498
>  #10 0x00005558b808920c in monitor_qapi_event_emit (event=QAPI_EVENT_SHUTDOWN, qdict=0x5558bb161600) at monitor.c:526
>  #11 0x00005558b8089307 in monitor_qapi_event_queue_no_reenter (event=QAPI_EVENT_SHUTDOWN, qdict=0x5558bb161600) at monitor.c:551
>  #12 0x00005558b80896c0 in qapi_event_emit (event=QAPI_EVENT_SHUTDOWN, qdict=0x5558bb161600) at monitor.c:626
>  #13 0x00005558b855f23b in qapi_event_send_shutdown (guest=false, reason=SHUTDOWN_CAUSE_HOST_QMP_QUIT) at qapi/qapi-events-run-state.c:43
>  #14 0x00005558b81911ef in qemu_system_shutdown (cause=SHUTDOWN_CAUSE_HOST_QMP_QUIT) at vl.c:1837
>  #15 0x00005558b8191308 in main_loop_should_exit () at vl.c:1885
>  #16 0x00005558b819140d in main_loop () at vl.c:1924
>  #17 0x00005558b8198c84 in main (argc=18, argv=0x7ffc355df3f8, envp=0x7ffc355df490) at vl.c:4665
> 
> This patch adds a lock to protect tcp_chr_disconnect() and
> socket_reconnect_timeout()
> 
> Signed-off-by: Alberto Garcia <berto@igalia.com>
> ---
>  chardev/char-socket.c | 25 +++++++++++++++++++++----
>  1 file changed, 21 insertions(+), 4 deletions(-)
> 
> diff --git a/chardev/char-socket.c b/chardev/char-socket.c
> index 4fcdd8aedd..94522777dc 100644
> --- a/chardev/char-socket.c
> +++ b/chardev/char-socket.c

[...]

> @@ -464,11 +464,19 @@ static void update_disconnected_filename(SocketChardev *s)
>      }
>  }
>  
> +static gboolean tcp_chr_be_event_closed(gpointer opaque)
> +{
> +    Chardev *chr = opaque;
> +    qemu_chr_be_event(chr, CHR_EVENT_CLOSED);
> +    return FALSE;
> +}
> +
>  /* NB may be called even if tcp_chr_connect has not been
>   * reached, due to TLS or telnet initialization failure,
>   * so can *not* assume s->state == TCP_CHARDEV_STATE_CONNECTED
> + * This must be called with chr->chr_write_lock held.
>   */
> -static void tcp_chr_disconnect(Chardev *chr)
> +static void tcp_chr_disconnect_locked(Chardev *chr)
>  {
>      SocketChardev *s = SOCKET_CHARDEV(chr);
>      bool emit_close = s->state == TCP_CHARDEV_STATE_CONNECTED;
> @@ -481,13 +489,20 @@ static void tcp_chr_disconnect(Chardev *chr)
>      }
>      update_disconnected_filename(s);
>      if (emit_close) {
> -        qemu_chr_be_event(chr, CHR_EVENT_CLOSED);
> +        qemu_idle_add(tcp_chr_be_event_closed, chr, chr->gcontext);

How does this guarantee that *chr is still valid when the callback is
invoked?

If I run tests/char-test, it fails.  With a bit of debugging output, it
looks like the the callback is scheduled with this function, then the
chardev is finalized, and only then is the callback invoked, accessing a
dangling pointer.

Or just with valgrind:

> /char/socket/server/mainloop/tcp: ==2114== Invalid write of size 4
> ==2114==    at 0x139F86: qemu_chr_be_event (char.c:70)
> ==2114==    by 0x13FE3D: tcp_chr_be_event_closed (char-socket.c:472)
> ==2114==    by 0x48ED7DA: ??? (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x48F0EDC: g_main_context_dispatch (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x158DC7: glib_pollfds_poll (main-loop.c:222)
> ==2114==    by 0x158DC7: os_host_main_loop_wait (main-loop.c:245)
> ==2114==    by 0x158DC7: main_loop_wait (main-loop.c:521)
> ==2114==    by 0x11EA06: char_socket_server_test (test-char.c:808)
> ==2114==    by 0x4918F9D: ??? (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x4918D43: ??? (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x4918D43: ??? (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x4918D43: ??? (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x4918D43: ??? (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x4919489: g_test_run_suite (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==  Address 0x5694434 is 116 bytes inside a block of size 312 free'd
> ==2114==    at 0x4839A0C: free (vg_replace_malloc.c:540)
> ==2114==    by 0x48F6D8C: g_free (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x1360E9: object_finalize (object.c:620)
> ==2114==    by 0x1360E9: object_unref (object.c:1070)
> ==2114==    by 0x134EC4: object_property_del_child.isra.0 (object.c:576)
> ==2114==    by 0x11F1EF: char_websock_test (test-char.c:461)
> ==2114==    by 0x4918F9D: ??? (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x4918D43: ??? (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x4919489: g_test_run_suite (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x49194A4: g_test_run (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x11A249: main (test-char.c:1459)
> ==2114==  Block was alloc'd at
> ==2114==    at 0x483880B: malloc (vg_replace_malloc.c:309)
> ==2114==    by 0x48F6C98: g_malloc (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x1355D0: object_new_with_type (object.c:631)
> ==2114==    by 0x13AEEA: qemu_chardev_new (char.c:959)
> ==2114==    by 0x13B227: qemu_chr_new_from_opts (char.c:680)
> ==2114==    by 0x13B3B3: qemu_chr_new_noreplay (char.c:726)
> ==2114==    by 0x13B469: qemu_chr_new_permit_mux_mon (char.c:748)
> ==2114==    by 0x11EF17: char_websock_test (test-char.c:415)
> ==2114==    by 0x4918F9D: ??? (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x4918D43: ??? (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x4919489: g_test_run_suite (in /usr/lib64/libglib-2.0.so.0.6000.4)
> ==2114==    by 0x49194A4: g_test_run (in /usr/lib64/libglib-2.0.so.0.6000.4)

(plus more of the same.)

Now I don’t know whether the test is just wrong and in normal use, the
callback would always be invoked before the object is destroyed (i.e.
the test just needs to poll or something).  But in any case, it is fishy
enough that there should be an explanatory comment at least.

Max

>      }
>      if (s->reconnect_time) {
>          qemu_chr_socket_restart_timer(chr);
>      }
>  }
>  
> +static void tcp_chr_disconnect(Chardev *chr)
> +{
> +    qemu_mutex_lock(&chr->chr_write_lock);
> +    tcp_chr_disconnect_locked(chr);
> +    qemu_mutex_unlock(&chr->chr_write_lock);
> +}
> +
>  static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
>  {
>      Chardev *chr = CHARDEV(opaque);
diff mbox series

Patch

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 4fcdd8aedd..94522777dc 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -148,7 +148,7 @@  static void tcp_chr_accept(QIONetListener *listener,
                            void *opaque);
 
 static int tcp_chr_read_poll(void *opaque);
-static void tcp_chr_disconnect(Chardev *chr);
+static void tcp_chr_disconnect_locked(Chardev *chr);
 
 /* Called with chr_write_lock held.  */
 static int tcp_chr_write(Chardev *chr, const uint8_t *buf, int len)
@@ -172,7 +172,7 @@  static int tcp_chr_write(Chardev *chr, const uint8_t *buf, int len)
 
         if (ret < 0 && errno != EAGAIN) {
             if (tcp_chr_read_poll(chr) <= 0) {
-                tcp_chr_disconnect(chr);
+                tcp_chr_disconnect_locked(chr);
                 return len;
             } /* else let the read handler finish it properly */
         }
@@ -464,11 +464,19 @@  static void update_disconnected_filename(SocketChardev *s)
     }
 }
 
+static gboolean tcp_chr_be_event_closed(gpointer opaque)
+{
+    Chardev *chr = opaque;
+    qemu_chr_be_event(chr, CHR_EVENT_CLOSED);
+    return FALSE;
+}
+
 /* NB may be called even if tcp_chr_connect has not been
  * reached, due to TLS or telnet initialization failure,
  * so can *not* assume s->state == TCP_CHARDEV_STATE_CONNECTED
+ * This must be called with chr->chr_write_lock held.
  */
-static void tcp_chr_disconnect(Chardev *chr)
+static void tcp_chr_disconnect_locked(Chardev *chr)
 {
     SocketChardev *s = SOCKET_CHARDEV(chr);
     bool emit_close = s->state == TCP_CHARDEV_STATE_CONNECTED;
@@ -481,13 +489,20 @@  static void tcp_chr_disconnect(Chardev *chr)
     }
     update_disconnected_filename(s);
     if (emit_close) {
-        qemu_chr_be_event(chr, CHR_EVENT_CLOSED);
+        qemu_idle_add(tcp_chr_be_event_closed, chr, chr->gcontext);
     }
     if (s->reconnect_time) {
         qemu_chr_socket_restart_timer(chr);
     }
 }
 
+static void tcp_chr_disconnect(Chardev *chr)
+{
+    qemu_mutex_lock(&chr->chr_write_lock);
+    tcp_chr_disconnect_locked(chr);
+    qemu_mutex_unlock(&chr->chr_write_lock);
+}
+
 static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
 {
     Chardev *chr = CHARDEV(opaque);
@@ -1128,8 +1143,10 @@  static gboolean socket_reconnect_timeout(gpointer opaque)
     Chardev *chr = CHARDEV(opaque);
     SocketChardev *s = SOCKET_CHARDEV(opaque);
 
+    qemu_mutex_lock(&chr->chr_write_lock);
     g_source_unref(s->reconnect_timer);
     s->reconnect_timer = NULL;
+    qemu_mutex_unlock(&chr->chr_write_lock);
 
     if (chr->be_open) {
         return false;