diff mbox series

[RFC,v7,4/9] migration: fix some segmentation faults when using multifd

Message ID 20181101101715.9443-5-fli@suse.com
State New
Headers show
Series qemu_thread_create: propagate errors to callers to check | expand

Commit Message

Fei Li Nov. 1, 2018, 10:17 a.m. UTC
When multifd is used during migration, a segmentaion fault will
occur in the source when multifd_save_cleanup() is called again if
the multifd_send_state has been freed in earlier error handling. This
can happen when migrate_fd_connect() fails and multifd_fd_cleanup()
is called, and then multifd_new_send_channel_async() fails and
multifd_save_cleanup() is called again.

If the QIOChannel *c of multifd_recv_state->params[i] (p->c) is not
initialized, there is no need to close the channel. Or else a
segmentation fault will occur in multifd_recv_terminate_threads()
when multifd_recv_initial_packet() fails.

Signed-off-by: Fei Li <fli@suse.com>
---
 migration/ram.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

Comments

Peter Xu Nov. 2, 2018, 2:31 a.m. UTC | #1
On Thu, Nov 01, 2018 at 06:17:10PM +0800, Fei Li wrote:
> When multifd is used during migration, a segmentaion fault will
> occur in the source when multifd_save_cleanup() is called again if
> the multifd_send_state has been freed in earlier error handling. This
> can happen when migrate_fd_connect() fails and multifd_fd_cleanup()
> is called, and then multifd_new_send_channel_async() fails and
> multifd_save_cleanup() is called again.
> 
> If the QIOChannel *c of multifd_recv_state->params[i] (p->c) is not
> initialized, there is no need to close the channel. Or else a
> segmentation fault will occur in multifd_recv_terminate_threads()
> when multifd_recv_initial_packet() fails.

It's a bit odd to me when I see that multifd_send_thread() calls
multifd_send_terminate_threads().  Is that the reason that you
encountered the problem?

Instead of checking all these null pointers, IMHO we should just let
multifd_send_terminate_threads() be called only in the main thread...

> 
> Signed-off-by: Fei Li <fli@suse.com>
> ---
>  migration/ram.c | 28 +++++++++++++++++++++-------
>  1 file changed, 21 insertions(+), 7 deletions(-)
> 
> diff --git a/migration/ram.c b/migration/ram.c
> index 7e7deec4d8..4db3b3e8f4 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -907,6 +907,11 @@ static void multifd_send_terminate_threads(Error *err)
>          }
>      }
>  
> +    /* in case multifd_send_state has been freed earlier */
> +    if (!multifd_send_state) {
> +        return;
> +    }
> +
>      for (i = 0; i < migrate_multifd_channels(); i++) {
>          MultiFDSendParams *p = &multifd_send_state->params[i];
>  
> @@ -922,7 +927,7 @@ int multifd_save_cleanup(Error **errp)
>      int i;
>      int ret = 0;
>  
> -    if (!migrate_use_multifd()) {
> +    if (!migrate_use_multifd() || !multifd_send_state) {
>          return 0;
>      }
>      multifd_send_terminate_threads(NULL);
> @@ -960,7 +965,7 @@ static void multifd_send_sync_main(void)
>  {
>      int i;
>  
> -    if (!migrate_use_multifd()) {
> +    if (!migrate_use_multifd() || !multifd_send_state) {
>          return;
>      }
>      if (multifd_send_state->pages->used) {
> @@ -1070,6 +1075,10 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
>      QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
>      Error *local_err = NULL;
>  
> +    if (!multifd_send_state) {
> +        return;
> +    }
> +
>      if (qio_task_propagate_error(task, &local_err)) {
>          if (multifd_save_cleanup(&local_err) != 0) {
>              migrate_set_error(migrate_get_current(), local_err);
> @@ -1131,7 +1140,7 @@ struct {
>      uint64_t packet_num;
>  } *multifd_recv_state;
>  
> -static void multifd_recv_terminate_threads(Error *err)
> +static void multifd_recv_terminate_threads(Error *err, bool channel)
>  {
>      int i;
>  
> @@ -1145,6 +1154,11 @@ static void multifd_recv_terminate_threads(Error *err)
>          }
>      }
>  
> +    /* in case p->c is not initialized */
> +    if (!channel) {
> +        return;
> +    }
> +
>      for (i = 0; i < migrate_multifd_channels(); i++) {
>          MultiFDRecvParams *p = &multifd_recv_state->params[i];
>  
> @@ -1166,7 +1180,7 @@ int multifd_load_cleanup(Error **errp)
>      if (!migrate_use_multifd()) {
>          return 0;
>      }
> -    multifd_recv_terminate_threads(NULL);
> +    multifd_recv_terminate_threads(NULL, true);
>      for (i = 0; i < migrate_multifd_channels(); i++) {
>          MultiFDRecvParams *p = &multifd_recv_state->params[i];
>  
> @@ -1269,7 +1283,7 @@ static void *multifd_recv_thread(void *opaque)
>      }
>  
>      if (local_err) {
> -        multifd_recv_terminate_threads(local_err);
> +        multifd_recv_terminate_threads(local_err, true);
>      }
>      qemu_mutex_lock(&p->mutex);
>      p->running = false;
> @@ -1331,7 +1345,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc)
>  
>      id = multifd_recv_initial_packet(ioc, &local_err);
>      if (id < 0) {
> -        multifd_recv_terminate_threads(local_err);
> +        multifd_recv_terminate_threads(local_err, false);
>          return false;
>      }
>  
> @@ -1339,7 +1353,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc)
>      if (p->c != NULL) {
>          error_setg(&local_err, "multifd: received id '%d' already setup'",
>                     id);
> -        multifd_recv_terminate_threads(local_err);
> +        multifd_recv_terminate_threads(local_err, true);
>          return false;
>      }
>      p->c = ioc;
> -- 
> 2.13.7
> 

Regards,
Fei Li Nov. 2, 2018, 6:03 a.m. UTC | #2
On 11/02/2018 10:31 AM, Peter Xu wrote:
> On Thu, Nov 01, 2018 at 06:17:10PM +0800, Fei Li wrote:
>> When multifd is used during migration, a segmentaion fault will
>> occur in the source when multifd_save_cleanup() is called again if
>> the multifd_send_state has been freed in earlier error handling. This
>> can happen when migrate_fd_connect() fails and multifd_fd_cleanup()
>> is called, and then multifd_new_send_channel_async() fails and
>> multifd_save_cleanup() is called again.
>>
>> If the QIOChannel *c of multifd_recv_state->params[i] (p->c) is not
>> initialized, there is no need to close the channel. Or else a
>> segmentation fault will occur in multifd_recv_terminate_threads()
>> when multifd_recv_initial_packet() fails.
> It's a bit odd to me when I see that multifd_send_thread() calls
> multifd_send_terminate_threads().  Is that the reason that you
> encountered the problem?
Yes, this is one of the reason. Actually this problem almost does not occur
before this patch series, but as this patch series is trying to make 
qemu_thread_create()
be more robust, I find this problem during the debugging. ;)

The second situation is when using multifd (in this way 
multifd_new_send_channel_asyn()[1]
is called several times) and once one channel fails in [1], the later 
channels
will encounter the segmentation fault problem when enters [1] again.
And the third is after applying the last patch, I mean after 
multifd_save_setup=>
socket_send_channel_create(multifd_new_send_channel_async, p) successfully,
but then qemu_thread_create(migration_thread) fails, I assume here we 
need to
do some migration cleanup, like migrate_fd_cleanup() or something 
similar like
the vm_start() in migration_iteration_finish()?
The fourth is when multifd_new_send_channel_async()[1] fails and 
multifd_save_cleanup()
is called, then multifd_send_sync_main <= qemu_savevm_state_setup <= 
migration_thread[2]
is called. (BTW, I find that sometimes [2] is called earlier than [1], 
but sometimes
later the first channel)
> Instead of checking all these null pointers, IMHO we should just let
> multifd_send_terminate_threads() be called only in the main thread...
Ok, from your reply in patch5/9, I know we should offer better isolation and
just let the main thread handle these cleanup.:)
>> Signed-off-by: Fei Li <fli@suse.com>
>> ---
>>   migration/ram.c | 28 +++++++++++++++++++++-------
>>   1 file changed, 21 insertions(+), 7 deletions(-)
>>
>> diff --git a/migration/ram.c b/migration/ram.c
>> index 7e7deec4d8..4db3b3e8f4 100644
>> --- a/migration/ram.c
>> +++ b/migration/ram.c
>> @@ -907,6 +907,11 @@ static void multifd_send_terminate_threads(Error *err)
>>           }
>>       }
>>   
>> +    /* in case multifd_send_state has been freed earlier */
>> +    if (!multifd_send_state) {
>> +        return;
>> +    }
>> +
>>       for (i = 0; i < migrate_multifd_channels(); i++) {
>>           MultiFDSendParams *p = &multifd_send_state->params[i];
The above one is the first case.
>>   
>> @@ -922,7 +927,7 @@ int multifd_save_cleanup(Error **errp)
>>       int i;
>>       int ret = 0;
>>   
>> -    if (!migrate_use_multifd()) {
>> +    if (!migrate_use_multifd() || !multifd_send_state) {
>>           return 0;
>>       }
>>       multifd_send_terminate_threads(NULL);
The above one is the third case.
>> @@ -960,7 +965,7 @@ static void multifd_send_sync_main(void)
>>   {
>>       int i;
>>   
>> -    if (!migrate_use_multifd()) {
>> +    if (!migrate_use_multifd() || !multifd_send_state) {
>>           return;
>>       }
>>       if (multifd_send_state->pages->used) {
The above one is the fourth case.
>> @@ -1070,6 +1075,10 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
>>       QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
>>       Error *local_err = NULL;
>>   
>> +    if (!multifd_send_state) {
>> +        return;
>> +    }
>> +
>>       if (qio_task_propagate_error(task, &local_err)) {
>>           if (multifd_save_cleanup(&local_err) != 0) {
>>               migrate_set_error(migrate_get_current(), local_err);
The above one is the second case.
>> @@ -1131,7 +1140,7 @@ struct {
>>       uint64_t packet_num;
>>   } *multifd_recv_state;
Below is to fix the second paragraph (p->c) in the commit message. :)
>>   
>> -static void multifd_recv_terminate_threads(Error *err)
>> +static void multifd_recv_terminate_threads(Error *err, bool channel)
>>   {
>>       int i;
>>   
>> @@ -1145,6 +1154,11 @@ static void multifd_recv_terminate_threads(Error *err)
>>           }
>>       }
>>   
>> +    /* in case p->c is not initialized */
>> +    if (!channel) {
>> +        return;
>> +    }
>> +
>>       for (i = 0; i < migrate_multifd_channels(); i++) {
>>           MultiFDRecvParams *p = &multifd_recv_state->params[i];
>>   
>> @@ -1166,7 +1180,7 @@ int multifd_load_cleanup(Error **errp)
>>       if (!migrate_use_multifd()) {
>>           return 0;
>>       }
>> -    multifd_recv_terminate_threads(NULL);
>> +    multifd_recv_terminate_threads(NULL, true);
>>       for (i = 0; i < migrate_multifd_channels(); i++) {
>>           MultiFDRecvParams *p = &multifd_recv_state->params[i];
>>   
>> @@ -1269,7 +1283,7 @@ static void *multifd_recv_thread(void *opaque)
>>       }
>>   
>>       if (local_err) {
>> -        multifd_recv_terminate_threads(local_err);
>> +        multifd_recv_terminate_threads(local_err, true);
>>       }
>>       qemu_mutex_lock(&p->mutex);
>>       p->running = false;
>> @@ -1331,7 +1345,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc)
>>   
>>       id = multifd_recv_initial_packet(ioc, &local_err);
>>       if (id < 0) {
>> -        multifd_recv_terminate_threads(local_err);
>> +        multifd_recv_terminate_threads(local_err, false);
>>           return false;
>>       }
>>   
>> @@ -1339,7 +1353,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc)
>>       if (p->c != NULL) {
>>           error_setg(&local_err, "multifd: received id '%d' already setup'",
>>                      id);
>> -        multifd_recv_terminate_threads(local_err);
>> +        multifd_recv_terminate_threads(local_err, true);
>>           return false;
>>       }
>>       p->c = ioc;
>> -- 
>> 2.13.7
>>
> Regards,
>
Have a nice day, thanks for the comment. :)
Fei
diff mbox series

Patch

diff --git a/migration/ram.c b/migration/ram.c
index 7e7deec4d8..4db3b3e8f4 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -907,6 +907,11 @@  static void multifd_send_terminate_threads(Error *err)
         }
     }
 
+    /* in case multifd_send_state has been freed earlier */
+    if (!multifd_send_state) {
+        return;
+    }
+
     for (i = 0; i < migrate_multifd_channels(); i++) {
         MultiFDSendParams *p = &multifd_send_state->params[i];
 
@@ -922,7 +927,7 @@  int multifd_save_cleanup(Error **errp)
     int i;
     int ret = 0;
 
-    if (!migrate_use_multifd()) {
+    if (!migrate_use_multifd() || !multifd_send_state) {
         return 0;
     }
     multifd_send_terminate_threads(NULL);
@@ -960,7 +965,7 @@  static void multifd_send_sync_main(void)
 {
     int i;
 
-    if (!migrate_use_multifd()) {
+    if (!migrate_use_multifd() || !multifd_send_state) {
         return;
     }
     if (multifd_send_state->pages->used) {
@@ -1070,6 +1075,10 @@  static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
     Error *local_err = NULL;
 
+    if (!multifd_send_state) {
+        return;
+    }
+
     if (qio_task_propagate_error(task, &local_err)) {
         if (multifd_save_cleanup(&local_err) != 0) {
             migrate_set_error(migrate_get_current(), local_err);
@@ -1131,7 +1140,7 @@  struct {
     uint64_t packet_num;
 } *multifd_recv_state;
 
-static void multifd_recv_terminate_threads(Error *err)
+static void multifd_recv_terminate_threads(Error *err, bool channel)
 {
     int i;
 
@@ -1145,6 +1154,11 @@  static void multifd_recv_terminate_threads(Error *err)
         }
     }
 
+    /* in case p->c is not initialized */
+    if (!channel) {
+        return;
+    }
+
     for (i = 0; i < migrate_multifd_channels(); i++) {
         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 
@@ -1166,7 +1180,7 @@  int multifd_load_cleanup(Error **errp)
     if (!migrate_use_multifd()) {
         return 0;
     }
-    multifd_recv_terminate_threads(NULL);
+    multifd_recv_terminate_threads(NULL, true);
     for (i = 0; i < migrate_multifd_channels(); i++) {
         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 
@@ -1269,7 +1283,7 @@  static void *multifd_recv_thread(void *opaque)
     }
 
     if (local_err) {
-        multifd_recv_terminate_threads(local_err);
+        multifd_recv_terminate_threads(local_err, true);
     }
     qemu_mutex_lock(&p->mutex);
     p->running = false;
@@ -1331,7 +1345,7 @@  bool multifd_recv_new_channel(QIOChannel *ioc)
 
     id = multifd_recv_initial_packet(ioc, &local_err);
     if (id < 0) {
-        multifd_recv_terminate_threads(local_err);
+        multifd_recv_terminate_threads(local_err, false);
         return false;
     }
 
@@ -1339,7 +1353,7 @@  bool multifd_recv_new_channel(QIOChannel *ioc)
     if (p->c != NULL) {
         error_setg(&local_err, "multifd: received id '%d' already setup'",
                    id);
-        multifd_recv_terminate_threads(local_err);
+        multifd_recv_terminate_threads(local_err, true);
         return false;
     }
     p->c = ioc;