diff mbox series

[v9,02/12] migration: Improve migration thread error handling

Message ID 20171004104636.7963-3-quintela@redhat.com
State New
Headers show
Series Multifd | expand

Commit Message

Juan Quintela Oct. 4, 2017, 10:46 a.m. UTC
We now report errors also when we finish migration, not only on info
migrate.  We plan to use this error from several places, and we want
the first error to happen to win, so we add an mutex to order it.

Signed-off-by: Juan Quintela <quintela@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 migration/migration.c | 19 ++++++++++++++++---
 migration/migration.h |  7 ++++++-
 migration/tls.c       |  1 -
 3 files changed, 22 insertions(+), 5 deletions(-)

Comments

Peter Xu Oct. 9, 2017, 9:28 a.m. UTC | #1
On Wed, Oct 04, 2017 at 12:46:26PM +0200, Juan Quintela wrote:

[...]

> diff --git a/migration/tls.c b/migration/tls.c
> index 596e8790bd..026a008667 100644
> --- a/migration/tls.c
> +++ b/migration/tls.c
> @@ -119,7 +119,6 @@ static void migration_tls_outgoing_handshake(QIOTask *task,
>      if (qio_task_propagate_error(task, &err)) {
>          trace_migration_tls_outgoing_handshake_error(error_get_pretty(err));
>          migrate_fd_error(s, err);
> -        error_free(err);

Would err be leaked if this line is removed?

>      } else {
>          trace_migration_tls_outgoing_handshake_complete();
>          migration_channel_connect(s, ioc, NULL);
> -- 
> 2.13.5
>
Dr. David Alan Gilbert Oct. 16, 2017, 5:34 p.m. UTC | #2
* Juan Quintela (quintela@redhat.com) wrote:
> We now report errors also when we finish migration, not only on info
> migrate.  We plan to use this error from several places, and we want
> the first error to happen to win, so we add an mutex to order it.
> 
> Signed-off-by: Juan Quintela <quintela@redhat.com>
> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  migration/migration.c | 19 ++++++++++++++++---
>  migration/migration.h |  7 ++++++-
>  migration/tls.c       |  1 -
>  3 files changed, 22 insertions(+), 5 deletions(-)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index 98429dc843..468f51cfa7 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -1071,19 +1071,30 @@ static void migrate_fd_cleanup(void *opaque)
>                            MIGRATION_STATUS_CANCELLED);
>      }
>  
> +    if (s->error) {
> +        /* It is used on info migrate.  We can't free it */
> +        error_report_err(error_copy(s->error));
> +    }
>      notifier_list_notify(&migration_state_notifiers, s);
>      block_cleanup_parameters(s);
>  }
>  
> +void migrate_set_error(MigrationState *s, const Error *error)

If you find you need to resend this, please add a comment on this
function saying it takes a copy and it's upto the caller to free
the error they pass in.

Dave

> +{
> +    qemu_mutex_lock(&s->error_mutex);
> +    if (!s->error) {
> +        s->error = error_copy(error);
> +    }
> +    qemu_mutex_unlock(&s->error_mutex);
> +}
> +
>  void migrate_fd_error(MigrationState *s, const Error *error)
>  {
>      trace_migrate_fd_error(error_get_pretty(error));
>      assert(s->to_dst_file == NULL);
>      migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
>                        MIGRATION_STATUS_FAILED);
> -    if (!s->error) {
> -        s->error = error_copy(error);
> -    }
> +    migrate_set_error(s, error);
>      notifier_list_notify(&migration_state_notifiers, s);
>      block_cleanup_parameters(s);
>  }
> @@ -2355,6 +2366,7 @@ static void migration_instance_finalize(Object *obj)
>      MigrationState *ms = MIGRATION_OBJ(obj);
>      MigrationParameters *params = &ms->parameters;
>  
> +    qemu_mutex_destroy(&ms->error_mutex);
>      g_free(params->tls_hostname);
>      g_free(params->tls_creds);
>  }
> @@ -2367,6 +2379,7 @@ static void migration_instance_init(Object *obj)
>      ms->state = MIGRATION_STATUS_NONE;
>      ms->xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE;
>      ms->mbps = -1;
> +    qemu_mutex_init(&ms->error_mutex);
>  
>      params->tls_hostname = g_strdup("");
>      params->tls_creds = g_strdup("");
> diff --git a/migration/migration.h b/migration/migration.h
> index b83cceadc4..51c0ac2e71 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -129,8 +129,12 @@ struct MigrationState
>      int64_t colo_checkpoint_time;
>      QEMUTimer *colo_delay_timer;
>  
> -    /* The last error that occurred */
> +    /* The first error that has occurred.
> +       We used the mutex to be able to return the 1st error message */
>      Error *error;
> +    /* mutex to protect errp */
> +    QemuMutex error_mutex;
> +
>      /* Do we have to clean up -b/-i from old migrate parameters */
>      /* This feature is deprecated and will be removed */
>      bool must_remove_block_options;
> @@ -159,6 +163,7 @@ bool  migration_has_all_channels(void);
>  
>  uint64_t migrate_max_downtime(void);
>  
> +void migrate_set_error(MigrationState *s, const Error *error);
>  void migrate_fd_error(MigrationState *s, const Error *error);
>  
>  void migrate_fd_connect(MigrationState *s);
> diff --git a/migration/tls.c b/migration/tls.c
> index 596e8790bd..026a008667 100644
> --- a/migration/tls.c
> +++ b/migration/tls.c
> @@ -119,7 +119,6 @@ static void migration_tls_outgoing_handshake(QIOTask *task,
>      if (qio_task_propagate_error(task, &err)) {
>          trace_migration_tls_outgoing_handshake_error(error_get_pretty(err));
>          migrate_fd_error(s, err);
> -        error_free(err);
>      } else {
>          trace_migration_tls_outgoing_handshake_complete();
>          migration_channel_connect(s, ioc, NULL);
> -- 
> 2.13.5
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert Oct. 16, 2017, 5:48 p.m. UTC | #3
* Dr. David Alan Gilbert (dgilbert@redhat.com) wrote:
> * Juan Quintela (quintela@redhat.com) wrote:
> > We now report errors also when we finish migration, not only on info
> > migrate.  We plan to use this error from several places, and we want
> > the first error to happen to win, so we add an mutex to order it.
> > 
> > Signed-off-by: Juan Quintela <quintela@redhat.com>
> > Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > ---
> >  migration/migration.c | 19 ++++++++++++++++---
> >  migration/migration.h |  7 ++++++-
> >  migration/tls.c       |  1 -
> >  3 files changed, 22 insertions(+), 5 deletions(-)
> > 
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 98429dc843..468f51cfa7 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -1071,19 +1071,30 @@ static void migrate_fd_cleanup(void *opaque)
> >                            MIGRATION_STATUS_CANCELLED);
> >      }
> >  
> > +    if (s->error) {
> > +        /* It is used on info migrate.  We can't free it */
> > +        error_report_err(error_copy(s->error));
> > +    }
> >      notifier_list_notify(&migration_state_notifiers, s);
> >      block_cleanup_parameters(s);
> >  }
> >  
> > +void migrate_set_error(MigrationState *s, const Error *error)
> 
> If you find you need to resend this, please add a comment on this
> function saying it takes a copy and it's upto the caller to free
> the error they pass in.

Oops, ignore that, I see you change it in the next one.

> Dave
> 
> > +{
> > +    qemu_mutex_lock(&s->error_mutex);
> > +    if (!s->error) {
> > +        s->error = error_copy(error);
> > +    }
> > +    qemu_mutex_unlock(&s->error_mutex);
> > +}
> > +
> >  void migrate_fd_error(MigrationState *s, const Error *error)
> >  {
> >      trace_migrate_fd_error(error_get_pretty(error));
> >      assert(s->to_dst_file == NULL);
> >      migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
> >                        MIGRATION_STATUS_FAILED);
> > -    if (!s->error) {
> > -        s->error = error_copy(error);
> > -    }
> > +    migrate_set_error(s, error);
> >      notifier_list_notify(&migration_state_notifiers, s);
> >      block_cleanup_parameters(s);
> >  }
> > @@ -2355,6 +2366,7 @@ static void migration_instance_finalize(Object *obj)
> >      MigrationState *ms = MIGRATION_OBJ(obj);
> >      MigrationParameters *params = &ms->parameters;
> >  
> > +    qemu_mutex_destroy(&ms->error_mutex);
> >      g_free(params->tls_hostname);
> >      g_free(params->tls_creds);
> >  }
> > @@ -2367,6 +2379,7 @@ static void migration_instance_init(Object *obj)
> >      ms->state = MIGRATION_STATUS_NONE;
> >      ms->xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE;
> >      ms->mbps = -1;
> > +    qemu_mutex_init(&ms->error_mutex);
> >  
> >      params->tls_hostname = g_strdup("");
> >      params->tls_creds = g_strdup("");
> > diff --git a/migration/migration.h b/migration/migration.h
> > index b83cceadc4..51c0ac2e71 100644
> > --- a/migration/migration.h
> > +++ b/migration/migration.h
> > @@ -129,8 +129,12 @@ struct MigrationState
> >      int64_t colo_checkpoint_time;
> >      QEMUTimer *colo_delay_timer;
> >  
> > -    /* The last error that occurred */
> > +    /* The first error that has occurred.
> > +       We used the mutex to be able to return the 1st error message */
> >      Error *error;
> > +    /* mutex to protect errp */
> > +    QemuMutex error_mutex;
> > +
> >      /* Do we have to clean up -b/-i from old migrate parameters */
> >      /* This feature is deprecated and will be removed */
> >      bool must_remove_block_options;
> > @@ -159,6 +163,7 @@ bool  migration_has_all_channels(void);
> >  
> >  uint64_t migrate_max_downtime(void);
> >  
> > +void migrate_set_error(MigrationState *s, const Error *error);
> >  void migrate_fd_error(MigrationState *s, const Error *error);
> >  
> >  void migrate_fd_connect(MigrationState *s);
> > diff --git a/migration/tls.c b/migration/tls.c
> > index 596e8790bd..026a008667 100644
> > --- a/migration/tls.c
> > +++ b/migration/tls.c
> > @@ -119,7 +119,6 @@ static void migration_tls_outgoing_handshake(QIOTask *task,
> >      if (qio_task_propagate_error(task, &err)) {
> >          trace_migration_tls_outgoing_handshake_error(error_get_pretty(err));
> >          migrate_fd_error(s, err);
> > -        error_free(err);
> >      } else {
> >          trace_migration_tls_outgoing_handshake_complete();
> >          migration_channel_connect(s, ioc, NULL);
> > -- 
> > 2.13.5
> > 
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox series

Patch

diff --git a/migration/migration.c b/migration/migration.c
index 98429dc843..468f51cfa7 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1071,19 +1071,30 @@  static void migrate_fd_cleanup(void *opaque)
                           MIGRATION_STATUS_CANCELLED);
     }
 
+    if (s->error) {
+        /* It is used on info migrate.  We can't free it */
+        error_report_err(error_copy(s->error));
+    }
     notifier_list_notify(&migration_state_notifiers, s);
     block_cleanup_parameters(s);
 }
 
+void migrate_set_error(MigrationState *s, const Error *error)
+{
+    qemu_mutex_lock(&s->error_mutex);
+    if (!s->error) {
+        s->error = error_copy(error);
+    }
+    qemu_mutex_unlock(&s->error_mutex);
+}
+
 void migrate_fd_error(MigrationState *s, const Error *error)
 {
     trace_migrate_fd_error(error_get_pretty(error));
     assert(s->to_dst_file == NULL);
     migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
                       MIGRATION_STATUS_FAILED);
-    if (!s->error) {
-        s->error = error_copy(error);
-    }
+    migrate_set_error(s, error);
     notifier_list_notify(&migration_state_notifiers, s);
     block_cleanup_parameters(s);
 }
@@ -2355,6 +2366,7 @@  static void migration_instance_finalize(Object *obj)
     MigrationState *ms = MIGRATION_OBJ(obj);
     MigrationParameters *params = &ms->parameters;
 
+    qemu_mutex_destroy(&ms->error_mutex);
     g_free(params->tls_hostname);
     g_free(params->tls_creds);
 }
@@ -2367,6 +2379,7 @@  static void migration_instance_init(Object *obj)
     ms->state = MIGRATION_STATUS_NONE;
     ms->xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE;
     ms->mbps = -1;
+    qemu_mutex_init(&ms->error_mutex);
 
     params->tls_hostname = g_strdup("");
     params->tls_creds = g_strdup("");
diff --git a/migration/migration.h b/migration/migration.h
index b83cceadc4..51c0ac2e71 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -129,8 +129,12 @@  struct MigrationState
     int64_t colo_checkpoint_time;
     QEMUTimer *colo_delay_timer;
 
-    /* The last error that occurred */
+    /* The first error that has occurred.
+       We used the mutex to be able to return the 1st error message */
     Error *error;
+    /* mutex to protect errp */
+    QemuMutex error_mutex;
+
     /* Do we have to clean up -b/-i from old migrate parameters */
     /* This feature is deprecated and will be removed */
     bool must_remove_block_options;
@@ -159,6 +163,7 @@  bool  migration_has_all_channels(void);
 
 uint64_t migrate_max_downtime(void);
 
+void migrate_set_error(MigrationState *s, const Error *error);
 void migrate_fd_error(MigrationState *s, const Error *error);
 
 void migrate_fd_connect(MigrationState *s);
diff --git a/migration/tls.c b/migration/tls.c
index 596e8790bd..026a008667 100644
--- a/migration/tls.c
+++ b/migration/tls.c
@@ -119,7 +119,6 @@  static void migration_tls_outgoing_handshake(QIOTask *task,
     if (qio_task_propagate_error(task, &err)) {
         trace_migration_tls_outgoing_handshake_error(error_get_pretty(err));
         migrate_fd_error(s, err);
-        error_free(err);
     } else {
         trace_migration_tls_outgoing_handshake_complete();
         migration_channel_connect(s, ioc, NULL);