diff mbox series

[RFC,1/1] migration: Update error description whenever migration fails

Message ID 20230503203116.42429-2-tejus.gk@nutanix.com
State New
Headers show
Series migration: Update error description whenever migration fails | expand

Commit Message

Tejus GK May 3, 2023, 8:31 p.m. UTC
There are places in the code where the migration is marked failed with
MIGRATION_STATUS_FAILED, but the failiure reason is never updated. Hence
libvirt doesn't know why the migration failed when it queries for it.

Signed-off-by: tejus.gk <tejus.gk@nutanix.com>
---
 migration/migration.c | 8 ++++++++
 1 file changed, 8 insertions(+)

Comments

Daniel P. Berrangé May 4, 2023, 8:16 a.m. UTC | #1
On Wed, May 03, 2023 at 08:31:16PM +0000, tejus.gk wrote:
> There are places in the code where the migration is marked failed with
> MIGRATION_STATUS_FAILED, but the failiure reason is never updated. Hence
> libvirt doesn't know why the migration failed when it queries for it.
> 
> Signed-off-by: tejus.gk <tejus.gk@nutanix.com>
> ---
>  migration/migration.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index feb5ab7493..0d7d34bf4d 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -1665,8 +1665,11 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
>          }
>          error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri",
>                     "a valid migration protocol");
> +        error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE, "uri",
> +                   "a valid migration protocol");
>          migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
>                            MIGRATION_STATUS_FAILED);
> +        migrate_set_error(s, local_err);
>          block_cleanup_parameters();
>          return;

Most of this  "} else {"  block is duplicating what is done in
the following "if (local_error)" block. As such I think this
should be deleted and replaced with merely

   } else {
        error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE, "uri",
                   "a valid migration protocol");
        block_cleanup_parameters();
   }

...so we just fallthruogh to the local_error cleanup block.

>      }
> @@ -2059,6 +2062,7 @@ static int postcopy_start(MigrationState *ms)
>      int64_t bandwidth = migrate_max_postcopy_bandwidth();
>      bool restart_block = false;
>      int cur_state = MIGRATION_STATUS_ACTIVE;
> +    Error *local_err = NULL;
>  
>      if (migrate_postcopy_preempt()) {
>          migration_wait_main_channel(ms);
> @@ -2203,8 +2207,10 @@ static int postcopy_start(MigrationState *ms)
>      ret = qemu_file_get_error(ms->to_dst_file);
>      if (ret) {
>          error_report("postcopy_start: Migration stream errored");
> +        error_setg(&local_err, "postcopy_start: Migration stream errored");

There is an earlier place in this method which also calls
error_report which you've not changed to call migrate_set_error.

Even more crazy is that the caller of postcopy_start() also
calls error_report() but with a useless error message.

ALso nothing is free'ing the local_err object once set.

IMHO, the postcopy_start() method should be changed to accept
an "Error **errp" parameter, and then the caller should be
responsible for calling error_report_err and migrate_set_error


>          migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
>                                MIGRATION_STATUS_FAILED);
> +        migrate_set_error(ms, local_err);
>      }
>  
>      trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
> @@ -3233,7 +3239,9 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>      if (migrate_postcopy_ram() || migrate_return_path()) {
>          if (open_return_path_on_source(s, !resume)) {
>              error_report("Unable to open return-path for postcopy");
> +            error_setg(&local_err, "Unable to open return-path");

Having two different error messages is bad and again nothing free's
the local_err object. Remove the error_report call and have it call
error_report_err(&local_err) which does free the object

>              migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
> +            migrate_set_error(s, local_err);
>              migrate_fd_cleanup(s);
>              return;
>          }
> -- 
> 2.22.3
> 
> 

With regards,
Daniel
Tejus GK May 5, 2023, 2:44 p.m. UTC | #2
On 04/05/23 1:46 pm, Daniel P. Berrangé wrote:
> On Wed, May 03, 2023 at 08:31:16PM +0000, tejus.gk wrote:
>> There are places in the code where the migration is marked failed with
>> MIGRATION_STATUS_FAILED, but the failiure reason is never updated. Hence
>> libvirt doesn't know why the migration failed when it queries for it.
>>
>> Signed-off-by: tejus.gk <tejus.gk@nutanix.com>
>> ---
>>  migration/migration.c | 8 ++++++++
>>  1 file changed, 8 insertions(+)
>>
>> diff --git a/migration/migration.c b/migration/migration.c
>> index feb5ab7493..0d7d34bf4d 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -1665,8 +1665,11 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
>>          }
>>          error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri",
>>                     "a valid migration protocol");
>> +        error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE, "uri",
>> +                   "a valid migration protocol");
>>          migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
>>                            MIGRATION_STATUS_FAILED);
>> +        migrate_set_error(s, local_err);
>>          block_cleanup_parameters();
>>          return;
> 
> Most of this  "} else {"  block is duplicating what is done in
> the following "if (local_error)" block. As such I think this
> should be deleted and replaced with merely
> 
>    } else {
>         error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE, "uri",
>                    "a valid migration protocol");
>         block_cleanup_parameters();
>    }
> 
> ...so we just fallthruogh to the local_error cleanup block.
Ack. Will modify this is in the next patch. 
> 
>>      }
>> @@ -2059,6 +2062,7 @@ static int postcopy_start(MigrationState *ms)
>>      int64_t bandwidth = migrate_max_postcopy_bandwidth();
>>      bool restart_block = false;
>>      int cur_state = MIGRATION_STATUS_ACTIVE;
>> +    Error *local_err = NULL;
>>  
>>      if (migrate_postcopy_preempt()) {
>>          migration_wait_main_channel(ms);
>> @@ -2203,8 +2207,10 @@ static int postcopy_start(MigrationState *ms)
>>      ret = qemu_file_get_error(ms->to_dst_file);
>>      if (ret) {
>>          error_report("postcopy_start: Migration stream errored");
>> +        error_setg(&local_err, "postcopy_start: Migration stream errored");
> 
> There is an earlier place in this method which also calls
> error_report which you've not changed to call migrate_set_error.
> 
Ack, will fix this in the next patch. 
> Even more crazy is that the caller of postcopy_start() also
> calls error_report() but with a useless error message.
> 
> ALso nothing is free'ing the local_err object once set.
> 
> IMHO, the postcopy_start() method should be changed to accept
> an "Error **errp" parameter, and then the caller should be
> responsible for calling error_report_err and migrate_set_error
Ack, will modify this in the next patch. 
> 
> 
>>          migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
>>                                MIGRATION_STATUS_FAILED);
>> +        migrate_set_error(ms, local_err);
>>      }
>>  
>>      trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
>> @@ -3233,7 +3239,9 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>>      if (migrate_postcopy_ram() || migrate_return_path()) {
>>          if (open_return_path_on_source(s, !resume)) {
>>              error_report("Unable to open return-path for postcopy");
>> +            error_setg(&local_err, "Unable to open return-path");
> 
> Having two different error messages is bad and again nothing free's
> the local_err object. Remove the error_report call and have it call
> error_report_err(&local_err) which does free the object
My bad, missed this. Will fix this in the next patch. 
> 
>>              migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
>> +            migrate_set_error(s, local_err);
>>              migrate_fd_cleanup(s);
>>              return;
>>          }
>> -- 
>> 2.22.3
>>
>>
> 
> With regards,
> Daniel

Hi, 
Thanks for the reviews. I'll be sending a revision with the fixes shortly. Meanwhile I wanted to get something clarified. Apart from the places this patch set is covering, there are also places in the code, where the migration is marked as failed, yet an error_report() call is either not happening or is happening in a different file. An example of the latter can be seen in the function migration_completion() in migration.c, where

        ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
                                                         s->block_inactive);
            }
        }
        qemu_mutex_unlock_iothread();

        if (ret < 0) {
            goto fail;
        }

and if we take a look at fail:

        fail:
              migrate_set_state(&s->state, current_active_state,
                      MIGRATION_STATUS_FAILED);

In this instance, the error_report() call for a possible failure while saving the vmstate is being done in the file vmstate.c. I wanted to ask if doing a migrate_set_error() in a different file (vmstate.c in this case) is permissible?

regards,
tejus
diff mbox series

Patch

diff --git a/migration/migration.c b/migration/migration.c
index feb5ab7493..0d7d34bf4d 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1665,8 +1665,11 @@  void qmp_migrate(const char *uri, bool has_blk, bool blk,
         }
         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri",
                    "a valid migration protocol");
+        error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE, "uri",
+                   "a valid migration protocol");
         migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
                           MIGRATION_STATUS_FAILED);
+        migrate_set_error(s, local_err);
         block_cleanup_parameters();
         return;
     }
@@ -2059,6 +2062,7 @@  static int postcopy_start(MigrationState *ms)
     int64_t bandwidth = migrate_max_postcopy_bandwidth();
     bool restart_block = false;
     int cur_state = MIGRATION_STATUS_ACTIVE;
+    Error *local_err = NULL;
 
     if (migrate_postcopy_preempt()) {
         migration_wait_main_channel(ms);
@@ -2203,8 +2207,10 @@  static int postcopy_start(MigrationState *ms)
     ret = qemu_file_get_error(ms->to_dst_file);
     if (ret) {
         error_report("postcopy_start: Migration stream errored");
+        error_setg(&local_err, "postcopy_start: Migration stream errored");
         migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
                               MIGRATION_STATUS_FAILED);
+        migrate_set_error(ms, local_err);
     }
 
     trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
@@ -3233,7 +3239,9 @@  void migrate_fd_connect(MigrationState *s, Error *error_in)
     if (migrate_postcopy_ram() || migrate_return_path()) {
         if (open_return_path_on_source(s, !resume)) {
             error_report("Unable to open return-path for postcopy");
+            error_setg(&local_err, "Unable to open return-path");
             migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
+            migrate_set_error(s, local_err);
             migrate_fd_cleanup(s);
             return;
         }