diff mbox series

[V3,4/7] Migration/colo.c: Add new COLOExitReason to handle all failover state

Message ID 20190303145021.2962-5-chen.zhang@intel.com
State New
Headers show
Series Migration/colo: Fix upstream bugs when occur failover | expand

Commit Message

Zhang, Chen March 3, 2019, 2:50 p.m. UTC
From: Zhang Chen <chen.zhang@intel.com>

In this patch we add the processing state for COLOExitReason,
because we have to identify COLO in the failover processing state or
failover error state. In the way, we can handle all the failover state.
We have improved the description of the COLOExitReason by the way.

Signed-off-by: Zhang Chen <chen.zhang@intel.com>
---
 migration/colo.c    | 24 +++++++++++++-----------
 qapi/migration.json | 15 +++++++++------
 2 files changed, 22 insertions(+), 17 deletions(-)

Comments

Dr. David Alan Gilbert March 8, 2019, 5:39 p.m. UTC | #1
* Zhang Chen (chen.zhang@intel.com) wrote:
> From: Zhang Chen <chen.zhang@intel.com>
> 
> In this patch we add the processing state for COLOExitReason,
> because we have to identify COLO in the failover processing state or
> failover error state. In the way, we can handle all the failover state.
> We have improved the description of the COLOExitReason by the way.
> 
> Signed-off-by: Zhang Chen <chen.zhang@intel.com>

Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

> ---
>  migration/colo.c    | 24 +++++++++++++-----------
>  qapi/migration.json | 15 +++++++++------
>  2 files changed, 22 insertions(+), 17 deletions(-)
> 
> diff --git a/migration/colo.c b/migration/colo.c
> index 89325952c7..dbe2b88807 100644
> --- a/migration/colo.c
> +++ b/migration/colo.c
> @@ -267,7 +267,11 @@ COLOStatus *qmp_query_colo_status(Error **errp)
>          s->reason = COLO_EXIT_REASON_REQUEST;
>          break;
>      default:
> -        s->reason = COLO_EXIT_REASON_ERROR;
> +        if (migration_in_colo_state()) {
> +            s->reason = COLO_EXIT_REASON_PROCESSING;
> +        } else {
> +            s->reason = COLO_EXIT_REASON_ERROR;
> +        }
>      }
>  
>      return s;
> @@ -579,16 +583,13 @@ out:
>       * or the user triggered failover.
>       */
>      switch (failover_get_state()) {
> -    case FAILOVER_STATUS_NONE:
> -        qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
> -                                  COLO_EXIT_REASON_ERROR);
> -        break;
>      case FAILOVER_STATUS_COMPLETED:
>          qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
>                                    COLO_EXIT_REASON_REQUEST);
>          break;
>      default:
> -        abort();
> +        qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
> +                                  COLO_EXIT_REASON_ERROR);
>      }
>  
>      /* Hope this not to be too long to wait here */
> @@ -850,17 +851,18 @@ out:
>          error_report_err(local_err);
>      }
>  
> +    /*
> +     * There are only two reasons we can get here, some error happened
> +     * or the user triggered failover.
> +     */
>      switch (failover_get_state()) {
> -    case FAILOVER_STATUS_NONE:
> -        qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
> -                                  COLO_EXIT_REASON_ERROR);
> -        break;
>      case FAILOVER_STATUS_COMPLETED:
>          qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
>                                    COLO_EXIT_REASON_REQUEST);
>          break;
>      default:
> -        abort();
> +        qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
> +                                  COLO_EXIT_REASON_ERROR);
>      }
>  
>      if (fb) {
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 7a795ecc16..089ed67807 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -983,19 +983,22 @@
>  ##
>  # @COLOExitReason:
>  #
> -# The reason for a COLO exit
> +# Describe the reason for COLO exit.
>  #
> -# @none: no failover has ever happened. This can't occur in the
> -# COLO_EXIT event, only in the result of query-colo-status.
> +# @none: failover has never happened. This state does not occur
> +# in the COLO_EXIT event, and is only visible in the result of
> +# query-colo-status.
>  #
> -# @request: COLO exit is due to an external request
> +# @request: COLO exit caused by an external request.
>  #
> -# @error: COLO exit is due to an internal error
> +# @error: COLO exit caused by an internal error.
> +#
> +# @processing: COLO is currently handling a failover (since 4.0).
>  #
>  # Since: 3.1
>  ##
>  { 'enum': 'COLOExitReason',
> -  'data': [ 'none', 'request', 'error' ] }
> +  'data': [ 'none', 'request', 'error' , 'processing' ] }
>  
>  ##
>  # @x-colo-lost-heartbeat:
> -- 
> 2.17.GIT
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Markus Armbruster March 9, 2019, 5:06 p.m. UTC | #2
Zhang Chen <chen.zhang@intel.com > writes:

> From: Zhang Chen <chen.zhang@intel.com>
>
> In this patch we add the processing state for COLOExitReason,
> because we have to identify COLO in the failover processing state or
> failover error state. In the way, we can handle all the failover state.
> We have improved the description of the COLOExitReason by the way.
>
> Signed-off-by: Zhang Chen <chen.zhang@intel.com>
> ---
>  migration/colo.c    | 24 +++++++++++++-----------
>  qapi/migration.json | 15 +++++++++------
>  2 files changed, 22 insertions(+), 17 deletions(-)
>
> diff --git a/migration/colo.c b/migration/colo.c
> index 89325952c7..dbe2b88807 100644
> --- a/migration/colo.c
> +++ b/migration/colo.c
> @@ -267,7 +267,11 @@ COLOStatus *qmp_query_colo_status(Error **errp)
       switch (failover_get_state()) {

failover_get_state() returns FailoverStatus, i.e. one of
FAILOVER_STATUS_NONE, _REQUIRE, _ACTIVE, _COMPLETED, _RELAUNCH.

       case FAILOVER_STATUS_NONE:
           s->reason = COLO_EXIT_REASON_NONE;
           break;
       case FAILOVER_STATUS_REQUIRE:
>          s->reason = COLO_EXIT_REASON_REQUEST;
>          break;
>      default:
> -        s->reason = COLO_EXIT_REASON_ERROR;
> +        if (migration_in_colo_state()) {
> +            s->reason = COLO_EXIT_REASON_PROCESSING;
> +        } else {
> +            s->reason = COLO_EXIT_REASON_ERROR;
> +        }
>      }
>  
>      return s;

In which FailoverStatus can migration_in_colo_state() return true?

> @@ -579,16 +583,13 @@ out:
       /*
        * There are only two reasons we can get here, some error happened
>       * or the user triggered failover.
>       */
>      switch (failover_get_state()) {
> -    case FAILOVER_STATUS_NONE:
> -        qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
> -                                  COLO_EXIT_REASON_ERROR);
> -        break;
>      case FAILOVER_STATUS_COMPLETED:
>          qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
>                                    COLO_EXIT_REASON_REQUEST);
>          break;
>      default:
> -        abort();
> +        qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
> +                                  COLO_EXIT_REASON_ERROR);
>      }
>  
>      /* Hope this not to be too long to wait here */

Change of behavior for FAILOVER_STATUS_REQUIRE, _ACTIVE, _COMPLETED,
_RELAUNCH: send a COLO_EXIT event instead of calling abort().  Why?

> @@ -850,17 +851,18 @@ out:
>          error_report_err(local_err);
>      }
>  
> +    /*
> +     * There are only two reasons we can get here, some error happened
> +     * or the user triggered failover.
> +     */
>      switch (failover_get_state()) {
> -    case FAILOVER_STATUS_NONE:
> -        qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
> -                                  COLO_EXIT_REASON_ERROR);
> -        break;
>      case FAILOVER_STATUS_COMPLETED:
>          qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
>                                    COLO_EXIT_REASON_REQUEST);
>          break;
>      default:
> -        abort();
> +        qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
> +                                  COLO_EXIT_REASON_ERROR);
>      }
>  
>      if (fb) {

Same question.

> diff --git a/qapi/migration.json b/qapi/migration.json
> index 7a795ecc16..089ed67807 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -983,19 +983,22 @@
>  ##
>  # @COLOExitReason:
>  #
> -# The reason for a COLO exit
> +# Describe the reason for COLO exit.

The old text was better.

>  #
> -# @none: no failover has ever happened. This can't occur in the
> -# COLO_EXIT event, only in the result of query-colo-status.
> +# @none: failover has never happened. This state does not occur
> +# in the COLO_EXIT event, and is only visible in the result of
> +# query-colo-status.

This might be a small improvment.

>  #
> -# @request: COLO exit is due to an external request
> +# @request: COLO exit caused by an external request.
>  #
> -# @error: COLO exit is due to an internal error
> +# @error: COLO exit caused by an internal error.

I like the old text better.

> +#
> +# @processing: COLO is currently handling a failover (since 4.0).
>  #
>  # Since: 3.1
>  ##
>  { 'enum': 'COLOExitReason',
> -  'data': [ 'none', 'request', 'error' ] }
> +  'data': [ 'none', 'request', 'error' , 'processing' ] }
>  
>  ##
>  # @x-colo-lost-heartbeat:

The patch conflates addition of a new member with doc improvements.
Tolerable since both are small.
Zhang, Chen March 11, 2019, 9:08 a.m. UTC | #3
-----Original Message-----
From: Markus Armbruster [mailto:armbru@redhat.com] 
Sent: Sunday, March 10, 2019 1:06 AM
To: Zhang, Chen <chen.zhang@intel.com>
Cc: Li Zhijian <lizhijian@cn.fujitsu.com>; Zhang Chen <zhangckid@gmail.com>; Dr. David Alan Gilbert <dgilbert@redhat.com>; Juan Quintela <quintela@redhat.com>; zhanghailiang <zhang.zhanghailiang@huawei.com>; Eric Blake <eblake@redhat.com>; qemu-dev <qemu-devel@nongnu.org>; Zhang, Chen <chen.zhang@intel.com>
Subject: Re: [Qemu-devel] [PATCH V3 4/7] Migration/colo.c: Add new COLOExitReason to handle all failover state

Zhang Chen <chen.zhang@intel.com > writes:

> From: Zhang Chen <chen.zhang@intel.com>
>
> In this patch we add the processing state for COLOExitReason, because 
> we have to identify COLO in the failover processing state or failover 
> error state. In the way, we can handle all the failover state.
> We have improved the description of the COLOExitReason by the way.
>
> Signed-off-by: Zhang Chen <chen.zhang@intel.com>
> ---
>  migration/colo.c    | 24 +++++++++++++-----------
>  qapi/migration.json | 15 +++++++++------
>  2 files changed, 22 insertions(+), 17 deletions(-)
>
> diff --git a/migration/colo.c b/migration/colo.c index 
> 89325952c7..dbe2b88807 100644
> --- a/migration/colo.c
> +++ b/migration/colo.c
> @@ -267,7 +267,11 @@ COLOStatus *qmp_query_colo_status(Error **errp)
       switch (failover_get_state()) {

failover_get_state() returns FailoverStatus, i.e. one of FAILOVER_STATUS_NONE, _REQUIRE, _ACTIVE, _COMPLETED, _RELAUNCH.

Yes, have any concern here?

       case FAILOVER_STATUS_NONE:
           s->reason = COLO_EXIT_REASON_NONE;
           break;
       case FAILOVER_STATUS_REQUIRE:
>          s->reason = COLO_EXIT_REASON_REQUEST;
>          break;
>      default:
> -        s->reason = COLO_EXIT_REASON_ERROR;
> +        if (migration_in_colo_state()) {
> +            s->reason = COLO_EXIT_REASON_PROCESSING;
> +        } else {
> +            s->reason = COLO_EXIT_REASON_ERROR;
> +        }
>      }
>  
>      return s;

In which FailoverStatus can migration_in_colo_state() return true?

It is no close connection about the FailoverStatus,  except the "FAILOVER_STATUS_REQUIRE", 
other status still have the possibility be triggered here , if migration_in_colo_state() return true means
we still in COLO failover process, otherwise means we have exit COLO with some failover error.

> @@ -579,16 +583,13 @@ out:
       /*
        * There are only two reasons we can get here, some error happened
>       * or the user triggered failover.
>       */
>      switch (failover_get_state()) {
> -    case FAILOVER_STATUS_NONE:
> -        qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
> -                                  COLO_EXIT_REASON_ERROR);
> -        break;
>      case FAILOVER_STATUS_COMPLETED:
>          qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
>                                    COLO_EXIT_REASON_REQUEST);
>          break;
>      default:
> -        abort();
> +        qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
> +                                  COLO_EXIT_REASON_ERROR);
>      }
>  
>      /* Hope this not to be too long to wait here */

Change of behavior for FAILOVER_STATUS_REQUIRE, _ACTIVE, _COMPLETED,
_RELAUNCH: send a COLO_EXIT event instead of calling abort().  Why?

Because if COLO occur failover error, we want to make the VM back to normal running status rather than just crash VM.


> @@ -850,17 +851,18 @@ out:
>          error_report_err(local_err);
>      }
>  
> +    /*
> +     * There are only two reasons we can get here, some error happened
> +     * or the user triggered failover.
> +     */
>      switch (failover_get_state()) {
> -    case FAILOVER_STATUS_NONE:
> -        qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
> -                                  COLO_EXIT_REASON_ERROR);
> -        break;
>      case FAILOVER_STATUS_COMPLETED:
>          qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
>                                    COLO_EXIT_REASON_REQUEST);
>          break;
>      default:
> -        abort();
> +        qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
> +                                  COLO_EXIT_REASON_ERROR);
>      }
>  
>      if (fb) {

Same question.

> diff --git a/qapi/migration.json b/qapi/migration.json index 
> 7a795ecc16..089ed67807 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -983,19 +983,22 @@
>  ##
>  # @COLOExitReason:
>  #
> -# The reason for a COLO exit
> +# Describe the reason for COLO exit.

The old text was better.

OK, I will remove it.

>  #
> -# @none: no failover has ever happened. This can't occur in the -# 
> COLO_EXIT event, only in the result of query-colo-status.
> +# @none: failover has never happened. This state does not occur # in 
> +the COLO_EXIT event, and is only visible in the result of # 
> +query-colo-status.

This might be a small improvment.

>  #
> -# @request: COLO exit is due to an external request
> +# @request: COLO exit caused by an external request.
>  #
> -# @error: COLO exit is due to an internal error
> +# @error: COLO exit caused by an internal error.

I like the old text better.

Sorry, I'm not a native speaker.
I will remove it in next version.

> +#
> +# @processing: COLO is currently handling a failover (since 4.0).
>  #
>  # Since: 3.1
>  ##
>  { 'enum': 'COLOExitReason',
> -  'data': [ 'none', 'request', 'error' ] }
> +  'data': [ 'none', 'request', 'error' , 'processing' ] }
>  
>  ##
>  # @x-colo-lost-heartbeat:

The patch conflates addition of a new member with doc improvements.
Tolerable since both are small.

Yes, Just tiny fix...

Thanks
Zhang Chen
diff mbox series

Patch

diff --git a/migration/colo.c b/migration/colo.c
index 89325952c7..dbe2b88807 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -267,7 +267,11 @@  COLOStatus *qmp_query_colo_status(Error **errp)
         s->reason = COLO_EXIT_REASON_REQUEST;
         break;
     default:
-        s->reason = COLO_EXIT_REASON_ERROR;
+        if (migration_in_colo_state()) {
+            s->reason = COLO_EXIT_REASON_PROCESSING;
+        } else {
+            s->reason = COLO_EXIT_REASON_ERROR;
+        }
     }
 
     return s;
@@ -579,16 +583,13 @@  out:
      * or the user triggered failover.
      */
     switch (failover_get_state()) {
-    case FAILOVER_STATUS_NONE:
-        qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
-                                  COLO_EXIT_REASON_ERROR);
-        break;
     case FAILOVER_STATUS_COMPLETED:
         qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
                                   COLO_EXIT_REASON_REQUEST);
         break;
     default:
-        abort();
+        qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
+                                  COLO_EXIT_REASON_ERROR);
     }
 
     /* Hope this not to be too long to wait here */
@@ -850,17 +851,18 @@  out:
         error_report_err(local_err);
     }
 
+    /*
+     * There are only two reasons we can get here, some error happened
+     * or the user triggered failover.
+     */
     switch (failover_get_state()) {
-    case FAILOVER_STATUS_NONE:
-        qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
-                                  COLO_EXIT_REASON_ERROR);
-        break;
     case FAILOVER_STATUS_COMPLETED:
         qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
                                   COLO_EXIT_REASON_REQUEST);
         break;
     default:
-        abort();
+        qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
+                                  COLO_EXIT_REASON_ERROR);
     }
 
     if (fb) {
diff --git a/qapi/migration.json b/qapi/migration.json
index 7a795ecc16..089ed67807 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -983,19 +983,22 @@ 
 ##
 # @COLOExitReason:
 #
-# The reason for a COLO exit
+# Describe the reason for COLO exit.
 #
-# @none: no failover has ever happened. This can't occur in the
-# COLO_EXIT event, only in the result of query-colo-status.
+# @none: failover has never happened. This state does not occur
+# in the COLO_EXIT event, and is only visible in the result of
+# query-colo-status.
 #
-# @request: COLO exit is due to an external request
+# @request: COLO exit caused by an external request.
 #
-# @error: COLO exit is due to an internal error
+# @error: COLO exit caused by an internal error.
+#
+# @processing: COLO is currently handling a failover (since 4.0).
 #
 # Since: 3.1
 ##
 { 'enum': 'COLOExitReason',
-  'data': [ 'none', 'request', 'error' ] }
+  'data': [ 'none', 'request', 'error' , 'processing' ] }
 
 ##
 # @x-colo-lost-heartbeat: