@@ -38,4 +38,8 @@ int get_colo_mode(void);
int create_and_init_ram_cache(void);
void colo_flush_ram_cache(void);
void release_ram_cache(void);
+
+/* failover */
+void colo_do_failover(MigrationState *s);
+
#endif
@@ -26,5 +26,6 @@ void failover_init_state(void);
int failover_set_state(int old_state, int new_state);
int failover_get_state(void);
void failover_request_active(Error **errp);
+bool failover_request_is_active(void);
#endif
@@ -22,9 +22,17 @@ static COLOFailoverStatus failover_state;
static void colo_failover_bh(void *opaque)
{
+ int old_state;
+
qemu_bh_delete(failover_bh);
failover_bh = NULL;
- /*TODO: Do failover work */
+ old_state = failover_set_state(FAILOVER_STATUS_REQUEST,
+ FAILOVER_STATUS_HANDLING);
+ if (old_state != FAILOVER_STATUS_REQUEST) {
+ error_report(" Unkown error for failover, old_state=%d", old_state);
+ return;
+ }
+ colo_do_failover(NULL);
}
void failover_request_active(Error **errp)
@@ -59,6 +67,11 @@ int failover_get_state(void)
return atomic_read(&failover_state);
}
+bool failover_request_is_active(void)
+{
+ return ((failover_get_state() != FAILOVER_STATUS_NONE));
+}
+
void qmp_colo_lost_heartbeat(Error **errp)
{
if (get_colo_mode() == COLO_MODE_UNKNOWN) {
@@ -88,6 +88,94 @@ bool migration_incoming_in_colo_state(void)
return (mis && (mis->state == MIGRATION_STATUS_COLO));
}
+static bool colo_runstate_is_stopped(void)
+{
+ return runstate_check(RUN_STATE_COLO) || !runstate_is_running();
+}
+
+/*
+ * there are two way to entry this function
+ * 1. From colo checkpoint incoming thread, in this case
+ * we should protect it by iothread lock
+ * 2. From user command, because hmp/qmp command
+ * was happened in main loop, iothread lock will cause a
+ * dead lock.
+ */
+static void secondary_vm_do_failover(void)
+{
+ int old_state;
+ MigrationIncomingState *mis = migration_incoming_get_current();
+
+ migrate_set_state(&mis->state, MIGRATION_STATUS_COLO,
+ MIGRATION_STATUS_COMPLETED);
+ if (!autostart) {
+ error_report("\"-S\" qemu option will be ignored in secondary side");
+ /* recover runstate to normal migration finish state */
+ autostart = true;
+ }
+ if (mis->file) { /* Make sure colo incoming thread not block in recv */
+ qemu_file_shutdown(mis->file);
+ }
+ if (mis->colo_buffer) {
+ qsb_free(mis->colo_buffer);
+ }
+ old_state = failover_set_state(FAILOVER_STATUS_HANDLING,
+ FAILOVER_STATUS_COMPLETED);
+ if (old_state != FAILOVER_STATUS_HANDLING) {
+ error_report("Serious error while do failover for secondary VM,"
+ "old_state: %d", old_state);
+ return;
+ }
+ /* For Secondary VM, jump to incoming co */
+ if (mis->migration_incoming_co) {
+ qemu_coroutine_enter(mis->migration_incoming_co, NULL);
+ }
+}
+
+static void primary_vm_do_failover(void)
+{
+ MigrationState *s = migrate_get_current();
+ int old_state;
+
+ if (s->state != MIGRATION_STATUS_FAILED) {
+ migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
+ MIGRATION_STATUS_COMPLETED);
+ }
+
+ if (s->file) { /* Make sure colo thread no block in recv */
+ qemu_file_shutdown(s->file);
+ }
+ if (s->colo_state.buffer) {
+ qsb_free(s->colo_state.buffer);
+ s->colo_state.buffer = NULL;
+ }
+ qemu_bh_schedule(s->cleanup_bh);
+
+ vm_start();
+
+ old_state = failover_set_state(FAILOVER_STATUS_HANDLING,
+ FAILOVER_STATUS_COMPLETED);
+ if (old_state != FAILOVER_STATUS_COMPLETED) {
+ error_report("Serious error while do failover for Primary VM,"
+ "old_state: %d", old_state);
+ return;
+ }
+}
+
+void colo_do_failover(MigrationState *s)
+{
+ /* Make sure vm stopped while failover */
+ if (!colo_runstate_is_stopped()) {
+ vm_stop_force_state(RUN_STATE_COLO);
+ }
+
+ if (get_colo_mode() == COLO_MODE_SECONDARY) {
+ secondary_vm_do_failover();
+ } else {
+ primary_vm_do_failover();
+ }
+}
+
/* colo checkpoint control helper */
static int colo_ctl_put(QEMUFile *f, uint64_t request)
{
@@ -162,11 +250,23 @@ static int colo_do_checkpoint_transaction(MigrationState *s, QEMUFile *control)
goto out;
}
+ if (failover_request_is_active()) {
+ ret = -1;
+ goto out;
+ }
/* suspend and save vm state to colo buffer */
qemu_mutex_lock_iothread();
vm_stop_force_state(RUN_STATE_COLO);
qemu_mutex_unlock_iothread();
trace_colo_vm_state_change("run", "stop");
+ /*
+ * failover request bh could be called after
+ * vm_stop_force_state so we check failover_request_is_active() again.
+ */
+ if (failover_request_is_active()) {
+ ret = -1;
+ goto out;
+ }
/* Disable block migration */
s->params.blk = 0;
@@ -225,7 +325,7 @@ static void *colo_thread(void *opaque)
{
MigrationState *s = opaque;
QEMUFile *colo_control = NULL;
- int ret;
+ int i, ret;
failover_init_state();
@@ -256,6 +356,11 @@ static void *colo_thread(void *opaque)
trace_colo_vm_state_change("stop", "run");
while (s->state == MIGRATION_STATUS_COLO) {
+ if (failover_request_is_active()) {
+ error_report("failover request");
+ goto out;
+ }
+
/* start a colo checkpoint */
if (colo_do_checkpoint_transaction(s, colo_control)) {
goto out;
@@ -263,18 +368,24 @@ static void *colo_thread(void *opaque)
}
out:
- migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
- MIGRATION_STATUS_COMPLETED);
-
- qsb_free(s->colo_state.buffer);
- s->colo_state.buffer = NULL;
-
+ error_report("colo: some error happens in colo_thread");
if (colo_control) {
qemu_fclose(colo_control);
}
+ /* Give users time (2s) to get involved in this verdict */
+ for (i = 0; i < 10; i++) {
+ if (failover_request_is_active()) {
+ error_report("Primary VM will take over work");
+ break;
+ }
+ usleep(200 * 1000);
+ }
qemu_mutex_lock_iothread();
- qemu_bh_schedule(s->cleanup_bh);
+ if (!failover_request_is_active()) {
+ error_report("Primary VM will take over work in default");
+ failover_request_active(NULL);
+ }
qemu_mutex_unlock_iothread();
return NULL;
@@ -337,7 +448,7 @@ void *colo_process_incoming_checkpoints(void *opaque)
int fd = qemu_get_fd(f);
QEMUFile *ctl = NULL, *fb = NULL;
uint64_t total_size;
- int ret;
+ int i, ret;
migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
MIGRATION_STATUS_COLO);
@@ -383,6 +494,11 @@ void *colo_process_incoming_checkpoints(void *opaque)
}
}
+ if (failover_request_is_active()) {
+ error_report("failover request");
+ goto out;
+ }
+
/* suspend guest */
qemu_mutex_lock_iothread();
vm_stop_force_state(RUN_STATE_COLO);
@@ -449,6 +565,7 @@ void *colo_process_incoming_checkpoints(void *opaque)
}
out:
+ error_report("Detect some error or get a failover request");
if (fb) {
qemu_fclose(fb);
}
@@ -458,9 +575,24 @@ out:
qemu_fclose(ctl);
}
- qsb_free(mis->colo_buffer);
+ /* Give users time (2s) to get involved in this verdict */
+ for (i = 0; i < 10; i++) {
+ if (failover_request_is_active()) {
+ error_report("Secondary VM will take over work");
+ break;
+ }
+ usleep(200*1000);
+ }
+ /* check flag again*/
+ if (!failover_request_is_active()) {
+ /*
+ * We assume that Primary VM is still alive according to heartbeat,
+ * just kill Secondary VM
+ */
+ error_report("SVM is going to exit in default!");
+ exit(1);
+ }
migration_incoming_exit_colo();
-
return NULL;
}
@@ -1477,6 +1477,7 @@ colo_vm_state_change(const char *old, const char *new) "Change '%s' => '%s'"
colo_ctl_put(const char *msg) "Send '%s'"
colo_ctl_get(const char *msg) "Receive '%s'"
colo_failover_set_state(int new_state) "new state %d"
+colo_rcv_pkt(int result) "Result of net packets comparing is different: %d"
# kvm-all.c
kvm_ioctl(int type, void *arg) "type 0x%x, arg %p"