diff mbox series

[PULL,10/40] migration: Delay start of migration main routines

Message ID 20180515234017.2277-11-quintela@redhat.com
State New
Headers show
Series [PULL,01/40] migration: fix saving normal page even if it's been compressed | expand

Commit Message

Juan Quintela May 15, 2018, 11:39 p.m. UTC
We need to make sure that we have started all the multifd threads.

Signed-off-by: Juan Quintela <quintela@redhat.com>
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
---
 migration/migration.c | 4 ++--
 migration/migration.h | 1 +
 migration/ram.c       | 3 +++
 migration/socket.c    | 4 ++++
 4 files changed, 10 insertions(+), 2 deletions(-)

Comments

Kevin Wolf May 18, 2018, 8:59 a.m. UTC | #1
Am 16.05.2018 um 01:39 hat Juan Quintela geschrieben:
> We need to make sure that we have started all the multifd threads.
> 
> Signed-off-by: Juan Quintela <quintela@redhat.com>
> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>

This commit makes qemu-iotests 091 hang for me. Either it breaks
backward compatibility intentionally and we need to update the test
case, or there is a bug somewhere.

Can you have a look, please?

Kevin

> ---
>  migration/migration.c | 4 ++--
>  migration/migration.h | 1 +
>  migration/ram.c       | 3 +++
>  migration/socket.c    | 4 ++++
>  4 files changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index 8e5b421b97..61c4ee7850 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -430,7 +430,7 @@ static void migration_incoming_setup(QEMUFile *f)
>      qemu_file_set_blocking(f, false);
>  }
>  
> -static void migration_incoming_process(void)
> +void migration_incoming_process(void)
>  {
>      Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
>      qemu_coroutine_enter(co);
> @@ -448,7 +448,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc)
>  
>      if (!mis->from_src_file) {
>          QEMUFile *f = qemu_fopen_channel_input(ioc);
> -        migration_fd_process_incoming(f);
> +        migration_incoming_setup(f);
>          return;
>      }
>      multifd_recv_new_channel(ioc);
> diff --git a/migration/migration.h b/migration/migration.h
> index 7c69598c54..26e5951d56 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -200,6 +200,7 @@ void migrate_set_state(int *state, int old_state, int new_state);
>  
>  void migration_fd_process_incoming(QEMUFile *f);
>  void migration_ioc_process_incoming(QIOChannel *ioc);
> +void migration_incoming_process(void);
>  
>  bool  migration_has_all_channels(void);
>  
> diff --git a/migration/ram.c b/migration/ram.c
> index 87434d3fce..f7e8615e15 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -717,6 +717,9 @@ void multifd_recv_new_channel(QIOChannel *ioc)
>      qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
>                         QEMU_THREAD_JOINABLE);
>      atomic_inc(&multifd_recv_state->count);
> +    if (multifd_recv_state->count == migrate_multifd_channels()) {
> +        migration_incoming_process();
> +    }
>  }
>  
>  /**
> diff --git a/migration/socket.c b/migration/socket.c
> index e09fd1aae5..7a5eb562b8 100644
> --- a/migration/socket.c
> +++ b/migration/socket.c
> @@ -170,6 +170,10 @@ static void socket_accept_incoming_migration(QIONetListener *listener,
>          qio_net_listener_disconnect(listener);
>  
>          object_unref(OBJECT(listener));
> +
> +        if (!migrate_use_multifd()) {
> +            migration_incoming_process();
> +        }
>      }
>  }
>  
> -- 
> 2.17.0
> 
>
Dr. David Alan Gilbert May 18, 2018, 10:34 a.m. UTC | #2
* Kevin Wolf (kwolf@redhat.com) wrote:
> Am 16.05.2018 um 01:39 hat Juan Quintela geschrieben:
> > We need to make sure that we have started all the multifd threads.
> > 
> > Signed-off-by: Juan Quintela <quintela@redhat.com>
> > Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
> 
> This commit makes qemu-iotests 091 hang for me. Either it breaks
> backward compatibility intentionally and we need to update the test
> case, or there is a bug somewhere.

It's not an intentional break.
And the avocado tcp and exec migrations pass OK, so hmm.

Dave

> Can you have a look, please?
> 
> Kevin
> 
> > ---
> >  migration/migration.c | 4 ++--
> >  migration/migration.h | 1 +
> >  migration/ram.c       | 3 +++
> >  migration/socket.c    | 4 ++++
> >  4 files changed, 10 insertions(+), 2 deletions(-)
> > 
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 8e5b421b97..61c4ee7850 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -430,7 +430,7 @@ static void migration_incoming_setup(QEMUFile *f)
> >      qemu_file_set_blocking(f, false);
> >  }
> >  
> > -static void migration_incoming_process(void)
> > +void migration_incoming_process(void)
> >  {
> >      Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
> >      qemu_coroutine_enter(co);
> > @@ -448,7 +448,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc)
> >  
> >      if (!mis->from_src_file) {
> >          QEMUFile *f = qemu_fopen_channel_input(ioc);
> > -        migration_fd_process_incoming(f);
> > +        migration_incoming_setup(f);
> >          return;
> >      }
> >      multifd_recv_new_channel(ioc);
> > diff --git a/migration/migration.h b/migration/migration.h
> > index 7c69598c54..26e5951d56 100644
> > --- a/migration/migration.h
> > +++ b/migration/migration.h
> > @@ -200,6 +200,7 @@ void migrate_set_state(int *state, int old_state, int new_state);
> >  
> >  void migration_fd_process_incoming(QEMUFile *f);
> >  void migration_ioc_process_incoming(QIOChannel *ioc);
> > +void migration_incoming_process(void);
> >  
> >  bool  migration_has_all_channels(void);
> >  
> > diff --git a/migration/ram.c b/migration/ram.c
> > index 87434d3fce..f7e8615e15 100644
> > --- a/migration/ram.c
> > +++ b/migration/ram.c
> > @@ -717,6 +717,9 @@ void multifd_recv_new_channel(QIOChannel *ioc)
> >      qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
> >                         QEMU_THREAD_JOINABLE);
> >      atomic_inc(&multifd_recv_state->count);
> > +    if (multifd_recv_state->count == migrate_multifd_channels()) {
> > +        migration_incoming_process();
> > +    }
> >  }
> >  
> >  /**
> > diff --git a/migration/socket.c b/migration/socket.c
> > index e09fd1aae5..7a5eb562b8 100644
> > --- a/migration/socket.c
> > +++ b/migration/socket.c
> > @@ -170,6 +170,10 @@ static void socket_accept_incoming_migration(QIONetListener *listener,
> >          qio_net_listener_disconnect(listener);
> >  
> >          object_unref(OBJECT(listener));
> > +
> > +        if (!migrate_use_multifd()) {
> > +            migration_incoming_process();
> > +        }
> >      }
> >  }
> >  
> > -- 
> > 2.17.0
> > 
> > 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Kevin Wolf May 18, 2018, 12:14 p.m. UTC | #3
Am 18.05.2018 um 12:34 hat Dr. David Alan Gilbert geschrieben:
> * Kevin Wolf (kwolf@redhat.com) wrote:
> > Am 16.05.2018 um 01:39 hat Juan Quintela geschrieben:
> > > We need to make sure that we have started all the multifd threads.
> > > 
> > > Signed-off-by: Juan Quintela <quintela@redhat.com>
> > > Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
> > 
> > This commit makes qemu-iotests 091 hang for me. Either it breaks
> > backward compatibility intentionally and we need to update the test
> > case, or there is a bug somewhere.
> 
> It's not an intentional break.
> And the avocado tcp and exec migrations pass OK, so hmm.

In case it helps, 169 fails as well and I got a core dump of an aborting
QEMU process:

(gdb) bt
#0  0x00007ff079f779fb in raise () at /lib64/libc.so.6
#1  0x00007ff079f79800 in abort () at /lib64/libc.so.6
#2  0x00007ff079f700da in __assert_fail_base () at /lib64/libc.so.6
#3  0x00007ff079f70152 in  () at /lib64/libc.so.6
#4  0x000055c2126f067b in bdrv_close_all () at block.c:3375
#5  0x000055c2123c54a6 in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4682

If I understand correctly, that assertion failure means that someone is
still holding a reference to a block device after all user-owned
references have been closed. I suppose this was the source qemu and
the migration hasn't been completed properly, though I haven't looked at
the code yet and this idea might be completely wrong.

Anyway, 091 is certainly the simpler test case to play with, but maybe
this gives you another hint.

Kevin
Kevin Wolf May 22, 2018, 4:20 p.m. UTC | #4
Am 18.05.2018 um 14:14 hat Kevin Wolf geschrieben:
> Am 18.05.2018 um 12:34 hat Dr. David Alan Gilbert geschrieben:
> > * Kevin Wolf (kwolf@redhat.com) wrote:
> > > Am 16.05.2018 um 01:39 hat Juan Quintela geschrieben:
> > > > We need to make sure that we have started all the multifd threads.
> > > > 
> > > > Signed-off-by: Juan Quintela <quintela@redhat.com>
> > > > Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
> > > 
> > > This commit makes qemu-iotests 091 hang for me. Either it breaks
> > > backward compatibility intentionally and we need to update the test
> > > case, or there is a bug somewhere.
> > 
> > It's not an intentional break.
> > And the avocado tcp and exec migrations pass OK, so hmm.
> 
> In case it helps, 169 fails as well and I got a core dump of an aborting
> QEMU process:
> 
> (gdb) bt
> #0  0x00007ff079f779fb in raise () at /lib64/libc.so.6
> #1  0x00007ff079f79800 in abort () at /lib64/libc.so.6
> #2  0x00007ff079f700da in __assert_fail_base () at /lib64/libc.so.6
> #3  0x00007ff079f70152 in  () at /lib64/libc.so.6
> #4  0x000055c2126f067b in bdrv_close_all () at block.c:3375
> #5  0x000055c2123c54a6 in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4682
> 
> If I understand correctly, that assertion failure means that someone is
> still holding a reference to a block device after all user-owned
> references have been closed. I suppose this was the source qemu and
> the migration hasn't been completed properly, though I haven't looked at
> the code yet and this idea might be completely wrong.
> 
> Anyway, 091 is certainly the simpler test case to play with, but maybe
> this gives you another hint.

Any news on this? This is starting to become really annoying as a
hanging test suite impacts my ability to properly test block layer
patches.

If there is no hope of quickly getting a proper fix for this, we may
have to revert something for now to fight the symptoms at least.

Kevin
Juan Quintela May 23, 2018, 6:29 a.m. UTC | #5
Kevin Wolf <kwolf@redhat.com> wrote:
> Am 18.05.2018 um 14:14 hat Kevin Wolf geschrieben:
>> Am 18.05.2018 um 12:34 hat Dr. David Alan Gilbert geschrieben:
>> > * Kevin Wolf (kwolf@redhat.com) wrote:
>> > > Am 16.05.2018 um 01:39 hat Juan Quintela geschrieben:
>> > > > We need to make sure that we have started all the multifd threads.
>> > > > 
>> > > > Signed-off-by: Juan Quintela <quintela@redhat.com>
>> > > > Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
>> > > 
>> > > This commit makes qemu-iotests 091 hang for me. Either it breaks
>> > > backward compatibility intentionally and we need to update the test
>> > > case, or there is a bug somewhere.
>> > 
>> > It's not an intentional break.
>> > And the avocado tcp and exec migrations pass OK, so hmm.
>> 
>> In case it helps, 169 fails as well and I got a core dump of an aborting
>> QEMU process:
>> 
>> (gdb) bt
>> #0  0x00007ff079f779fb in raise () at /lib64/libc.so.6
>> #1  0x00007ff079f79800 in abort () at /lib64/libc.so.6
>> #2  0x00007ff079f700da in __assert_fail_base () at /lib64/libc.so.6
>> #3  0x00007ff079f70152 in  () at /lib64/libc.so.6
>> #4  0x000055c2126f067b in bdrv_close_all () at block.c:3375
>> #5 0x000055c2123c54a6 in main (argc=<optimized out>, argv=<optimized
>> out>, envp=<optimized out>) at vl.c:4682
>> 
>> If I understand correctly, that assertion failure means that someone is
>> still holding a reference to a block device after all user-owned
>> references have been closed. I suppose this was the source qemu and
>> the migration hasn't been completed properly, though I haven't looked at
>> the code yet and this idea might be completely wrong.
>> 
>> Anyway, 091 is certainly the simpler test case to play with, but maybe
>> this gives you another hint.
>
> Any news on this? This is starting to become really annoying as a
> hanging test suite impacts my ability to properly test block layer
> patches.
>
> If there is no hope of quickly getting a proper fix for this, we may
> have to revert something for now to fight the symptoms at least.

Hi

I am looking into this.  I have been on vacation the end of last week,
and to make things nicer, body decided that taking a stomach flu before
returning was a great idea.

Coping with email/tests.

Later, Juan.
diff mbox series

Patch

diff --git a/migration/migration.c b/migration/migration.c
index 8e5b421b97..61c4ee7850 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -430,7 +430,7 @@  static void migration_incoming_setup(QEMUFile *f)
     qemu_file_set_blocking(f, false);
 }
 
-static void migration_incoming_process(void)
+void migration_incoming_process(void)
 {
     Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
     qemu_coroutine_enter(co);
@@ -448,7 +448,7 @@  void migration_ioc_process_incoming(QIOChannel *ioc)
 
     if (!mis->from_src_file) {
         QEMUFile *f = qemu_fopen_channel_input(ioc);
-        migration_fd_process_incoming(f);
+        migration_incoming_setup(f);
         return;
     }
     multifd_recv_new_channel(ioc);
diff --git a/migration/migration.h b/migration/migration.h
index 7c69598c54..26e5951d56 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -200,6 +200,7 @@  void migrate_set_state(int *state, int old_state, int new_state);
 
 void migration_fd_process_incoming(QEMUFile *f);
 void migration_ioc_process_incoming(QIOChannel *ioc);
+void migration_incoming_process(void);
 
 bool  migration_has_all_channels(void);
 
diff --git a/migration/ram.c b/migration/ram.c
index 87434d3fce..f7e8615e15 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -717,6 +717,9 @@  void multifd_recv_new_channel(QIOChannel *ioc)
     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
                        QEMU_THREAD_JOINABLE);
     atomic_inc(&multifd_recv_state->count);
+    if (multifd_recv_state->count == migrate_multifd_channels()) {
+        migration_incoming_process();
+    }
 }
 
 /**
diff --git a/migration/socket.c b/migration/socket.c
index e09fd1aae5..7a5eb562b8 100644
--- a/migration/socket.c
+++ b/migration/socket.c
@@ -170,6 +170,10 @@  static void socket_accept_incoming_migration(QIONetListener *listener,
         qio_net_listener_disconnect(listener);
 
         object_unref(OBJECT(listener));
+
+        if (!migrate_use_multifd()) {
+            migration_incoming_process();
+        }
     }
 }