Message ID | 20180515234017.2277-11-quintela@redhat.com |
---|---|
State | New |
Headers | show |
Series | [PULL,01/40] migration: fix saving normal page even if it's been compressed | expand |
Am 16.05.2018 um 01:39 hat Juan Quintela geschrieben: > We need to make sure that we have started all the multifd threads. > > Signed-off-by: Juan Quintela <quintela@redhat.com> > Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> This commit makes qemu-iotests 091 hang for me. Either it breaks backward compatibility intentionally and we need to update the test case, or there is a bug somewhere. Can you have a look, please? Kevin > --- > migration/migration.c | 4 ++-- > migration/migration.h | 1 + > migration/ram.c | 3 +++ > migration/socket.c | 4 ++++ > 4 files changed, 10 insertions(+), 2 deletions(-) > > diff --git a/migration/migration.c b/migration/migration.c > index 8e5b421b97..61c4ee7850 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -430,7 +430,7 @@ static void migration_incoming_setup(QEMUFile *f) > qemu_file_set_blocking(f, false); > } > > -static void migration_incoming_process(void) > +void migration_incoming_process(void) > { > Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL); > qemu_coroutine_enter(co); > @@ -448,7 +448,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc) > > if (!mis->from_src_file) { > QEMUFile *f = qemu_fopen_channel_input(ioc); > - migration_fd_process_incoming(f); > + migration_incoming_setup(f); > return; > } > multifd_recv_new_channel(ioc); > diff --git a/migration/migration.h b/migration/migration.h > index 7c69598c54..26e5951d56 100644 > --- a/migration/migration.h > +++ b/migration/migration.h > @@ -200,6 +200,7 @@ void migrate_set_state(int *state, int old_state, int new_state); > > void migration_fd_process_incoming(QEMUFile *f); > void migration_ioc_process_incoming(QIOChannel *ioc); > +void migration_incoming_process(void); > > bool migration_has_all_channels(void); > > diff --git a/migration/ram.c b/migration/ram.c > index 87434d3fce..f7e8615e15 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -717,6 +717,9 @@ void multifd_recv_new_channel(QIOChannel *ioc) > qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, > QEMU_THREAD_JOINABLE); > atomic_inc(&multifd_recv_state->count); > + if (multifd_recv_state->count == migrate_multifd_channels()) { > + migration_incoming_process(); > + } > } > > /** > diff --git a/migration/socket.c b/migration/socket.c > index e09fd1aae5..7a5eb562b8 100644 > --- a/migration/socket.c > +++ b/migration/socket.c > @@ -170,6 +170,10 @@ static void socket_accept_incoming_migration(QIONetListener *listener, > qio_net_listener_disconnect(listener); > > object_unref(OBJECT(listener)); > + > + if (!migrate_use_multifd()) { > + migration_incoming_process(); > + } > } > } > > -- > 2.17.0 > >
* Kevin Wolf (kwolf@redhat.com) wrote: > Am 16.05.2018 um 01:39 hat Juan Quintela geschrieben: > > We need to make sure that we have started all the multifd threads. > > > > Signed-off-by: Juan Quintela <quintela@redhat.com> > > Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> > > This commit makes qemu-iotests 091 hang for me. Either it breaks > backward compatibility intentionally and we need to update the test > case, or there is a bug somewhere. It's not an intentional break. And the avocado tcp and exec migrations pass OK, so hmm. Dave > Can you have a look, please? > > Kevin > > > --- > > migration/migration.c | 4 ++-- > > migration/migration.h | 1 + > > migration/ram.c | 3 +++ > > migration/socket.c | 4 ++++ > > 4 files changed, 10 insertions(+), 2 deletions(-) > > > > diff --git a/migration/migration.c b/migration/migration.c > > index 8e5b421b97..61c4ee7850 100644 > > --- a/migration/migration.c > > +++ b/migration/migration.c > > @@ -430,7 +430,7 @@ static void migration_incoming_setup(QEMUFile *f) > > qemu_file_set_blocking(f, false); > > } > > > > -static void migration_incoming_process(void) > > +void migration_incoming_process(void) > > { > > Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL); > > qemu_coroutine_enter(co); > > @@ -448,7 +448,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc) > > > > if (!mis->from_src_file) { > > QEMUFile *f = qemu_fopen_channel_input(ioc); > > - migration_fd_process_incoming(f); > > + migration_incoming_setup(f); > > return; > > } > > multifd_recv_new_channel(ioc); > > diff --git a/migration/migration.h b/migration/migration.h > > index 7c69598c54..26e5951d56 100644 > > --- a/migration/migration.h > > +++ b/migration/migration.h > > @@ -200,6 +200,7 @@ void migrate_set_state(int *state, int old_state, int new_state); > > > > void migration_fd_process_incoming(QEMUFile *f); > > void migration_ioc_process_incoming(QIOChannel *ioc); > > +void migration_incoming_process(void); > > > > bool migration_has_all_channels(void); > > > > diff --git a/migration/ram.c b/migration/ram.c > > index 87434d3fce..f7e8615e15 100644 > > --- a/migration/ram.c > > +++ b/migration/ram.c > > @@ -717,6 +717,9 @@ void multifd_recv_new_channel(QIOChannel *ioc) > > qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, > > QEMU_THREAD_JOINABLE); > > atomic_inc(&multifd_recv_state->count); > > + if (multifd_recv_state->count == migrate_multifd_channels()) { > > + migration_incoming_process(); > > + } > > } > > > > /** > > diff --git a/migration/socket.c b/migration/socket.c > > index e09fd1aae5..7a5eb562b8 100644 > > --- a/migration/socket.c > > +++ b/migration/socket.c > > @@ -170,6 +170,10 @@ static void socket_accept_incoming_migration(QIONetListener *listener, > > qio_net_listener_disconnect(listener); > > > > object_unref(OBJECT(listener)); > > + > > + if (!migrate_use_multifd()) { > > + migration_incoming_process(); > > + } > > } > > } > > > > -- > > 2.17.0 > > > > -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Am 18.05.2018 um 12:34 hat Dr. David Alan Gilbert geschrieben: > * Kevin Wolf (kwolf@redhat.com) wrote: > > Am 16.05.2018 um 01:39 hat Juan Quintela geschrieben: > > > We need to make sure that we have started all the multifd threads. > > > > > > Signed-off-by: Juan Quintela <quintela@redhat.com> > > > Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> > > > > This commit makes qemu-iotests 091 hang for me. Either it breaks > > backward compatibility intentionally and we need to update the test > > case, or there is a bug somewhere. > > It's not an intentional break. > And the avocado tcp and exec migrations pass OK, so hmm. In case it helps, 169 fails as well and I got a core dump of an aborting QEMU process: (gdb) bt #0 0x00007ff079f779fb in raise () at /lib64/libc.so.6 #1 0x00007ff079f79800 in abort () at /lib64/libc.so.6 #2 0x00007ff079f700da in __assert_fail_base () at /lib64/libc.so.6 #3 0x00007ff079f70152 in () at /lib64/libc.so.6 #4 0x000055c2126f067b in bdrv_close_all () at block.c:3375 #5 0x000055c2123c54a6 in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4682 If I understand correctly, that assertion failure means that someone is still holding a reference to a block device after all user-owned references have been closed. I suppose this was the source qemu and the migration hasn't been completed properly, though I haven't looked at the code yet and this idea might be completely wrong. Anyway, 091 is certainly the simpler test case to play with, but maybe this gives you another hint. Kevin
Am 18.05.2018 um 14:14 hat Kevin Wolf geschrieben: > Am 18.05.2018 um 12:34 hat Dr. David Alan Gilbert geschrieben: > > * Kevin Wolf (kwolf@redhat.com) wrote: > > > Am 16.05.2018 um 01:39 hat Juan Quintela geschrieben: > > > > We need to make sure that we have started all the multifd threads. > > > > > > > > Signed-off-by: Juan Quintela <quintela@redhat.com> > > > > Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> > > > > > > This commit makes qemu-iotests 091 hang for me. Either it breaks > > > backward compatibility intentionally and we need to update the test > > > case, or there is a bug somewhere. > > > > It's not an intentional break. > > And the avocado tcp and exec migrations pass OK, so hmm. > > In case it helps, 169 fails as well and I got a core dump of an aborting > QEMU process: > > (gdb) bt > #0 0x00007ff079f779fb in raise () at /lib64/libc.so.6 > #1 0x00007ff079f79800 in abort () at /lib64/libc.so.6 > #2 0x00007ff079f700da in __assert_fail_base () at /lib64/libc.so.6 > #3 0x00007ff079f70152 in () at /lib64/libc.so.6 > #4 0x000055c2126f067b in bdrv_close_all () at block.c:3375 > #5 0x000055c2123c54a6 in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4682 > > If I understand correctly, that assertion failure means that someone is > still holding a reference to a block device after all user-owned > references have been closed. I suppose this was the source qemu and > the migration hasn't been completed properly, though I haven't looked at > the code yet and this idea might be completely wrong. > > Anyway, 091 is certainly the simpler test case to play with, but maybe > this gives you another hint. Any news on this? This is starting to become really annoying as a hanging test suite impacts my ability to properly test block layer patches. If there is no hope of quickly getting a proper fix for this, we may have to revert something for now to fight the symptoms at least. Kevin
Kevin Wolf <kwolf@redhat.com> wrote: > Am 18.05.2018 um 14:14 hat Kevin Wolf geschrieben: >> Am 18.05.2018 um 12:34 hat Dr. David Alan Gilbert geschrieben: >> > * Kevin Wolf (kwolf@redhat.com) wrote: >> > > Am 16.05.2018 um 01:39 hat Juan Quintela geschrieben: >> > > > We need to make sure that we have started all the multifd threads. >> > > > >> > > > Signed-off-by: Juan Quintela <quintela@redhat.com> >> > > > Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> >> > > >> > > This commit makes qemu-iotests 091 hang for me. Either it breaks >> > > backward compatibility intentionally and we need to update the test >> > > case, or there is a bug somewhere. >> > >> > It's not an intentional break. >> > And the avocado tcp and exec migrations pass OK, so hmm. >> >> In case it helps, 169 fails as well and I got a core dump of an aborting >> QEMU process: >> >> (gdb) bt >> #0 0x00007ff079f779fb in raise () at /lib64/libc.so.6 >> #1 0x00007ff079f79800 in abort () at /lib64/libc.so.6 >> #2 0x00007ff079f700da in __assert_fail_base () at /lib64/libc.so.6 >> #3 0x00007ff079f70152 in () at /lib64/libc.so.6 >> #4 0x000055c2126f067b in bdrv_close_all () at block.c:3375 >> #5 0x000055c2123c54a6 in main (argc=<optimized out>, argv=<optimized >> out>, envp=<optimized out>) at vl.c:4682 >> >> If I understand correctly, that assertion failure means that someone is >> still holding a reference to a block device after all user-owned >> references have been closed. I suppose this was the source qemu and >> the migration hasn't been completed properly, though I haven't looked at >> the code yet and this idea might be completely wrong. >> >> Anyway, 091 is certainly the simpler test case to play with, but maybe >> this gives you another hint. > > Any news on this? This is starting to become really annoying as a > hanging test suite impacts my ability to properly test block layer > patches. > > If there is no hope of quickly getting a proper fix for this, we may > have to revert something for now to fight the symptoms at least. Hi I am looking into this. I have been on vacation the end of last week, and to make things nicer, body decided that taking a stomach flu before returning was a great idea. Coping with email/tests. Later, Juan.
diff --git a/migration/migration.c b/migration/migration.c index 8e5b421b97..61c4ee7850 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -430,7 +430,7 @@ static void migration_incoming_setup(QEMUFile *f) qemu_file_set_blocking(f, false); } -static void migration_incoming_process(void) +void migration_incoming_process(void) { Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL); qemu_coroutine_enter(co); @@ -448,7 +448,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc) if (!mis->from_src_file) { QEMUFile *f = qemu_fopen_channel_input(ioc); - migration_fd_process_incoming(f); + migration_incoming_setup(f); return; } multifd_recv_new_channel(ioc); diff --git a/migration/migration.h b/migration/migration.h index 7c69598c54..26e5951d56 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -200,6 +200,7 @@ void migrate_set_state(int *state, int old_state, int new_state); void migration_fd_process_incoming(QEMUFile *f); void migration_ioc_process_incoming(QIOChannel *ioc); +void migration_incoming_process(void); bool migration_has_all_channels(void); diff --git a/migration/ram.c b/migration/ram.c index 87434d3fce..f7e8615e15 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -717,6 +717,9 @@ void multifd_recv_new_channel(QIOChannel *ioc) qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, QEMU_THREAD_JOINABLE); atomic_inc(&multifd_recv_state->count); + if (multifd_recv_state->count == migrate_multifd_channels()) { + migration_incoming_process(); + } } /** diff --git a/migration/socket.c b/migration/socket.c index e09fd1aae5..7a5eb562b8 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -170,6 +170,10 @@ static void socket_accept_incoming_migration(QIONetListener *listener, qio_net_listener_disconnect(listener); object_unref(OBJECT(listener)); + + if (!migrate_use_multifd()) { + migration_incoming_process(); + } } }