diff mbox series

Aborts in iotest 169

Message ID 3da8da19-72dc-8f5e-f2e1-820999994cb2@redhat.com
State New
Headers show
Series Aborts in iotest 169 | expand

Commit Message

Max Reitz Jan. 23, 2019, 3:48 p.m. UTC
Hi,

When running 169 in parallel (e.g. like so:

$ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
$ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
$ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
$ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done

in four different shells), I get aborts:

(Often I get segfaults, but that's because of
http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
feel free to apply the attached patch to make them go away)


WARNING:qemu:qemu received signal 6:
build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
-chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
-mon chardev=mon,mode=control -display none -vga none -qtest
unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
-nodefaults -machine accel=qtest -drive
if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
.................E..
======================================================================
ERROR:
test_do_test_migration_resume_source_not_persistent__not_migbitmap
(__main__.TestDirtyBitmapMigration)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "169", line 206, in <lambda>
    setattr(klass, 'test_' + method + name, lambda self: mc(self))
  File "169", line 113, in do_test_migration_resume_source
    self.check_bitmap(self.vm_a, sha256)
  File "169", line 72, in check_bitmap
    node='drive0', name='bitmap0')
  File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
    return self._qmp.cmd(cmd, args=qmp_args)
  File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
    return self.cmd_obj(qmp_cmd)
  File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
    resp = self.__json_read()
  File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
__json_read
    data = self.__sockfile.readline()
  File "/usr/lib64/python2.7/socket.py", line 451, in readline
    data = self._sock.recv(self._rbufsize)
error: [Errno 104] Connection reset by peer

----------------------------------------------------------------------
Ran 20 tests

FAILED (errors=1)


Or:

WARNING:qemu:qemu received signal 6:
build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
-chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
-mon chardev=mon,mode=control -display none -vga none -qtest
unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
-machine accel=qtest -drive
if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
WARNING:qemu:qemu received signal 6:
build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
-chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
-mon chardev=mon,mode=control -display none -vga none -qtest
unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
-machine accel=qtest -drive
if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback

...................F
======================================================================
FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
(__main__.TestDirtyBitmapMigration)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "169", line 206, in <lambda>
    setattr(klass, 'test_' + method + name, lambda self: mc(self))
  File "169", line 125, in do_test_migration_resume_source
    self.assertEqual(log, '')
AssertionError: "qemu-system-x86_64: invalid runstate transition:
'running' -> 'postmigrate'\n" != ''

----------------------------------------------------------------------
Ran 20 tests

FAILED (failures=1)


The backtrace always goes like this:

(gdb) bt
#0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
#1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
#2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
at vl.c:742
#3  0x000055a46ebbb1a6 in runstate_set
(new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
#4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
at migration/migration.c:2972
#5  0x000055a46ed39129 in migration_thread
(opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
#6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
util/qemu-thread-posix.c:502


#7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
#8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
(gdb) frame 2
#2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
at vl.c:742
742             abort();
(gdb) print current_run_state
$1 = RUN_STATE_RUNNING


Neither of migration or runstates are my strong suite, so I thought I'd
report it before diving into it.

Max

Comments

Luiz Capitulino Jan. 23, 2019, 4:04 p.m. UTC | #1
On Wed, 23 Jan 2019 16:48:49 +0100
Max Reitz <mreitz@redhat.com> wrote:

> Hi,
> 
> When running 169 in parallel (e.g. like so:
> 
> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
> 
> in four different shells), I get aborts:

OK, is this part of a test-suite that's also running migration
tests in parallel or in sequence? In other words, what does
iotests have to do with migration (sorry if this is stupid
question, but it's been years I don't do qemu).

More below.

[...]

> The backtrace always goes like this:
> 
> (gdb) bt
> #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
> #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> at vl.c:742
> #3  0x000055a46ebbb1a6 in runstate_set
> (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
> #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
> at migration/migration.c:2972
> #5  0x000055a46ed39129 in migration_thread
> (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
> #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
> util/qemu-thread-posix.c:502
> 
> 
> #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
> #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
> (gdb) frame 2
> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> at vl.c:742
> 742             abort();
> (gdb) print current_run_state
> $1 = RUN_STATE_RUNNING
> 
> 
> Neither of migration or runstates are my strong suite, so I thought I'd
> report it before diving into it.

So, the problem seems to be that qemu is going from running state to
postmigrate state. IIRC, this is is an invalid state transition. The
valid states to transition to depends if this guest is the source or
target for migration.

When this happened in the past it meant some QEMU code skipped a
transition, but I can't tell what this has to do with iotests.
Max Reitz Jan. 23, 2019, 4:12 p.m. UTC | #2
On 23.01.19 17:04, Luiz Capitulino wrote:
> On Wed, 23 Jan 2019 16:48:49 +0100
> Max Reitz <mreitz@redhat.com> wrote:
> 
>> Hi,
>>
>> When running 169 in parallel (e.g. like so:
>>
>> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
>> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
>> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
>> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
>>
>> in four different shells), I get aborts:
> 
> OK, is this part of a test-suite that's also running migration
> tests in parallel or in sequence? In other words, what does
> iotests have to do with migration (sorry if this is stupid
> question, but it's been years I don't do qemu).

They run migration tests in sequence, but if you run multiple test
instances in parallel, well, then they will be run in parallel.

The only reason I CC'd you was because you were so prominent in git
blame. O:-)

> More below.
> 
> [...]
> 
>> The backtrace always goes like this:
>>
>> (gdb) bt
>> #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
>> #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
>> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
>> at vl.c:742
>> #3  0x000055a46ebbb1a6 in runstate_set
>> (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
>> #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
>> at migration/migration.c:2972
>> #5  0x000055a46ed39129 in migration_thread
>> (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
>> #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
>> util/qemu-thread-posix.c:502
>>
>>
>> #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
>> #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
>> (gdb) frame 2
>> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
>> at vl.c:742
>> 742             abort();
>> (gdb) print current_run_state
>> $1 = RUN_STATE_RUNNING
>>
>>
>> Neither of migration or runstates are my strong suite, so I thought I'd
>> report it before diving into it.
> 
> So, the problem seems to be that qemu is going from running state to
> postmigrate state. IIRC, this is is an invalid state transition. The
> valid states to transition to depends if this guest is the source or
> target for migration.
> 
> When this happened in the past it meant some QEMU code skipped a
> transition, but I can't tell what this has to do with iotests.

Well, this iotest (which tests a migration configuration) sometimes
apparently results in this invalid transition.  But that can't be just
the test's fault, as qemu should handle that gracefully.

It's probably an issue in the migration code and not so much in vl.c, yes...

Max
Luiz Capitulino Jan. 23, 2019, 4:24 p.m. UTC | #3
On Wed, 23 Jan 2019 17:12:35 +0100
Max Reitz <mreitz@redhat.com> wrote:

> On 23.01.19 17:04, Luiz Capitulino wrote:
> > On Wed, 23 Jan 2019 16:48:49 +0100
> > Max Reitz <mreitz@redhat.com> wrote:
> >   
> >> Hi,
> >>
> >> When running 169 in parallel (e.g. like so:
> >>
> >> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
> >> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
> >> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
> >> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
> >>
> >> in four different shells), I get aborts:  
> > 
> > OK, is this part of a test-suite that's also running migration
> > tests in parallel or in sequence? In other words, what does
> > iotests have to do with migration (sorry if this is stupid
> > question, but it's been years I don't do qemu).  
> 
> They run migration tests in sequence, but if you run multiple test
> instances in parallel, well, then they will be run in parallel.
> 
> The only reason I CC'd you was because you were so prominent in git
> blame. O:-)

Yeah, that's often the case with me :-)

> > When this happened in the past it meant some QEMU code skipped a
> > transition, but I can't tell what this has to do with iotests.  
> 
> Well, this iotest (which tests a migration configuration) sometimes
> apparently results in this invalid transition.  But that can't be just
> the test's fault, as qemu should handle that gracefully.

Does iotest run a guest or does it only executes parts of qemu
code? If it's the latter, then I'd guess the test code is missing
calling qemu code that sets the appropriate state between
running and postmigrate states.

> It's probably an issue in the migration code and not so much in vl.c, yes...

Yeah, I'll let the migration people jump in.
Dr. David Alan Gilbert Jan. 23, 2019, 4:35 p.m. UTC | #4
* Luiz Capitulino (lcapitulino@redhat.com) wrote:
> On Wed, 23 Jan 2019 17:12:35 +0100
> Max Reitz <mreitz@redhat.com> wrote:
> 
> > On 23.01.19 17:04, Luiz Capitulino wrote:
> > > On Wed, 23 Jan 2019 16:48:49 +0100
> > > Max Reitz <mreitz@redhat.com> wrote:
> > >   
> > >> Hi,
> > >>
> > >> When running 169 in parallel (e.g. like so:
> > >>
> > >> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
> > >> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
> > >> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
> > >> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
> > >>
> > >> in four different shells), I get aborts:  
> > > 
> > > OK, is this part of a test-suite that's also running migration
> > > tests in parallel or in sequence? In other words, what does
> > > iotests have to do with migration (sorry if this is stupid
> > > question, but it's been years I don't do qemu).  
> > 
> > They run migration tests in sequence, but if you run multiple test
> > instances in parallel, well, then they will be run in parallel.
> > 
> > The only reason I CC'd you was because you were so prominent in git
> > blame. O:-)
> 
> Yeah, that's often the case with me :-)
> 
> > > When this happened in the past it meant some QEMU code skipped a
> > > transition, but I can't tell what this has to do with iotests.  
> > 
> > Well, this iotest (which tests a migration configuration) sometimes
> > apparently results in this invalid transition.  But that can't be just
> > the test's fault, as qemu should handle that gracefully.
> 
> Does iotest run a guest or does it only executes parts of qemu
> code? If it's the latter, then I'd guess the test code is missing
> calling qemu code that sets the appropriate state between
> running and postmigrate states.
> 
> > It's probably an issue in the migration code and not so much in vl.c, yes...
> 
> Yeah, I'll let the migration people jump in.

Can we get a log of the qmp commands when it fails?
A 'running->postmigrate' transition is a bit weird; you could get a
something->postmigrate if you migrate a VM twice, and the source is
already in postmigrate, so it tells the destination to go into
postmigrate.   That's odd in itself, but not *that* odd.
But then why is it going running->postmigrate?  That suggests
the source is currently running when it receives an incoming migration,
and that really shouldn't happen.

Dave
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Max Reitz Jan. 23, 2019, 5:16 p.m. UTC | #5
On 23.01.19 17:35, Dr. David Alan Gilbert wrote:
> * Luiz Capitulino (lcapitulino@redhat.com) wrote:
>> On Wed, 23 Jan 2019 17:12:35 +0100
>> Max Reitz <mreitz@redhat.com> wrote:
>>
>>> On 23.01.19 17:04, Luiz Capitulino wrote:
>>>> On Wed, 23 Jan 2019 16:48:49 +0100
>>>> Max Reitz <mreitz@redhat.com> wrote:
>>>>   
>>>>> Hi,
>>>>>
>>>>> When running 169 in parallel (e.g. like so:
>>>>>
>>>>> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
>>>>> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
>>>>> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
>>>>> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
>>>>>
>>>>> in four different shells), I get aborts:  
>>>>
>>>> OK, is this part of a test-suite that's also running migration
>>>> tests in parallel or in sequence? In other words, what does
>>>> iotests have to do with migration (sorry if this is stupid
>>>> question, but it's been years I don't do qemu).  
>>>
>>> They run migration tests in sequence, but if you run multiple test
>>> instances in parallel, well, then they will be run in parallel.
>>>
>>> The only reason I CC'd you was because you were so prominent in git
>>> blame. O:-)
>>
>> Yeah, that's often the case with me :-)
>>
>>>> When this happened in the past it meant some QEMU code skipped a
>>>> transition, but I can't tell what this has to do with iotests.  
>>>
>>> Well, this iotest (which tests a migration configuration) sometimes
>>> apparently results in this invalid transition.  But that can't be just
>>> the test's fault, as qemu should handle that gracefully.
>>
>> Does iotest run a guest or does it only executes parts of qemu
>> code? If it's the latter, then I'd guess the test code is missing
>> calling qemu code that sets the appropriate state between
>> running and postmigrate states.
>>
>>> It's probably an issue in the migration code and not so much in vl.c, yes...
>>
>> Yeah, I'll let the migration people jump in.
> 
> Can we get a log of the qmp commands when it fails?
> A 'running->postmigrate' transition is a bit weird; you could get a
> something->postmigrate if you migrate a VM twice, and the source is
> already in postmigrate, so it tells the destination to go into
> postmigrate.   That's odd in itself, but not *that* odd.
> But then why is it going running->postmigrate?  That suggests
> the source is currently running when it receives an incoming migration,
> and that really shouldn't happen.

This is what I get with -d:

> 
> test_do_test_migration_resume_source_persistent__not_migbitmap (__main__.TestDirtyBitmapMigration) ... DEBUG:QMP:>>> {'execute': 'qmp_capabilities'}
> DEBUG:QMP:<<< {u'return': {}}
> DEBUG:QMP:>>> {'execute': 'migrate-set-capabilities', 'arguments': {'capabilities': [{'capability': 'events', 'state': True}]}}
> DEBUG:QMP:<<< {u'return': {}}
> DEBUG:QMP:>>> {'execute': 'block-dirty-bitmap-add', 'arguments': {'node': 'drive0', 'persistent': True, 'name': 'bitmap0', 'granularity': 512}}
> DEBUG:QMP:<<< {u'return': {}}
> DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 0 65536"'}}
> DEBUG:QMP:<<< {u'return': u''}
> DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 983040 65536"'}}
> DEBUG:QMP:<<< {u'return': u''}
> DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 655873 4096"'}}
> DEBUG:QMP:<<< {u'return': u''}
> DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
> DEBUG:QMP:<<< {u'return': {u'sha256': u'01d2ebedcb8f549a2547dbf8e231c410e3e747a9479e98909fc936e0035cf8b1'}}
> DEBUG:QMP:>>> {'execute': 'migrate', 'arguments': {'uri': 'exec: cat > /tmp/t0/test-qcow2-file/mig_file'}}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 301029}, u'data': {u'status': u'setup'}, u'event': u'MIGRATION'}
> DEBUG:QMP:<<< {u'return': {}}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 302836}, u'data': {u'pass': 1}, u'event': u'MIGRATION_PASS'}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 302899}, u'data': {u'status': u'active'}, u'event': u'MIGRATION'}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 312321}, u'event': u'STOP'}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 312413}, u'data': {u'pass': 2}, u'event': u'MIGRATION_PASS'}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 314779}, u'data': {u'status': u'completed'}, u'event': u'MIGRATION'}
> DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
> DEBUG:QMP:<<< {u'error': {u'class': u'GenericError', u'desc': u"Dirty bitmap 'bitmap0' not found"}}
> DEBUG:QMP:>>> {'execute': 'cont'}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 316624}, u'event': u'RESUME'}
> DEBUG:QMP:<<< {u'return': {}}
> DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
> WARNING:qemu:qemu received signal 6: build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 -chardev socket,id=mon,path=/tmp/t0/test-qcow2-file/tmp1d4GTg/qemua-28430-monitor.sock -mon chardev=mon,mode=control -display none -vga none -qtest unix:path=/tmp/t0/test-qcow2-file/qemua-28430-qtest.sock -machine accel=qtest -nodefaults -machine accel=qtest -drive if=virtio,id=drive0,file=/tmp/t0/test-qcow2-file/disk_a,format=qcow2,cache=writethrough
> ERROR
> 
> ======================================================================
> ERROR: test_do_test_migration_resume_source_persistent__not_migbitmap (__main__.TestDirtyBitmapMigration)
> ----------------------------------------------------------------------
> Traceback (most recent call last):
>   File "169", line 206, in <lambda>
>     setattr(klass, 'test_' + method + name, lambda self: mc(self))
>   File "169", line 113, in do_test_migration_resume_source
>     self.check_bitmap(self.vm_a, sha256)
>   File "169", line 72, in check_bitmap
>     node='drive0', name='bitmap0')
>   File "tests/qemu-iotests/../../scripts/qemu.py", line 368, in qmp
>     return self._qmp.cmd(cmd, args=qmp_args)
>   File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
>     return self.cmd_obj(qmp_cmd)
>   File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
>     resp = self.__json_read()
>   File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in __json_read                                                                                                                                                                
>     data = self.__sockfile.readline()
>   File "/usr/lib64/python2.7/socket.py", line 451, in readline
>     data = self._sock.recv(self._rbufsize)
> error: [Errno 104] Connection reset by peer
> 
> ----------------------------------------------------------------------


I'm not sure whether the problem is really from
x-debug-block-dirty-bitmap-sha256, or from the self.vm_a.shutdown() on
line 116 in iotest 169...  Or maybe from something else entirely?


A slightly different failure pattern is this:

> test_do_test_migration_resume_source_not_persistent__migbitmap (__main__.TestDirtyBitmapMigration) ... DEBUG:QMP:>>> {'execute': 'qmp_capabilities'}
> DEBUG:QMP:<<< {u'return': {}}
> DEBUG:QMP:>>> {'execute': 'migrate-set-capabilities', 'arguments': {'capabilities': [{'capability': 'events', 'state': True}, {'capability': 'dirty-bitmaps', 'state': True}]}}
> DEBUG:QMP:<<< {u'return': {}}
> DEBUG:QMP:>>> {'execute': 'block-dirty-bitmap-add', 'arguments': {'node': 'drive0', 'name': 'bitmap0', 'granularity': 512}}
> DEBUG:QMP:<<< {u'return': {}}
> DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 0 65536"'}}
> DEBUG:QMP:<<< {u'return': u''}
> DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 983040 65536"'}}
> DEBUG:QMP:<<< {u'return': u''}
> DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 655873 4096"'}}
> DEBUG:QMP:<<< {u'return': u''}
> DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
> DEBUG:QMP:<<< {u'return': {u'sha256': u'01d2ebedcb8f549a2547dbf8e231c410e3e747a9479e98909fc936e0035cf8b1'}}
> DEBUG:QMP:>>> {'execute': 'migrate', 'arguments': {'uri': 'exec: cat > /tmp/t0/test-qcow2-file/mig_file'}}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 466831}, u'data': {u'status': u'setup'}, u'event': u'MIGRATION'}
> DEBUG:QMP:<<< {u'return': {}}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 472171}, u'data': {u'pass': 1}, u'event': u'MIGRATION_PASS'}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 472233}, u'data': {u'status': u'active'}, u'event': u'MIGRATION'}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 571246}, u'data': {u'pass': 2}, u'event': u'MIGRATION_PASS'}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 571665}, u'event': u'STOP'}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 571934}, u'data': {u'pass': 3}, u'event': u'MIGRATION_PASS'}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 574302}, u'data': {u'status': u'completed'}, u'event': u'MIGRATION'}
> DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
> DEBUG:QMP:<<< {u'return': {u'sha256': u'01d2ebedcb8f549a2547dbf8e231c410e3e747a9479e98909fc936e0035cf8b1'}}
> DEBUG:QMP:>>> {'execute': 'cont'}
> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 576305}, u'event': u'RESUME'}
> DEBUG:QMP:<<< {u'return': {}}
> DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
> DEBUG:QMP:<<< {u'return': {u'sha256': u'01d2ebedcb8f549a2547dbf8e231c410e3e747a9479e98909fc936e0035cf8b1'}}
> DEBUG:QMP:>>> {'execute': 'quit'}
> WARNING:qemu:qemu received signal 6: build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 -chardev socket,id=mon,path=/tmp/t0/test-qcow2-file/tmpG2ZjZB/qemua-4834-monitor.sock -mon chardev=mon,mode=control -display none -vga none -qtest unix:path=/tmp/t0/test-qcow2-file/qemua-4834-qtest.sock -machine accel=qtest -nodefaults -machine accel=qtest -drive if=virtio,id=drive0,file=/tmp/t0/test-qcow2-file/disk_a,format=qcow2,cache=writethrough
> WARNING:qemu:qemu received signal 6: build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 -chardev socket,id=mon,path=/tmp/t0/test-qcow2-file/tmpG2ZjZB/qemua-4834-monitor.sock -mon chardev=mon,mode=control -display none -vga none -qtest unix:path=/tmp/t0/test-qcow2-file/qemua-4834-qtest.sock -machine accel=qtest -nodefaults -machine accel=qtest -drive if=virtio,id=drive0,file=/tmp/t0/test-qcow2-file/disk_a,format=qcow2,cache=writethrough
> 
> ======================================================================
> FAIL: test_do_test_migration_resume_source_not_persistent__migbitmap (__main__.TestDirtyBitmapMigration)
> ----------------------------------------------------------------------
> Traceback (most recent call last):
>   File "169", line 206, in <lambda>
>     setattr(klass, 'test_' + method + name, lambda self: mc(self))
>   File "169", line 125, in do_test_migration_resume_source
>     self.assertEqual(log, '')
> AssertionError: "qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'\n" != ''

So here looks more like the shutdown is the problem.

Max
Dr. David Alan Gilbert Jan. 23, 2019, 6:08 p.m. UTC | #6
* Max Reitz (mreitz@redhat.com) wrote:
> On 23.01.19 17:35, Dr. David Alan Gilbert wrote:
> > * Luiz Capitulino (lcapitulino@redhat.com) wrote:
> >> On Wed, 23 Jan 2019 17:12:35 +0100
> >> Max Reitz <mreitz@redhat.com> wrote:
> >>
> >>> On 23.01.19 17:04, Luiz Capitulino wrote:
> >>>> On Wed, 23 Jan 2019 16:48:49 +0100
> >>>> Max Reitz <mreitz@redhat.com> wrote:
> >>>>   
> >>>>> Hi,
> >>>>>
> >>>>> When running 169 in parallel (e.g. like so:
> >>>>>
> >>>>> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
> >>>>> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
> >>>>> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
> >>>>> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
> >>>>>
> >>>>> in four different shells), I get aborts:  
> >>>>
> >>>> OK, is this part of a test-suite that's also running migration
> >>>> tests in parallel or in sequence? In other words, what does
> >>>> iotests have to do with migration (sorry if this is stupid
> >>>> question, but it's been years I don't do qemu).  
> >>>
> >>> They run migration tests in sequence, but if you run multiple test
> >>> instances in parallel, well, then they will be run in parallel.
> >>>
> >>> The only reason I CC'd you was because you were so prominent in git
> >>> blame. O:-)
> >>
> >> Yeah, that's often the case with me :-)
> >>
> >>>> When this happened in the past it meant some QEMU code skipped a
> >>>> transition, but I can't tell what this has to do with iotests.  
> >>>
> >>> Well, this iotest (which tests a migration configuration) sometimes
> >>> apparently results in this invalid transition.  But that can't be just
> >>> the test's fault, as qemu should handle that gracefully.
> >>
> >> Does iotest run a guest or does it only executes parts of qemu
> >> code? If it's the latter, then I'd guess the test code is missing
> >> calling qemu code that sets the appropriate state between
> >> running and postmigrate states.
> >>
> >>> It's probably an issue in the migration code and not so much in vl.c, yes...
> >>
> >> Yeah, I'll let the migration people jump in.
> > 
> > Can we get a log of the qmp commands when it fails?
> > A 'running->postmigrate' transition is a bit weird; you could get a
> > something->postmigrate if you migrate a VM twice, and the source is
> > already in postmigrate, so it tells the destination to go into
> > postmigrate.   That's odd in itself, but not *that* odd.
> > But then why is it going running->postmigrate?  That suggests
> > the source is currently running when it receives an incoming migration,
> > and that really shouldn't happen.
> 
> This is what I get with -d:
> 
> > 
> > test_do_test_migration_resume_source_persistent__not_migbitmap (__main__.TestDirtyBitmapMigration) ... DEBUG:QMP:>>> {'execute': 'qmp_capabilities'}
> > DEBUG:QMP:<<< {u'return': {}}
> > DEBUG:QMP:>>> {'execute': 'migrate-set-capabilities', 'arguments': {'capabilities': [{'capability': 'events', 'state': True}]}}
> > DEBUG:QMP:<<< {u'return': {}}
> > DEBUG:QMP:>>> {'execute': 'block-dirty-bitmap-add', 'arguments': {'node': 'drive0', 'persistent': True, 'name': 'bitmap0', 'granularity': 512}}
> > DEBUG:QMP:<<< {u'return': {}}
> > DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 0 65536"'}}
> > DEBUG:QMP:<<< {u'return': u''}
> > DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 983040 65536"'}}
> > DEBUG:QMP:<<< {u'return': u''}
> > DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 655873 4096"'}}
> > DEBUG:QMP:<<< {u'return': u''}
> > DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
> > DEBUG:QMP:<<< {u'return': {u'sha256': u'01d2ebedcb8f549a2547dbf8e231c410e3e747a9479e98909fc936e0035cf8b1'}}
> > DEBUG:QMP:>>> {'execute': 'migrate', 'arguments': {'uri': 'exec: cat > /tmp/t0/test-qcow2-file/mig_file'}}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 301029}, u'data': {u'status': u'setup'}, u'event': u'MIGRATION'}
> > DEBUG:QMP:<<< {u'return': {}}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 302836}, u'data': {u'pass': 1}, u'event': u'MIGRATION_PASS'}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 302899}, u'data': {u'status': u'active'}, u'event': u'MIGRATION'}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 312321}, u'event': u'STOP'}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 312413}, u'data': {u'pass': 2}, u'event': u'MIGRATION_PASS'}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 314779}, u'data': {u'status': u'completed'}, u'event': u'MIGRATION'}
> > DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
> > DEBUG:QMP:<<< {u'error': {u'class': u'GenericError', u'desc': u"Dirty bitmap 'bitmap0' not found"}}
> > DEBUG:QMP:>>> {'execute': 'cont'}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 316624}, u'event': u'RESUME'}
> > DEBUG:QMP:<<< {u'return': {}}
> > DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
> > WARNING:qemu:qemu received signal 6: build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 -chardev socket,id=mon,path=/tmp/t0/test-qcow2-file/tmp1d4GTg/qemua-28430-monitor.sock -mon chardev=mon,mode=control -display none -vga none -qtest unix:path=/tmp/t0/test-qcow2-file/qemua-28430-qtest.sock -machine accel=qtest -nodefaults -machine accel=qtest -drive if=virtio,id=drive0,file=/tmp/t0/test-qcow2-file/disk_a,format=qcow2,cache=writethrough
> > ERROR
> > 
> > ======================================================================
> > ERROR: test_do_test_migration_resume_source_persistent__not_migbitmap (__main__.TestDirtyBitmapMigration)
> > ----------------------------------------------------------------------
> > Traceback (most recent call last):
> >   File "169", line 206, in <lambda>
> >     setattr(klass, 'test_' + method + name, lambda self: mc(self))
> >   File "169", line 113, in do_test_migration_resume_source
> >     self.check_bitmap(self.vm_a, sha256)
> >   File "169", line 72, in check_bitmap
> >     node='drive0', name='bitmap0')
> >   File "tests/qemu-iotests/../../scripts/qemu.py", line 368, in qmp
> >     return self._qmp.cmd(cmd, args=qmp_args)
> >   File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
> >     return self.cmd_obj(qmp_cmd)
> >   File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
> >     resp = self.__json_read()
> >   File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in __json_read                                                                                                                                                                
> >     data = self.__sockfile.readline()
> >   File "/usr/lib64/python2.7/socket.py", line 451, in readline
> >     data = self._sock.recv(self._rbufsize)
> > error: [Errno 104] Connection reset by peer
> > 
> > ----------------------------------------------------------------------
> 
> 
> I'm not sure whether the problem is really from
> x-debug-block-dirty-bitmap-sha256, or from the self.vm_a.shutdown() on
> line 116 in iotest 169...  Or maybe from something else entirely?
> 
> 
> A slightly different failure pattern is this:
> 
> > test_do_test_migration_resume_source_not_persistent__migbitmap (__main__.TestDirtyBitmapMigration) ... DEBUG:QMP:>>> {'execute': 'qmp_capabilities'}
> > DEBUG:QMP:<<< {u'return': {}}
> > DEBUG:QMP:>>> {'execute': 'migrate-set-capabilities', 'arguments': {'capabilities': [{'capability': 'events', 'state': True}, {'capability': 'dirty-bitmaps', 'state': True}]}}
> > DEBUG:QMP:<<< {u'return': {}}
> > DEBUG:QMP:>>> {'execute': 'block-dirty-bitmap-add', 'arguments': {'node': 'drive0', 'name': 'bitmap0', 'granularity': 512}}
> > DEBUG:QMP:<<< {u'return': {}}
> > DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 0 65536"'}}
> > DEBUG:QMP:<<< {u'return': u''}
> > DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 983040 65536"'}}
> > DEBUG:QMP:<<< {u'return': u''}
> > DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 655873 4096"'}}
> > DEBUG:QMP:<<< {u'return': u''}
> > DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
> > DEBUG:QMP:<<< {u'return': {u'sha256': u'01d2ebedcb8f549a2547dbf8e231c410e3e747a9479e98909fc936e0035cf8b1'}}
> > DEBUG:QMP:>>> {'execute': 'migrate', 'arguments': {'uri': 'exec: cat > /tmp/t0/test-qcow2-file/mig_file'}}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 466831}, u'data': {u'status': u'setup'}, u'event': u'MIGRATION'}
> > DEBUG:QMP:<<< {u'return': {}}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 472171}, u'data': {u'pass': 1}, u'event': u'MIGRATION_PASS'}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 472233}, u'data': {u'status': u'active'}, u'event': u'MIGRATION'}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 571246}, u'data': {u'pass': 2}, u'event': u'MIGRATION_PASS'}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 571665}, u'event': u'STOP'}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 571934}, u'data': {u'pass': 3}, u'event': u'MIGRATION_PASS'}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 574302}, u'data': {u'status': u'completed'}, u'event': u'MIGRATION'}
> > DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
> > DEBUG:QMP:<<< {u'return': {u'sha256': u'01d2ebedcb8f549a2547dbf8e231c410e3e747a9479e98909fc936e0035cf8b1'}}
> > DEBUG:QMP:>>> {'execute': 'cont'}
> > DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 576305}, u'event': u'RESUME'}
> > DEBUG:QMP:<<< {u'return': {}}
> > DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
> > DEBUG:QMP:<<< {u'return': {u'sha256': u'01d2ebedcb8f549a2547dbf8e231c410e3e747a9479e98909fc936e0035cf8b1'}}
> > DEBUG:QMP:>>> {'execute': 'quit'}
> > WARNING:qemu:qemu received signal 6: build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 -chardev socket,id=mon,path=/tmp/t0/test-qcow2-file/tmpG2ZjZB/qemua-4834-monitor.sock -mon chardev=mon,mode=control -display none -vga none -qtest unix:path=/tmp/t0/test-qcow2-file/qemua-4834-qtest.sock -machine accel=qtest -nodefaults -machine accel=qtest -drive if=virtio,id=drive0,file=/tmp/t0/test-qcow2-file/disk_a,format=qcow2,cache=writethrough
> > WARNING:qemu:qemu received signal 6: build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 -chardev socket,id=mon,path=/tmp/t0/test-qcow2-file/tmpG2ZjZB/qemua-4834-monitor.sock -mon chardev=mon,mode=control -display none -vga none -qtest unix:path=/tmp/t0/test-qcow2-file/qemua-4834-qtest.sock -machine accel=qtest -nodefaults -machine accel=qtest -drive if=virtio,id=drive0,file=/tmp/t0/test-qcow2-file/disk_a,format=qcow2,cache=writethrough
> > 
> > ======================================================================
> > FAIL: test_do_test_migration_resume_source_not_persistent__migbitmap (__main__.TestDirtyBitmapMigration)
> > ----------------------------------------------------------------------
> > Traceback (most recent call last):
> >   File "169", line 206, in <lambda>
> >     setattr(klass, 'test_' + method + name, lambda self: mc(self))
> >   File "169", line 125, in do_test_migration_resume_source
> >     self.assertEqual(log, '')
> > AssertionError: "qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'\n" != ''
> 
> So here looks more like the shutdown is the problem.

It doesn't repeat for me here in a  few minutes of testing;  can you get
me a full all-threads backtrace for those asserts?

Dave

> Max
> 



--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Vladimir Sementsov-Ogievskiy Jan. 24, 2019, 8:08 a.m. UTC | #7
23.01.2019 21:08, Dr. David Alan Gilbert wrote:
> * Max Reitz (mreitz@redhat.com) wrote:
>> On 23.01.19 17:35, Dr. David Alan Gilbert wrote:
>>> * Luiz Capitulino (lcapitulino@redhat.com) wrote:
>>>> On Wed, 23 Jan 2019 17:12:35 +0100
>>>> Max Reitz <mreitz@redhat.com> wrote:
>>>>
>>>>> On 23.01.19 17:04, Luiz Capitulino wrote:
>>>>>> On Wed, 23 Jan 2019 16:48:49 +0100
>>>>>> Max Reitz <mreitz@redhat.com> wrote:
>>>>>>    
>>>>>>> Hi,
>>>>>>>
>>>>>>> When running 169 in parallel (e.g. like so:
>>>>>>>
>>>>>>> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
>>>>>>> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
>>>>>>> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
>>>>>>> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
>>>>>>>
>>>>>>> in four different shells), I get aborts:
>>>>>>
>>>>>> OK, is this part of a test-suite that's also running migration
>>>>>> tests in parallel or in sequence? In other words, what does
>>>>>> iotests have to do with migration (sorry if this is stupid
>>>>>> question, but it's been years I don't do qemu).
>>>>>
>>>>> They run migration tests in sequence, but if you run multiple test
>>>>> instances in parallel, well, then they will be run in parallel.
>>>>>
>>>>> The only reason I CC'd you was because you were so prominent in git
>>>>> blame. O:-)
>>>>
>>>> Yeah, that's often the case with me :-)
>>>>
>>>>>> When this happened in the past it meant some QEMU code skipped a
>>>>>> transition, but I can't tell what this has to do with iotests.
>>>>>
>>>>> Well, this iotest (which tests a migration configuration) sometimes
>>>>> apparently results in this invalid transition.  But that can't be just
>>>>> the test's fault, as qemu should handle that gracefully.
>>>>
>>>> Does iotest run a guest or does it only executes parts of qemu
>>>> code? If it's the latter, then I'd guess the test code is missing
>>>> calling qemu code that sets the appropriate state between
>>>> running and postmigrate states.
>>>>
>>>>> It's probably an issue in the migration code and not so much in vl.c, yes...
>>>>
>>>> Yeah, I'll let the migration people jump in.
>>>
>>> Can we get a log of the qmp commands when it fails?
>>> A 'running->postmigrate' transition is a bit weird; you could get a
>>> something->postmigrate if you migrate a VM twice, and the source is
>>> already in postmigrate, so it tells the destination to go into
>>> postmigrate.   That's odd in itself, but not *that* odd.
>>> But then why is it going running->postmigrate?  That suggests
>>> the source is currently running when it receives an incoming migration,
>>> and that really shouldn't happen.
>>
>> This is what I get with -d:
>>
>>>
>>> test_do_test_migration_resume_source_persistent__not_migbitmap (__main__.TestDirtyBitmapMigration) ... DEBUG:QMP:>>> {'execute': 'qmp_capabilities'}
>>> DEBUG:QMP:<<< {u'return': {}}
>>> DEBUG:QMP:>>> {'execute': 'migrate-set-capabilities', 'arguments': {'capabilities': [{'capability': 'events', 'state': True}]}}
>>> DEBUG:QMP:<<< {u'return': {}}
>>> DEBUG:QMP:>>> {'execute': 'block-dirty-bitmap-add', 'arguments': {'node': 'drive0', 'persistent': True, 'name': 'bitmap0', 'granularity': 512}}
>>> DEBUG:QMP:<<< {u'return': {}}
>>> DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 0 65536"'}}
>>> DEBUG:QMP:<<< {u'return': u''}
>>> DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 983040 65536"'}}
>>> DEBUG:QMP:<<< {u'return': u''}
>>> DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 655873 4096"'}}
>>> DEBUG:QMP:<<< {u'return': u''}
>>> DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
>>> DEBUG:QMP:<<< {u'return': {u'sha256': u'01d2ebedcb8f549a2547dbf8e231c410e3e747a9479e98909fc936e0035cf8b1'}}
>>> DEBUG:QMP:>>> {'execute': 'migrate', 'arguments': {'uri': 'exec: cat > /tmp/t0/test-qcow2-file/mig_file'}}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 301029}, u'data': {u'status': u'setup'}, u'event': u'MIGRATION'}
>>> DEBUG:QMP:<<< {u'return': {}}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 302836}, u'data': {u'pass': 1}, u'event': u'MIGRATION_PASS'}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 302899}, u'data': {u'status': u'active'}, u'event': u'MIGRATION'}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 312321}, u'event': u'STOP'}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 312413}, u'data': {u'pass': 2}, u'event': u'MIGRATION_PASS'}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 314779}, u'data': {u'status': u'completed'}, u'event': u'MIGRATION'}
>>> DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
>>> DEBUG:QMP:<<< {u'error': {u'class': u'GenericError', u'desc': u"Dirty bitmap 'bitmap0' not found"}}
>>> DEBUG:QMP:>>> {'execute': 'cont'}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263213, u'microseconds': 316624}, u'event': u'RESUME'}
>>> DEBUG:QMP:<<< {u'return': {}}
>>> DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
>>> WARNING:qemu:qemu received signal 6: build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 -chardev socket,id=mon,path=/tmp/t0/test-qcow2-file/tmp1d4GTg/qemua-28430-monitor.sock -mon chardev=mon,mode=control -display none -vga none -qtest unix:path=/tmp/t0/test-qcow2-file/qemua-28430-qtest.sock -machine accel=qtest -nodefaults -machine accel=qtest -drive if=virtio,id=drive0,file=/tmp/t0/test-qcow2-file/disk_a,format=qcow2,cache=writethrough
>>> ERROR
>>>
>>> ======================================================================
>>> ERROR: test_do_test_migration_resume_source_persistent__not_migbitmap (__main__.TestDirtyBitmapMigration)
>>> ----------------------------------------------------------------------
>>> Traceback (most recent call last):
>>>    File "169", line 206, in <lambda>
>>>      setattr(klass, 'test_' + method + name, lambda self: mc(self))
>>>    File "169", line 113, in do_test_migration_resume_source
>>>      self.check_bitmap(self.vm_a, sha256)
>>>    File "169", line 72, in check_bitmap
>>>      node='drive0', name='bitmap0')
>>>    File "tests/qemu-iotests/../../scripts/qemu.py", line 368, in qmp
>>>      return self._qmp.cmd(cmd, args=qmp_args)
>>>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
>>>      return self.cmd_obj(qmp_cmd)
>>>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
>>>      resp = self.__json_read()
>>>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in __json_read
>>>      data = self.__sockfile.readline()
>>>    File "/usr/lib64/python2.7/socket.py", line 451, in readline
>>>      data = self._sock.recv(self._rbufsize)
>>> error: [Errno 104] Connection reset by peer
>>>
>>> ----------------------------------------------------------------------
>>
>>
>> I'm not sure whether the problem is really from
>> x-debug-block-dirty-bitmap-sha256, or from the self.vm_a.shutdown() on
>> line 116 in iotest 169...  Or maybe from something else entirely?
>>
>>
>> A slightly different failure pattern is this:
>>
>>> test_do_test_migration_resume_source_not_persistent__migbitmap (__main__.TestDirtyBitmapMigration) ... DEBUG:QMP:>>> {'execute': 'qmp_capabilities'}
>>> DEBUG:QMP:<<< {u'return': {}}
>>> DEBUG:QMP:>>> {'execute': 'migrate-set-capabilities', 'arguments': {'capabilities': [{'capability': 'events', 'state': True}, {'capability': 'dirty-bitmaps', 'state': True}]}}
>>> DEBUG:QMP:<<< {u'return': {}}
>>> DEBUG:QMP:>>> {'execute': 'block-dirty-bitmap-add', 'arguments': {'node': 'drive0', 'name': 'bitmap0', 'granularity': 512}}
>>> DEBUG:QMP:<<< {u'return': {}}
>>> DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 0 65536"'}}
>>> DEBUG:QMP:<<< {u'return': u''}
>>> DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 983040 65536"'}}
>>> DEBUG:QMP:<<< {u'return': u''}
>>> DEBUG:QMP:>>> {'execute': 'human-monitor-command', 'arguments': {'command-line': 'qemu-io drive0 "write 655873 4096"'}}
>>> DEBUG:QMP:<<< {u'return': u''}
>>> DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
>>> DEBUG:QMP:<<< {u'return': {u'sha256': u'01d2ebedcb8f549a2547dbf8e231c410e3e747a9479e98909fc936e0035cf8b1'}}
>>> DEBUG:QMP:>>> {'execute': 'migrate', 'arguments': {'uri': 'exec: cat > /tmp/t0/test-qcow2-file/mig_file'}}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 466831}, u'data': {u'status': u'setup'}, u'event': u'MIGRATION'}
>>> DEBUG:QMP:<<< {u'return': {}}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 472171}, u'data': {u'pass': 1}, u'event': u'MIGRATION_PASS'}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 472233}, u'data': {u'status': u'active'}, u'event': u'MIGRATION'}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 571246}, u'data': {u'pass': 2}, u'event': u'MIGRATION_PASS'}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 571665}, u'event': u'STOP'}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 571934}, u'data': {u'pass': 3}, u'event': u'MIGRATION_PASS'}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 574302}, u'data': {u'status': u'completed'}, u'event': u'MIGRATION'}
>>> DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
>>> DEBUG:QMP:<<< {u'return': {u'sha256': u'01d2ebedcb8f549a2547dbf8e231c410e3e747a9479e98909fc936e0035cf8b1'}}
>>> DEBUG:QMP:>>> {'execute': 'cont'}
>>> DEBUG:QMP:<<< {u'timestamp': {u'seconds': 1548263493, u'microseconds': 576305}, u'event': u'RESUME'}
>>> DEBUG:QMP:<<< {u'return': {}}
>>> DEBUG:QMP:>>> {'execute': 'x-debug-block-dirty-bitmap-sha256', 'arguments': {'node': 'drive0', 'name': 'bitmap0'}}
>>> DEBUG:QMP:<<< {u'return': {u'sha256': u'01d2ebedcb8f549a2547dbf8e231c410e3e747a9479e98909fc936e0035cf8b1'}}
>>> DEBUG:QMP:>>> {'execute': 'quit'}
>>> WARNING:qemu:qemu received signal 6: build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 -chardev socket,id=mon,path=/tmp/t0/test-qcow2-file/tmpG2ZjZB/qemua-4834-monitor.sock -mon chardev=mon,mode=control -display none -vga none -qtest unix:path=/tmp/t0/test-qcow2-file/qemua-4834-qtest.sock -machine accel=qtest -nodefaults -machine accel=qtest -drive if=virtio,id=drive0,file=/tmp/t0/test-qcow2-file/disk_a,format=qcow2,cache=writethrough
>>> WARNING:qemu:qemu received signal 6: build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 -chardev socket,id=mon,path=/tmp/t0/test-qcow2-file/tmpG2ZjZB/qemua-4834-monitor.sock -mon chardev=mon,mode=control -display none -vga none -qtest unix:path=/tmp/t0/test-qcow2-file/qemua-4834-qtest.sock -machine accel=qtest -nodefaults -machine accel=qtest -drive if=virtio,id=drive0,file=/tmp/t0/test-qcow2-file/disk_a,format=qcow2,cache=writethrough
>>>
>>> ======================================================================
>>> FAIL: test_do_test_migration_resume_source_not_persistent__migbitmap (__main__.TestDirtyBitmapMigration)
>>> ----------------------------------------------------------------------
>>> Traceback (most recent call last):
>>>    File "169", line 206, in <lambda>
>>>      setattr(klass, 'test_' + method + name, lambda self: mc(self))
>>>    File "169", line 125, in do_test_migration_resume_source
>>>      self.assertEqual(log, '')
>>> AssertionError: "qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'\n" != ''
>>
>> So here looks more like the shutdown is the problem.
> 
> It doesn't repeat for me here in a  few minutes of testing;  can you get
> me a full all-threads backtrace for those asserts?
> 

Reproduced for me after about 1-3 minutes on tmpfs, did you try it?
Vladimir Sementsov-Ogievskiy Jan. 24, 2019, 9:29 a.m. UTC | #8
23.01.2019 18:48, Max Reitz wrote:
> Hi,
> 
> When running 169 in parallel (e.g. like so:
> 
> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
> 
> in four different shells), I get aborts:
> 
> (Often I get segfaults, but that's because of
> http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
> feel free to apply the attached patch to make them go away)
> 
> 
> WARNING:qemu:qemu received signal 6:
> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
> -mon chardev=mon,mode=control -display none -vga none -qtest
> unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
> -nodefaults -machine accel=qtest -drive
> if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
> .................E..
> ======================================================================
> ERROR:
> test_do_test_migration_resume_source_not_persistent__not_migbitmap
> (__main__.TestDirtyBitmapMigration)
> ----------------------------------------------------------------------
> Traceback (most recent call last):
>    File "169", line 206, in <lambda>
>      setattr(klass, 'test_' + method + name, lambda self: mc(self))
>    File "169", line 113, in do_test_migration_resume_source
>      self.check_bitmap(self.vm_a, sha256)
>    File "169", line 72, in check_bitmap
>      node='drive0', name='bitmap0')
>    File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
>      return self._qmp.cmd(cmd, args=qmp_args)
>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
>      return self.cmd_obj(qmp_cmd)
>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
>      resp = self.__json_read()
>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
> __json_read
>      data = self.__sockfile.readline()
>    File "/usr/lib64/python2.7/socket.py", line 451, in readline
>      data = self._sock.recv(self._rbufsize)
> error: [Errno 104] Connection reset by peer
> 
> ----------------------------------------------------------------------
> Ran 20 tests
> 
> FAILED (errors=1)
> 
> 
> Or:
> 
> WARNING:qemu:qemu received signal 6:
> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> -mon chardev=mon,mode=control -display none -vga none -qtest
> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> -machine accel=qtest -drive
> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> WARNING:qemu:qemu received signal 6:
> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> -mon chardev=mon,mode=control -display none -vga none -qtest
> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> -machine accel=qtest -drive
> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> 
> ...................F
> ======================================================================
> FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
> (__main__.TestDirtyBitmapMigration)
> ----------------------------------------------------------------------
> Traceback (most recent call last):
>    File "169", line 206, in <lambda>
>      setattr(klass, 'test_' + method + name, lambda self: mc(self))
>    File "169", line 125, in do_test_migration_resume_source
>      self.assertEqual(log, '')
> AssertionError: "qemu-system-x86_64: invalid runstate transition:
> 'running' -> 'postmigrate'\n" != ''
> 
> ----------------------------------------------------------------------
> Ran 20 tests
> 
> FAILED (failures=1)
> 
> 
> The backtrace always goes like this:
> 
> (gdb) bt
> #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
> #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> at vl.c:742
> #3  0x000055a46ebbb1a6 in runstate_set
> (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
> #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
> at migration/migration.c:2972
> #5  0x000055a46ed39129 in migration_thread
> (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
> #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
> util/qemu-thread-posix.c:502
> 
> 
> #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
> #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
> (gdb) frame 2
> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> at vl.c:742
> 742             abort();
> (gdb) print current_run_state
> $1 = RUN_STATE_RUNNING
> 
> 
> Neither of migration or runstates are my strong suite, so I thought I'd
> report it before diving into it.
> 
> Max
> 

[...]
450556@1548321072.888979:migrate_set_state new state active
450556@1548321072.889022:migration_thread_setup_complete
450556@1548321072.988275:migrate_transferred transferred 985298 time_spent 100 bandwidth 9852 max_size 2955894
450556@1548321072.988283:migration_bitmap_sync_start
450556@1548321072.988293:migration_bitmap_sync_end dirty_pages 32898
450556@1548321072.988495:migration_thread_low_pending 2048
migration_completion
450556@1548321072.988541:runstate_set current_state 9 new_state 7
450556@1548321072.988780:migration_bitmap_sync_start
450556@1548321072.988790:migration_bitmap_sync_end dirty_pages 32898
450556@1548321072.993112:migrate_global_state_pre_save saved state: running
[1] 450556@1548321072.993237:migrate_set_state new state completed
450549@1548321072.993668:handle_qmp_command mon 0x55aebef99830 req: {"execute": "x-debug-block-dirty-bitmap-sha256", "arguments": {"name": "bitmap0", "node": "drive0"}}
450556@1548321072.993697:migration_thread_after_loop
[2] 450549@1548321072.993917:handle_qmp_command mon 0x55aebef99830 req: {"execute": "cont"}
450476@1548321072.994461:runstate_set current_state 7 new_state 9
qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'


Aha, so, in test we wait for MIGRATION COMPLETED [1] and run qmp_cont [2] and go to RUNNING.
then in migration thread we go to migration_iteration_finish() and fail to go to POSTMIGRATE.
(note, that this testcase is about resuming source after migration)

So, fix for test may be additionally wait for POSTMIGRATE, not only for MIGRATION COMPLETION.

But how to fix Qemu not to crash? May be, forbid some transitions (FINISH_MIGRATE -> RUNNING),
  or at least error-out qmp_cont if runstate is  FINISH_MIGRATE?
Vladimir Sementsov-Ogievskiy Jan. 24, 2019, 9:49 a.m. UTC | #9
24.01.2019 12:29, Vladimir Sementsov-Ogievskiy wrote:
> 23.01.2019 18:48, Max Reitz wrote:
>> Hi,
>>
>> When running 169 in parallel (e.g. like so:
>>
>> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
>> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
>> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
>> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
>>
>> in four different shells), I get aborts:
>>
>> (Often I get segfaults, but that's because of
>> http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
>> feel free to apply the attached patch to make them go away)
>>
>>
>> WARNING:qemu:qemu received signal 6:
>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
>> -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
>> -mon chardev=mon,mode=control -display none -vga none -qtest
>> unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
>> -nodefaults -machine accel=qtest -drive
>> if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
>> .................E..
>> ======================================================================
>> ERROR:
>> test_do_test_migration_resume_source_not_persistent__not_migbitmap
>> (__main__.TestDirtyBitmapMigration)
>> ----------------------------------------------------------------------
>> Traceback (most recent call last):
>>    File "169", line 206, in <lambda>
>>      setattr(klass, 'test_' + method + name, lambda self: mc(self))
>>    File "169", line 113, in do_test_migration_resume_source
>>      self.check_bitmap(self.vm_a, sha256)
>>    File "169", line 72, in check_bitmap
>>      node='drive0', name='bitmap0')
>>    File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
>>      return self._qmp.cmd(cmd, args=qmp_args)
>>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
>>      return self.cmd_obj(qmp_cmd)
>>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
>>      resp = self.__json_read()
>>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
>> __json_read
>>      data = self.__sockfile.readline()
>>    File "/usr/lib64/python2.7/socket.py", line 451, in readline
>>      data = self._sock.recv(self._rbufsize)
>> error: [Errno 104] Connection reset by peer
>>
>> ----------------------------------------------------------------------
>> Ran 20 tests
>>
>> FAILED (errors=1)
>>
>>
>> Or:
>>
>> WARNING:qemu:qemu received signal 6:
>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
>> -mon chardev=mon,mode=control -display none -vga none -qtest
>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
>> -machine accel=qtest -drive
>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
>> WARNING:qemu:qemu received signal 6:
>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
>> -mon chardev=mon,mode=control -display none -vga none -qtest
>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
>> -machine accel=qtest -drive
>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
>>
>> ...................F
>> ======================================================================
>> FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
>> (__main__.TestDirtyBitmapMigration)
>> ----------------------------------------------------------------------
>> Traceback (most recent call last):
>>    File "169", line 206, in <lambda>
>>      setattr(klass, 'test_' + method + name, lambda self: mc(self))
>>    File "169", line 125, in do_test_migration_resume_source
>>      self.assertEqual(log, '')
>> AssertionError: "qemu-system-x86_64: invalid runstate transition:
>> 'running' -> 'postmigrate'\n" != ''
>>
>> ----------------------------------------------------------------------
>> Ran 20 tests
>>
>> FAILED (failures=1)
>>
>>
>> The backtrace always goes like this:
>>
>> (gdb) bt
>> #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
>> #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
>> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
>> at vl.c:742
>> #3  0x000055a46ebbb1a6 in runstate_set
>> (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
>> #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
>> at migration/migration.c:2972
>> #5  0x000055a46ed39129 in migration_thread
>> (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
>> #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
>> util/qemu-thread-posix.c:502
>>
>>
>> #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
>> #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
>> (gdb) frame 2
>> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
>> at vl.c:742
>> 742             abort();
>> (gdb) print current_run_state
>> $1 = RUN_STATE_RUNNING
>>
>>
>> Neither of migration or runstates are my strong suite, so I thought I'd
>> report it before diving into it.
>>
>> Max
>>
> 
> [...]
> 450556@1548321072.888979:migrate_set_state new state active
> 450556@1548321072.889022:migration_thread_setup_complete
> 450556@1548321072.988275:migrate_transferred transferred 985298 time_spent 100 bandwidth 9852 max_size 2955894
> 450556@1548321072.988283:migration_bitmap_sync_start
> 450556@1548321072.988293:migration_bitmap_sync_end dirty_pages 32898
> 450556@1548321072.988495:migration_thread_low_pending 2048
> migration_completion
> 450556@1548321072.988541:runstate_set current_state 9 new_state 7
> 450556@1548321072.988780:migration_bitmap_sync_start
> 450556@1548321072.988790:migration_bitmap_sync_end dirty_pages 32898
> 450556@1548321072.993112:migrate_global_state_pre_save saved state: running
> [1] 450556@1548321072.993237:migrate_set_state new state completed
> 450549@1548321072.993668:handle_qmp_command mon 0x55aebef99830 req: {"execute": "x-debug-block-dirty-bitmap-sha256", "arguments": {"name": "bitmap0", "node": "drive0"}}
> 450556@1548321072.993697:migration_thread_after_loop
> [2] 450549@1548321072.993917:handle_qmp_command mon 0x55aebef99830 req: {"execute": "cont"}
> 450476@1548321072.994461:runstate_set current_state 7 new_state 9
> qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'
> 
> 
> Aha, so, in test we wait for MIGRATION COMPLETED [1] and run qmp_cont [2] and go to RUNNING.
> then in migration thread we go to migration_iteration_finish() and fail to go to POSTMIGRATE.
> (note, that this testcase is about resuming source after migration)
> 
> So, fix for test may be additionally wait for POSTMIGRATE, not only for MIGRATION COMPLETION.
> 
> But how to fix Qemu not to crash? May be, forbid some transitions (FINISH_MIGRATE -> RUNNING),
>   or at least error-out qmp_cont if runstate is  FINISH_MIGRATE?
> 
> 
> 


Ok, this seems to fix test for me:

diff --git a/tests/qemu-iotests/169 b/tests/qemu-iotests/169
index 527aebd0cb..dd98168c8d 100755
--- a/tests/qemu-iotests/169
+++ b/tests/qemu-iotests/169
@@ -102,6 +102,10 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
              event = self.vm_a.event_wait('MIGRATION')
              if event['data']['status'] == 'completed':
                  break
+        while True:
+            result = self.vm_a.qmp('query-status')
+            if (result['return']['status'] == 'postmigrate'):
+                break

          # test that bitmap is still here
          removed = (not migrate_bitmaps) and persistent

-


And this is the full diff of the test I used, to skip checking log (as I enabled traces) and to skip all other test-cases:

(hmm, we definitely need simple way to run one test case from test, without editing it)

diff --git a/tests/qemu-iotests/169 b/tests/qemu-iotests/169
index 527aebd0cb..dd98168c8d 100755
--- a/tests/qemu-iotests/169
+++ b/tests/qemu-iotests/169
@@ -102,6 +102,10 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
              event = self.vm_a.event_wait('MIGRATION')
              if event['data']['status'] == 'completed':
                  break
+        while True:
+            result = self.vm_a.qmp('query-status')
+            if (result['return']['status'] == 'postmigrate'):
+                break

          # test that bitmap is still here
          removed = (not migrate_bitmaps) and persistent
@@ -117,12 +121,12 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):

          # catch 'Could not reopen qcow2 layer: Bitmap already exists'
          # possible error
-        log = self.vm_a.get_log()
-        log = re.sub(r'^\[I \d+\.\d+\] OPENED\n', '', log)
-        log = re.sub(r'^(wrote .* bytes at offset .*\n.*KiB.*ops.*sec.*\n){3}',
-                     '', log)
-        log = re.sub(r'\[I \+\d+\.\d+\] CLOSED\n?$', '', log)
-        self.assertEqual(log, '')
+        #log = self.vm_a.get_log()
+        #log = re.sub(r'^\[I \d+\.\d+\] OPENED\n', '', log)
+        #log = re.sub(r'^(wrote .* bytes at offset .*\n.*KiB.*ops.*sec.*\n){3}',
+                     #'', log)
+        #log = re.sub(r'\[I \+\d+\.\d+\] CLOSED\n?$', '', log)
+        #self.assertEqual(log, '')

          # test that bitmap is still persistent
          self.vm_a.launch()
@@ -211,14 +215,16 @@ for cmb in list(itertools.product((True, False), repeat=4)):
      name += '_online' if cmb[2] else '_offline'
      name += '_shared' if cmb[3] else '_nonshared'

-    inject_test_case(TestDirtyBitmapMigration, name, 'do_test_migration',
+    if False:
+        inject_test_case(TestDirtyBitmapMigration, name, 'do_test_migration',
                       *list(cmb))

  for cmb in list(itertools.product((True, False), repeat=2)):
      name = ('_' if cmb[0] else '_not_') + 'persistent_'
      name += ('_' if cmb[1] else '_not_') + 'migbitmap'

-    inject_test_case(TestDirtyBitmapMigration, name,
+    if name == '_persistent__migbitmap':
+        inject_test_case(TestDirtyBitmapMigration, name,
                       'do_test_migration_resume_source', *list(cmb))

  if __name__ == '__main__':
diff --git a/tests/qemu-iotests/169.out b/tests/qemu-iotests/169.out
index 3a89159833..ae1213e6f8 100644
--- a/tests/qemu-iotests/169.out
+++ b/tests/qemu-iotests/169.out
@@ -1,5 +1,5 @@
-....................
+.
  ----------------------------------------------------------------------
-Ran 20 tests
+Ran 1 tests

  OK
Dr. David Alan Gilbert Jan. 24, 2019, 10:10 a.m. UTC | #10
* Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
> 24.01.2019 12:29, Vladimir Sementsov-Ogievskiy wrote:
> > 23.01.2019 18:48, Max Reitz wrote:
> >> Hi,
> >>
> >> When running 169 in parallel (e.g. like so:
> >>
> >> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
> >> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
> >> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
> >> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
> >>
> >> in four different shells), I get aborts:
> >>
> >> (Often I get segfaults, but that's because of
> >> http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
> >> feel free to apply the attached patch to make them go away)
> >>
> >>
> >> WARNING:qemu:qemu received signal 6:
> >> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> >> -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
> >> -mon chardev=mon,mode=control -display none -vga none -qtest
> >> unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
> >> -nodefaults -machine accel=qtest -drive
> >> if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
> >> .................E..
> >> ======================================================================
> >> ERROR:
> >> test_do_test_migration_resume_source_not_persistent__not_migbitmap
> >> (__main__.TestDirtyBitmapMigration)
> >> ----------------------------------------------------------------------
> >> Traceback (most recent call last):
> >>    File "169", line 206, in <lambda>
> >>      setattr(klass, 'test_' + method + name, lambda self: mc(self))
> >>    File "169", line 113, in do_test_migration_resume_source
> >>      self.check_bitmap(self.vm_a, sha256)
> >>    File "169", line 72, in check_bitmap
> >>      node='drive0', name='bitmap0')
> >>    File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
> >>      return self._qmp.cmd(cmd, args=qmp_args)
> >>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
> >>      return self.cmd_obj(qmp_cmd)
> >>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
> >>      resp = self.__json_read()
> >>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
> >> __json_read
> >>      data = self.__sockfile.readline()
> >>    File "/usr/lib64/python2.7/socket.py", line 451, in readline
> >>      data = self._sock.recv(self._rbufsize)
> >> error: [Errno 104] Connection reset by peer
> >>
> >> ----------------------------------------------------------------------
> >> Ran 20 tests
> >>
> >> FAILED (errors=1)
> >>
> >>
> >> Or:
> >>
> >> WARNING:qemu:qemu received signal 6:
> >> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> >> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> >> -mon chardev=mon,mode=control -display none -vga none -qtest
> >> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> >> -machine accel=qtest -drive
> >> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> >> WARNING:qemu:qemu received signal 6:
> >> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> >> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> >> -mon chardev=mon,mode=control -display none -vga none -qtest
> >> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> >> -machine accel=qtest -drive
> >> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> >>
> >> ...................F
> >> ======================================================================
> >> FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
> >> (__main__.TestDirtyBitmapMigration)
> >> ----------------------------------------------------------------------
> >> Traceback (most recent call last):
> >>    File "169", line 206, in <lambda>
> >>      setattr(klass, 'test_' + method + name, lambda self: mc(self))
> >>    File "169", line 125, in do_test_migration_resume_source
> >>      self.assertEqual(log, '')
> >> AssertionError: "qemu-system-x86_64: invalid runstate transition:
> >> 'running' -> 'postmigrate'\n" != ''
> >>
> >> ----------------------------------------------------------------------
> >> Ran 20 tests
> >>
> >> FAILED (failures=1)
> >>
> >>
> >> The backtrace always goes like this:
> >>
> >> (gdb) bt
> >> #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
> >> #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
> >> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> >> at vl.c:742
> >> #3  0x000055a46ebbb1a6 in runstate_set
> >> (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
> >> #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
> >> at migration/migration.c:2972
> >> #5  0x000055a46ed39129 in migration_thread
> >> (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
> >> #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
> >> util/qemu-thread-posix.c:502
> >>
> >>
> >> #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
> >> #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
> >> (gdb) frame 2
> >> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> >> at vl.c:742
> >> 742             abort();
> >> (gdb) print current_run_state
> >> $1 = RUN_STATE_RUNNING
> >>
> >>
> >> Neither of migration or runstates are my strong suite, so I thought I'd
> >> report it before diving into it.
> >>
> >> Max
> >>
> > 
> > [...]
> > 450556@1548321072.888979:migrate_set_state new state active
> > 450556@1548321072.889022:migration_thread_setup_complete
> > 450556@1548321072.988275:migrate_transferred transferred 985298 time_spent 100 bandwidth 9852 max_size 2955894
> > 450556@1548321072.988283:migration_bitmap_sync_start
> > 450556@1548321072.988293:migration_bitmap_sync_end dirty_pages 32898
> > 450556@1548321072.988495:migration_thread_low_pending 2048
> > migration_completion
> > 450556@1548321072.988541:runstate_set current_state 9 new_state 7
> > 450556@1548321072.988780:migration_bitmap_sync_start
> > 450556@1548321072.988790:migration_bitmap_sync_end dirty_pages 32898
> > 450556@1548321072.993112:migrate_global_state_pre_save saved state: running
> > [1] 450556@1548321072.993237:migrate_set_state new state completed
> > 450549@1548321072.993668:handle_qmp_command mon 0x55aebef99830 req: {"execute": "x-debug-block-dirty-bitmap-sha256", "arguments": {"name": "bitmap0", "node": "drive0"}}
> > 450556@1548321072.993697:migration_thread_after_loop
> > [2] 450549@1548321072.993917:handle_qmp_command mon 0x55aebef99830 req: {"execute": "cont"}
> > 450476@1548321072.994461:runstate_set current_state 7 new_state 9
> > qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'
> > 
> > 
> > Aha, so, in test we wait for MIGRATION COMPLETED [1] and run qmp_cont [2] and go to RUNNING.
> > then in migration thread we go to migration_iteration_finish() and fail to go to POSTMIGRATE.
> > (note, that this testcase is about resuming source after migration)
> > 
> > So, fix for test may be additionally wait for POSTMIGRATE, not only for MIGRATION COMPLETION.
> > 
> > But how to fix Qemu not to crash? May be, forbid some transitions (FINISH_MIGRATE -> RUNNING),
> >   or at least error-out qmp_cont if runstate is  FINISH_MIGRATE?
> > 
> > 
> > 
> 
> 
> Ok, this seems to fix test for me:
> 
> diff --git a/tests/qemu-iotests/169 b/tests/qemu-iotests/169
> index 527aebd0cb..dd98168c8d 100755
> --- a/tests/qemu-iotests/169
> +++ b/tests/qemu-iotests/169
> @@ -102,6 +102,10 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
>               event = self.vm_a.event_wait('MIGRATION')
>               if event['data']['status'] == 'completed':
>                   break
> +        while True:
> +            result = self.vm_a.qmp('query-status')
> +            if (result['return']['status'] == 'postmigrate'):
> +                break

Hmm; I can see that it might help; although I still can't quite see
how the bad transition that was reported can have happened.

Dave

>           # test that bitmap is still here
>           removed = (not migrate_bitmaps) and persistent
> 
> -
> 
> 
> And this is the full diff of the test I used, to skip checking log (as I enabled traces) and to skip all other test-cases:
> 
> (hmm, we definitely need simple way to run one test case from test, without editing it)
> 
> diff --git a/tests/qemu-iotests/169 b/tests/qemu-iotests/169
> index 527aebd0cb..dd98168c8d 100755
> --- a/tests/qemu-iotests/169
> +++ b/tests/qemu-iotests/169
> @@ -102,6 +102,10 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
>               event = self.vm_a.event_wait('MIGRATION')
>               if event['data']['status'] == 'completed':
>                   break
> +        while True:
> +            result = self.vm_a.qmp('query-status')
> +            if (result['return']['status'] == 'postmigrate'):
> +                break
> 
>           # test that bitmap is still here
>           removed = (not migrate_bitmaps) and persistent
> @@ -117,12 +121,12 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
> 
>           # catch 'Could not reopen qcow2 layer: Bitmap already exists'
>           # possible error
> -        log = self.vm_a.get_log()
> -        log = re.sub(r'^\[I \d+\.\d+\] OPENED\n', '', log)
> -        log = re.sub(r'^(wrote .* bytes at offset .*\n.*KiB.*ops.*sec.*\n){3}',
> -                     '', log)
> -        log = re.sub(r'\[I \+\d+\.\d+\] CLOSED\n?$', '', log)
> -        self.assertEqual(log, '')
> +        #log = self.vm_a.get_log()
> +        #log = re.sub(r'^\[I \d+\.\d+\] OPENED\n', '', log)
> +        #log = re.sub(r'^(wrote .* bytes at offset .*\n.*KiB.*ops.*sec.*\n){3}',
> +                     #'', log)
> +        #log = re.sub(r'\[I \+\d+\.\d+\] CLOSED\n?$', '', log)
> +        #self.assertEqual(log, '')
> 
>           # test that bitmap is still persistent
>           self.vm_a.launch()
> @@ -211,14 +215,16 @@ for cmb in list(itertools.product((True, False), repeat=4)):
>       name += '_online' if cmb[2] else '_offline'
>       name += '_shared' if cmb[3] else '_nonshared'
> 
> -    inject_test_case(TestDirtyBitmapMigration, name, 'do_test_migration',
> +    if False:
> +        inject_test_case(TestDirtyBitmapMigration, name, 'do_test_migration',
>                        *list(cmb))
> 
>   for cmb in list(itertools.product((True, False), repeat=2)):
>       name = ('_' if cmb[0] else '_not_') + 'persistent_'
>       name += ('_' if cmb[1] else '_not_') + 'migbitmap'
> 
> -    inject_test_case(TestDirtyBitmapMigration, name,
> +    if name == '_persistent__migbitmap':
> +        inject_test_case(TestDirtyBitmapMigration, name,
>                        'do_test_migration_resume_source', *list(cmb))
> 
>   if __name__ == '__main__':
> diff --git a/tests/qemu-iotests/169.out b/tests/qemu-iotests/169.out
> index 3a89159833..ae1213e6f8 100644
> --- a/tests/qemu-iotests/169.out
> +++ b/tests/qemu-iotests/169.out
> @@ -1,5 +1,5 @@
> -....................
> +.
>   ----------------------------------------------------------------------
> -Ran 20 tests
> +Ran 1 tests
> 
>   OK
> 
> 
> 
> -- 
> Best regards,
> Vladimir
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Kevin Wolf Jan. 24, 2019, 10:15 a.m. UTC | #11
Am 24.01.2019 um 10:29 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 23.01.2019 18:48, Max Reitz wrote:
> > Hi,
> > 
> > When running 169 in parallel (e.g. like so:
> > 
> > $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
> > $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
> > $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
> > $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
> > 
> > in four different shells), I get aborts:
> > 
> > (Often I get segfaults, but that's because of
> > http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
> > feel free to apply the attached patch to make them go away)
> > 
> > 
> > WARNING:qemu:qemu received signal 6:
> > build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> > -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
> > -mon chardev=mon,mode=control -display none -vga none -qtest
> > unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
> > -nodefaults -machine accel=qtest -drive
> > if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
> > .................E..
> > ======================================================================
> > ERROR:
> > test_do_test_migration_resume_source_not_persistent__not_migbitmap
> > (__main__.TestDirtyBitmapMigration)
> > ----------------------------------------------------------------------
> > Traceback (most recent call last):
> >    File "169", line 206, in <lambda>
> >      setattr(klass, 'test_' + method + name, lambda self: mc(self))
> >    File "169", line 113, in do_test_migration_resume_source
> >      self.check_bitmap(self.vm_a, sha256)
> >    File "169", line 72, in check_bitmap
> >      node='drive0', name='bitmap0')
> >    File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
> >      return self._qmp.cmd(cmd, args=qmp_args)
> >    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
> >      return self.cmd_obj(qmp_cmd)
> >    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
> >      resp = self.__json_read()
> >    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
> > __json_read
> >      data = self.__sockfile.readline()
> >    File "/usr/lib64/python2.7/socket.py", line 451, in readline
> >      data = self._sock.recv(self._rbufsize)
> > error: [Errno 104] Connection reset by peer
> > 
> > ----------------------------------------------------------------------
> > Ran 20 tests
> > 
> > FAILED (errors=1)
> > 
> > 
> > Or:
> > 
> > WARNING:qemu:qemu received signal 6:
> > build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> > -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> > -mon chardev=mon,mode=control -display none -vga none -qtest
> > unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> > -machine accel=qtest -drive
> > if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> > WARNING:qemu:qemu received signal 6:
> > build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> > -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> > -mon chardev=mon,mode=control -display none -vga none -qtest
> > unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> > -machine accel=qtest -drive
> > if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> > 
> > ...................F
> > ======================================================================
> > FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
> > (__main__.TestDirtyBitmapMigration)
> > ----------------------------------------------------------------------
> > Traceback (most recent call last):
> >    File "169", line 206, in <lambda>
> >      setattr(klass, 'test_' + method + name, lambda self: mc(self))
> >    File "169", line 125, in do_test_migration_resume_source
> >      self.assertEqual(log, '')
> > AssertionError: "qemu-system-x86_64: invalid runstate transition:
> > 'running' -> 'postmigrate'\n" != ''
> > 
> > ----------------------------------------------------------------------
> > Ran 20 tests
> > 
> > FAILED (failures=1)
> > 
> > 
> > The backtrace always goes like this:
> > 
> > (gdb) bt
> > #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
> > #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
> > #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> > at vl.c:742
> > #3  0x000055a46ebbb1a6 in runstate_set
> > (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
> > #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
> > at migration/migration.c:2972
> > #5  0x000055a46ed39129 in migration_thread
> > (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
> > #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
> > util/qemu-thread-posix.c:502
> > 
> > 
> > #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
> > #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
> > (gdb) frame 2
> > #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> > at vl.c:742
> > 742             abort();
> > (gdb) print current_run_state
> > $1 = RUN_STATE_RUNNING
> > 
> > 
> > Neither of migration or runstates are my strong suite, so I thought I'd
> > report it before diving into it.
> > 
> > Max
> > 
> 
> [...]
> 450556@1548321072.888979:migrate_set_state new state active
> 450556@1548321072.889022:migration_thread_setup_complete
> 450556@1548321072.988275:migrate_transferred transferred 985298 time_spent 100 bandwidth 9852 max_size 2955894
> 450556@1548321072.988283:migration_bitmap_sync_start
> 450556@1548321072.988293:migration_bitmap_sync_end dirty_pages 32898
> 450556@1548321072.988495:migration_thread_low_pending 2048
> migration_completion
> 450556@1548321072.988541:runstate_set current_state 9 new_state 7
> 450556@1548321072.988780:migration_bitmap_sync_start
> 450556@1548321072.988790:migration_bitmap_sync_end dirty_pages 32898
> 450556@1548321072.993112:migrate_global_state_pre_save saved state: running
> [1] 450556@1548321072.993237:migrate_set_state new state completed
> 450549@1548321072.993668:handle_qmp_command mon 0x55aebef99830 req: {"execute": "x-debug-block-dirty-bitmap-sha256", "arguments": {"name": "bitmap0", "node": "drive0"}}
> 450556@1548321072.993697:migration_thread_after_loop
> [2] 450549@1548321072.993917:handle_qmp_command mon 0x55aebef99830 req: {"execute": "cont"}
> 450476@1548321072.994461:runstate_set current_state 7 new_state 9
> qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'
> 
> 
> Aha, so, in test we wait for MIGRATION COMPLETED [1] and run qmp_cont [2] and go to RUNNING.
> then in migration thread we go to migration_iteration_finish() and fail to go to POSTMIGRATE.
> (note, that this testcase is about resuming source after migration)
> 
> So, fix for test may be additionally wait for POSTMIGRATE, not only for MIGRATION COMPLETION.
> 
> But how to fix Qemu not to crash? May be, forbid some transitions (FINISH_MIGRATE -> RUNNING),
>   or at least error-out qmp_cont if runstate is  FINISH_MIGRATE?

qmp_cont currently checks for RUN_STATE_INMIGRATE and gives it special
treatment. As a quick hack, doing the same for RUN_STATE_FINISH_MIGRATE
might fix the problem. A better simple fix would possibly include a
whitelist of states where you can use 'cont' to avoid that the user
messes with other internal states.

However, like the problem that Dave reported a few days ago (migrating
twice leads to a crash because we try to inactivate already inactive
block devices), I think this is a symptom of a larger problem. We're
not taking the state machine serious enough. One-off checks here and
there are unlikely to ever fully solve the problem of running a command
in a runstate in which it was never supposed to run.

I wonder whether the QAPI schema should have a field 'run-states' for
commands, and by default we would only include states where the VM has
ownership of its resources (e.g. images are activated) and which are not
temporary states that are automatically left, like finish-migrate.

Then the default for commands is to be rejected in "unusual" runstates
where we're not expecting user intervention, and we must explicitly
allow them if they are okay, in fact.

Instead of listing every obscure runstate, maybe we should really use
categories of runstates instead:

1. Running
2. Paused, owns all resources (like disk images)
3. Paused, doesn't own some resources (source VM after migration
   completes, destination before migration completes)
4. Paused temporarily for internal reasons (e.g. finish-migrate,
   restore-vm, save-vm)

Most commands should be okay with 1 and 2, but possibly not 3, and
almost never 4.

Kevin
Vladimir Sementsov-Ogievskiy Jan. 24, 2019, 10:23 a.m. UTC | #12
24.01.2019 13:10, Dr. David Alan Gilbert wrote:
> * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
>> 24.01.2019 12:29, Vladimir Sementsov-Ogievskiy wrote:
>>> 23.01.2019 18:48, Max Reitz wrote:
>>>> Hi,
>>>>
>>>> When running 169 in parallel (e.g. like so:
>>>>
>>>> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
>>>> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
>>>> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
>>>> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
>>>>
>>>> in four different shells), I get aborts:
>>>>
>>>> (Often I get segfaults, but that's because of
>>>> http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
>>>> feel free to apply the attached patch to make them go away)
>>>>
>>>>
>>>> WARNING:qemu:qemu received signal 6:
>>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
>>>> -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
>>>> -mon chardev=mon,mode=control -display none -vga none -qtest
>>>> unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
>>>> -nodefaults -machine accel=qtest -drive
>>>> if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
>>>> .................E..
>>>> ======================================================================
>>>> ERROR:
>>>> test_do_test_migration_resume_source_not_persistent__not_migbitmap
>>>> (__main__.TestDirtyBitmapMigration)
>>>> ----------------------------------------------------------------------
>>>> Traceback (most recent call last):
>>>>     File "169", line 206, in <lambda>
>>>>       setattr(klass, 'test_' + method + name, lambda self: mc(self))
>>>>     File "169", line 113, in do_test_migration_resume_source
>>>>       self.check_bitmap(self.vm_a, sha256)
>>>>     File "169", line 72, in check_bitmap
>>>>       node='drive0', name='bitmap0')
>>>>     File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
>>>>       return self._qmp.cmd(cmd, args=qmp_args)
>>>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
>>>>       return self.cmd_obj(qmp_cmd)
>>>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
>>>>       resp = self.__json_read()
>>>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
>>>> __json_read
>>>>       data = self.__sockfile.readline()
>>>>     File "/usr/lib64/python2.7/socket.py", line 451, in readline
>>>>       data = self._sock.recv(self._rbufsize)
>>>> error: [Errno 104] Connection reset by peer
>>>>
>>>> ----------------------------------------------------------------------
>>>> Ran 20 tests
>>>>
>>>> FAILED (errors=1)
>>>>
>>>>
>>>> Or:
>>>>
>>>> WARNING:qemu:qemu received signal 6:
>>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
>>>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
>>>> -mon chardev=mon,mode=control -display none -vga none -qtest
>>>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
>>>> -machine accel=qtest -drive
>>>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
>>>> WARNING:qemu:qemu received signal 6:
>>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
>>>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
>>>> -mon chardev=mon,mode=control -display none -vga none -qtest
>>>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
>>>> -machine accel=qtest -drive
>>>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
>>>>
>>>> ...................F
>>>> ======================================================================
>>>> FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
>>>> (__main__.TestDirtyBitmapMigration)
>>>> ----------------------------------------------------------------------
>>>> Traceback (most recent call last):
>>>>     File "169", line 206, in <lambda>
>>>>       setattr(klass, 'test_' + method + name, lambda self: mc(self))
>>>>     File "169", line 125, in do_test_migration_resume_source
>>>>       self.assertEqual(log, '')
>>>> AssertionError: "qemu-system-x86_64: invalid runstate transition:
>>>> 'running' -> 'postmigrate'\n" != ''
>>>>
>>>> ----------------------------------------------------------------------
>>>> Ran 20 tests
>>>>
>>>> FAILED (failures=1)
>>>>
>>>>
>>>> The backtrace always goes like this:
>>>>
>>>> (gdb) bt
>>>> #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
>>>> #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
>>>> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
>>>> at vl.c:742
>>>> #3  0x000055a46ebbb1a6 in runstate_set
>>>> (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
>>>> #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
>>>> at migration/migration.c:2972
>>>> #5  0x000055a46ed39129 in migration_thread
>>>> (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
>>>> #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
>>>> util/qemu-thread-posix.c:502
>>>>
>>>>
>>>> #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
>>>> #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
>>>> (gdb) frame 2
>>>> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
>>>> at vl.c:742
>>>> 742             abort();
>>>> (gdb) print current_run_state
>>>> $1 = RUN_STATE_RUNNING
>>>>
>>>>
>>>> Neither of migration or runstates are my strong suite, so I thought I'd
>>>> report it before diving into it.
>>>>
>>>> Max
>>>>
>>>
>>> [...]
>>> 450556@1548321072.888979:migrate_set_state new state active
>>> 450556@1548321072.889022:migration_thread_setup_complete
>>> 450556@1548321072.988275:migrate_transferred transferred 985298 time_spent 100 bandwidth 9852 max_size 2955894
>>> 450556@1548321072.988283:migration_bitmap_sync_start
>>> 450556@1548321072.988293:migration_bitmap_sync_end dirty_pages 32898
>>> 450556@1548321072.988495:migration_thread_low_pending 2048
>>> migration_completion
>>> 450556@1548321072.988541:runstate_set current_state 9 new_state 7
>>> 450556@1548321072.988780:migration_bitmap_sync_start
>>> 450556@1548321072.988790:migration_bitmap_sync_end dirty_pages 32898
>>> 450556@1548321072.993112:migrate_global_state_pre_save saved state: running
>>> [1] 450556@1548321072.993237:migrate_set_state new state completed
>>> 450549@1548321072.993668:handle_qmp_command mon 0x55aebef99830 req: {"execute": "x-debug-block-dirty-bitmap-sha256", "arguments": {"name": "bitmap0", "node": "drive0"}}
>>> 450556@1548321072.993697:migration_thread_after_loop
>>> [2] 450549@1548321072.993917:handle_qmp_command mon 0x55aebef99830 req: {"execute": "cont"}
>>> 450476@1548321072.994461:runstate_set current_state 7 new_state 9
>>> qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'
>>>
>>>
>>> Aha, so, in test we wait for MIGRATION COMPLETED [1] and run qmp_cont [2] and go to RUNNING.
>>> then in migration thread we go to migration_iteration_finish() and fail to go to POSTMIGRATE.
>>> (note, that this testcase is about resuming source after migration)
>>>
>>> So, fix for test may be additionally wait for POSTMIGRATE, not only for MIGRATION COMPLETION.
>>>
>>> But how to fix Qemu not to crash? May be, forbid some transitions (FINISH_MIGRATE -> RUNNING),
>>>    or at least error-out qmp_cont if runstate is  FINISH_MIGRATE?
>>>
>>>
>>>
>>
>>
>> Ok, this seems to fix test for me:
>>
>> diff --git a/tests/qemu-iotests/169 b/tests/qemu-iotests/169
>> index 527aebd0cb..dd98168c8d 100755
>> --- a/tests/qemu-iotests/169
>> +++ b/tests/qemu-iotests/169
>> @@ -102,6 +102,10 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
>>                event = self.vm_a.event_wait('MIGRATION')
>>                if event['data']['status'] == 'completed':
>>                    break
>> +        while True:
>> +            result = self.vm_a.qmp('query-status')
>> +            if (result['return']['status'] == 'postmigrate'):
>> +                break
> 
> Hmm; I can see that it might help; although I still can't quite see
> how the bad transition that was reported can have happened.
> 

It can be assumed from my trace above:

1. migration thread do migratioN_completion and goes to RUN_STATE_FINISH_MIGRATE through vm_stop_force_state(RUN_STATE_FINISH_MIGRATE)
2. migration_completion do
         migrate_set_state(&s->state, current_active_state,
                             MIGRATION_STATUS_COMPLETED);
    and MIGRATION COMPLETED event is reported, so
3. 169 iotest gets event, and think that all is done about migration. And it call qmp_cont
4. qmp_cont do transition from RUN_STATE_FINISH_MIGRATE to RUN_STATE_RUNNING
5. migration thread goes to migration_iteration_finish and do invalid transition


> 
>>            # test that bitmap is still here
>>            removed = (not migrate_bitmaps) and persistent
>>
>> -
>>
>>
>> And this is the full diff of the test I used, to skip checking log (as I enabled traces) and to skip all other test-cases:
>>
>> (hmm, we definitely need simple way to run one test case from test, without editing it)
>>
>> diff --git a/tests/qemu-iotests/169 b/tests/qemu-iotests/169
>> index 527aebd0cb..dd98168c8d 100755
>> --- a/tests/qemu-iotests/169
>> +++ b/tests/qemu-iotests/169
>> @@ -102,6 +102,10 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
>>                event = self.vm_a.event_wait('MIGRATION')
>>                if event['data']['status'] == 'completed':
>>                    break
>> +        while True:
>> +            result = self.vm_a.qmp('query-status')
>> +            if (result['return']['status'] == 'postmigrate'):
>> +                break
>>
>>            # test that bitmap is still here
>>            removed = (not migrate_bitmaps) and persistent
>> @@ -117,12 +121,12 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
>>
>>            # catch 'Could not reopen qcow2 layer: Bitmap already exists'
>>            # possible error
>> -        log = self.vm_a.get_log()
>> -        log = re.sub(r'^\[I \d+\.\d+\] OPENED\n', '', log)
>> -        log = re.sub(r'^(wrote .* bytes at offset .*\n.*KiB.*ops.*sec.*\n){3}',
>> -                     '', log)
>> -        log = re.sub(r'\[I \+\d+\.\d+\] CLOSED\n?$', '', log)
>> -        self.assertEqual(log, '')
>> +        #log = self.vm_a.get_log()
>> +        #log = re.sub(r'^\[I \d+\.\d+\] OPENED\n', '', log)
>> +        #log = re.sub(r'^(wrote .* bytes at offset .*\n.*KiB.*ops.*sec.*\n){3}',
>> +                     #'', log)
>> +        #log = re.sub(r'\[I \+\d+\.\d+\] CLOSED\n?$', '', log)
>> +        #self.assertEqual(log, '')
>>
>>            # test that bitmap is still persistent
>>            self.vm_a.launch()
>> @@ -211,14 +215,16 @@ for cmb in list(itertools.product((True, False), repeat=4)):
>>        name += '_online' if cmb[2] else '_offline'
>>        name += '_shared' if cmb[3] else '_nonshared'
>>
>> -    inject_test_case(TestDirtyBitmapMigration, name, 'do_test_migration',
>> +    if False:
>> +        inject_test_case(TestDirtyBitmapMigration, name, 'do_test_migration',
>>                         *list(cmb))
>>
>>    for cmb in list(itertools.product((True, False), repeat=2)):
>>        name = ('_' if cmb[0] else '_not_') + 'persistent_'
>>        name += ('_' if cmb[1] else '_not_') + 'migbitmap'
>>
>> -    inject_test_case(TestDirtyBitmapMigration, name,
>> +    if name == '_persistent__migbitmap':
>> +        inject_test_case(TestDirtyBitmapMigration, name,
>>                         'do_test_migration_resume_source', *list(cmb))
>>
>>    if __name__ == '__main__':
>> diff --git a/tests/qemu-iotests/169.out b/tests/qemu-iotests/169.out
>> index 3a89159833..ae1213e6f8 100644
>> --- a/tests/qemu-iotests/169.out
>> +++ b/tests/qemu-iotests/169.out
>> @@ -1,5 +1,5 @@
>> -....................
>> +.
>>    ----------------------------------------------------------------------
>> -Ran 20 tests
>> +Ran 1 tests
>>
>>    OK
>>
>>
>>
>> -- 
>> Best regards,
>> Vladimir
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Vladimir Sementsov-Ogievskiy Jan. 24, 2019, 10:32 a.m. UTC | #13
24.01.2019 13:15, Kevin Wolf wrote:
> Am 24.01.2019 um 10:29 hat Vladimir Sementsov-Ogievskiy geschrieben:
>> 23.01.2019 18:48, Max Reitz wrote:
>>> Hi,
>>>
>>> When running 169 in parallel (e.g. like so:
>>>
>>> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
>>> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
>>> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
>>> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
>>>
>>> in four different shells), I get aborts:
>>>
>>> (Often I get segfaults, but that's because of
>>> http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
>>> feel free to apply the attached patch to make them go away)
>>>
>>>
>>> WARNING:qemu:qemu received signal 6:
>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
>>> -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
>>> -mon chardev=mon,mode=control -display none -vga none -qtest
>>> unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
>>> -nodefaults -machine accel=qtest -drive
>>> if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
>>> .................E..
>>> ======================================================================
>>> ERROR:
>>> test_do_test_migration_resume_source_not_persistent__not_migbitmap
>>> (__main__.TestDirtyBitmapMigration)
>>> ----------------------------------------------------------------------
>>> Traceback (most recent call last):
>>>     File "169", line 206, in <lambda>
>>>       setattr(klass, 'test_' + method + name, lambda self: mc(self))
>>>     File "169", line 113, in do_test_migration_resume_source
>>>       self.check_bitmap(self.vm_a, sha256)
>>>     File "169", line 72, in check_bitmap
>>>       node='drive0', name='bitmap0')
>>>     File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
>>>       return self._qmp.cmd(cmd, args=qmp_args)
>>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
>>>       return self.cmd_obj(qmp_cmd)
>>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
>>>       resp = self.__json_read()
>>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
>>> __json_read
>>>       data = self.__sockfile.readline()
>>>     File "/usr/lib64/python2.7/socket.py", line 451, in readline
>>>       data = self._sock.recv(self._rbufsize)
>>> error: [Errno 104] Connection reset by peer
>>>
>>> ----------------------------------------------------------------------
>>> Ran 20 tests
>>>
>>> FAILED (errors=1)
>>>
>>>
>>> Or:
>>>
>>> WARNING:qemu:qemu received signal 6:
>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
>>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
>>> -mon chardev=mon,mode=control -display none -vga none -qtest
>>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
>>> -machine accel=qtest -drive
>>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
>>> WARNING:qemu:qemu received signal 6:
>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
>>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
>>> -mon chardev=mon,mode=control -display none -vga none -qtest
>>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
>>> -machine accel=qtest -drive
>>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
>>>
>>> ...................F
>>> ======================================================================
>>> FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
>>> (__main__.TestDirtyBitmapMigration)
>>> ----------------------------------------------------------------------
>>> Traceback (most recent call last):
>>>     File "169", line 206, in <lambda>
>>>       setattr(klass, 'test_' + method + name, lambda self: mc(self))
>>>     File "169", line 125, in do_test_migration_resume_source
>>>       self.assertEqual(log, '')
>>> AssertionError: "qemu-system-x86_64: invalid runstate transition:
>>> 'running' -> 'postmigrate'\n" != ''
>>>
>>> ----------------------------------------------------------------------
>>> Ran 20 tests
>>>
>>> FAILED (failures=1)
>>>
>>>
>>> The backtrace always goes like this:
>>>
>>> (gdb) bt
>>> #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
>>> #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
>>> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
>>> at vl.c:742
>>> #3  0x000055a46ebbb1a6 in runstate_set
>>> (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
>>> #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
>>> at migration/migration.c:2972
>>> #5  0x000055a46ed39129 in migration_thread
>>> (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
>>> #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
>>> util/qemu-thread-posix.c:502
>>>
>>>
>>> #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
>>> #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
>>> (gdb) frame 2
>>> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
>>> at vl.c:742
>>> 742             abort();
>>> (gdb) print current_run_state
>>> $1 = RUN_STATE_RUNNING
>>>
>>>
>>> Neither of migration or runstates are my strong suite, so I thought I'd
>>> report it before diving into it.
>>>
>>> Max
>>>
>>
>> [...]
>> 450556@1548321072.888979:migrate_set_state new state active
>> 450556@1548321072.889022:migration_thread_setup_complete
>> 450556@1548321072.988275:migrate_transferred transferred 985298 time_spent 100 bandwidth 9852 max_size 2955894
>> 450556@1548321072.988283:migration_bitmap_sync_start
>> 450556@1548321072.988293:migration_bitmap_sync_end dirty_pages 32898
>> 450556@1548321072.988495:migration_thread_low_pending 2048
>> migration_completion
>> 450556@1548321072.988541:runstate_set current_state 9 new_state 7
>> 450556@1548321072.988780:migration_bitmap_sync_start
>> 450556@1548321072.988790:migration_bitmap_sync_end dirty_pages 32898
>> 450556@1548321072.993112:migrate_global_state_pre_save saved state: running
>> [1] 450556@1548321072.993237:migrate_set_state new state completed
>> 450549@1548321072.993668:handle_qmp_command mon 0x55aebef99830 req: {"execute": "x-debug-block-dirty-bitmap-sha256", "arguments": {"name": "bitmap0", "node": "drive0"}}
>> 450556@1548321072.993697:migration_thread_after_loop
>> [2] 450549@1548321072.993917:handle_qmp_command mon 0x55aebef99830 req: {"execute": "cont"}
>> 450476@1548321072.994461:runstate_set current_state 7 new_state 9
>> qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'
>>
>>
>> Aha, so, in test we wait for MIGRATION COMPLETED [1] and run qmp_cont [2] and go to RUNNING.
>> then in migration thread we go to migration_iteration_finish() and fail to go to POSTMIGRATE.
>> (note, that this testcase is about resuming source after migration)
>>
>> So, fix for test may be additionally wait for POSTMIGRATE, not only for MIGRATION COMPLETION.
>>
>> But how to fix Qemu not to crash? May be, forbid some transitions (FINISH_MIGRATE -> RUNNING),
>>    or at least error-out qmp_cont if runstate is  FINISH_MIGRATE?
> 
> qmp_cont currently checks for RUN_STATE_INMIGRATE and gives it special
> treatment. As a quick hack, doing the same for RUN_STATE_FINISH_MIGRATE
> might fix the problem.
> A better simple fix would possibly include a
> whitelist of states where you can use 'cont' to avoid that the user
> messes with other internal states.
> 
> However, like the problem that Dave reported a few days ago (migrating
> twice leads to a crash because we try to inactivate already inactive
> block devices), I think this is a symptom of a larger problem. We're
> not taking the state machine serious enough. One-off checks here and
> there are unlikely to ever fully solve the problem of running a command
> in a runstate in which it was never supposed to run.
> 
> I wonder whether the QAPI schema should have a field 'run-states' for
> commands, and by default we would only include states where the VM has
> ownership of its resources (e.g. images are activated) and which are not
> temporary states that are automatically left, like finish-migrate.
> 
> Then the default for commands is to be rejected in "unusual" runstates
> where we're not expecting user intervention, and we must explicitly
> allow them if they are okay, in fact.
> 
> Instead of listing every obscure runstate, maybe we should really use
> categories of runstates instead:
> 
> 1. Running
> 2. Paused, owns all resources (like disk images)
> 3. Paused, doesn't own some resources (source VM after migration
>     completes, destination before migration completes)
> 4. Paused temporarily for internal reasons (e.g. finish-migrate,
>     restore-vm, save-vm)
> 
> Most commands should be okay with 1 and 2, but possibly not 3, and
> almost never 4.

And then we need a mechanism to block state transition while some
operations which don't support new state are ongoing.

> 
> Kevin
>
Kevin Wolf Jan. 24, 2019, 10:45 a.m. UTC | #14
Am 24.01.2019 um 11:32 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 24.01.2019 13:15, Kevin Wolf wrote:
> > Am 24.01.2019 um 10:29 hat Vladimir Sementsov-Ogievskiy geschrieben:
> >> 23.01.2019 18:48, Max Reitz wrote:
> >>> Hi,
> >>>
> >>> When running 169 in parallel (e.g. like so:
> >>>
> >>> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
> >>> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
> >>> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
> >>> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
> >>>
> >>> in four different shells), I get aborts:
> >>>
> >>> (Often I get segfaults, but that's because of
> >>> http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
> >>> feel free to apply the attached patch to make them go away)
> >>>
> >>>
> >>> WARNING:qemu:qemu received signal 6:
> >>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> >>> -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
> >>> -mon chardev=mon,mode=control -display none -vga none -qtest
> >>> unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
> >>> -nodefaults -machine accel=qtest -drive
> >>> if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
> >>> .................E..
> >>> ======================================================================
> >>> ERROR:
> >>> test_do_test_migration_resume_source_not_persistent__not_migbitmap
> >>> (__main__.TestDirtyBitmapMigration)
> >>> ----------------------------------------------------------------------
> >>> Traceback (most recent call last):
> >>>     File "169", line 206, in <lambda>
> >>>       setattr(klass, 'test_' + method + name, lambda self: mc(self))
> >>>     File "169", line 113, in do_test_migration_resume_source
> >>>       self.check_bitmap(self.vm_a, sha256)
> >>>     File "169", line 72, in check_bitmap
> >>>       node='drive0', name='bitmap0')
> >>>     File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
> >>>       return self._qmp.cmd(cmd, args=qmp_args)
> >>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
> >>>       return self.cmd_obj(qmp_cmd)
> >>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
> >>>       resp = self.__json_read()
> >>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
> >>> __json_read
> >>>       data = self.__sockfile.readline()
> >>>     File "/usr/lib64/python2.7/socket.py", line 451, in readline
> >>>       data = self._sock.recv(self._rbufsize)
> >>> error: [Errno 104] Connection reset by peer
> >>>
> >>> ----------------------------------------------------------------------
> >>> Ran 20 tests
> >>>
> >>> FAILED (errors=1)
> >>>
> >>>
> >>> Or:
> >>>
> >>> WARNING:qemu:qemu received signal 6:
> >>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> >>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> >>> -mon chardev=mon,mode=control -display none -vga none -qtest
> >>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> >>> -machine accel=qtest -drive
> >>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> >>> WARNING:qemu:qemu received signal 6:
> >>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> >>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> >>> -mon chardev=mon,mode=control -display none -vga none -qtest
> >>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> >>> -machine accel=qtest -drive
> >>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> >>>
> >>> ...................F
> >>> ======================================================================
> >>> FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
> >>> (__main__.TestDirtyBitmapMigration)
> >>> ----------------------------------------------------------------------
> >>> Traceback (most recent call last):
> >>>     File "169", line 206, in <lambda>
> >>>       setattr(klass, 'test_' + method + name, lambda self: mc(self))
> >>>     File "169", line 125, in do_test_migration_resume_source
> >>>       self.assertEqual(log, '')
> >>> AssertionError: "qemu-system-x86_64: invalid runstate transition:
> >>> 'running' -> 'postmigrate'\n" != ''
> >>>
> >>> ----------------------------------------------------------------------
> >>> Ran 20 tests
> >>>
> >>> FAILED (failures=1)
> >>>
> >>>
> >>> The backtrace always goes like this:
> >>>
> >>> (gdb) bt
> >>> #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
> >>> #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
> >>> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> >>> at vl.c:742
> >>> #3  0x000055a46ebbb1a6 in runstate_set
> >>> (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
> >>> #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
> >>> at migration/migration.c:2972
> >>> #5  0x000055a46ed39129 in migration_thread
> >>> (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
> >>> #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
> >>> util/qemu-thread-posix.c:502
> >>>
> >>>
> >>> #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
> >>> #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
> >>> (gdb) frame 2
> >>> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> >>> at vl.c:742
> >>> 742             abort();
> >>> (gdb) print current_run_state
> >>> $1 = RUN_STATE_RUNNING
> >>>
> >>>
> >>> Neither of migration or runstates are my strong suite, so I thought I'd
> >>> report it before diving into it.
> >>>
> >>> Max
> >>>
> >>
> >> [...]
> >> 450556@1548321072.888979:migrate_set_state new state active
> >> 450556@1548321072.889022:migration_thread_setup_complete
> >> 450556@1548321072.988275:migrate_transferred transferred 985298 time_spent 100 bandwidth 9852 max_size 2955894
> >> 450556@1548321072.988283:migration_bitmap_sync_start
> >> 450556@1548321072.988293:migration_bitmap_sync_end dirty_pages 32898
> >> 450556@1548321072.988495:migration_thread_low_pending 2048
> >> migration_completion
> >> 450556@1548321072.988541:runstate_set current_state 9 new_state 7
> >> 450556@1548321072.988780:migration_bitmap_sync_start
> >> 450556@1548321072.988790:migration_bitmap_sync_end dirty_pages 32898
> >> 450556@1548321072.993112:migrate_global_state_pre_save saved state: running
> >> [1] 450556@1548321072.993237:migrate_set_state new state completed
> >> 450549@1548321072.993668:handle_qmp_command mon 0x55aebef99830 req: {"execute": "x-debug-block-dirty-bitmap-sha256", "arguments": {"name": "bitmap0", "node": "drive0"}}
> >> 450556@1548321072.993697:migration_thread_after_loop
> >> [2] 450549@1548321072.993917:handle_qmp_command mon 0x55aebef99830 req: {"execute": "cont"}
> >> 450476@1548321072.994461:runstate_set current_state 7 new_state 9
> >> qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'
> >>
> >>
> >> Aha, so, in test we wait for MIGRATION COMPLETED [1] and run qmp_cont [2] and go to RUNNING.
> >> then in migration thread we go to migration_iteration_finish() and fail to go to POSTMIGRATE.
> >> (note, that this testcase is about resuming source after migration)
> >>
> >> So, fix for test may be additionally wait for POSTMIGRATE, not only for MIGRATION COMPLETION.
> >>
> >> But how to fix Qemu not to crash? May be, forbid some transitions (FINISH_MIGRATE -> RUNNING),
> >>    or at least error-out qmp_cont if runstate is  FINISH_MIGRATE?
> > 
> > qmp_cont currently checks for RUN_STATE_INMIGRATE and gives it special
> > treatment. As a quick hack, doing the same for RUN_STATE_FINISH_MIGRATE
> > might fix the problem.
> > A better simple fix would possibly include a
> > whitelist of states where you can use 'cont' to avoid that the user
> > messes with other internal states.
> > 
> > However, like the problem that Dave reported a few days ago (migrating
> > twice leads to a crash because we try to inactivate already inactive
> > block devices), I think this is a symptom of a larger problem. We're
> > not taking the state machine serious enough. One-off checks here and
> > there are unlikely to ever fully solve the problem of running a command
> > in a runstate in which it was never supposed to run.
> > 
> > I wonder whether the QAPI schema should have a field 'run-states' for
> > commands, and by default we would only include states where the VM has
> > ownership of its resources (e.g. images are activated) and which are not
> > temporary states that are automatically left, like finish-migrate.
> > 
> > Then the default for commands is to be rejected in "unusual" runstates
> > where we're not expecting user intervention, and we must explicitly
> > allow them if they are okay, in fact.
> > 
> > Instead of listing every obscure runstate, maybe we should really use
> > categories of runstates instead:
> > 
> > 1. Running
> > 2. Paused, owns all resources (like disk images)
> > 3. Paused, doesn't own some resources (source VM after migration
> >     completes, destination before migration completes)
> > 4. Paused temporarily for internal reasons (e.g. finish-migrate,
> >     restore-vm, save-vm)
> > 
> > Most commands should be okay with 1 and 2, but possibly not 3, and
> > almost never 4.
> 
> And then we need a mechanism to block state transition while some
> operations which don't support new state are ongoing.

This essentially means not allowing those QMP commands that could cause
a state transition (primarily cont/stop, but probably some more).

State transitions that happen automatically should in theory always be
correct, everything else is a bug and abort() is the right way to stop
them (like we already do).

Kevin
Dr. David Alan Gilbert Jan. 24, 2019, 10:49 a.m. UTC | #15
* Kevin Wolf (kwolf@redhat.com) wrote:
> Am 24.01.2019 um 10:29 hat Vladimir Sementsov-Ogievskiy geschrieben:
> > 23.01.2019 18:48, Max Reitz wrote:
> > > Hi,
> > > 
> > > When running 169 in parallel (e.g. like so:
> > > 
> > > $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
> > > $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
> > > $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
> > > $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
> > > 
> > > in four different shells), I get aborts:
> > > 
> > > (Often I get segfaults, but that's because of
> > > http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
> > > feel free to apply the attached patch to make them go away)
> > > 
> > > 
> > > WARNING:qemu:qemu received signal 6:
> > > build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> > > -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
> > > -mon chardev=mon,mode=control -display none -vga none -qtest
> > > unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
> > > -nodefaults -machine accel=qtest -drive
> > > if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
> > > .................E..
> > > ======================================================================
> > > ERROR:
> > > test_do_test_migration_resume_source_not_persistent__not_migbitmap
> > > (__main__.TestDirtyBitmapMigration)
> > > ----------------------------------------------------------------------
> > > Traceback (most recent call last):
> > >    File "169", line 206, in <lambda>
> > >      setattr(klass, 'test_' + method + name, lambda self: mc(self))
> > >    File "169", line 113, in do_test_migration_resume_source
> > >      self.check_bitmap(self.vm_a, sha256)
> > >    File "169", line 72, in check_bitmap
> > >      node='drive0', name='bitmap0')
> > >    File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
> > >      return self._qmp.cmd(cmd, args=qmp_args)
> > >    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
> > >      return self.cmd_obj(qmp_cmd)
> > >    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
> > >      resp = self.__json_read()
> > >    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
> > > __json_read
> > >      data = self.__sockfile.readline()
> > >    File "/usr/lib64/python2.7/socket.py", line 451, in readline
> > >      data = self._sock.recv(self._rbufsize)
> > > error: [Errno 104] Connection reset by peer
> > > 
> > > ----------------------------------------------------------------------
> > > Ran 20 tests
> > > 
> > > FAILED (errors=1)
> > > 
> > > 
> > > Or:
> > > 
> > > WARNING:qemu:qemu received signal 6:
> > > build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> > > -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> > > -mon chardev=mon,mode=control -display none -vga none -qtest
> > > unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> > > -machine accel=qtest -drive
> > > if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> > > WARNING:qemu:qemu received signal 6:
> > > build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> > > -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> > > -mon chardev=mon,mode=control -display none -vga none -qtest
> > > unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> > > -machine accel=qtest -drive
> > > if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> > > 
> > > ...................F
> > > ======================================================================
> > > FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
> > > (__main__.TestDirtyBitmapMigration)
> > > ----------------------------------------------------------------------
> > > Traceback (most recent call last):
> > >    File "169", line 206, in <lambda>
> > >      setattr(klass, 'test_' + method + name, lambda self: mc(self))
> > >    File "169", line 125, in do_test_migration_resume_source
> > >      self.assertEqual(log, '')
> > > AssertionError: "qemu-system-x86_64: invalid runstate transition:
> > > 'running' -> 'postmigrate'\n" != ''
> > > 
> > > ----------------------------------------------------------------------
> > > Ran 20 tests
> > > 
> > > FAILED (failures=1)
> > > 
> > > 
> > > The backtrace always goes like this:
> > > 
> > > (gdb) bt
> > > #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
> > > #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
> > > #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> > > at vl.c:742
> > > #3  0x000055a46ebbb1a6 in runstate_set
> > > (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
> > > #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
> > > at migration/migration.c:2972
> > > #5  0x000055a46ed39129 in migration_thread
> > > (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
> > > #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
> > > util/qemu-thread-posix.c:502
> > > 
> > > 
> > > #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
> > > #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
> > > (gdb) frame 2
> > > #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> > > at vl.c:742
> > > 742             abort();
> > > (gdb) print current_run_state
> > > $1 = RUN_STATE_RUNNING
> > > 
> > > 
> > > Neither of migration or runstates are my strong suite, so I thought I'd
> > > report it before diving into it.
> > > 
> > > Max
> > > 
> > 
> > [...]
> > 450556@1548321072.888979:migrate_set_state new state active
> > 450556@1548321072.889022:migration_thread_setup_complete
> > 450556@1548321072.988275:migrate_transferred transferred 985298 time_spent 100 bandwidth 9852 max_size 2955894
> > 450556@1548321072.988283:migration_bitmap_sync_start
> > 450556@1548321072.988293:migration_bitmap_sync_end dirty_pages 32898
> > 450556@1548321072.988495:migration_thread_low_pending 2048
> > migration_completion
> > 450556@1548321072.988541:runstate_set current_state 9 new_state 7
> > 450556@1548321072.988780:migration_bitmap_sync_start
> > 450556@1548321072.988790:migration_bitmap_sync_end dirty_pages 32898
> > 450556@1548321072.993112:migrate_global_state_pre_save saved state: running
> > [1] 450556@1548321072.993237:migrate_set_state new state completed
> > 450549@1548321072.993668:handle_qmp_command mon 0x55aebef99830 req: {"execute": "x-debug-block-dirty-bitmap-sha256", "arguments": {"name": "bitmap0", "node": "drive0"}}
> > 450556@1548321072.993697:migration_thread_after_loop
> > [2] 450549@1548321072.993917:handle_qmp_command mon 0x55aebef99830 req: {"execute": "cont"}
> > 450476@1548321072.994461:runstate_set current_state 7 new_state 9
> > qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'
> > 
> > 
> > Aha, so, in test we wait for MIGRATION COMPLETED [1] and run qmp_cont [2] and go to RUNNING.
> > then in migration thread we go to migration_iteration_finish() and fail to go to POSTMIGRATE.
> > (note, that this testcase is about resuming source after migration)

Ah that makes sense as to why we failed.

> > So, fix for test may be additionally wait for POSTMIGRATE, not only for MIGRATION COMPLETION.
> > 
> > But how to fix Qemu not to crash? May be, forbid some transitions (FINISH_MIGRATE -> RUNNING),
> >   or at least error-out qmp_cont if runstate is  FINISH_MIGRATE?
> 
> qmp_cont currently checks for RUN_STATE_INMIGRATE and gives it special
> treatment. As a quick hack, doing the same for RUN_STATE_FINISH_MIGRATE
> might fix the problem. A better simple fix would possibly include a
> whitelist of states where you can use 'cont' to avoid that the user
> messes with other internal states.
> 
> However, like the problem that Dave reported a few days ago (migrating
> twice leads to a crash because we try to inactivate already inactive
> block devices), I think this is a symptom of a larger problem. We're
> not taking the state machine serious enough. One-off checks here and
> there are unlikely to ever fully solve the problem of running a command
> in a runstate in which it was never supposed to run.
> 
> I wonder whether the QAPI schema should have a field 'run-states' for
> commands, and by default we would only include states where the VM has
> ownership of its resources (e.g. images are activated) and which are not
> temporary states that are automatically left, like finish-migrate.
> 
> Then the default for commands is to be rejected in "unusual" runstates
> where we're not expecting user intervention, and we must explicitly
> allow them if they are okay, in fact.
> 
> Instead of listing every obscure runstate, maybe we should really use
> categories of runstates instead:
> 
> 1. Running
> 2. Paused, owns all resources (like disk images)
> 3. Paused, doesn't own some resources (source VM after migration
>    completes, destination before migration completes)
> 4. Paused temporarily for internal reasons (e.g. finish-migrate,
>    restore-vm, save-vm)
> 
> Most commands should be okay with 1 and 2, but possibly not 3, and
> almost never 4.

The problem is it's tricky to get those groupings right and to figure
out how they all interact.  For example it's perfectly fine for most
query type commands to run in (3) and (4), and you need a bunch of
commands to let you recover from failures that might leave you in (3)
and cont is one of those that you might use after a particular form
of failed migration.

Dave

> Kevin
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert Jan. 24, 2019, 10:52 a.m. UTC | #16
* Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
> 24.01.2019 13:10, Dr. David Alan Gilbert wrote:
> > * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
> >> 24.01.2019 12:29, Vladimir Sementsov-Ogievskiy wrote:
> >>> 23.01.2019 18:48, Max Reitz wrote:
> >>>> Hi,
> >>>>
> >>>> When running 169 in parallel (e.g. like so:
> >>>>
> >>>> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
> >>>> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
> >>>> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
> >>>> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
> >>>>
> >>>> in four different shells), I get aborts:
> >>>>
> >>>> (Often I get segfaults, but that's because of
> >>>> http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
> >>>> feel free to apply the attached patch to make them go away)
> >>>>
> >>>>
> >>>> WARNING:qemu:qemu received signal 6:
> >>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> >>>> -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
> >>>> -mon chardev=mon,mode=control -display none -vga none -qtest
> >>>> unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
> >>>> -nodefaults -machine accel=qtest -drive
> >>>> if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
> >>>> .................E..
> >>>> ======================================================================
> >>>> ERROR:
> >>>> test_do_test_migration_resume_source_not_persistent__not_migbitmap
> >>>> (__main__.TestDirtyBitmapMigration)
> >>>> ----------------------------------------------------------------------
> >>>> Traceback (most recent call last):
> >>>>     File "169", line 206, in <lambda>
> >>>>       setattr(klass, 'test_' + method + name, lambda self: mc(self))
> >>>>     File "169", line 113, in do_test_migration_resume_source
> >>>>       self.check_bitmap(self.vm_a, sha256)
> >>>>     File "169", line 72, in check_bitmap
> >>>>       node='drive0', name='bitmap0')
> >>>>     File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
> >>>>       return self._qmp.cmd(cmd, args=qmp_args)
> >>>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
> >>>>       return self.cmd_obj(qmp_cmd)
> >>>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
> >>>>       resp = self.__json_read()
> >>>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
> >>>> __json_read
> >>>>       data = self.__sockfile.readline()
> >>>>     File "/usr/lib64/python2.7/socket.py", line 451, in readline
> >>>>       data = self._sock.recv(self._rbufsize)
> >>>> error: [Errno 104] Connection reset by peer
> >>>>
> >>>> ----------------------------------------------------------------------
> >>>> Ran 20 tests
> >>>>
> >>>> FAILED (errors=1)
> >>>>
> >>>>
> >>>> Or:
> >>>>
> >>>> WARNING:qemu:qemu received signal 6:
> >>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> >>>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> >>>> -mon chardev=mon,mode=control -display none -vga none -qtest
> >>>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> >>>> -machine accel=qtest -drive
> >>>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> >>>> WARNING:qemu:qemu received signal 6:
> >>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> >>>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> >>>> -mon chardev=mon,mode=control -display none -vga none -qtest
> >>>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> >>>> -machine accel=qtest -drive
> >>>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> >>>>
> >>>> ...................F
> >>>> ======================================================================
> >>>> FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
> >>>> (__main__.TestDirtyBitmapMigration)
> >>>> ----------------------------------------------------------------------
> >>>> Traceback (most recent call last):
> >>>>     File "169", line 206, in <lambda>
> >>>>       setattr(klass, 'test_' + method + name, lambda self: mc(self))
> >>>>     File "169", line 125, in do_test_migration_resume_source
> >>>>       self.assertEqual(log, '')
> >>>> AssertionError: "qemu-system-x86_64: invalid runstate transition:
> >>>> 'running' -> 'postmigrate'\n" != ''
> >>>>
> >>>> ----------------------------------------------------------------------
> >>>> Ran 20 tests
> >>>>
> >>>> FAILED (failures=1)
> >>>>
> >>>>
> >>>> The backtrace always goes like this:
> >>>>
> >>>> (gdb) bt
> >>>> #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
> >>>> #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
> >>>> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> >>>> at vl.c:742
> >>>> #3  0x000055a46ebbb1a6 in runstate_set
> >>>> (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
> >>>> #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
> >>>> at migration/migration.c:2972
> >>>> #5  0x000055a46ed39129 in migration_thread
> >>>> (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
> >>>> #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
> >>>> util/qemu-thread-posix.c:502
> >>>>
> >>>>
> >>>> #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
> >>>> #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
> >>>> (gdb) frame 2
> >>>> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> >>>> at vl.c:742
> >>>> 742             abort();
> >>>> (gdb) print current_run_state
> >>>> $1 = RUN_STATE_RUNNING
> >>>>
> >>>>
> >>>> Neither of migration or runstates are my strong suite, so I thought I'd
> >>>> report it before diving into it.
> >>>>
> >>>> Max
> >>>>
> >>>
> >>> [...]
> >>> 450556@1548321072.888979:migrate_set_state new state active
> >>> 450556@1548321072.889022:migration_thread_setup_complete
> >>> 450556@1548321072.988275:migrate_transferred transferred 985298 time_spent 100 bandwidth 9852 max_size 2955894
> >>> 450556@1548321072.988283:migration_bitmap_sync_start
> >>> 450556@1548321072.988293:migration_bitmap_sync_end dirty_pages 32898
> >>> 450556@1548321072.988495:migration_thread_low_pending 2048
> >>> migration_completion
> >>> 450556@1548321072.988541:runstate_set current_state 9 new_state 7
> >>> 450556@1548321072.988780:migration_bitmap_sync_start
> >>> 450556@1548321072.988790:migration_bitmap_sync_end dirty_pages 32898
> >>> 450556@1548321072.993112:migrate_global_state_pre_save saved state: running
> >>> [1] 450556@1548321072.993237:migrate_set_state new state completed
> >>> 450549@1548321072.993668:handle_qmp_command mon 0x55aebef99830 req: {"execute": "x-debug-block-dirty-bitmap-sha256", "arguments": {"name": "bitmap0", "node": "drive0"}}
> >>> 450556@1548321072.993697:migration_thread_after_loop
> >>> [2] 450549@1548321072.993917:handle_qmp_command mon 0x55aebef99830 req: {"execute": "cont"}
> >>> 450476@1548321072.994461:runstate_set current_state 7 new_state 9
> >>> qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'
> >>>
> >>>
> >>> Aha, so, in test we wait for MIGRATION COMPLETED [1] and run qmp_cont [2] and go to RUNNING.
> >>> then in migration thread we go to migration_iteration_finish() and fail to go to POSTMIGRATE.
> >>> (note, that this testcase is about resuming source after migration)
> >>>
> >>> So, fix for test may be additionally wait for POSTMIGRATE, not only for MIGRATION COMPLETION.
> >>>
> >>> But how to fix Qemu not to crash? May be, forbid some transitions (FINISH_MIGRATE -> RUNNING),
> >>>    or at least error-out qmp_cont if runstate is  FINISH_MIGRATE?
> >>>
> >>>
> >>>
> >>
> >>
> >> Ok, this seems to fix test for me:
> >>
> >> diff --git a/tests/qemu-iotests/169 b/tests/qemu-iotests/169
> >> index 527aebd0cb..dd98168c8d 100755
> >> --- a/tests/qemu-iotests/169
> >> +++ b/tests/qemu-iotests/169
> >> @@ -102,6 +102,10 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
> >>                event = self.vm_a.event_wait('MIGRATION')
> >>                if event['data']['status'] == 'completed':
> >>                    break
> >> +        while True:
> >> +            result = self.vm_a.qmp('query-status')
> >> +            if (result['return']['status'] == 'postmigrate'):
> >> +                break
> > 
> > Hmm; I can see that it might help; although I still can't quite see
> > how the bad transition that was reported can have happened.
> > 
> 
> It can be assumed from my trace above:
> 
> 1. migration thread do migratioN_completion and goes to RUN_STATE_FINISH_MIGRATE through vm_stop_force_state(RUN_STATE_FINISH_MIGRATE)
> 2. migration_completion do
>          migrate_set_state(&s->state, current_active_state,
>                              MIGRATION_STATUS_COMPLETED);
>     and MIGRATION COMPLETED event is reported, so
> 3. 169 iotest gets event, and think that all is done about migration. And it call qmp_cont
> 4. qmp_cont do transition from RUN_STATE_FINISH_MIGRATE to RUN_STATE_RUNNING
> 5. migration thread goes to migration_iteration_finish and do invalid transition

Yes, agreed.

Dave

> 
> > 
> >>            # test that bitmap is still here
> >>            removed = (not migrate_bitmaps) and persistent
> >>
> >> -
> >>
> >>
> >> And this is the full diff of the test I used, to skip checking log (as I enabled traces) and to skip all other test-cases:
> >>
> >> (hmm, we definitely need simple way to run one test case from test, without editing it)
> >>
> >> diff --git a/tests/qemu-iotests/169 b/tests/qemu-iotests/169
> >> index 527aebd0cb..dd98168c8d 100755
> >> --- a/tests/qemu-iotests/169
> >> +++ b/tests/qemu-iotests/169
> >> @@ -102,6 +102,10 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
> >>                event = self.vm_a.event_wait('MIGRATION')
> >>                if event['data']['status'] == 'completed':
> >>                    break
> >> +        while True:
> >> +            result = self.vm_a.qmp('query-status')
> >> +            if (result['return']['status'] == 'postmigrate'):
> >> +                break
> >>
> >>            # test that bitmap is still here
> >>            removed = (not migrate_bitmaps) and persistent
> >> @@ -117,12 +121,12 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
> >>
> >>            # catch 'Could not reopen qcow2 layer: Bitmap already exists'
> >>            # possible error
> >> -        log = self.vm_a.get_log()
> >> -        log = re.sub(r'^\[I \d+\.\d+\] OPENED\n', '', log)
> >> -        log = re.sub(r'^(wrote .* bytes at offset .*\n.*KiB.*ops.*sec.*\n){3}',
> >> -                     '', log)
> >> -        log = re.sub(r'\[I \+\d+\.\d+\] CLOSED\n?$', '', log)
> >> -        self.assertEqual(log, '')
> >> +        #log = self.vm_a.get_log()
> >> +        #log = re.sub(r'^\[I \d+\.\d+\] OPENED\n', '', log)
> >> +        #log = re.sub(r'^(wrote .* bytes at offset .*\n.*KiB.*ops.*sec.*\n){3}',
> >> +                     #'', log)
> >> +        #log = re.sub(r'\[I \+\d+\.\d+\] CLOSED\n?$', '', log)
> >> +        #self.assertEqual(log, '')
> >>
> >>            # test that bitmap is still persistent
> >>            self.vm_a.launch()
> >> @@ -211,14 +215,16 @@ for cmb in list(itertools.product((True, False), repeat=4)):
> >>        name += '_online' if cmb[2] else '_offline'
> >>        name += '_shared' if cmb[3] else '_nonshared'
> >>
> >> -    inject_test_case(TestDirtyBitmapMigration, name, 'do_test_migration',
> >> +    if False:
> >> +        inject_test_case(TestDirtyBitmapMigration, name, 'do_test_migration',
> >>                         *list(cmb))
> >>
> >>    for cmb in list(itertools.product((True, False), repeat=2)):
> >>        name = ('_' if cmb[0] else '_not_') + 'persistent_'
> >>        name += ('_' if cmb[1] else '_not_') + 'migbitmap'
> >>
> >> -    inject_test_case(TestDirtyBitmapMigration, name,
> >> +    if name == '_persistent__migbitmap':
> >> +        inject_test_case(TestDirtyBitmapMigration, name,
> >>                         'do_test_migration_resume_source', *list(cmb))
> >>
> >>    if __name__ == '__main__':
> >> diff --git a/tests/qemu-iotests/169.out b/tests/qemu-iotests/169.out
> >> index 3a89159833..ae1213e6f8 100644
> >> --- a/tests/qemu-iotests/169.out
> >> +++ b/tests/qemu-iotests/169.out
> >> @@ -1,5 +1,5 @@
> >> -....................
> >> +.
> >>    ----------------------------------------------------------------------
> >> -Ran 20 tests
> >> +Ran 1 tests
> >>
> >>    OK
> >>
> >>
> >>
> >> -- 
> >> Best regards,
> >> Vladimir
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > 
> 
> 
> -- 
> Best regards,
> Vladimir
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Vladimir Sementsov-Ogievskiy Jan. 24, 2019, 11:53 a.m. UTC | #17
24.01.2019 13:15, Kevin Wolf wrote:
> Am 24.01.2019 um 10:29 hat Vladimir Sementsov-Ogievskiy geschrieben:
>> 23.01.2019 18:48, Max Reitz wrote:
>>> Hi,
>>>
>>> When running 169 in parallel (e.g. like so:
>>>
>>> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
>>> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
>>> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
>>> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
>>>
>>> in four different shells), I get aborts:
>>>
>>> (Often I get segfaults, but that's because of
>>> http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
>>> feel free to apply the attached patch to make them go away)
>>>
>>>
>>> WARNING:qemu:qemu received signal 6:
>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
>>> -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
>>> -mon chardev=mon,mode=control -display none -vga none -qtest
>>> unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
>>> -nodefaults -machine accel=qtest -drive
>>> if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
>>> .................E..
>>> ======================================================================
>>> ERROR:
>>> test_do_test_migration_resume_source_not_persistent__not_migbitmap
>>> (__main__.TestDirtyBitmapMigration)
>>> ----------------------------------------------------------------------
>>> Traceback (most recent call last):
>>>     File "169", line 206, in <lambda>
>>>       setattr(klass, 'test_' + method + name, lambda self: mc(self))
>>>     File "169", line 113, in do_test_migration_resume_source
>>>       self.check_bitmap(self.vm_a, sha256)
>>>     File "169", line 72, in check_bitmap
>>>       node='drive0', name='bitmap0')
>>>     File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
>>>       return self._qmp.cmd(cmd, args=qmp_args)
>>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
>>>       return self.cmd_obj(qmp_cmd)
>>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
>>>       resp = self.__json_read()
>>>     File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
>>> __json_read
>>>       data = self.__sockfile.readline()
>>>     File "/usr/lib64/python2.7/socket.py", line 451, in readline
>>>       data = self._sock.recv(self._rbufsize)
>>> error: [Errno 104] Connection reset by peer
>>>
>>> ----------------------------------------------------------------------
>>> Ran 20 tests
>>>
>>> FAILED (errors=1)
>>>
>>>
>>> Or:
>>>
>>> WARNING:qemu:qemu received signal 6:
>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
>>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
>>> -mon chardev=mon,mode=control -display none -vga none -qtest
>>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
>>> -machine accel=qtest -drive
>>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
>>> WARNING:qemu:qemu received signal 6:
>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
>>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
>>> -mon chardev=mon,mode=control -display none -vga none -qtest
>>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
>>> -machine accel=qtest -drive
>>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
>>>
>>> ...................F
>>> ======================================================================
>>> FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
>>> (__main__.TestDirtyBitmapMigration)
>>> ----------------------------------------------------------------------
>>> Traceback (most recent call last):
>>>     File "169", line 206, in <lambda>
>>>       setattr(klass, 'test_' + method + name, lambda self: mc(self))
>>>     File "169", line 125, in do_test_migration_resume_source
>>>       self.assertEqual(log, '')
>>> AssertionError: "qemu-system-x86_64: invalid runstate transition:
>>> 'running' -> 'postmigrate'\n" != ''
>>>
>>> ----------------------------------------------------------------------
>>> Ran 20 tests
>>>
>>> FAILED (failures=1)
>>>
>>>
>>> The backtrace always goes like this:
>>>
>>> (gdb) bt
>>> #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
>>> #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
>>> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
>>> at vl.c:742
>>> #3  0x000055a46ebbb1a6 in runstate_set
>>> (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
>>> #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
>>> at migration/migration.c:2972
>>> #5  0x000055a46ed39129 in migration_thread
>>> (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
>>> #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
>>> util/qemu-thread-posix.c:502
>>>
>>>
>>> #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
>>> #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
>>> (gdb) frame 2
>>> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
>>> at vl.c:742
>>> 742             abort();
>>> (gdb) print current_run_state
>>> $1 = RUN_STATE_RUNNING
>>>
>>>
>>> Neither of migration or runstates are my strong suite, so I thought I'd
>>> report it before diving into it.
>>>
>>> Max
>>>
>>
>> [...]
>> 450556@1548321072.888979:migrate_set_state new state active
>> 450556@1548321072.889022:migration_thread_setup_complete
>> 450556@1548321072.988275:migrate_transferred transferred 985298 time_spent 100 bandwidth 9852 max_size 2955894
>> 450556@1548321072.988283:migration_bitmap_sync_start
>> 450556@1548321072.988293:migration_bitmap_sync_end dirty_pages 32898
>> 450556@1548321072.988495:migration_thread_low_pending 2048
>> migration_completion
>> 450556@1548321072.988541:runstate_set current_state 9 new_state 7
>> 450556@1548321072.988780:migration_bitmap_sync_start
>> 450556@1548321072.988790:migration_bitmap_sync_end dirty_pages 32898
>> 450556@1548321072.993112:migrate_global_state_pre_save saved state: running
>> [1] 450556@1548321072.993237:migrate_set_state new state completed
>> 450549@1548321072.993668:handle_qmp_command mon 0x55aebef99830 req: {"execute": "x-debug-block-dirty-bitmap-sha256", "arguments": {"name": "bitmap0", "node": "drive0"}}
>> 450556@1548321072.993697:migration_thread_after_loop
>> [2] 450549@1548321072.993917:handle_qmp_command mon 0x55aebef99830 req: {"execute": "cont"}
>> 450476@1548321072.994461:runstate_set current_state 7 new_state 9
>> qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'
>>
>>
>> Aha, so, in test we wait for MIGRATION COMPLETED [1] and run qmp_cont [2] and go to RUNNING.
>> then in migration thread we go to migration_iteration_finish() and fail to go to POSTMIGRATE.
>> (note, that this testcase is about resuming source after migration)
>>
>> So, fix for test may be additionally wait for POSTMIGRATE, not only for MIGRATION COMPLETION.
>>
>> But how to fix Qemu not to crash? May be, forbid some transitions (FINISH_MIGRATE -> RUNNING),
>>    or at least error-out qmp_cont if runstate is  FINISH_MIGRATE?
> 
> qmp_cont currently checks for RUN_STATE_INMIGRATE and gives it special
> treatment. As a quick hack, doing the same for RUN_STATE_FINISH_MIGRATE
> might fix the problem.

So, I think it's a good thing to do right now and I will do.
Hmm, done.
Interesting, after applying
--- a/qmp.c
+++ b/qmp.c
@@ -156,6 +156,9 @@ void qmp_cont(Error **errp)
          return;
      } else if (runstate_check(RUN_STATE_SUSPENDED)) {
          return;
+    } else if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
+        error_setg(errp, "Migration is not finalized yet");
+        return;
      }

      for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {


I got new fail:

@@ -1,5 +1,22 @@
-....................
+WARNING:qemu:qemu received signal 6: /work/src/qemu/reitz-migration-bug/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 -chardev socket,id=mon,path=/ramdisk/t3/tmplN68au/qemua-917541-monitor.sock -mon chardev=mon,mode=control -display none -vga none -qtest unix:path=/ramdisk/t3/qemua-917541-qtest.sock -machine accel=qtest -nodefaults -machine accel=qtest -drive if=virtio,id=drive0,file=/ramdisk/t3/disk_a,format=qcow2,cache=writeback
+E...................
+======================================================================
+ERROR: test_do_test_migration_not_persistent__migbitmap__offline_nonshared (__main__.TestDirtyBitmapMigration)
+----------------------------------------------------------------------
+Traceback (most recent call last):
+  File "169", line 207, in <lambda>
+    setattr(klass, 'test_' + method + name, lambda self: mc(self))
+  File "169", line 167, in do_test_migration
+    event = self.vm_a.event_wait('MIGRATION')
+  File "/work/src/qemu/reitz-migration-bug/tests/qemu-iotests/../../scripts/qemu.py", line 436, in eventwait
+    event = self._qmp.pull_event(wait=timeout)
+  File "/work/src/qemu/reitz-migration-bug/tests/qemu-iotests/../../scripts/qmp/qmp.py", line 216, in pul_event
+    self.__get_events(wait)
+  File "/work/src/qemu/reitz-migration-bug/tests/qemu-iotests/../../scripts/qmp/qmp.py", line 124, in __et_events
+    raise QMPTimeoutError("Timeout waiting for event")
+QMPTimeoutError: Timeout waiting for event


#0  0x00007f95f4cde207 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
#1  0x00007f95f4cdf8f8 in __GI_abort () at abort.c:90
#2  0x00007f95f4cd7026 in __assert_fail_base
     (fmt=0x7f95f4e32520 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=assertion@entry=0x55b63154d370 "((&all_bdrv_states)->tqh_first == ((void *)0))", file=file@entry=0x55b63154c2b0 "block.c", line=line@entry=3534, function=function@entry=0x55b63154e03d <__PRETTY_FUNCTION__.30345> "bdrv_close_all")
     at assert.c:92
#3  0x00007f95f4cd70d2 in __GI___assert_fail
     (assertion=0x55b63154d370 "((&all_bdrv_states)->tqh_first == ((void *)0))", file=0x55b63154c2b0 "block.c", line=3534, function=0x55b63154e03d <__PRETTY_FUNCTION__.30345> "bdrv_close_all") at assert.c:101
#4  0x000055b6311e082e in bdrv_close_all () at block.c:3534
#5  0x000055b630f3ae5c in main (argc=18, argv=0x7ffda6f14428, envp=0x7ffda6f144c0) at vl.c:4670

will dig it a bit..
Kevin Wolf Jan. 24, 2019, 11:58 a.m. UTC | #18
Am 24.01.2019 um 11:49 hat Dr. David Alan Gilbert geschrieben:
> * Kevin Wolf (kwolf@redhat.com) wrote:
> > Am 24.01.2019 um 10:29 hat Vladimir Sementsov-Ogievskiy geschrieben:
> > > 23.01.2019 18:48, Max Reitz wrote:
> > > > Hi,
> > > > 
> > > > When running 169 in parallel (e.g. like so:
> > > > 
> > > > $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
> > > > $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
> > > > $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
> > > > $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
> > > > 
> > > > in four different shells), I get aborts:
> > > > 
> > > > (Often I get segfaults, but that's because of
> > > > http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
> > > > feel free to apply the attached patch to make them go away)
> > > > 
> > > > 
> > > > WARNING:qemu:qemu received signal 6:
> > > > build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> > > > -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
> > > > -mon chardev=mon,mode=control -display none -vga none -qtest
> > > > unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
> > > > -nodefaults -machine accel=qtest -drive
> > > > if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
> > > > .................E..
> > > > ======================================================================
> > > > ERROR:
> > > > test_do_test_migration_resume_source_not_persistent__not_migbitmap
> > > > (__main__.TestDirtyBitmapMigration)
> > > > ----------------------------------------------------------------------
> > > > Traceback (most recent call last):
> > > >    File "169", line 206, in <lambda>
> > > >      setattr(klass, 'test_' + method + name, lambda self: mc(self))
> > > >    File "169", line 113, in do_test_migration_resume_source
> > > >      self.check_bitmap(self.vm_a, sha256)
> > > >    File "169", line 72, in check_bitmap
> > > >      node='drive0', name='bitmap0')
> > > >    File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
> > > >      return self._qmp.cmd(cmd, args=qmp_args)
> > > >    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
> > > >      return self.cmd_obj(qmp_cmd)
> > > >    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
> > > >      resp = self.__json_read()
> > > >    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
> > > > __json_read
> > > >      data = self.__sockfile.readline()
> > > >    File "/usr/lib64/python2.7/socket.py", line 451, in readline
> > > >      data = self._sock.recv(self._rbufsize)
> > > > error: [Errno 104] Connection reset by peer
> > > > 
> > > > ----------------------------------------------------------------------
> > > > Ran 20 tests
> > > > 
> > > > FAILED (errors=1)
> > > > 
> > > > 
> > > > Or:
> > > > 
> > > > WARNING:qemu:qemu received signal 6:
> > > > build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> > > > -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> > > > -mon chardev=mon,mode=control -display none -vga none -qtest
> > > > unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> > > > -machine accel=qtest -drive
> > > > if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> > > > WARNING:qemu:qemu received signal 6:
> > > > build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> > > > -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> > > > -mon chardev=mon,mode=control -display none -vga none -qtest
> > > > unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> > > > -machine accel=qtest -drive
> > > > if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> > > > 
> > > > ...................F
> > > > ======================================================================
> > > > FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
> > > > (__main__.TestDirtyBitmapMigration)
> > > > ----------------------------------------------------------------------
> > > > Traceback (most recent call last):
> > > >    File "169", line 206, in <lambda>
> > > >      setattr(klass, 'test_' + method + name, lambda self: mc(self))
> > > >    File "169", line 125, in do_test_migration_resume_source
> > > >      self.assertEqual(log, '')
> > > > AssertionError: "qemu-system-x86_64: invalid runstate transition:
> > > > 'running' -> 'postmigrate'\n" != ''
> > > > 
> > > > ----------------------------------------------------------------------
> > > > Ran 20 tests
> > > > 
> > > > FAILED (failures=1)
> > > > 
> > > > 
> > > > The backtrace always goes like this:
> > > > 
> > > > (gdb) bt
> > > > #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
> > > > #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
> > > > #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> > > > at vl.c:742
> > > > #3  0x000055a46ebbb1a6 in runstate_set
> > > > (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
> > > > #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
> > > > at migration/migration.c:2972
> > > > #5  0x000055a46ed39129 in migration_thread
> > > > (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
> > > > #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
> > > > util/qemu-thread-posix.c:502
> > > > 
> > > > 
> > > > #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
> > > > #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
> > > > (gdb) frame 2
> > > > #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> > > > at vl.c:742
> > > > 742             abort();
> > > > (gdb) print current_run_state
> > > > $1 = RUN_STATE_RUNNING
> > > > 
> > > > 
> > > > Neither of migration or runstates are my strong suite, so I thought I'd
> > > > report it before diving into it.
> > > > 
> > > > Max
> > > > 
> > > 
> > > [...]
> > > 450556@1548321072.888979:migrate_set_state new state active
> > > 450556@1548321072.889022:migration_thread_setup_complete
> > > 450556@1548321072.988275:migrate_transferred transferred 985298 time_spent 100 bandwidth 9852 max_size 2955894
> > > 450556@1548321072.988283:migration_bitmap_sync_start
> > > 450556@1548321072.988293:migration_bitmap_sync_end dirty_pages 32898
> > > 450556@1548321072.988495:migration_thread_low_pending 2048
> > > migration_completion
> > > 450556@1548321072.988541:runstate_set current_state 9 new_state 7
> > > 450556@1548321072.988780:migration_bitmap_sync_start
> > > 450556@1548321072.988790:migration_bitmap_sync_end dirty_pages 32898
> > > 450556@1548321072.993112:migrate_global_state_pre_save saved state: running
> > > [1] 450556@1548321072.993237:migrate_set_state new state completed
> > > 450549@1548321072.993668:handle_qmp_command mon 0x55aebef99830 req: {"execute": "x-debug-block-dirty-bitmap-sha256", "arguments": {"name": "bitmap0", "node": "drive0"}}
> > > 450556@1548321072.993697:migration_thread_after_loop
> > > [2] 450549@1548321072.993917:handle_qmp_command mon 0x55aebef99830 req: {"execute": "cont"}
> > > 450476@1548321072.994461:runstate_set current_state 7 new_state 9
> > > qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'
> > > 
> > > 
> > > Aha, so, in test we wait for MIGRATION COMPLETED [1] and run qmp_cont [2] and go to RUNNING.
> > > then in migration thread we go to migration_iteration_finish() and fail to go to POSTMIGRATE.
> > > (note, that this testcase is about resuming source after migration)
> 
> Ah that makes sense as to why we failed.
> 
> > > So, fix for test may be additionally wait for POSTMIGRATE, not only for MIGRATION COMPLETION.
> > > 
> > > But how to fix Qemu not to crash? May be, forbid some transitions (FINISH_MIGRATE -> RUNNING),
> > >   or at least error-out qmp_cont if runstate is  FINISH_MIGRATE?
> > 
> > qmp_cont currently checks for RUN_STATE_INMIGRATE and gives it special
> > treatment. As a quick hack, doing the same for RUN_STATE_FINISH_MIGRATE
> > might fix the problem. A better simple fix would possibly include a
> > whitelist of states where you can use 'cont' to avoid that the user
> > messes with other internal states.
> > 
> > However, like the problem that Dave reported a few days ago (migrating
> > twice leads to a crash because we try to inactivate already inactive
> > block devices), I think this is a symptom of a larger problem. We're
> > not taking the state machine serious enough. One-off checks here and
> > there are unlikely to ever fully solve the problem of running a command
> > in a runstate in which it was never supposed to run.
> > 
> > I wonder whether the QAPI schema should have a field 'run-states' for
> > commands, and by default we would only include states where the VM has
> > ownership of its resources (e.g. images are activated) and which are not
> > temporary states that are automatically left, like finish-migrate.
> > 
> > Then the default for commands is to be rejected in "unusual" runstates
> > where we're not expecting user intervention, and we must explicitly
> > allow them if they are okay, in fact.
> > 
> > Instead of listing every obscure runstate, maybe we should really use
> > categories of runstates instead:
> > 
> > 1. Running
> > 2. Paused, owns all resources (like disk images)
> > 3. Paused, doesn't own some resources (source VM after migration
> >    completes, destination before migration completes)
> > 4. Paused temporarily for internal reasons (e.g. finish-migrate,
> >    restore-vm, save-vm)
> > 
> > Most commands should be okay with 1 and 2, but possibly not 3, and
> > almost never 4.
> 
> The problem is it's tricky to get those groupings right and to figure
> out how they all interact.  For example it's perfectly fine for most
> query type commands to run in (3) and (4), and you need a bunch of
> commands to let you recover from failures that might leave you in (3)
> and cont is one of those that you might use after a particular form
> of failed migration.

I'm not saying it is always easy, though I expect that many cases will
actually be rather obvious.

'cont' will be one of the commands that need to work in (3), but it
needs internal checks to make sure it behaves correctly. This is fine,
if we declare in the schema that a command works in (3), we'll also
have reviewed that it can actually handle the situation correctly.

I think being explicit about the expected runstates for each command
would already go a long way in addressing this class of bugs, because it
reminds us to even think about unusual runstates.

Kevin
Eric Blake Jan. 24, 2019, 5:58 p.m. UTC | #19
On 1/24/19 4:15 AM, Kevin Wolf wrote:

>> But how to fix Qemu not to crash? May be, forbid some transitions (FINISH_MIGRATE -> RUNNING),
>>   or at least error-out qmp_cont if runstate is  FINISH_MIGRATE?
> 

> I wonder whether the QAPI schema should have a field 'run-states' for
> commands, and by default we would only include states where the VM has
> ownership of its resources (e.g. images are activated) and which are not
> temporary states that are automatically left, like finish-migrate.

We already have 'allow-oob' and 'allow-preconfig' flags on a per-command
basis; you're basically proposing that we extend this mechanism for
marking other attributes of commands,...

> 
> Then the default for commands is to be rejected in "unusual" runstates
> where we're not expecting user intervention, and we must explicitly
> allow them if they are okay, in fact.
> 
> Instead of listing every obscure runstate, maybe we should really use
> categories of runstates instead:
> 
> 1. Running
> 2. Paused, owns all resources (like disk images)
> 3. Paused, doesn't own some resources (source VM after migration
>    completes, destination before migration completes)
> 4. Paused temporarily for internal reasons (e.g. finish-migrate,
>    restore-vm, save-vm)
> 
> Most commands should be okay with 1 and 2, but possibly not 3, and
> almost never 4.

...then enforcing that commands are only executed according to the
attributes they have (where the default attributes match categories 1
and 2, and commands have to opt-in if they are safe to run in category 3
or 4 just like they have to opt-in for preconfig or oob usage).
Dr. David Alan Gilbert Jan. 24, 2019, 8:15 p.m. UTC | #20
* Kevin Wolf (kwolf@redhat.com) wrote:
> Am 24.01.2019 um 11:49 hat Dr. David Alan Gilbert geschrieben:
> > * Kevin Wolf (kwolf@redhat.com) wrote:
> > > Am 24.01.2019 um 10:29 hat Vladimir Sementsov-Ogievskiy geschrieben:
> > > > 23.01.2019 18:48, Max Reitz wrote:
> > > > > Hi,
> > > > > 
> > > > > When running 169 in parallel (e.g. like so:
> > > > > 
> > > > > $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
> > > > > $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
> > > > > $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
> > > > > $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
> > > > > 
> > > > > in four different shells), I get aborts:
> > > > > 
> > > > > (Often I get segfaults, but that's because of
> > > > > http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
> > > > > feel free to apply the attached patch to make them go away)
> > > > > 
> > > > > 
> > > > > WARNING:qemu:qemu received signal 6:
> > > > > build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> > > > > -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
> > > > > -mon chardev=mon,mode=control -display none -vga none -qtest
> > > > > unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
> > > > > -nodefaults -machine accel=qtest -drive
> > > > > if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
> > > > > .................E..
> > > > > ======================================================================
> > > > > ERROR:
> > > > > test_do_test_migration_resume_source_not_persistent__not_migbitmap
> > > > > (__main__.TestDirtyBitmapMigration)
> > > > > ----------------------------------------------------------------------
> > > > > Traceback (most recent call last):
> > > > >    File "169", line 206, in <lambda>
> > > > >      setattr(klass, 'test_' + method + name, lambda self: mc(self))
> > > > >    File "169", line 113, in do_test_migration_resume_source
> > > > >      self.check_bitmap(self.vm_a, sha256)
> > > > >    File "169", line 72, in check_bitmap
> > > > >      node='drive0', name='bitmap0')
> > > > >    File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
> > > > >      return self._qmp.cmd(cmd, args=qmp_args)
> > > > >    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
> > > > >      return self.cmd_obj(qmp_cmd)
> > > > >    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
> > > > >      resp = self.__json_read()
> > > > >    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
> > > > > __json_read
> > > > >      data = self.__sockfile.readline()
> > > > >    File "/usr/lib64/python2.7/socket.py", line 451, in readline
> > > > >      data = self._sock.recv(self._rbufsize)
> > > > > error: [Errno 104] Connection reset by peer
> > > > > 
> > > > > ----------------------------------------------------------------------
> > > > > Ran 20 tests
> > > > > 
> > > > > FAILED (errors=1)
> > > > > 
> > > > > 
> > > > > Or:
> > > > > 
> > > > > WARNING:qemu:qemu received signal 6:
> > > > > build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> > > > > -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> > > > > -mon chardev=mon,mode=control -display none -vga none -qtest
> > > > > unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> > > > > -machine accel=qtest -drive
> > > > > if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> > > > > WARNING:qemu:qemu received signal 6:
> > > > > build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> > > > > -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> > > > > -mon chardev=mon,mode=control -display none -vga none -qtest
> > > > > unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> > > > > -machine accel=qtest -drive
> > > > > if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> > > > > 
> > > > > ...................F
> > > > > ======================================================================
> > > > > FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
> > > > > (__main__.TestDirtyBitmapMigration)
> > > > > ----------------------------------------------------------------------
> > > > > Traceback (most recent call last):
> > > > >    File "169", line 206, in <lambda>
> > > > >      setattr(klass, 'test_' + method + name, lambda self: mc(self))
> > > > >    File "169", line 125, in do_test_migration_resume_source
> > > > >      self.assertEqual(log, '')
> > > > > AssertionError: "qemu-system-x86_64: invalid runstate transition:
> > > > > 'running' -> 'postmigrate'\n" != ''
> > > > > 
> > > > > ----------------------------------------------------------------------
> > > > > Ran 20 tests
> > > > > 
> > > > > FAILED (failures=1)
> > > > > 
> > > > > 
> > > > > The backtrace always goes like this:
> > > > > 
> > > > > (gdb) bt
> > > > > #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
> > > > > #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
> > > > > #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> > > > > at vl.c:742
> > > > > #3  0x000055a46ebbb1a6 in runstate_set
> > > > > (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
> > > > > #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
> > > > > at migration/migration.c:2972
> > > > > #5  0x000055a46ed39129 in migration_thread
> > > > > (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
> > > > > #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
> > > > > util/qemu-thread-posix.c:502
> > > > > 
> > > > > 
> > > > > #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
> > > > > #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
> > > > > (gdb) frame 2
> > > > > #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> > > > > at vl.c:742
> > > > > 742             abort();
> > > > > (gdb) print current_run_state
> > > > > $1 = RUN_STATE_RUNNING
> > > > > 
> > > > > 
> > > > > Neither of migration or runstates are my strong suite, so I thought I'd
> > > > > report it before diving into it.
> > > > > 
> > > > > Max
> > > > > 
> > > > 
> > > > [...]
> > > > 450556@1548321072.888979:migrate_set_state new state active
> > > > 450556@1548321072.889022:migration_thread_setup_complete
> > > > 450556@1548321072.988275:migrate_transferred transferred 985298 time_spent 100 bandwidth 9852 max_size 2955894
> > > > 450556@1548321072.988283:migration_bitmap_sync_start
> > > > 450556@1548321072.988293:migration_bitmap_sync_end dirty_pages 32898
> > > > 450556@1548321072.988495:migration_thread_low_pending 2048
> > > > migration_completion
> > > > 450556@1548321072.988541:runstate_set current_state 9 new_state 7
> > > > 450556@1548321072.988780:migration_bitmap_sync_start
> > > > 450556@1548321072.988790:migration_bitmap_sync_end dirty_pages 32898
> > > > 450556@1548321072.993112:migrate_global_state_pre_save saved state: running
> > > > [1] 450556@1548321072.993237:migrate_set_state new state completed
> > > > 450549@1548321072.993668:handle_qmp_command mon 0x55aebef99830 req: {"execute": "x-debug-block-dirty-bitmap-sha256", "arguments": {"name": "bitmap0", "node": "drive0"}}
> > > > 450556@1548321072.993697:migration_thread_after_loop
> > > > [2] 450549@1548321072.993917:handle_qmp_command mon 0x55aebef99830 req: {"execute": "cont"}
> > > > 450476@1548321072.994461:runstate_set current_state 7 new_state 9
> > > > qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate'
> > > > 
> > > > 
> > > > Aha, so, in test we wait for MIGRATION COMPLETED [1] and run qmp_cont [2] and go to RUNNING.
> > > > then in migration thread we go to migration_iteration_finish() and fail to go to POSTMIGRATE.
> > > > (note, that this testcase is about resuming source after migration)
> > 
> > Ah that makes sense as to why we failed.
> > 
> > > > So, fix for test may be additionally wait for POSTMIGRATE, not only for MIGRATION COMPLETION.
> > > > 
> > > > But how to fix Qemu not to crash? May be, forbid some transitions (FINISH_MIGRATE -> RUNNING),
> > > >   or at least error-out qmp_cont if runstate is  FINISH_MIGRATE?
> > > 
> > > qmp_cont currently checks for RUN_STATE_INMIGRATE and gives it special
> > > treatment. As a quick hack, doing the same for RUN_STATE_FINISH_MIGRATE
> > > might fix the problem. A better simple fix would possibly include a
> > > whitelist of states where you can use 'cont' to avoid that the user
> > > messes with other internal states.
> > > 
> > > However, like the problem that Dave reported a few days ago (migrating
> > > twice leads to a crash because we try to inactivate already inactive
> > > block devices), I think this is a symptom of a larger problem. We're
> > > not taking the state machine serious enough. One-off checks here and
> > > there are unlikely to ever fully solve the problem of running a command
> > > in a runstate in which it was never supposed to run.
> > > 
> > > I wonder whether the QAPI schema should have a field 'run-states' for
> > > commands, and by default we would only include states where the VM has
> > > ownership of its resources (e.g. images are activated) and which are not
> > > temporary states that are automatically left, like finish-migrate.
> > > 
> > > Then the default for commands is to be rejected in "unusual" runstates
> > > where we're not expecting user intervention, and we must explicitly
> > > allow them if they are okay, in fact.
> > > 
> > > Instead of listing every obscure runstate, maybe we should really use
> > > categories of runstates instead:
> > > 
> > > 1. Running
> > > 2. Paused, owns all resources (like disk images)
> > > 3. Paused, doesn't own some resources (source VM after migration
> > >    completes, destination before migration completes)
> > > 4. Paused temporarily for internal reasons (e.g. finish-migrate,
> > >    restore-vm, save-vm)
> > > 
> > > Most commands should be okay with 1 and 2, but possibly not 3, and
> > > almost never 4.
> > 
> > The problem is it's tricky to get those groupings right and to figure
> > out how they all interact.  For example it's perfectly fine for most
> > query type commands to run in (3) and (4), and you need a bunch of
> > commands to let you recover from failures that might leave you in (3)
> > and cont is one of those that you might use after a particular form
> > of failed migration.
> 
> I'm not saying it is always easy, though I expect that many cases will
> actually be rather obvious.
> 
> 'cont' will be one of the commands that need to work in (3), but it
> needs internal checks to make sure it behaves correctly. This is fine,
> if we declare in the schema that a command works in (3), we'll also
> have reviewed that it can actually handle the situation correctly.

Maybe; but we've got deeper problems that many of the other runstate
changes aren't safe and other qmp commands aren't safe during parts
of migration.   About the only safety system we have is the BQL and
we can't take that around the whole of migration; so we've got this
dance we do during the end of migration where we take the lock some of
the time (no one I've asked can tell me why) but there are places
where things can jump in.

Interestingly the migration state machine uses a 'set_state' call
that most of the time is called with the state you expect to be
in, e.g.:
    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
                      MIGRATION_STATUS_COMPLETED);

only causes a transition from active->completed and you can check
the old state if you want.   This is safer than what we use
for runstates, because it avoids a lot of the odder arbitrary
transitions.

Dave

> I think being explicit about the expected runstates for each command
> would already go a long way in addressing this class of bugs, because it
> reminds us to even think about unusual runstates.
> 
> Kevin
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Vladimir Sementsov-Ogievskiy April 8, 2019, 2:57 p.m. UTC | #21
23.01.2019 18:48, Max Reitz wrote:
> Hi,
> 
> When running 169 in parallel (e.g. like so:
> 
> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
> 
> in four different shells), I get aborts:
> 
> (Often I get segfaults, but that's because of
> http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
> feel free to apply the attached patch to make them go away)

Hi,

Didn't you send it in separate? iotests still fails sometimes with segfault
in chardev code...

> 
> 
> WARNING:qemu:qemu received signal 6:
> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
> -mon chardev=mon,mode=control -display none -vga none -qtest
> unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
> -nodefaults -machine accel=qtest -drive
> if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
> .................E..
> ======================================================================
> ERROR:
> test_do_test_migration_resume_source_not_persistent__not_migbitmap
> (__main__.TestDirtyBitmapMigration)
> ----------------------------------------------------------------------
> Traceback (most recent call last):
>    File "169", line 206, in <lambda>
>      setattr(klass, 'test_' + method + name, lambda self: mc(self))
>    File "169", line 113, in do_test_migration_resume_source
>      self.check_bitmap(self.vm_a, sha256)
>    File "169", line 72, in check_bitmap
>      node='drive0', name='bitmap0')
>    File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
>      return self._qmp.cmd(cmd, args=qmp_args)
>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
>      return self.cmd_obj(qmp_cmd)
>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
>      resp = self.__json_read()
>    File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
> __json_read
>      data = self.__sockfile.readline()
>    File "/usr/lib64/python2.7/socket.py", line 451, in readline
>      data = self._sock.recv(self._rbufsize)
> error: [Errno 104] Connection reset by peer
> 
> ----------------------------------------------------------------------
> Ran 20 tests
> 
> FAILED (errors=1)
> 
> 
> Or:
> 
> WARNING:qemu:qemu received signal 6:
> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> -mon chardev=mon,mode=control -display none -vga none -qtest
> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> -machine accel=qtest -drive
> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> WARNING:qemu:qemu received signal 6:
> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
> -mon chardev=mon,mode=control -display none -vga none -qtest
> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
> -machine accel=qtest -drive
> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
> 
> ...................F
> ======================================================================
> FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
> (__main__.TestDirtyBitmapMigration)
> ----------------------------------------------------------------------
> Traceback (most recent call last):
>    File "169", line 206, in <lambda>
>      setattr(klass, 'test_' + method + name, lambda self: mc(self))
>    File "169", line 125, in do_test_migration_resume_source
>      self.assertEqual(log, '')
> AssertionError: "qemu-system-x86_64: invalid runstate transition:
> 'running' -> 'postmigrate'\n" != ''
> 
> ----------------------------------------------------------------------
> Ran 20 tests
> 
> FAILED (failures=1)
> 
> 
> The backtrace always goes like this:
> 
> (gdb) bt
> #0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
> #1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> at vl.c:742
> #3  0x000055a46ebbb1a6 in runstate_set
> (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
> #4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
> at migration/migration.c:2972
> #5  0x000055a46ed39129 in migration_thread
> (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
> #6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
> util/qemu-thread-posix.c:502
> 
> 
> #7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
> #8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
> (gdb) frame 2
> #2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
> at vl.c:742
> 742             abort();
> (gdb) print current_run_state
> $1 = RUN_STATE_RUNNING
> 
> 
> Neither of migration or runstates are my strong suite, so I thought I'd
> report it before diving into it.
> 
> Max
>
Max Reitz April 8, 2019, 3:08 p.m. UTC | #22
On 08.04.19 16:57, Vladimir Sementsov-Ogievskiy wrote:
> 23.01.2019 18:48, Max Reitz wrote:
>> Hi,
>>
>> When running 169 in parallel (e.g. like so:
>>
>> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
>> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
>> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
>> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done
>>
>> in four different shells), I get aborts:
>>
>> (Often I get segfaults, but that's because of
>> http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
>> feel free to apply the attached patch to make them go away)
> 
> Hi,
> 
> Didn't you send it in separate? iotests still fails sometimes with segfault
> in chardev code...

I didn’t, because I don’t believe that patch is entirely correct.
That’s why its commit title starts with “Hack”.

Max
diff mbox series

Patch

From 4f141f42f2ae8cf509495ee0962fd45e160f33af Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Wed, 23 Jan 2019 16:48:07 +0100
Subject: [PATCH] Hack to fix race in tcp_chr_disconnect()

---
 chardev/char-socket.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index eaa8e8b68f..9c326dcbf3 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -124,7 +124,12 @@  static void tcp_chr_accept(QIONetListener *listener,
                            void *opaque);
 
 static int tcp_chr_read_poll(void *opaque);
-static void tcp_chr_disconnect(Chardev *chr);
+static void tcp_chr_do_disconnect(Chardev *chr, bool locked);
+
+static void tcp_chr_disconnect(Chardev *chr)
+{
+    tcp_chr_do_disconnect(chr, false);
+}
 
 /* Called with chr_write_lock held.  */
 static int tcp_chr_write(Chardev *chr, const uint8_t *buf, int len)
@@ -148,7 +153,7 @@  static int tcp_chr_write(Chardev *chr, const uint8_t *buf, int len)
 
         if (ret < 0 && errno != EAGAIN) {
             if (tcp_chr_read_poll(chr) <= 0) {
-                tcp_chr_disconnect(chr);
+                tcp_chr_do_disconnect(chr, true);
                 return len;
             } /* else let the read handler finish it properly */
         }
@@ -444,8 +449,12 @@  static void update_disconnected_filename(SocketChardev *s)
  * reached, due to TLS or telnet initialization failure,
  * so can *not* assume s->connected == true
  */
-static void tcp_chr_disconnect(Chardev *chr)
+static void tcp_chr_do_disconnect(Chardev *chr, bool locked)
 {
+    if (!locked) {
+        qemu_mutex_lock(&chr->chr_write_lock);
+    }
+
     SocketChardev *s = SOCKET_CHARDEV(chr);
     bool emit_close = s->connected;
 
@@ -462,6 +471,10 @@  static void tcp_chr_disconnect(Chardev *chr)
     if (s->reconnect_time) {
         qemu_chr_socket_restart_timer(chr);
     }
+
+    if (!locked) {
+        qemu_mutex_unlock(&chr->chr_write_lock);
+    }
 }
 
 static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
-- 
2.20.1