[v9,3/3] iotests: test nbd reconnect
diff mbox series

Message ID 20190917171322.4127-4-vsementsov@virtuozzo.com
State New
Headers show
Series
  • NBD reconnect
Related show

Commit Message

Vladimir Sementsov-Ogievskiy Sept. 17, 2019, 5:13 p.m. UTC
Add test, which starts backup to nbd target and restarts nbd server
during backup.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 tests/qemu-iotests/264        | 65 +++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/264.out    | 12 +++++++
 tests/qemu-iotests/group      |  1 +
 tests/qemu-iotests/iotests.py |  4 +++
 4 files changed, 82 insertions(+)
 create mode 100755 tests/qemu-iotests/264
 create mode 100644 tests/qemu-iotests/264.out

Comments

Eric Blake Sept. 23, 2019, 7:51 p.m. UTC | #1
On 9/17/19 12:13 PM, Vladimir Sementsov-Ogievskiy wrote:
> Add test, which starts backup to nbd target and restarts nbd server
> during backup.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>  tests/qemu-iotests/264        | 65 +++++++++++++++++++++++++++++++++++
>  tests/qemu-iotests/264.out    | 12 +++++++
>  tests/qemu-iotests/group      |  1 +
>  tests/qemu-iotests/iotests.py |  4 +++
>  4 files changed, 82 insertions(+)
>  create mode 100755 tests/qemu-iotests/264
>  create mode 100644 tests/qemu-iotests/264.out
> 

> +import time
> +
> +import iotests
> +from iotests import qemu_img_create, file_path, qemu_nbd_popen, log
> +
> +disk_a, disk_b, nbd_sock = file_path('disk_a', 'disk_b', 'nbd-sock')
> +
> +qemu_img_create('-f', iotests.imgfmt, disk_a, '5M')
> +qemu_img_create('-f', iotests.imgfmt, disk_b, '5M')
> +srv = qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b)
> +time.sleep(1)
> +

Hard-coded sleep times can be problematic (too long, and the test is no
longer quick; too short, and heavy load can interfere); is there any way
to poll for a completion event rather than just waiting one second?

> +vm = iotests.VM().add_drive(disk_a)
> +vm.launch()
> +vm.hmp_qemu_io('drive0', 'write 0 5M')
> +
> +vm.qmp_log('blockdev-add', filters=[iotests.filter_qmp_testfiles],
> +           **{'node_name': 'backup0',
> +              'driver': 'raw',
> +              'file': {'driver': 'nbd',
> +                       'server': {'type': 'unix', 'path': nbd_sock},
> +                       'reconnect-delay': 10}})
> +vm.qmp_log('blockdev-backup', device='drive0', sync='full', target='backup0',
> +           speed=(1 * 1024 * 1024))

Throttling to make sure we are likely to still be going and thus test
the reconnect logic.

> +
> +time.sleep(1)
> +log('Kill NBD server')
> +srv.kill()

Again, that hard-coded sleep looks a bit risky.

> +
> +jobs = vm.qmp('query-block-jobs')['return']
> +if jobs and jobs[0]['offset'] < jobs[0]['len']:
> +    log('Backup job is still in progress')
> +
> +time.sleep(1)

Why do we need yet another sleep?

> +
> +log('Start NBD server')
> +srv = qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b)
> +
> +vm.qmp_log('block-job-set-speed', device='drive0', speed=0)

So here, you release the throttle to finish the job.

> +e = vm.event_wait('BLOCK_JOB_COMPLETED')
> +log('Backup completed: {}'.format(e['data']['offset']))
> +
> +vm.qmp_log('blockdev-del', node_name='backup0')
> +srv.kill()
> +vm.shutdown()
> diff --git a/tests/qemu-iotests/264.out b/tests/qemu-iotests/264.out
> new file mode 100644
> index 0000000000..4a2f4aa509
> --- /dev/null
> +++ b/tests/qemu-iotests/264.out
> @@ -0,0 +1,12 @@
> +{"execute": "blockdev-add", "arguments": {"driver": "raw", "file": {"driver": "nbd", "reconnect-delay": 10, "server": {"path": "TEST_DIR/PID-nbd-sock", "type": "unix"}}, "node-name": "backup0"}}
> +{"return": {}}
> +{"execute": "blockdev-backup", "arguments": {"device": "drive0", "speed": 1048576, "sync": "full", "target": "backup0"}}
> +{"return": {}}
> +Kill NBD server
> +Backup job is still in progress
> +Start NBD server
> +{"execute": "block-job-set-speed", "arguments": {"device": "drive0", "speed": 0}}
> +{"return": {}}
> +Backup completed: 5242880
> +{"execute": "blockdev-del", "arguments": {"node-name": "backup0"}}
> +{"return": {}}
> diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
> index 5d3da937e4..4f6dd6f153 100644
> --- a/tests/qemu-iotests/group
> +++ b/tests/qemu-iotests/group
> @@ -275,5 +275,6 @@
>  258 rw quick
>  262 rw quick migration
>  263 rw quick
> +264 rw quick

With that many hard-coded sleeps, is it still quick?

/me applies the patch and runs it...

264      pass       [14:49:55] [14:50:01]   6s

that's borderline enough that I would not call it quick.

>  
> +def qemu_nbd_popen(*args):
> +    '''Run qemu-nbd in daemon mode and return the parent's exit code'''
> +    return subprocess.Popen(qemu_nbd_args + ['--persistent'] + list(args))
> +

Should you also use a pid file here, and wait for the existence of the
pid file before returning (rather than hard-coding sleep(1))?

>  def compare_images(img1, img2, fmt1=imgfmt, fmt2=imgfmt):
>      '''Return True if two image files are identical'''
>      return qemu_img('compare', '-f', fmt1,
> 

At any rate,

Tested-by: Eric Blake <eblake@redhat.com>
Vladimir Sementsov-Ogievskiy Sept. 24, 2019, 8:31 a.m. UTC | #2
23.09.2019 22:51, Eric Blake wrote:
> On 9/17/19 12:13 PM, Vladimir Sementsov-Ogievskiy wrote:
>> Add test, which starts backup to nbd target and restarts nbd server
>> during backup.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>   tests/qemu-iotests/264        | 65 +++++++++++++++++++++++++++++++++++
>>   tests/qemu-iotests/264.out    | 12 +++++++
>>   tests/qemu-iotests/group      |  1 +
>>   tests/qemu-iotests/iotests.py |  4 +++
>>   4 files changed, 82 insertions(+)
>>   create mode 100755 tests/qemu-iotests/264
>>   create mode 100644 tests/qemu-iotests/264.out
>>
> 
>> +import time
>> +
>> +import iotests
>> +from iotests import qemu_img_create, file_path, qemu_nbd_popen, log
>> +
>> +disk_a, disk_b, nbd_sock = file_path('disk_a', 'disk_b', 'nbd-sock')
>> +
>> +qemu_img_create('-f', iotests.imgfmt, disk_a, '5M')
>> +qemu_img_create('-f', iotests.imgfmt, disk_b, '5M')
>> +srv = qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b)
>> +time.sleep(1)
>> +
> 
> Hard-coded sleep times can be problematic (too long, and the test is no
> longer quick; too short, and heavy load can interfere); is there any way
> to poll for a completion event rather than just waiting one second?

Hmm, I think qemu-nbd don't send events.. Here I just want to give it a time
to start. Possibly we can enable tracing and wait for some trace-point, but I'm
not sure that this is good idea. And sleep is simpler.

> 
>> +vm = iotests.VM().add_drive(disk_a)
>> +vm.launch()
>> +vm.hmp_qemu_io('drive0', 'write 0 5M')
>> +
>> +vm.qmp_log('blockdev-add', filters=[iotests.filter_qmp_testfiles],
>> +           **{'node_name': 'backup0',
>> +              'driver': 'raw',
>> +              'file': {'driver': 'nbd',
>> +                       'server': {'type': 'unix', 'path': nbd_sock},
>> +                       'reconnect-delay': 10}})
>> +vm.qmp_log('blockdev-backup', device='drive0', sync='full', target='backup0',
>> +           speed=(1 * 1024 * 1024))
> 
> Throttling to make sure we are likely to still be going and thus test
> the reconnect logic.
> 
>> +
>> +time.sleep(1)
>> +log('Kill NBD server')
>> +srv.kill()
> 
> Again, that hard-coded sleep looks a bit risky.

Hmm, you are right it's definitely risky.. I forget, that in down-stream virtuozzo
branch this place is rewritten to be smarter, I'll resend updated test, replying
to this thread.

> 
>> +
>> +jobs = vm.qmp('query-block-jobs')['return']
>> +if jobs and jobs[0]['offset'] < jobs[0]['len']:
>> +    log('Backup job is still in progress')
>> +
>> +time.sleep(1)
> 
> Why do we need yet another sleep?

The idea is to emulate that server is down for some time. It may be better
to disable throttling before this sleep, to give backup more chance to create
requests during server down-time.

> 
>> +
>> +log('Start NBD server')
>> +srv = qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b)
>> +
>> +vm.qmp_log('block-job-set-speed', device='drive0', speed=0)
> 
> So here, you release the throttle to finish the job.
> 
>> +e = vm.event_wait('BLOCK_JOB_COMPLETED')
>> +log('Backup completed: {}'.format(e['data']['offset']))
>> +
>> +vm.qmp_log('blockdev-del', node_name='backup0')
>> +srv.kill()
>> +vm.shutdown()
>> diff --git a/tests/qemu-iotests/264.out b/tests/qemu-iotests/264.out
>> new file mode 100644
>> index 0000000000..4a2f4aa509
>> --- /dev/null
>> +++ b/tests/qemu-iotests/264.out
>> @@ -0,0 +1,12 @@
>> +{"execute": "blockdev-add", "arguments": {"driver": "raw", "file": {"driver": "nbd", "reconnect-delay": 10, "server": {"path": "TEST_DIR/PID-nbd-sock", "type": "unix"}}, "node-name": "backup0"}}
>> +{"return": {}}
>> +{"execute": "blockdev-backup", "arguments": {"device": "drive0", "speed": 1048576, "sync": "full", "target": "backup0"}}
>> +{"return": {}}
>> +Kill NBD server
>> +Backup job is still in progress
>> +Start NBD server
>> +{"execute": "block-job-set-speed", "arguments": {"device": "drive0", "speed": 0}}
>> +{"return": {}}
>> +Backup completed: 5242880
>> +{"execute": "blockdev-del", "arguments": {"node-name": "backup0"}}
>> +{"return": {}}
>> diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
>> index 5d3da937e4..4f6dd6f153 100644
>> --- a/tests/qemu-iotests/group
>> +++ b/tests/qemu-iotests/group
>> @@ -275,5 +275,6 @@
>>   258 rw quick
>>   262 rw quick migration
>>   263 rw quick
>> +264 rw quick
> 
> With that many hard-coded sleeps, is it still quick?
> 
> /me applies the patch and runs it...
> 
> 264      pass       [14:49:55] [14:50:01]   6s
> 
> that's borderline enough that I would not call it quick.
> 
>>   
>> +def qemu_nbd_popen(*args):
>> +    '''Run qemu-nbd in daemon mode and return the parent's exit code'''
>> +    return subprocess.Popen(qemu_nbd_args + ['--persistent'] + list(args))
>> +
> 
> Should you also use a pid file here, and wait for the existence of the
> pid file before returning (rather than hard-coding sleep(1))?

What do you mean / how to do it?

We want to wait until listening socket is prepared..

> 
>>   def compare_images(img1, img2, fmt1=imgfmt, fmt2=imgfmt):
>>       '''Return True if two image files are identical'''
>>       return qemu_img('compare', '-f', fmt1,
>>
> 
> At any rate,
> 
> Tested-by: Eric Blake <eblake@redhat.com>
>
Eric Blake Oct. 4, 2019, 6:05 p.m. UTC | #3
On 9/24/19 3:31 AM, Vladimir Sementsov-Ogievskiy wrote:

>>> +def qemu_nbd_popen(*args):
>>> +    '''Run qemu-nbd in daemon mode and return the parent's exit code'''
>>> +    return subprocess.Popen(qemu_nbd_args + ['--persistent'] + list(args))
>>> +
>>
>> Should you also use a pid file here, and wait for the existence of the
>> pid file before returning (rather than hard-coding sleep(1))?
> 
> What do you mean / how to do it?
> 
> We want to wait until listening socket is prepared..

In shell:

qemu-nbd --pid-file=/path/to/file ...
while [ ! -e /path/to/file ]; do
   sleep ... # fractional second, or exponential, or whatever...
done
# Now the listening socket is indeed prepared

You'd have to translate that idiom to python.

Or:

pre-open Unix socket at /path/to/socket
LISTEN_PID=... LISTEN_FDS=1 qemu-nbd ... 3<>/path/to/socket

Now the socket is pre-created and passed into qemu-nbd via systemd 
socket activation, so you know the listening socket is ready without 
having to do any loop at all.  Here's a patch in libnbd where we just 
switched from waiting for the port to appear (because the test predated 
qemu-nbd pidfile support) to instead using socket activation, for reference:
https://github.com/libguestfs/libnbd/commit/352331d177
Vladimir Sementsov-Ogievskiy Oct. 7, 2019, 10:48 a.m. UTC | #4
04.10.2019 21:05, Eric Blake wrote:
> On 9/24/19 3:31 AM, Vladimir Sementsov-Ogievskiy wrote:
> 
>>>> +def qemu_nbd_popen(*args):
>>>> +    '''Run qemu-nbd in daemon mode and return the parent's exit code'''
>>>> +    return subprocess.Popen(qemu_nbd_args + ['--persistent'] + list(args))
>>>> +
>>>
>>> Should you also use a pid file here, and wait for the existence of the
>>> pid file before returning (rather than hard-coding sleep(1))?
>>
>> What do you mean / how to do it?
>>
>> We want to wait until listening socket is prepared..
> 
> In shell:
> 
> qemu-nbd --pid-file=/path/to/file ...
> while [ ! -e /path/to/file ]; do
>    sleep ... # fractional second, or exponential, or whatever...
> done
> # Now the listening socket is indeed prepared
> 
> You'd have to translate that idiom to python.

Don't see, how it is better than what I've done in 04.. But I can resend with this.
At least, the fact that socket is initialized before creating pid file is undocumented..

> 
> Or:
> 
> pre-open Unix socket at /path/to/socket
> LISTEN_PID=... LISTEN_FDS=1 qemu-nbd ... 3<>/path/to/socket
> 
> Now the socket is pre-created and passed into qemu-nbd via systemd socket activation, so you know the listening socket is ready without having to do any loop at all.  Here's a patch in libnbd where we just switched from waiting for the port to appear (because the test predated qemu-nbd pidfile support) to instead using socket activation, for reference:
> https://github.com/libguestfs/libnbd/commit/352331d177
> 

Hmm, I'm afraid I need more help in it, I don't know socket activation and googling for some time didn't help.
How to pre-open the socket? How to set LISTEN_PID? Looking at the code I see that activation path failed if
LISTEN_PID != getpid().. So I need to know qemu-nbd pid before starting it? o_O
Eric Blake Oct. 7, 2019, 8:03 p.m. UTC | #5
On 10/7/19 5:48 AM, Vladimir Sementsov-Ogievskiy wrote:

>>> We want to wait until listening socket is prepared..
>>
>> In shell:
>>
>> qemu-nbd --pid-file=/path/to/file ...
>> while [ ! -e /path/to/file ]; do
>>     sleep ... # fractional second, or exponential, or whatever...
>> done
>> # Now the listening socket is indeed prepared
>>
>> You'd have to translate that idiom to python.
> 
> Don't see, how it is better than what I've done in 04.. But I can resend with this.
> At least, the fact that socket is initialized before creating pid file is undocumented..

I just posted a patch to rectify that.

> 
>>
>> Or:
>>
>> pre-open Unix socket at /path/to/socket
>> LISTEN_PID=... LISTEN_FDS=1 qemu-nbd ... 3<>/path/to/socket
>>
>> Now the socket is pre-created and passed into qemu-nbd via systemd socket activation, so you know the listening socket is ready without having to do any loop at all.  Here's a patch in libnbd where we just switched from waiting for the port to appear (because the test predated qemu-nbd pidfile support) to instead using socket activation, for reference:
>> https://github.com/libguestfs/libnbd/commit/352331d177
>>
> 
> Hmm, I'm afraid I need more help in it, I don't know socket activation and googling for some time didn't help.
> How to pre-open the socket? How to set LISTEN_PID? Looking at the code I see that activation path failed if
> LISTEN_PID != getpid().. So I need to know qemu-nbd pid before starting it? o_O
> 

I'm not sure if a shell can open a Unix socket as a server.  The shell 
_does_ have the 'exec' builtin, such that you can manipulate the 
environment in shell and then use exec to guarantee the qemu-nbd process 
has the same pid as the shell process that just manipulated the 
environment; but it doesn't help unless you can also pass in the Unix 
socket pre-bound, and I'm not aware of command line tools that make that 
easy (maybe nc can do something like it, but then you have to wonder if 
nc is adding yet another layer of bounce-buffering).  But in other 
languages (C, Python, ...) you create a Unix socket, call bind() on it, 
then call fork().  In the parent process, you then close the just-bound 
fd, and call connect() on the socket - the connect will block until the 
child process uses the socket, but you are guaranteed that it won't fail 
because the server side is already bound.  In the child process, you use 
dup2() to move the fd to 3 and clear O_CLOEXEC, manipulate the 
environment to set LISTEN_PID to the current process id and LISTEN_FDS 
to 1, then exec.  The child process then has the correct id to realize 
that it has been handed a pre-bound socket, and skips any code that it 
would normally do to create the socket and bind it.

Note that setting LISTEN_PID in the child process after a fork() from a 
multi-threaded parent is _extremely_ difficult to do correctly (setenv() 
is not async-signal-safe, and anything that is not async-signal-safe can 
cause deadlock if invoked between fork() and exec() if the parent 
process was multi-threaded) - so it may even be easier to do a 
double-exec() - the first exec is to a shim to get rid of the 
async-signal-safe restrictions, and can then set LISTEN_PID without 
issues, and the second exec to the actual target.  But I don't know of 
any such shim designed for common use.

That said, socket activation isn't a necessity to use, just a 
convenience.  I don't care if the regression test uses socket 
activation, as long as it works at testing nbd reconnect.

Patch
diff mbox series

diff --git a/tests/qemu-iotests/264 b/tests/qemu-iotests/264
new file mode 100755
index 0000000000..e70f91c5ca
--- /dev/null
+++ b/tests/qemu-iotests/264
@@ -0,0 +1,65 @@ 
+#!/usr/bin/env python
+#
+# Test nbd reconnect
+#
+# Copyright (c) 2019 Virtuozzo International GmbH. All rights reserved.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import time
+
+import iotests
+from iotests import qemu_img_create, file_path, qemu_nbd_popen, log
+
+disk_a, disk_b, nbd_sock = file_path('disk_a', 'disk_b', 'nbd-sock')
+
+qemu_img_create('-f', iotests.imgfmt, disk_a, '5M')
+qemu_img_create('-f', iotests.imgfmt, disk_b, '5M')
+srv = qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b)
+time.sleep(1)
+
+vm = iotests.VM().add_drive(disk_a)
+vm.launch()
+vm.hmp_qemu_io('drive0', 'write 0 5M')
+
+vm.qmp_log('blockdev-add', filters=[iotests.filter_qmp_testfiles],
+           **{'node_name': 'backup0',
+              'driver': 'raw',
+              'file': {'driver': 'nbd',
+                       'server': {'type': 'unix', 'path': nbd_sock},
+                       'reconnect-delay': 10}})
+vm.qmp_log('blockdev-backup', device='drive0', sync='full', target='backup0',
+           speed=(1 * 1024 * 1024))
+
+time.sleep(1)
+log('Kill NBD server')
+srv.kill()
+
+jobs = vm.qmp('query-block-jobs')['return']
+if jobs and jobs[0]['offset'] < jobs[0]['len']:
+    log('Backup job is still in progress')
+
+time.sleep(1)
+
+log('Start NBD server')
+srv = qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b)
+
+vm.qmp_log('block-job-set-speed', device='drive0', speed=0)
+e = vm.event_wait('BLOCK_JOB_COMPLETED')
+log('Backup completed: {}'.format(e['data']['offset']))
+
+vm.qmp_log('blockdev-del', node_name='backup0')
+srv.kill()
+vm.shutdown()
diff --git a/tests/qemu-iotests/264.out b/tests/qemu-iotests/264.out
new file mode 100644
index 0000000000..4a2f4aa509
--- /dev/null
+++ b/tests/qemu-iotests/264.out
@@ -0,0 +1,12 @@ 
+{"execute": "blockdev-add", "arguments": {"driver": "raw", "file": {"driver": "nbd", "reconnect-delay": 10, "server": {"path": "TEST_DIR/PID-nbd-sock", "type": "unix"}}, "node-name": "backup0"}}
+{"return": {}}
+{"execute": "blockdev-backup", "arguments": {"device": "drive0", "speed": 1048576, "sync": "full", "target": "backup0"}}
+{"return": {}}
+Kill NBD server
+Backup job is still in progress
+Start NBD server
+{"execute": "block-job-set-speed", "arguments": {"device": "drive0", "speed": 0}}
+{"return": {}}
+Backup completed: 5242880
+{"execute": "blockdev-del", "arguments": {"node-name": "backup0"}}
+{"return": {}}
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 5d3da937e4..4f6dd6f153 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -275,5 +275,6 @@ 
 258 rw quick
 262 rw quick migration
 263 rw quick
+264 rw quick
 265 rw auto quick
 266 rw quick
diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index b26271187c..a9c496dd7e 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -229,6 +229,10 @@  def qemu_nbd_early_pipe(*args):
     else:
         return exitcode, subp.communicate()[0]
 
+def qemu_nbd_popen(*args):
+    '''Run qemu-nbd in daemon mode and return the parent's exit code'''
+    return subprocess.Popen(qemu_nbd_args + ['--persistent'] + list(args))
+
 def compare_images(img1, img2, fmt1=imgfmt, fmt2=imgfmt):
     '''Return True if two image files are identical'''
     return qemu_img('compare', '-f', fmt1,