diff mbox series

[v3] iotests: Tweak 030 in order to trigger a race condition with parallel jobs

Message ID 20180302112050.26670-1-berto@igalia.com
State New
Headers show
Series [v3] iotests: Tweak 030 in order to trigger a race condition with parallel jobs | expand

Commit Message

Alberto Garcia March 2, 2018, 11:20 a.m. UTC
This patch tweaks TestParallelOps in iotest 030 so it allocates data
in smaller regions (256KB/512KB instead of 512KB/1MB) and the
block-stream job in test_stream_commit() only needs to copy data that
is at the very end of the image.

This way when the block-stream job is awakened it will finish right
away without any chance of being stopped by block_job_sleep_ns(). This
triggers the bug that was fixed by 3d5d319e1221082974711af1d09d82f07
and is therefore a more useful test case for parallel block jobs.

After this patch the aforementiond bug can also be reproduced with the
test_stream_parallel() test case.

Since with this change the stream job in test_stream_commit() finishes
early, this patch introduces a similar test case where both jobs are
slowed down so they can actually run in parallel.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: John Snow <jsnow@redhat.com>
---

This patch was sent already in December but it seems to have been
forgotten. v3 is the same as v2 but with a typo fixed in the commit
message.

---
 tests/qemu-iotests/030     | 48 +++++++++++++++++++++++++++++++++++++++-------
 tests/qemu-iotests/030.out |  4 ++--
 2 files changed, 43 insertions(+), 9 deletions(-)

Comments

Max Reitz March 5, 2018, 4:59 p.m. UTC | #1
On 2018-03-02 12:20, Alberto Garcia wrote:
> This patch tweaks TestParallelOps in iotest 030 so it allocates data
> in smaller regions (256KB/512KB instead of 512KB/1MB) and the
> block-stream job in test_stream_commit() only needs to copy data that
> is at the very end of the image.
> 
> This way when the block-stream job is awakened it will finish right
> away without any chance of being stopped by block_job_sleep_ns(). This
> triggers the bug that was fixed by 3d5d319e1221082974711af1d09d82f07
> and is therefore a more useful test case for parallel block jobs.
> 
> After this patch the aforementiond bug can also be reproduced with the
> test_stream_parallel() test case.
> 
> Since with this change the stream job in test_stream_commit() finishes
> early, this patch introduces a similar test case where both jobs are
> slowed down so they can actually run in parallel.
> 
> Signed-off-by: Alberto Garcia <berto@igalia.com>
> Reviewed-by: John Snow <jsnow@redhat.com>
> ---
> 
> This patch was sent already in December but it seems to have been
> forgotten. v3 is the same as v2 but with a typo fixed in the commit
> message.
> 
> ---
>  tests/qemu-iotests/030     | 48 +++++++++++++++++++++++++++++++++++++++-------
>  tests/qemu-iotests/030.out |  4 ++--
>  2 files changed, 43 insertions(+), 9 deletions(-)
> 
> diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030
> index 457984b8e9..44ad1e311f 100755
> --- a/tests/qemu-iotests/030
> +++ b/tests/qemu-iotests/030
> @@ -156,7 +156,7 @@ class TestSingleDrive(iotests.QMPTestCase):
>  class TestParallelOps(iotests.QMPTestCase):
>      num_ops = 4 # Number of parallel block-stream operations
>      num_imgs = num_ops * 2 + 1
> -    image_len = num_ops * 1024 * 1024
> +    image_len = num_ops * 512 * 1024
>      imgs = []
>  
>      def setUp(self):
> @@ -177,12 +177,12 @@ class TestParallelOps(iotests.QMPTestCase):
>  
>          # Put data into the images we are copying data from
>          for i in range(self.num_imgs / 2):
> -            img_index = i * 2 + 1
> -            # Alternate between 512k and 1M.
> +            img_index = self.num_imgs - i * 2 - 2

First of all, I don't like this very much because it's not clear that
img_index is going to be odd.

I'd prefer something like

  reverse_i = self.num_imgs / 2 - 1 - 1
  img_index = reverse_i * 2 + 1

Secondly, I've reverted 3d5d319e1221082 to test this, and I could
reproduce failure exactly once.  Since then, no luck (in like 20
attempts, I think)...

Max

> +            # Alternate between 256KB and 512KB.
>              # This way jobs will not finish in the same order they were created
> -            num_kb = 512 + 512 * (i % 2)
> +            num_kb = 256 + 256 * (i % 2)
>              qemu_io('-f', iotests.imgfmt,
> -                    '-c', 'write -P %d %d %d' % (i, i*1024*1024, num_kb * 1024),
> +                    '-c', 'write -P 0xFF %dk %dk' % (i * 512, num_kb),
>                      self.imgs[img_index])
>  
>          # Attach the drive to the VM
> @@ -318,12 +318,14 @@ class TestParallelOps(iotests.QMPTestCase):
>          self.wait_until_completed(drive='commit-drive0')
>  
>      # Test a block-stream and a block-commit job in parallel
> -    def test_stream_commit(self):
> +    # Here the stream job is supposed to finish quickly in order to reproduce
> +    # the scenario that triggers the bug fixed in 3d5d319e1221082974711af1d09
> +    def test_stream_commit_1(self):
>          self.assertLessEqual(8, self.num_imgs)
>          self.assert_no_active_block_jobs()
>  
>          # Stream from node0 into node2
> -        result = self.vm.qmp('block-stream', device='node2', job_id='node2')
> +        result = self.vm.qmp('block-stream', device='node2', base_node='node0', job_id='node2')
>          self.assert_qmp(result, 'return', {})
>  
>          # Commit from the active layer into node3
> @@ -348,6 +350,38 @@ class TestParallelOps(iotests.QMPTestCase):
>  
>          self.assert_no_active_block_jobs()
>  
> +    # This is similar to test_stream_commit_1 but both jobs are slowed
> +    # down so they can run in parallel for a little while.
> +    def test_stream_commit_2(self):
> +        self.assertLessEqual(8, self.num_imgs)
> +        self.assert_no_active_block_jobs()
> +
> +        # Stream from node0 into node4
> +        result = self.vm.qmp('block-stream', device='node4', base_node='node0', job_id='node4', speed=1024*1024)
> +        self.assert_qmp(result, 'return', {})
> +
> +        # Commit from the active layer into node5
> +        result = self.vm.qmp('block-commit', device='drive0', base=self.imgs[5], speed=1024*1024)
> +        self.assert_qmp(result, 'return', {})
> +
> +        # Wait for all jobs to be finished.
> +        pending_jobs = ['node4', 'drive0']
> +        while len(pending_jobs) > 0:
> +            for event in self.vm.get_qmp_events(wait=True):
> +                if event['event'] == 'BLOCK_JOB_COMPLETED':
> +                    node_name = self.dictpath(event, 'data/device')
> +                    self.assertTrue(node_name in pending_jobs)
> +                    self.assert_qmp_absent(event, 'data/error')
> +                    pending_jobs.remove(node_name)
> +                if event['event'] == 'BLOCK_JOB_READY':
> +                    self.assert_qmp(event, 'data/device', 'drive0')
> +                    self.assert_qmp(event, 'data/type', 'commit')
> +                    self.assert_qmp_absent(event, 'data/error')
> +                    self.assertTrue('drive0' in pending_jobs)
> +                    self.vm.qmp('block-job-complete', device='drive0')
> +
> +        self.assert_no_active_block_jobs()
> +
>      # Test the base_node parameter
>      def test_stream_base_node_name(self):
>          self.assert_no_active_block_jobs()
> diff --git a/tests/qemu-iotests/030.out b/tests/qemu-iotests/030.out
> index 391c8573ca..42314e9c00 100644
> --- a/tests/qemu-iotests/030.out
> +++ b/tests/qemu-iotests/030.out
> @@ -1,5 +1,5 @@
> -.......................
> +........................
>  ----------------------------------------------------------------------
> -Ran 23 tests
> +Ran 24 tests
>  
>  OK
>
Alberto Garcia March 6, 2018, 10:25 a.m. UTC | #2
On Mon 05 Mar 2018 05:59:47 PM CET, Max Reitz wrote:
> Secondly, I've reverted 3d5d319e1221082 to test this, and I could
> reproduce failure exactly once.  Since then, no luck (in like 20
> attempts, I think)...

Oh, I see. I was bisecting this and it seems that 1a63a907507fbbcf made
this problem more difficult to reproduce. If you revert that one too (in
addition to 3d5d319e1221082, that is) then it crashes very easily.

I don't think it makes sense to tweak the test ever further, I would
simply mention that "this triggers the bug that was fixed by 3d5d319e122
and 1a63a907507fbbcf" or something like that.

Berto
diff mbox series

Patch

diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030
index 457984b8e9..44ad1e311f 100755
--- a/tests/qemu-iotests/030
+++ b/tests/qemu-iotests/030
@@ -156,7 +156,7 @@  class TestSingleDrive(iotests.QMPTestCase):
 class TestParallelOps(iotests.QMPTestCase):
     num_ops = 4 # Number of parallel block-stream operations
     num_imgs = num_ops * 2 + 1
-    image_len = num_ops * 1024 * 1024
+    image_len = num_ops * 512 * 1024
     imgs = []
 
     def setUp(self):
@@ -177,12 +177,12 @@  class TestParallelOps(iotests.QMPTestCase):
 
         # Put data into the images we are copying data from
         for i in range(self.num_imgs / 2):
-            img_index = i * 2 + 1
-            # Alternate between 512k and 1M.
+            img_index = self.num_imgs - i * 2 - 2
+            # Alternate between 256KB and 512KB.
             # This way jobs will not finish in the same order they were created
-            num_kb = 512 + 512 * (i % 2)
+            num_kb = 256 + 256 * (i % 2)
             qemu_io('-f', iotests.imgfmt,
-                    '-c', 'write -P %d %d %d' % (i, i*1024*1024, num_kb * 1024),
+                    '-c', 'write -P 0xFF %dk %dk' % (i * 512, num_kb),
                     self.imgs[img_index])
 
         # Attach the drive to the VM
@@ -318,12 +318,14 @@  class TestParallelOps(iotests.QMPTestCase):
         self.wait_until_completed(drive='commit-drive0')
 
     # Test a block-stream and a block-commit job in parallel
-    def test_stream_commit(self):
+    # Here the stream job is supposed to finish quickly in order to reproduce
+    # the scenario that triggers the bug fixed in 3d5d319e1221082974711af1d09
+    def test_stream_commit_1(self):
         self.assertLessEqual(8, self.num_imgs)
         self.assert_no_active_block_jobs()
 
         # Stream from node0 into node2
-        result = self.vm.qmp('block-stream', device='node2', job_id='node2')
+        result = self.vm.qmp('block-stream', device='node2', base_node='node0', job_id='node2')
         self.assert_qmp(result, 'return', {})
 
         # Commit from the active layer into node3
@@ -348,6 +350,38 @@  class TestParallelOps(iotests.QMPTestCase):
 
         self.assert_no_active_block_jobs()
 
+    # This is similar to test_stream_commit_1 but both jobs are slowed
+    # down so they can run in parallel for a little while.
+    def test_stream_commit_2(self):
+        self.assertLessEqual(8, self.num_imgs)
+        self.assert_no_active_block_jobs()
+
+        # Stream from node0 into node4
+        result = self.vm.qmp('block-stream', device='node4', base_node='node0', job_id='node4', speed=1024*1024)
+        self.assert_qmp(result, 'return', {})
+
+        # Commit from the active layer into node5
+        result = self.vm.qmp('block-commit', device='drive0', base=self.imgs[5], speed=1024*1024)
+        self.assert_qmp(result, 'return', {})
+
+        # Wait for all jobs to be finished.
+        pending_jobs = ['node4', 'drive0']
+        while len(pending_jobs) > 0:
+            for event in self.vm.get_qmp_events(wait=True):
+                if event['event'] == 'BLOCK_JOB_COMPLETED':
+                    node_name = self.dictpath(event, 'data/device')
+                    self.assertTrue(node_name in pending_jobs)
+                    self.assert_qmp_absent(event, 'data/error')
+                    pending_jobs.remove(node_name)
+                if event['event'] == 'BLOCK_JOB_READY':
+                    self.assert_qmp(event, 'data/device', 'drive0')
+                    self.assert_qmp(event, 'data/type', 'commit')
+                    self.assert_qmp_absent(event, 'data/error')
+                    self.assertTrue('drive0' in pending_jobs)
+                    self.vm.qmp('block-job-complete', device='drive0')
+
+        self.assert_no_active_block_jobs()
+
     # Test the base_node parameter
     def test_stream_base_node_name(self):
         self.assert_no_active_block_jobs()
diff --git a/tests/qemu-iotests/030.out b/tests/qemu-iotests/030.out
index 391c8573ca..42314e9c00 100644
--- a/tests/qemu-iotests/030.out
+++ b/tests/qemu-iotests/030.out
@@ -1,5 +1,5 @@ 
-.......................
+........................
 ----------------------------------------------------------------------
-Ran 23 tests
+Ran 24 tests
 
 OK