fix the co_queue multi-adding bug

Message ID	54D87C7B.9010600@redhat.com
State	New
Headers	show Return-Path: <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org> Message-ID: <54D87C7B.9010600@redhat.com> Date: Mon, 09 Feb 2015 10:23:07 +0100 From: Paolo Bonzini <pbonzini@redhat.com> User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.4.0 MIME-Version: 1.0 To: w00214312 <wu.wubin@huawei.com>, qemu-devel@nongnu.org References: <1423302708-7900-1-git-send-email-wu.wubin@huawei.com> In-Reply-To: <1423302708-7900-1-git-send-email-wu.wubin@huawei.com> Content-Type: text/plain; charset=windows-1252 Content-Transfer-Encoding: 8bit Error: Malformed IPv6 address (bad octet value). Cc: kwolf@redhat.com, famz@redhat.com, stefanha@redhat.com Subject: Re: [Qemu-devel] [PATCH] fix the co_queue multi-adding bug Precedence: list Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org

Message ID

54D87C7B.9010600@redhat.com

State

New

Headers

Message-ID: <54D87C7B.9010600@redhat.com>
Date: Mon, 09 Feb 2015 10:23:07 +0100
From: Paolo Bonzini <pbonzini@redhat.com>
User-Agent: Mozilla/5.0 (X11; Linux x86_64;
	rv:31.0) Gecko/20100101 Thunderbird/31.4.0
MIME-Version: 1.0
To: w00214312 <wu.wubin@huawei.com>, qemu-devel@nongnu.org
References: <1423302708-7900-1-git-send-email-wu.wubin@huawei.com>
In-Reply-To: <1423302708-7900-1-git-send-email-wu.wubin@huawei.com>
Content-Type: text/plain; charset=windows-1252
Content-Transfer-Encoding: 8bit
Cc: kwolf@redhat.com, famz@redhat.com, stefanha@redhat.com
Subject: Re: [Qemu-devel] [PATCH] fix the co_queue multi-adding bug
Precedence: list
Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org
Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org

Commit Message

Paolo Bonzini Feb. 9, 2015, 9:23 a.m. UTC

On 07/02/2015 10:51, w00214312 wrote:
> From: Bin Wu <wu.wubin@huawei.com>
> 
> When we test the drive_mirror between different hosts by ndb devices, 
> we find that, during the cancel phase the qemu process crashes sometimes.
> By checking the crash core file, we find the stack as follows, which means
> a coroutine re-enter error occurs:

This bug probably can be fixed simply by delaying the setting of
recv_coroutine.

What are the symptoms if you only apply your "qemu-coroutine-lock: fix
co_queue multi-adding bug" patch but not "qemu-coroutine: fix
qemu_co_queue_run_restart error"?

Can you try the patch below?  (Compile-tested only).

Comments

Bin Wu Feb. 9, 2015, 9:47 a.m. UTC | #1

On 2015/2/9 17:23, Paolo Bonzini wrote:
> 
> 
> On 07/02/2015 10:51, w00214312 wrote:
>> From: Bin Wu <wu.wubin@huawei.com>
>>
>> When we test the drive_mirror between different hosts by ndb devices, 
>> we find that, during the cancel phase the qemu process crashes sometimes.
>> By checking the crash core file, we find the stack as follows, which means
>> a coroutine re-enter error occurs:
> 
> This bug probably can be fixed simply by delaying the setting of
> recv_coroutine.
> 
> What are the symptoms if you only apply your "qemu-coroutine-lock: fix
> co_queue multi-adding bug" patch but not "qemu-coroutine: fix
> qemu_co_queue_run_restart error"?
> 
> Can you try the patch below?  (Compile-tested only).
> 

yes, I think this patch can solve the problem too. I will try the patch latter.

> diff --git a/block/nbd-client.c b/block/nbd-client.c
> index 6e1c97c..23d6a71 100644
> --- a/block/nbd-client.c
> +++ b/block/nbd-client.c
> @@ -104,10 +104,21 @@ static int nbd_co_send_request(NbdClientSession *s,
>      QEMUIOVector *qiov, int offset)
>  {
>      AioContext *aio_context;
> -    int rc, ret;
> +    int rc, ret, i;
>  
>      qemu_co_mutex_lock(&s->send_mutex);
> +
> +    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
> +        if (s->recv_coroutine[i] == NULL) {
> +            s->recv_coroutine[i] = qemu_coroutine_self();
> +            break;
> +        }
> +    }
> +
> +    assert(i < MAX_NBD_REQUESTS);
> +    request->handle = INDEX_TO_HANDLE(s, i);
>      s->send_coroutine = qemu_coroutine_self();
> +
>      aio_context = bdrv_get_aio_context(s->bs);
>      aio_set_fd_handler(aio_context, s->sock,
>                         nbd_reply_ready, nbd_restart_write, s);
> @@ -164,8 +175,6 @@ static void nbd_co_receive_reply(NbdClientSession *s,
>  static void nbd_coroutine_start(NbdClientSession *s,
>     struct nbd_request *request)
>  {
> -    int i;
> -
>      /* Poor man semaphore.  The free_sema is locked when no other request
>       * can be accepted, and unlocked after receiving one reply.  */
>      if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
> @@ -174,15 +183,7 @@ static void nbd_coroutine_start(NbdClientSession *s,
>      }
>      s->in_flight++;
>  
> -    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
> -        if (s->recv_coroutine[i] == NULL) {
> -            s->recv_coroutine[i] = qemu_coroutine_self();
> -            break;
> -        }
> -    }
> -
> -    assert(i < MAX_NBD_REQUESTS);
> -    request->handle = INDEX_TO_HANDLE(s, i);
> +    /* s->recv_coroutine[i] is set as soon as we get the send_lock.  */
>  }
>  
>  static void nbd_coroutine_end(NbdClientSession *s,
> 
> 
>

Bin Wu Feb. 10, 2015, 6:34 a.m. UTC | #2

On 2015/2/9 17:23, Paolo Bonzini wrote:
> 
> 
> On 07/02/2015 10:51, w00214312 wrote:
>> From: Bin Wu <wu.wubin@huawei.com>
>>
>> When we test the drive_mirror between different hosts by ndb devices, 
>> we find that, during the cancel phase the qemu process crashes sometimes.
>> By checking the crash core file, we find the stack as follows, which means
>> a coroutine re-enter error occurs:
> 
> This bug probably can be fixed simply by delaying the setting of
> recv_coroutine.
> 
> What are the symptoms if you only apply your "qemu-coroutine-lock: fix
> co_queue multi-adding bug" patch but not "qemu-coroutine: fix
> qemu_co_queue_run_restart error"?

These two patches are used to solve two different problems:
-"qemu-coroutine-lock: fix co_queue multi-adding bug" solves the coroutine
re-enter problem which is found when we send a cancel command after the
drive_mirror is just started.
-"qemu-coroutine: fix qemu_co_queue_run_restart error" solves the segfault
problem during drive_mirror phase of two VMs which copy large files between each
other.

> 
> Can you try the patch below?  (Compile-tested only).
> 
> diff --git a/block/nbd-client.c b/block/nbd-client.c
> index 6e1c97c..23d6a71 100644
> --- a/block/nbd-client.c
> +++ b/block/nbd-client.c
> @@ -104,10 +104,21 @@ static int nbd_co_send_request(NbdClientSession *s,
>      QEMUIOVector *qiov, int offset)
>  {
>      AioContext *aio_context;
> -    int rc, ret;
> +    int rc, ret, i;
>  
>      qemu_co_mutex_lock(&s->send_mutex);
> +
> +    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
> +        if (s->recv_coroutine[i] == NULL) {
> +            s->recv_coroutine[i] = qemu_coroutine_self();
> +            break;
> +        }
> +    }
> +
> +    assert(i < MAX_NBD_REQUESTS);
> +    request->handle = INDEX_TO_HANDLE(s, i);
>      s->send_coroutine = qemu_coroutine_self();
> +
>      aio_context = bdrv_get_aio_context(s->bs);
>      aio_set_fd_handler(aio_context, s->sock,
>                         nbd_reply_ready, nbd_restart_write, s);
> @@ -164,8 +175,6 @@ static void nbd_co_receive_reply(NbdClientSession *s,
>  static void nbd_coroutine_start(NbdClientSession *s,
>     struct nbd_request *request)
>  {
> -    int i;
> -
>      /* Poor man semaphore.  The free_sema is locked when no other request
>       * can be accepted, and unlocked after receiving one reply.  */
>      if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
> @@ -174,15 +183,7 @@ static void nbd_coroutine_start(NbdClientSession *s,
>      }
>      s->in_flight++;
>  
> -    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
> -        if (s->recv_coroutine[i] == NULL) {
> -            s->recv_coroutine[i] = qemu_coroutine_self();
> -            break;
> -        }
> -    }
> -
> -    assert(i < MAX_NBD_REQUESTS);
> -    request->handle = INDEX_TO_HANDLE(s, i);
> +    /* s->recv_coroutine[i] is set as soon as we get the send_lock.  */
>  }
>  
>  static void nbd_coroutine_end(NbdClientSession *s,
> 
> 
>

diff --git a/block/nbd-client.c b/block/nbd-client.c
index 6e1c97c..23d6a71 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -104,10 +104,21 @@  static int nbd_co_send_request(NbdClientSession *s,
     QEMUIOVector *qiov, int offset)
 {
     AioContext *aio_context;
-    int rc, ret;
+    int rc, ret, i;
 
     qemu_co_mutex_lock(&s->send_mutex);
+
+    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
+        if (s->recv_coroutine[i] == NULL) {
+            s->recv_coroutine[i] = qemu_coroutine_self();
+            break;
+        }
+    }
+
+    assert(i < MAX_NBD_REQUESTS);
+    request->handle = INDEX_TO_HANDLE(s, i);
     s->send_coroutine = qemu_coroutine_self();
+
     aio_context = bdrv_get_aio_context(s->bs);
     aio_set_fd_handler(aio_context, s->sock,
                        nbd_reply_ready, nbd_restart_write, s);
@@ -164,8 +175,6 @@  static void nbd_co_receive_reply(NbdClientSession *s,
 static void nbd_coroutine_start(NbdClientSession *s,
    struct nbd_request *request)
 {
-    int i;
-
     /* Poor man semaphore.  The free_sema is locked when no other request
      * can be accepted, and unlocked after receiving one reply.  */
     if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
@@ -174,15 +183,7 @@  static void nbd_coroutine_start(NbdClientSession *s,
     }
     s->in_flight++;
 
-    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
-        if (s->recv_coroutine[i] == NULL) {
-            s->recv_coroutine[i] = qemu_coroutine_self();
-            break;
-        }
-    }
-
-    assert(i < MAX_NBD_REQUESTS);
-    request->handle = INDEX_TO_HANDLE(s, i);
+    /* s->recv_coroutine[i] is set as soon as we get the send_lock.  */
 }
 
 static void nbd_coroutine_end(NbdClientSession *s,

fix the co_queue multi-adding bug

Commit Message

Comments

Patch