diff mbox

[Qemu-stable] rdma: fix multiple VMs parallel migration

Message ID CAO0YQGKgYTK+CQ1kJRXYa1yk-Z_LC+uo9MMvO=0b7Ho9dbcr_w@mail.gmail.com
State New
Headers show

Commit Message

Frank Yang Aug. 30, 2013, 12:39 p.m. UTC
When several VMs migrate with RDMA at the same time, the increased pressure
cause packet loss probabilistically and make source and destination wait
for each other. There might be some of VMs blocked during the migration.

Fix the bug by using two completion queues, for sending and receiving
respectively.

From 0c4829495cdc89eea2e94b103ac42c3f6a4b32c2 Mon Sep 17 00:00:00 2001
From: Frank Yang <frank.yangjie@gmail.com>
Date: Fri, 30 Aug 2013 17:53:34 +0800
Subject: [PATCH] rdma: fix multiple VMs parallel migration

Signed-off-by: Frank Yang <frank.yangjie@gmail.com>
---
 migration-rdma.c | 57
++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 20 deletions(-)

         goto err_alloc_pd_cq;
     }
@@ -1040,8 +1044,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
     attr.cap.max_recv_wr = 3;
     attr.cap.max_send_sge = 1;
     attr.cap.max_recv_sge = 1;
-    attr.send_cq = rdma->cq;
-    attr.recv_cq = rdma->cq;
+    attr.send_cq = rdma->send_cq;
+    attr.recv_cq = rdma->recv_cq;
     attr.qp_type = IBV_QPT_RC;

     ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
@@ -1361,13 +1365,18 @@ static void qemu_rdma_signal_unregister(RDMAContext
*rdma, uint64_t index,
  * Return the work request ID that completed.
  */
 static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
-                               uint32_t *byte_len)
+                               uint32_t *byte_len, int wrid_requested)
 {
     int ret;
     struct ibv_wc wc;
     uint64_t wr_id;

-    ret = ibv_poll_cq(rdma->cq, 1, &wc);
+    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
+        wrid_requested == RDMA_WRID_SEND_CONTROL) {
+        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
+    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
+        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
+    }

     if (!ret) {
         *wr_id_out = RDMA_WRID_NONE;
@@ -1460,12 +1469,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext
*rdma, int wrid_requested,
     void *cq_ctx;
     uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;

-    if (ibv_req_notify_cq(rdma->cq, 0)) {
-        return -1;
-    }
     /* poll cq first */
     while (wr_id != wrid_requested) {
-        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
+        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
         if (ret < 0) {
             return ret;
         }
@@ -1487,6 +1493,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext
*rdma, int wrid_requested,
     }

     while (1) {
+        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
+            wrid_requested == RDMA_WRID_SEND_CONTROL) {
+            if (ibv_req_notify_cq(rdma->send_cq, 0)) {
+                return -1;
+            }
+        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
+            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
+                return -1;
+            }
+        }
+
         /*
          * Coroutine doesn't start until process_incoming_migration()
          * so don't yield unless we know we're running inside of a
coroutine.
@@ -1502,12 +1519,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext
*rdma, int wrid_requested,

         num_cq_events++;

-        if (ibv_req_notify_cq(cq, 0)) {
-            goto err_block_for_wrid;
-        }
-
         while (wr_id != wrid_requested) {
-            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
+            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len,
wrid_requested);
             if (ret < 0) {
                 goto err_block_for_wrid;
             }
@@ -2236,9 +2249,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
         ibv_destroy_qp(rdma->qp);
         rdma->qp = NULL;
     }
-    if (rdma->cq) {
-        ibv_destroy_cq(rdma->cq);
-        rdma->cq = NULL;
+    if (rdma->send_cq) {
+        ibv_destroy_cq(rdma->send_cq);
+        rdma->send_cq = NULL;
+    }
+    if (rdma->recv_cq) {
+        ibv_destroy_cq(rdma->recv_cq);
+        rdma->recv_cq = NULL;
     }
     if (rdma->comp_channel) {
         ibv_destroy_comp_channel(rdma->comp_channel);
@@ -2770,7 +2787,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void
*opaque,
      */
     while (1) {
         uint64_t wr_id, wr_id_in;
-        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
+        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL,
RDMA_WRID_RDMA_WRITE);
         if (ret < 0) {
             fprintf(stderr, "rdma migration: polling error! %d\n", ret);
             goto err;

Comments

Isaku Yamahata Sept. 2, 2013, 12:46 p.m. UTC | #1
Hi. Can you elaborate why two CQs fix it? Does it depend on
HCA implementation?

I'm not against two CQs for sending and receiving. In fact I'm for it
because I use two CQs for postcopy RDMA support.

thanks,

On Fri, Aug 30, 2013 at 08:39:31PM +0800, Frank Yang wrote:
> When several VMs migrate with RDMA at the same time, the increased pressure
> cause packet loss probabilistically and make source and destination wait for
> each other. There might be some of VMs blocked during the migration.
> 
> Fix the bug by using two completion queues, for sending and receiving
> respectively.
> 
> From 0c4829495cdc89eea2e94b103ac42c3f6a4b32c2 Mon Sep 17 00:00:00 2001
> From: Frank Yang <frank.yangjie@gmail.com>
> Date: Fri, 30 Aug 2013 17:53:34 +0800
> Subject: [PATCH] rdma: fix multiple VMs parallel migration
> 
> Signed-off-by: Frank Yang <frank.yangjie@gmail.com>
> ---
>  migration-rdma.c | 57 ++++++++++++++++++++++++++++++++++++--------------------
>  1 file changed, 37 insertions(+), 20 deletions(-)
> 
> diff --git a/migration-rdma.c b/migration-rdma.c
> index 3d1266f..d0eacbb 100644
> --- a/migration-rdma.c
> +++ b/migration-rdma.c
> @@ -362,7 +362,8 @@ typedef struct RDMAContext {
>      struct ibv_qp *qp;                      /* queue pair */
>      struct ibv_comp_channel *comp_channel;  /* completion channel */
>      struct ibv_pd *pd;                      /* protection domain */
> -    struct ibv_cq *cq;                      /* completion queue */
> +    struct ibv_cq *send_cq;                 /* send completion queue */
> +    struct ibv_cq *recv_cq;                 /* receive completion queue */
>  
>      /*
>       * If a previous write failed (perhaps because of a failed
> @@ -1006,9 +1007,12 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
>       * Completion queue can be filled by both read and write work requests,
>       * so must reflect the sum of both possible queue sizes.
>       */
> -    rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
> +    rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 2),
>              NULL, rdma->comp_channel, 0);
> -    if (!rdma->cq) {
> +    rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX, NULL,
> +            rdma->comp_channel, 0);
> +
> +    if (!rdma->send_cq || !rdma->recv_cq) {
>          fprintf(stderr, "failed to allocate completion queue\n");
>          goto err_alloc_pd_cq;
>      }
> @@ -1040,8 +1044,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
>      attr.cap.max_recv_wr = 3;
>      attr.cap.max_send_sge = 1;
>      attr.cap.max_recv_sge = 1;
> -    attr.send_cq = rdma->cq;
> -    attr.recv_cq = rdma->cq;
> +    attr.send_cq = rdma->send_cq;
> +    attr.recv_cq = rdma->recv_cq;
>      attr.qp_type = IBV_QPT_RC;
>  
>      ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
> @@ -1361,13 +1365,18 @@ static void qemu_rdma_signal_unregister(RDMAContext
> *rdma, uint64_t index,
>   * Return the work request ID that completed.
>   */
>  static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
> -                               uint32_t *byte_len)
> +                               uint32_t *byte_len, int wrid_requested)
>  {
>      int ret;
>      struct ibv_wc wc;
>      uint64_t wr_id;
>  
> -    ret = ibv_poll_cq(rdma->cq, 1, &wc);
> +    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
> +        wrid_requested == RDMA_WRID_SEND_CONTROL) {
> +        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
> +    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
> +        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
> +    }
>  
>      if (!ret) {
>          *wr_id_out = RDMA_WRID_NONE;
> @@ -1460,12 +1469,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma,
> int wrid_requested,
>      void *cq_ctx;
>      uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
>  
> -    if (ibv_req_notify_cq(rdma->cq, 0)) {
> -        return -1;
> -    }
>      /* poll cq first */
>      while (wr_id != wrid_requested) {
> -        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
> +        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
>          if (ret < 0) {
>              return ret;
>          }
> @@ -1487,6 +1493,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma,
> int wrid_requested,
>      }
>  
>      while (1) {
> +        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
> +            wrid_requested == RDMA_WRID_SEND_CONTROL) {
> +            if (ibv_req_notify_cq(rdma->send_cq, 0)) {
> +                return -1;
> +            }
> +        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
> +            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
> +                return -1;
> +            }
> +        }
> +
>          /*
>           * Coroutine doesn't start until process_incoming_migration()
>           * so don't yield unless we know we're running inside of a coroutine.
> @@ -1502,12 +1519,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma,
> int wrid_requested,
>  
>          num_cq_events++;
>  
> -        if (ibv_req_notify_cq(cq, 0)) {
> -            goto err_block_for_wrid;
> -        }
> -
>          while (wr_id != wrid_requested) {
> -            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
> +            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
>              if (ret < 0) {
>                  goto err_block_for_wrid;
>              }
> @@ -2236,9 +2249,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
>          ibv_destroy_qp(rdma->qp);
>          rdma->qp = NULL;
>      }
> -    if (rdma->cq) {
> -        ibv_destroy_cq(rdma->cq);
> -        rdma->cq = NULL;
> +    if (rdma->send_cq) {
> +        ibv_destroy_cq(rdma->send_cq);
> +        rdma->send_cq = NULL;
> +    }
> +    if (rdma->recv_cq) {
> +        ibv_destroy_cq(rdma->recv_cq);
> +        rdma->recv_cq = NULL;
>      }
>      if (rdma->comp_channel) {
>          ibv_destroy_comp_channel(rdma->comp_channel);
> @@ -2770,7 +2787,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void
> *opaque,
>       */
>      while (1) {
>          uint64_t wr_id, wr_id_in;
> -        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
> +        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL, RDMA_WRID_RDMA_WRITE);
>          if (ret < 0) {
>              fprintf(stderr, "rdma migration: polling error! %d\n", ret);
>              goto err;
> -- 
> 1.8.3.msysgit.0
> 
>
Frank Yang Sept. 3, 2013, 4:20 a.m. UTC | #2
Yes, it depends on low-level implementation. During my earlier test,
using one CQ to send and receive may cause packet loss with heavy load:
the destination thinks it send READY message successfully but the source
still waits for it. This situation always happens when the destination
polls
receive CQE first.

So I think using only one CQ may cause packet conflict or something like
that,
and it should be the driver bug. However, using two CQs fix the problem.



2013/9/2 Isaku Yamahata <yamahata@private.email.ne.jp>

> Hi. Can you elaborate why two CQs fix it? Does it depend on
> HCA implementation?
>
> I'm not against two CQs for sending and receiving. In fact I'm for it
> because I use two CQs for postcopy RDMA support.
>
> thanks,
>
> On Fri, Aug 30, 2013 at 08:39:31PM +0800, Frank Yang wrote:
> > When several VMs migrate with RDMA at the same time, the increased
> pressure
> > cause packet loss probabilistically and make source and destination wait
> for
> > each other. There might be some of VMs blocked during the migration.
> >
> > Fix the bug by using two completion queues, for sending and receiving
> > respectively.
> >
> > From 0c4829495cdc89eea2e94b103ac42c3f6a4b32c2 Mon Sep 17 00:00:00 2001
> > From: Frank Yang <frank.yangjie@gmail.com>
> > Date: Fri, 30 Aug 2013 17:53:34 +0800
> > Subject: [PATCH] rdma: fix multiple VMs parallel migration
> >
> > Signed-off-by: Frank Yang <frank.yangjie@gmail.com>
> > ---
> >  migration-rdma.c | 57
> ++++++++++++++++++++++++++++++++++++--------------------
> >  1 file changed, 37 insertions(+), 20 deletions(-)
> >
> > diff --git a/migration-rdma.c b/migration-rdma.c
> > index 3d1266f..d0eacbb 100644
> > --- a/migration-rdma.c
> > +++ b/migration-rdma.c
> > @@ -362,7 +362,8 @@ typedef struct RDMAContext {
> >      struct ibv_qp *qp;                      /* queue pair */
> >      struct ibv_comp_channel *comp_channel;  /* completion channel */
> >      struct ibv_pd *pd;                      /* protection domain */
> > -    struct ibv_cq *cq;                      /* completion queue */
> > +    struct ibv_cq *send_cq;                 /* send completion queue */
> > +    struct ibv_cq *recv_cq;                 /* receive completion queue
> */
> >
> >      /*
> >       * If a previous write failed (perhaps because of a failed
> > @@ -1006,9 +1007,12 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext
> *rdma)
> >       * Completion queue can be filled by both read and write work
> requests,
> >       * so must reflect the sum of both possible queue sizes.
> >       */
> > -    rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
> > +    rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX
> * 2),
> >              NULL, rdma->comp_channel, 0);
> > -    if (!rdma->cq) {
> > +    rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX,
> NULL,
> > +            rdma->comp_channel, 0);
> > +
> > +    if (!rdma->send_cq || !rdma->recv_cq) {
> >          fprintf(stderr, "failed to allocate completion queue\n");
> >          goto err_alloc_pd_cq;
> >      }
> > @@ -1040,8 +1044,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
> >      attr.cap.max_recv_wr = 3;
> >      attr.cap.max_send_sge = 1;
> >      attr.cap.max_recv_sge = 1;
> > -    attr.send_cq = rdma->cq;
> > -    attr.recv_cq = rdma->cq;
> > +    attr.send_cq = rdma->send_cq;
> > +    attr.recv_cq = rdma->recv_cq;
> >      attr.qp_type = IBV_QPT_RC;
> >
> >      ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
> > @@ -1361,13 +1365,18 @@ static void
> qemu_rdma_signal_unregister(RDMAContext
> > *rdma, uint64_t index,
> >   * Return the work request ID that completed.
> >   */
> >  static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
> > -                               uint32_t *byte_len)
> > +                               uint32_t *byte_len, int wrid_requested)
> >  {
> >      int ret;
> >      struct ibv_wc wc;
> >      uint64_t wr_id;
> >
> > -    ret = ibv_poll_cq(rdma->cq, 1, &wc);
> > +    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
> > +        wrid_requested == RDMA_WRID_SEND_CONTROL) {
> > +        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
> > +    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
> > +        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
> > +    }
> >
> >      if (!ret) {
> >          *wr_id_out = RDMA_WRID_NONE;
> > @@ -1460,12 +1469,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext
> *rdma,
> > int wrid_requested,
> >      void *cq_ctx;
> >      uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
> >
> > -    if (ibv_req_notify_cq(rdma->cq, 0)) {
> > -        return -1;
> > -    }
> >      /* poll cq first */
> >      while (wr_id != wrid_requested) {
> > -        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
> > +        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
> >          if (ret < 0) {
> >              return ret;
> >          }
> > @@ -1487,6 +1493,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext
> *rdma,
> > int wrid_requested,
> >      }
> >
> >      while (1) {
> > +        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
> > +            wrid_requested == RDMA_WRID_SEND_CONTROL) {
> > +            if (ibv_req_notify_cq(rdma->send_cq, 0)) {
> > +                return -1;
> > +            }
> > +        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
> > +            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
> > +                return -1;
> > +            }
> > +        }
> > +
> >          /*
> >           * Coroutine doesn't start until process_incoming_migration()
> >           * so don't yield unless we know we're running inside of a
> coroutine.
> > @@ -1502,12 +1519,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext
> *rdma,
> > int wrid_requested,
> >
> >          num_cq_events++;
> >
> > -        if (ibv_req_notify_cq(cq, 0)) {
> > -            goto err_block_for_wrid;
> > -        }
> > -
> >          while (wr_id != wrid_requested) {
> > -            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
> > +            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len,
> wrid_requested);
> >              if (ret < 0) {
> >                  goto err_block_for_wrid;
> >              }
> > @@ -2236,9 +2249,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
> >          ibv_destroy_qp(rdma->qp);
> >          rdma->qp = NULL;
> >      }
> > -    if (rdma->cq) {
> > -        ibv_destroy_cq(rdma->cq);
> > -        rdma->cq = NULL;
> > +    if (rdma->send_cq) {
> > +        ibv_destroy_cq(rdma->send_cq);
> > +        rdma->send_cq = NULL;
> > +    }
> > +    if (rdma->recv_cq) {
> > +        ibv_destroy_cq(rdma->recv_cq);
> > +        rdma->recv_cq = NULL;
> >      }
> >      if (rdma->comp_channel) {
> >          ibv_destroy_comp_channel(rdma->comp_channel);
> > @@ -2770,7 +2787,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void
> > *opaque,
> >       */
> >      while (1) {
> >          uint64_t wr_id, wr_id_in;
> > -        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
> > +        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL,
> RDMA_WRID_RDMA_WRITE);
> >          if (ret < 0) {
> >              fprintf(stderr, "rdma migration: polling error! %d\n", ret);
> >              goto err;
> > --
> > 1.8.3.msysgit.0
> >
> >
>
> --
> yamahata
>
Lei Li Sept. 3, 2013, 5:03 a.m. UTC | #3
Hi Frank,

I failed to apply this patch. Please make sure to use git-send-email, otherwise
it's a little hard to review. :)

On 08/30/2013 08:39 PM, Frank Yang wrote:
> When several VMs migrate with RDMA at the same time, the increased 
> pressure cause packet loss probabilistically and make source and 
> destination wait for each other. There might be some of VMs blocked 
> during the migration.
>
> Fix the bug by using two completion queues, for sending and receiving 
> respectively.

>
> From 0c4829495cdc89eea2e94b103ac42c3f6a4b32c2 Mon Sep 17 00:00:00 2001
> From: Frank Yang <frank.yangjie@gmail.com 
> <mailto:frank.yangjie@gmail.com>>
> Date: Fri, 30 Aug 2013 17:53:34 +0800
> Subject: [PATCH] rdma: fix multiple VMs parallel migration

The commit message should be here within the patch. You can use 'git commit --amend'
to add it.
  

>
> Signed-off-by: Frank Yang <frank.yangjie@gmail.com 
> <mailto:frank.yangjie@gmail.com>>
> ---
>  migration-rdma.c | 57 
> ++++++++++++++++++++++++++++++++++++--------------------
>  1 file changed, 37 insertions(+), 20 deletions(-)
>
> diff --git a/migration-rdma.c b/migration-rdma.c
> index 3d1266f..d0eacbb 100644
> --- a/migration-rdma.c
> +++ b/migration-rdma.c
> @@ -362,7 +362,8 @@ typedef struct RDMAContext {
>      struct ibv_qp *qp;                      /* queue pair */
>      struct ibv_comp_channel *comp_channel;  /* completion channel */
>      struct ibv_pd *pd;                      /* protection domain */
> -    struct ibv_cq *cq;                      /* completion queue */
> +    struct ibv_cq *send_cq;                 /* send completion queue */
> +    struct ibv_cq *recv_cq;                 /* receive completion 
> queue */
>      /*
>       * If a previous write failed (perhaps because of a failed
> @@ -1006,9 +1007,12 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
>       * Completion queue can be filled by both read and write work 
> requests,
>       * so must reflect the sum of both possible queue sizes.
>       */
> -    rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
> +    rdma->send_cq = ibv_create_cq(rdma->verbs, 
> (RDMA_SIGNALED_SEND_MAX * 2),
>              NULL, rdma->comp_channel, 0);
> -    if (!rdma->cq) {
> +    rdma->recv_cq = ibv_create_cq(rdma->verbs, 
> RDMA_SIGNALED_SEND_MAX, NULL,
> +            rdma->comp_channel, 0);
> +
> +    if (!rdma->send_cq || !rdma->recv_cq) {
>          fprintf(stderr, "failed to allocate completion queue\n");
>          goto err_alloc_pd_cq;
>      }
> @@ -1040,8 +1044,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
>      attr.cap.max_recv_wr = 3;
>      attr.cap.max_send_sge = 1;
>      attr.cap.max_recv_sge = 1;
> -    attr.send_cq = rdma->cq;
> -    attr.recv_cq = rdma->cq;
> +    attr.send_cq = rdma->send_cq;
> +    attr.recv_cq = rdma->recv_cq;
>      attr.qp_type = IBV_QPT_RC;
>      ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
> @@ -1361,13 +1365,18 @@ static void 
> qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
>   * Return the work request ID that completed.
>   */
>  static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
> -                               uint32_t *byte_len)
> +                               uint32_t *byte_len, int wrid_requested)
>  {
>      int ret;
>      struct ibv_wc wc;
>      uint64_t wr_id;
> -    ret = ibv_poll_cq(rdma->cq, 1, &wc);
> +    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
> +        wrid_requested == RDMA_WRID_SEND_CONTROL) {
> +        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
> +    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
> +        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
> +    }
>      if (!ret) {
>          *wr_id_out = RDMA_WRID_NONE;
> @@ -1460,12 +1469,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext 
> *rdma, int wrid_requested,
>      void *cq_ctx;
>      uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
> -    if (ibv_req_notify_cq(rdma->cq, 0)) {
> -        return -1;
> -    }
>      /* poll cq first */
>      while (wr_id != wrid_requested) {
> -        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
> +        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
>          if (ret < 0) {
>              return ret;
>          }
> @@ -1487,6 +1493,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext 
> *rdma, int wrid_requested,
>      }
>      while (1) {
> +        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
> +            wrid_requested == RDMA_WRID_SEND_CONTROL) {
> +            if (ibv_req_notify_cq(rdma->send_cq, 0)) {
> +                return -1;
> +            }
> +        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
> +            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
> +                return -1;
> +            }
> +        }
> +
>          /*
>           * Coroutine doesn't start until process_incoming_migration()
>           * so don't yield unless we know we're running inside of a 
> coroutine.
> @@ -1502,12 +1519,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext 
> *rdma, int wrid_requested,
>          num_cq_events++;
> -        if (ibv_req_notify_cq(cq, 0)) {
> -            goto err_block_for_wrid;
> -        }
> -
>          while (wr_id != wrid_requested) {
> -            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
> +            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, 
> wrid_requested);
>              if (ret < 0) {
>                  goto err_block_for_wrid;
>              }
> @@ -2236,9 +2249,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
>          ibv_destroy_qp(rdma->qp);
>          rdma->qp = NULL;
>      }
> -    if (rdma->cq) {
> -        ibv_destroy_cq(rdma->cq);
> -        rdma->cq = NULL;
> +    if (rdma->send_cq) {
> +        ibv_destroy_cq(rdma->send_cq);
> +        rdma->send_cq = NULL;
> +    }
> +    if (rdma->recv_cq) {
> +        ibv_destroy_cq(rdma->recv_cq);
> +        rdma->recv_cq = NULL;
>      }
>      if (rdma->comp_channel) {
>  ibv_destroy_comp_channel(rdma->comp_channel);
> @@ -2770,7 +2787,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, 
> void *opaque,
>       */
>      while (1) {
>          uint64_t wr_id, wr_id_in;
> -        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
> +        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL, 
> RDMA_WRID_RDMA_WRITE);
>          if (ret < 0) {
>              fprintf(stderr, "rdma migration: polling error! %d\n", ret);
>              goto err;
> -- 
> 1.8.3.msysgit.0
>
>
Lei Li Sept. 3, 2013, 5:38 a.m. UTC | #4
On 09/03/2013 12:20 PM, Frank Yang wrote:
> Yes, it depends on low-level implementation. During my earlier test,

What do you mean by the 'it depends on low-level implementation'?  Do you test
it with IB or Ethernet?

> using one CQ to send and receive may cause packet loss with heavy load:
> the destination thinks it send READY message successfully but the source
> still waits for it. This situation always happens when the destination 
> polls
> receive CQE first.
>
> So I think using only one CQ may cause packet conflict or something 
> like that,
> and it should be the driver bug. However, using two CQs fix the problem.

If the receiver may not receive this READY message from sender under heavy load caused by
packet loss, why two CQs can avoid this?

>
>
>
> 2013/9/2 Isaku Yamahata <yamahata@private.email.ne.jp 
> <mailto:yamahata@private.email.ne.jp>>
>
>     Hi. Can you elaborate why two CQs fix it? Does it depend on
>     HCA implementation?
>
>     I'm not against two CQs for sending and receiving. In fact I'm for it
>     because I use two CQs for postcopy RDMA support.
>
>     thanks,
>
>     On Fri, Aug 30, 2013 at 08:39:31PM +0800, Frank Yang wrote:
>     > When several VMs migrate with RDMA at the same time, the
>     increased pressure
>     > cause packet loss probabilistically and make source and
>     destination wait for
>     > each other. There might be some of VMs blocked during the migration.
>     >
>     > Fix the bug by using two completion queues, for sending and
>     receiving
>     > respectively.
>     >
>     > From 0c4829495cdc89eea2e94b103ac42c3f6a4b32c2 Mon Sep 17
>     00:00:00 2001
>     > From: Frank Yang <frank.yangjie@gmail.com
>     <mailto:frank.yangjie@gmail.com>>
>     > Date: Fri, 30 Aug 2013 17:53:34 +0800
>     > Subject: [PATCH] rdma: fix multiple VMs parallel migration
>     >
>     > Signed-off-by: Frank Yang <frank.yangjie@gmail.com
>     <mailto:frank.yangjie@gmail.com>>
>     > ---
>     >  migration-rdma.c | 57
>     ++++++++++++++++++++++++++++++++++++--------------------
>     >  1 file changed, 37 insertions(+), 20 deletions(-)
>     >
>     > diff --git a/migration-rdma.c b/migration-rdma.c
>     > index 3d1266f..d0eacbb 100644
>     > --- a/migration-rdma.c
>     > +++ b/migration-rdma.c
>     > @@ -362,7 +362,8 @@ typedef struct RDMAContext {
>     >      struct ibv_qp *qp;                      /* queue pair */
>     >      struct ibv_comp_channel *comp_channel;  /* completion
>     channel */
>     >      struct ibv_pd *pd;                      /* protection domain */
>     > -    struct ibv_cq *cq;                      /* completion queue */
>     > +    struct ibv_cq *send_cq;                 /* send completion
>     queue */
>     > +    struct ibv_cq *recv_cq;                 /* receive
>     completion queue */
>     >
>     >      /*
>     >       * If a previous write failed (perhaps because of a failed
>     > @@ -1006,9 +1007,12 @@ static int
>     qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
>     >       * Completion queue can be filled by both read and write
>     work requests,
>     >       * so must reflect the sum of both possible queue sizes.
>     >       */
>     > -    rdma->cq = ibv_create_cq(rdma->verbs,
>     (RDMA_SIGNALED_SEND_MAX * 3),
>     > +    rdma->send_cq = ibv_create_cq(rdma->verbs,
>     (RDMA_SIGNALED_SEND_MAX * 2),
>     >              NULL, rdma->comp_channel, 0);
>     > -    if (!rdma->cq) {
>     > +    rdma->recv_cq = ibv_create_cq(rdma->verbs,
>     RDMA_SIGNALED_SEND_MAX, NULL,
>     > +            rdma->comp_channel, 0);
>     > +
>     > +    if (!rdma->send_cq || !rdma->recv_cq) {
>     >          fprintf(stderr, "failed to allocate completion queue\n");
>     >          goto err_alloc_pd_cq;
>     >      }
>     > @@ -1040,8 +1044,8 @@ static int qemu_rdma_alloc_qp(RDMAContext
>     *rdma)
>     >      attr.cap.max_recv_wr = 3;
>     >      attr.cap.max_send_sge = 1;
>     >      attr.cap.max_recv_sge = 1;
>     > -    attr.send_cq = rdma->cq;
>     > -    attr.recv_cq = rdma->cq;
>     > +    attr.send_cq = rdma->send_cq;
>     > +    attr.recv_cq = rdma->recv_cq;
>     >      attr.qp_type = IBV_QPT_RC;
>     >
>     >      ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
>     > @@ -1361,13 +1365,18 @@ static void
>     qemu_rdma_signal_unregister(RDMAContext
>     > *rdma, uint64_t index,
>     >   * Return the work request ID that completed.
>     >   */
>     >  static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t
>     *wr_id_out,
>     > -                               uint32_t *byte_len)
>     > +                               uint32_t *byte_len, int
>     wrid_requested)
>     >  {
>     >      int ret;
>     >      struct ibv_wc wc;
>     >      uint64_t wr_id;
>     >
>     > -    ret = ibv_poll_cq(rdma->cq, 1, &wc);
>     > +    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
>     > +        wrid_requested == RDMA_WRID_SEND_CONTROL) {
>     > +        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
>     > +    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
>     > +        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
>     > +    }
>     >
>     >      if (!ret) {
>     >          *wr_id_out = RDMA_WRID_NONE;
>     > @@ -1460,12 +1469,9 @@ static int
>     qemu_rdma_block_for_wrid(RDMAContext *rdma,
>     > int wrid_requested,
>     >      void *cq_ctx;
>     >      uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
>     >
>     > -    if (ibv_req_notify_cq(rdma->cq, 0)) {
>     > -        return -1;
>     > -    }
>     >      /* poll cq first */
>     >      while (wr_id != wrid_requested) {
>     > -        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
>     > +        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len,
>     wrid_requested);
>     >          if (ret < 0) {
>     >              return ret;
>     >          }
>     > @@ -1487,6 +1493,17 @@ static int
>     qemu_rdma_block_for_wrid(RDMAContext *rdma,
>     > int wrid_requested,
>     >      }
>     >
>     >      while (1) {
>     > +        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
>     > +            wrid_requested == RDMA_WRID_SEND_CONTROL) {
>     > +            if (ibv_req_notify_cq(rdma->send_cq, 0)) {
>     > +                return -1;
>     > +            }
>     > +        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
>     > +            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
>     > +                return -1;
>     > +            }
>     > +        }
>     > +
>     >          /*
>     >           * Coroutine doesn't start until
>     process_incoming_migration()
>     >           * so don't yield unless we know we're running inside
>     of a coroutine.
>     > @@ -1502,12 +1519,8 @@ static int
>     qemu_rdma_block_for_wrid(RDMAContext *rdma,
>     > int wrid_requested,
>     >
>     >          num_cq_events++;
>     >
>     > -        if (ibv_req_notify_cq(cq, 0)) {
>     > -            goto err_block_for_wrid;
>     > -        }
>     > -
>     >          while (wr_id != wrid_requested) {
>     > -            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
>     > +            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len,
>     wrid_requested);
>     >              if (ret < 0) {
>     >                  goto err_block_for_wrid;
>     >              }
>     > @@ -2236,9 +2249,13 @@ static void qemu_rdma_cleanup(RDMAContext
>     *rdma)
>     >          ibv_destroy_qp(rdma->qp);
>     >          rdma->qp = NULL;
>     >      }
>     > -    if (rdma->cq) {
>     > -        ibv_destroy_cq(rdma->cq);
>     > -        rdma->cq = NULL;
>     > +    if (rdma->send_cq) {
>     > +        ibv_destroy_cq(rdma->send_cq);
>     > +        rdma->send_cq = NULL;
>     > +    }
>     > +    if (rdma->recv_cq) {
>     > +        ibv_destroy_cq(rdma->recv_cq);
>     > +        rdma->recv_cq = NULL;
>     >      }
>     >      if (rdma->comp_channel) {
>     >  ibv_destroy_comp_channel(rdma->comp_channel);
>     > @@ -2770,7 +2787,7 @@ static size_t qemu_rdma_save_page(QEMUFile
>     *f, void
>     > *opaque,
>     >       */
>     >      while (1) {
>     >          uint64_t wr_id, wr_id_in;
>     > -        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
>     > +        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL,
>     RDMA_WRID_RDMA_WRITE);
>     >          if (ret < 0) {
>     >              fprintf(stderr, "rdma migration: polling error!
>     %d\n", ret);
>     >              goto err;
>     > --
>     > 1.8.3.msysgit.0
>     >
>     >
>
>     --
>     yamahata
>
>
mrhines@linux.vnet.ibm.com Sept. 3, 2013, 2:13 p.m. UTC | #5
No top-posting, please.

On 09/03/2013 12:20 AM, Frank Yang wrote:
> Yes, it depends on low-level implementation. During my earlier test,
> using one CQ to send and receive may cause packet loss with heavy load:
> the destination thinks it send READY message successfully but the source
> still waits for it. This situation always happens when the destination 
> polls
> receive CQE first.
>
> So I think using only one CQ may cause packet conflict or something 
> like that,
> and it should be the driver bug. However, using two CQs fix the problem.
>
>

This doesn't seem like a very clear answer ..... are you sure its packet 
loss?

The queue pairs are supposed to be reliable - I've never experienced a 
situation
where packets were simply "dropped" for no reason without breaking the
connection and putting the QP into an error state.

- Michael
Frank Yang Sept. 4, 2013, 3:23 a.m. UTC | #6
On 2013-9-3 22:13, Michael R. Hines wrote:
>
> No top-posting, please.
>
> On 09/03/2013 12:20 AM, Frank Yang wrote:
>> Yes, it depends on low-level implementation. During my earlier test,
>> using one CQ to send and receive may cause packet loss with heavy load:
>> the destination thinks it send READY message successfully but the source
>> still waits for it. This situation always happens when the destination polls
>> receive CQE first.
>>
>> So I think using only one CQ may cause packet conflict or something like that,
>> and it should be the driver bug. However, using two CQs fix the problem.
>>
>>
>
> This doesn't seem like a very clear answer ..... are you sure its packet loss?
>
> The queue pairs are supposed to be reliable - I've never experienced a situation
> where packets were simply "dropped" for no reason without breaking the
> connection and putting the QP into an error state.
>
> - Michael
>
The fact is
1. The destination polls sending of READY message successfully. Either READY
     message is sent successfully indeed and the source does not receive it, or the
     destination dose not send READY message out at all.
2. I've tried to send READY message again by adding some codes during the migration.
    Source can receive the READY message successfully. So the connection is not
    broken and the QP works fine.

The packet loss what I'm talking about does not only refer to the loss during the
transmission. The message may also not be sent out successfully actually. ibv_poll_cq()
returns with no error, but the source dosen't receive message. For qemu, the message
it sent is lost.
Frank Yang Sept. 4, 2013, 3:59 a.m. UTC | #7
On 2013-9-3 13:38, Lei Li wrote:
> On 09/03/2013 12:20 PM, Frank Yang wrote:
>> Yes, it depends on low-level implementation. During my earlier test,
>
> What do you mean by the 'it depends on low-level implementation'?  Do you test
> it with IB or Ethernet?
I've tested both IB(40 GigE) and Ethernet(10 GigE). IB seems better but still could fail.
I don't have IB(10 GigE), so I'm not sure it's relevant to the bandwidth or not. 

>> using one CQ to send and receive may cause packet loss with heavy load:
>> the destination thinks it send READY message successfully but the source
>> still waits for it. This situation always happens when the destination polls
>> receive CQE first.
>>
>> So I think using only one CQ may cause packet conflict or something like that,
>> and it should be the driver bug. However, using two CQs fix the problem.
>
> If the receiver may not receive this READY message from sender under heavy load caused by
> packet loss, why two CQs can avoid this?
I haven't gone deeply into the kernel and seen the implemetation. But two CQs make
sure that qemu will not poll receive CQE when it expects to poll send CQE, and truly
can avoid parallel migration failure. I've tested IB and Ethernet for dozens of times, all
success so far.
>>
>>
>>
>> 2013/9/2 Isaku Yamahata <yamahata@private.email.ne.jp <mailto:yamahata@private.email.ne.jp>>
>>
>>     Hi. Can you elaborate why two CQs fix it? Does it depend on
>>     HCA implementation?
>>
>>     I'm not against two CQs for sending and receiving. In fact I'm for it
>>     because I use two CQs for postcopy RDMA support.
>>
>>     thanks,
>>
>>     On Fri, Aug 30, 2013 at 08:39:31PM +0800, Frank Yang wrote:
>>     > When several VMs migrate with RDMA at the same time, the
>>     increased pressure
>>     > cause packet loss probabilistically and make source and
>>     destination wait for
>>     > each other. There might be some of VMs blocked during the migration.
>>     >
>>     > Fix the bug by using two completion queues, for sending and
>>     receiving
>>     > respectively.
>>     >
>>     > From 0c4829495cdc89eea2e94b103ac42c3f6a4b32c2 Mon Sep 17
>>     00:00:00 2001
>>     > From: Frank Yang <frank.yangjie@gmail.com
>>     <mailto:frank.yangjie@gmail.com>>
>>     > Date: Fri, 30 Aug 2013 17:53:34 +0800
>>     > Subject: [PATCH] rdma: fix multiple VMs parallel migration
>>     >
>>     > Signed-off-by: Frank Yang <frank.yangjie@gmail.com
>>     <mailto:frank.yangjie@gmail.com>>
>>     > ---
>>     >  migration-rdma.c | 57
>>     ++++++++++++++++++++++++++++++++++++--------------------
>>     >  1 file changed, 37 insertions(+), 20 deletions(-)
>>     >
>>     > diff --git a/migration-rdma.c b/migration-rdma.c
>>     > index 3d1266f..d0eacbb 100644
>>     > --- a/migration-rdma.c
>>     > +++ b/migration-rdma.c
>>     > @@ -362,7 +362,8 @@ typedef struct RDMAContext {
>>     >      struct ibv_qp *qp;                      /* queue pair */
>>     >      struct ibv_comp_channel *comp_channel;  /* completion
>>     channel */
>>     >      struct ibv_pd *pd;                      /* protection domain */
>>     > -    struct ibv_cq *cq;                      /* completion queue */
>>     > +    struct ibv_cq *send_cq;                 /* send completion
>>     queue */
>>     > +    struct ibv_cq *recv_cq;                 /* receive
>>     completion queue */
>>     >
>>     >      /*
>>     >       * If a previous write failed (perhaps because of a failed
>>     > @@ -1006,9 +1007,12 @@ static int
>>     qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
>>     >       * Completion queue can be filled by both read and write
>>     work requests,
>>     >       * so must reflect the sum of both possible queue sizes.
>>     >       */
>>     > -    rdma->cq = ibv_create_cq(rdma->verbs,
>>     (RDMA_SIGNALED_SEND_MAX * 3),
>>     > +    rdma->send_cq = ibv_create_cq(rdma->verbs,
>>     (RDMA_SIGNALED_SEND_MAX * 2),
>>     >              NULL, rdma->comp_channel, 0);
>>     > -    if (!rdma->cq) {
>>     > +    rdma->recv_cq = ibv_create_cq(rdma->verbs,
>>     RDMA_SIGNALED_SEND_MAX, NULL,
>>     > +            rdma->comp_channel, 0);
>>     > +
>>     > +    if (!rdma->send_cq || !rdma->recv_cq) {
>>     >          fprintf(stderr, "failed to allocate completion queue\n");
>>     >          goto err_alloc_pd_cq;
>>     >      }
>>     > @@ -1040,8 +1044,8 @@ static int qemu_rdma_alloc_qp(RDMAContext
>>     *rdma)
>>     >      attr.cap.max_recv_wr = 3;
>>     >      attr.cap.max_send_sge = 1;
>>     >      attr.cap.max_recv_sge = 1;
>>     > -    attr.send_cq = rdma->cq;
>>     > -    attr.recv_cq = rdma->cq;
>>     > +    attr.send_cq = rdma->send_cq;
>>     > +    attr.recv_cq = rdma->recv_cq;
>>     >      attr.qp_type = IBV_QPT_RC;
>>     >
>>     >      ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
>>     > @@ -1361,13 +1365,18 @@ static void
>>     qemu_rdma_signal_unregister(RDMAContext
>>     > *rdma, uint64_t index,
>>     >   * Return the work request ID that completed.
>>     >   */
>>     >  static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t
>>     *wr_id_out,
>>     > -                               uint32_t *byte_len)
>>     > +                               uint32_t *byte_len, int
>>     wrid_requested)
>>     >  {
>>     >      int ret;
>>     >      struct ibv_wc wc;
>>     >      uint64_t wr_id;
>>     >
>>     > -    ret = ibv_poll_cq(rdma->cq, 1, &wc);
>>     > +    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
>>     > +        wrid_requested == RDMA_WRID_SEND_CONTROL) {
>>     > +        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
>>     > +    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
>>     > +        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
>>     > +    }
>>     >
>>     >      if (!ret) {
>>     >          *wr_id_out = RDMA_WRID_NONE;
>>     > @@ -1460,12 +1469,9 @@ static int
>>     qemu_rdma_block_for_wrid(RDMAContext *rdma,
>>     > int wrid_requested,
>>     >      void *cq_ctx;
>>     >      uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
>>     >
>>     > -    if (ibv_req_notify_cq(rdma->cq, 0)) {
>>     > -        return -1;
>>     > -    }
>>     >      /* poll cq first */
>>     >      while (wr_id != wrid_requested) {
>>     > -        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
>>     > +        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len,
>>     wrid_requested);
>>     >          if (ret < 0) {
>>     >              return ret;
>>     >          }
>>     > @@ -1487,6 +1493,17 @@ static int
>>     qemu_rdma_block_for_wrid(RDMAContext *rdma,
>>     > int wrid_requested,
>>     >      }
>>     >
>>     >      while (1) {
>>     > +        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
>>     > +            wrid_requested == RDMA_WRID_SEND_CONTROL) {
>>     > +            if (ibv_req_notify_cq(rdma->send_cq, 0)) {
>>     > +                return -1;
>>     > +            }
>>     > +        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
>>     > +            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
>>     > +                return -1;
>>     > +            }
>>     > +        }
>>     > +
>>     >          /*
>>     >           * Coroutine doesn't start until
>>     process_incoming_migration()
>>     >           * so don't yield unless we know we're running inside
>>     of a coroutine.
>>     > @@ -1502,12 +1519,8 @@ static int
>>     qemu_rdma_block_for_wrid(RDMAContext *rdma,
>>     > int wrid_requested,
>>     >
>>     >          num_cq_events++;
>>     >
>>     > -        if (ibv_req_notify_cq(cq, 0)) {
>>     > -            goto err_block_for_wrid;
>>     > -        }
>>     > -
>>     >          while (wr_id != wrid_requested) {
>>     > -            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
>>     > +            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len,
>>     wrid_requested);
>>     >              if (ret < 0) {
>>     >                  goto err_block_for_wrid;
>>     >              }
>>     > @@ -2236,9 +2249,13 @@ static void qemu_rdma_cleanup(RDMAContext
>>     *rdma)
>>     >          ibv_destroy_qp(rdma->qp);
>>     >          rdma->qp = NULL;
>>     >      }
>>     > -    if (rdma->cq) {
>>     > -        ibv_destroy_cq(rdma->cq);
>>     > -        rdma->cq = NULL;
>>     > +    if (rdma->send_cq) {
>>     > +        ibv_destroy_cq(rdma->send_cq);
>>     > +        rdma->send_cq = NULL;
>>     > +    }
>>     > +    if (rdma->recv_cq) {
>>     > +        ibv_destroy_cq(rdma->recv_cq);
>>     > +        rdma->recv_cq = NULL;
>>     >      }
>>     >      if (rdma->comp_channel) {
>>     >  ibv_destroy_comp_channel(rdma->comp_channel);
>>     > @@ -2770,7 +2787,7 @@ static size_t qemu_rdma_save_page(QEMUFile
>>     *f, void
>>     > *opaque,
>>     >       */
>>     >      while (1) {
>>     >          uint64_t wr_id, wr_id_in;
>>     > -        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
>>     > +        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL,
>>     RDMA_WRID_RDMA_WRITE);
>>     >          if (ret < 0) {
>>     >              fprintf(stderr, "rdma migration: polling error!
>>     %d\n", ret);
>>     >              goto err;
>>     > --
>>     > 1.8.3.msysgit.0
>>     >
>>     >
>>
>>     --
>>     yamahata
>>
>>
>
>
diff mbox

Patch

diff --git a/migration-rdma.c b/migration-rdma.c
index 3d1266f..d0eacbb 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -362,7 +362,8 @@  typedef struct RDMAContext {
     struct ibv_qp *qp;                      /* queue pair */
     struct ibv_comp_channel *comp_channel;  /* completion channel */
     struct ibv_pd *pd;                      /* protection domain */
-    struct ibv_cq *cq;                      /* completion queue */
+    struct ibv_cq *send_cq;                 /* send completion queue */
+    struct ibv_cq *recv_cq;                 /* receive completion queue */

     /*
      * If a previous write failed (perhaps because of a failed
@@ -1006,9 +1007,12 @@  static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
      * Completion queue can be filled by both read and write work requests,
      * so must reflect the sum of both possible queue sizes.
      */
-    rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
+    rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX *
2),
             NULL, rdma->comp_channel, 0);
-    if (!rdma->cq) {
+    rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX,
NULL,
+            rdma->comp_channel, 0);
+
+    if (!rdma->send_cq || !rdma->recv_cq) {
         fprintf(stderr, "failed to allocate completion queue\n");