diff mbox

rdma: Fix cleanup in error paths

Message ID 4bc35b66-3d20-411a-a6d1-6f3263a13758@CMEXHTCAS2.ad.emulex.com
State New
Headers show

Commit Message

Padmanabh Ratnakar March 26, 2015, 2:08 a.m. UTC
As part of commit e325b49a320b493cc5d69e263751ff716dc458fe,
order in which resources are destroyed was changed for fixing
a seg fault. Due to this change, CQ will never get destroyed as
CQ should be destroyed after QP destruction. Seg fault is caused
improper cleanup when connection fails. Fixing cleanup after
connection failure and order in which resources are destroyed
in qemu_rdma_cleanup() routine.

Signed-off-by: Meghana Cheripady <meghana.cheripady@emulex.com>
Signed-off-by: Padmanabh Ratnakar <padmanabh.ratnakar@emulex.com>
---
 migration/rdma.c |   22 ++++++++--------------
 1 files changed, 8 insertions(+), 14 deletions(-)

Comments

Dr. David Alan Gilbert March 25, 2015, 10:39 a.m. UTC | #1
> As part of commit e325b49a320b493cc5d69e263751ff716dc458fe,
> order in which resources are destroyed was changed for fixing
> a seg fault. Due to this change, CQ will never get destroyed as
> CQ should be destroyed after QP destruction. Seg fault is caused
> improper cleanup when connection fails. Fixing cleanup after
> connection failure and order in which resources are destroyed
> in qemu_rdma_cleanup() routine.

Do you have a test case that triggers the seg fault?

> Signed-off-by: Meghana Cheripady <meghana.cheripady@emulex.com>
> Signed-off-by: Padmanabh Ratnakar <padmanabh.ratnakar@emulex.com>
> ---
>  migration/rdma.c |   22 ++++++++--------------
>  1 files changed, 8 insertions(+), 14 deletions(-)
> 
> diff --git a/migration/rdma.c b/migration/rdma.c
> index e6c3a67..77e3444 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -2194,6 +2194,10 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
>          }
>      }
>  
> +    if (rdma->qp) {
> +        rdma_destroy_qp(rdma->cm_id);
> +        rdma->qp = NULL;
> +    }

OK, that makes sense, the manpage for rdma_destroy_qp says
'Users must destroy any QP associated with an rdma_cm_id before destroying the ID.'
so it's probably good to have this early.

>      if (rdma->cq) {
>          ibv_destroy_cq(rdma->cq);
>          rdma->cq = NULL;
> @@ -2206,18 +2210,14 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
>          ibv_dealloc_pd(rdma->pd);
>          rdma->pd = NULL;
>      }
> -    if (rdma->listen_id) {
> -        rdma_destroy_id(rdma->listen_id);
> -        rdma->listen_id = NULL;
> -    }
>      if (rdma->cm_id) {
> -        if (rdma->qp) {
> -            rdma_destroy_qp(rdma->cm_id);
> -            rdma->qp = NULL;
> -        }
>          rdma_destroy_id(rdma->cm_id);
>          rdma->cm_id = NULL;
>      }
> +    if (rdma->listen_id) {
> +        rdma_destroy_id(rdma->listen_id);
> +        rdma->listen_id = NULL;
> +    }

Can you explain this reorder - why is the order of listen_id and cm_id important?
is that a failure on the destination?

>      if (rdma->channel) {
>          rdma_destroy_event_channel(rdma->channel);
>          rdma->channel = NULL;
> @@ -2309,8 +2309,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
>      if (ret) {
>          perror("rdma_connect");
>          ERROR(errp, "connecting to destination!");
> -        rdma_destroy_id(rdma->cm_id);
> -        rdma->cm_id = NULL;
>          goto err_rdma_source_connect;
>      }
>  
> @@ -2319,8 +2317,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
>          perror("rdma_get_cm_event after rdma_connect");
>          ERROR(errp, "connecting to destination!");
>          rdma_ack_cm_event(cm_event);
> -        rdma_destroy_id(rdma->cm_id);
> -        rdma->cm_id = NULL;
>          goto err_rdma_source_connect;
>      }
>  
> @@ -2328,8 +2324,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
>          perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
>          ERROR(errp, "connecting to destination!");
>          rdma_ack_cm_event(cm_event);
> -        rdma_destroy_id(rdma->cm_id);
> -        rdma->cm_id = NULL;
>          goto err_rdma_source_connect;
>      }
>      rdma->connected = true;
> -- 
> 1.7.1

Dave
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Juan Quintela March 25, 2015, 11:20 a.m. UTC | #2
Padmanabh Ratnakar <padmanabh.ratnakar@Emulex.Com> wrote:
> As part of commit e325b49a320b493cc5d69e263751ff716dc458fe,
> order in which resources are destroyed was changed for fixing
> a seg fault. Due to this change, CQ will never get destroyed as
> CQ should be destroyed after QP destruction. Seg fault is caused
> improper cleanup when connection fails. Fixing cleanup after
> connection failure and order in which resources are destroyed
> in qemu_rdma_cleanup() routine.
>
> Signed-off-by: Meghana Cheripady <meghana.cheripady@emulex.com>
> Signed-off-by: Padmanabh Ratnakar <padmanabh.ratnakar@emulex.com>
> ---
>  migration/rdma.c |   22 ++++++++--------------
>  1 files changed, 8 insertions(+), 14 deletions(-)
>
> diff --git a/migration/rdma.c b/migration/rdma.c
> index e6c3a67..77e3444 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -2194,6 +2194,10 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
>          }
>      }
>  
> +    if (rdma->qp) {
> +        rdma_destroy_qp(rdma->cm_id);
> +        rdma->qp = NULL;
> +    }

Agreed with this change.


>      if (rdma->cq) {
>          ibv_destroy_cq(rdma->cq);
>          rdma->cq = NULL;
> @@ -2206,18 +2210,14 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
>          ibv_dealloc_pd(rdma->pd);
>          rdma->pd = NULL;
>      }
> -    if (rdma->listen_id) {
> -        rdma_destroy_id(rdma->listen_id);
> -        rdma->listen_id = NULL;
> -    }

I am not sure about this one.  We have (receiving side)

create listen_id
cm_id = rdma_accept(listen_id)

So, it looks better to do the cleanup in the other order, but notice
that with current code, I think this don't matter (i.e. we can't call
qemu_rdma_cleanup() from other place that accept).


>      if (rdma->cm_id) {
> -        if (rdma->qp) {
> -            rdma_destroy_qp(rdma->cm_id);
> -            rdma->qp = NULL;
> -        }

This is the "companion" of the 1st chunk, and I agree with the c"

>          rdma_destroy_id(rdma->cm_id);
>          rdma->cm_id = NULL;
>      }
> +    if (rdma->listen_id) {
> +        rdma_destroy_id(rdma->listen_id);
> +        rdma->listen_id = NULL;
> +    }

Companion of the second chunk that I can't understand why it is moved.

>      if (rdma->channel) {
>          rdma_destroy_event_channel(rdma->channel);
>          rdma->channel = NULL;
> @@ -2309,8 +2309,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
>      if (ret) {
>          perror("rdma_connect");
>          ERROR(errp, "connecting to destination!");
> -        rdma_destroy_id(rdma->cm_id);
> -        rdma->cm_id = NULL;
>          goto err_rdma_source_connect;
>      }

This other three are nice, remove code, and make it correct with the
case that qp has to be removed first.

So, should we drop the listen_id part, or there is a reason for it?

Later, Juan.


> @@ -2319,8 +2317,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
>          perror("rdma_get_cm_event after rdma_connect");
>          ERROR(errp, "connecting to destination!");
>          rdma_ack_cm_event(cm_event);
> -        rdma_destroy_id(rdma->cm_id);
> -        rdma->cm_id = NULL;
>          goto err_rdma_source_connect;
>      }
>  
> @@ -2328,8 +2324,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
>          perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
>          ERROR(errp, "connecting to destination!");
>          rdma_ack_cm_event(cm_event);
> -        rdma_destroy_id(rdma->cm_id);
> -        rdma->cm_id = NULL;
>          goto err_rdma_source_connect;
>      }
>      rdma->connected = true;
Padmanabh Ratnakar March 25, 2015, 1:50 p.m. UTC | #3
> -----Original Message-----
> From: Dr. David Alan Gilbert [mailto:dgilbert@redhat.com]
> Sent: Wednesday, March 25, 2015 4:10 PM
> To: Padmanabh Ratnakar
> Cc: quintela@redhat.com; amit.shah@redhat.com; mrhines@us.ibm.com;
> Padmanabh Ratnakar; Meghana Cheripady; qemu-devel@nongnu.org
> Subject: Re: [PATCH] rdma: Fix cleanup in error paths
> 
> > As part of commit e325b49a320b493cc5d69e263751ff716dc458fe,
> > order in which resources are destroyed was changed for fixing a seg
> > fault. Due to this change, CQ will never get destroyed as CQ should be
> > destroyed after QP destruction. Seg fault is caused improper cleanup
> > when connection fails. Fixing cleanup after connection failure and
> > order in which resources are destroyed in qemu_rdma_cleanup() routine.
> 
> Do you have a test case that triggers the seg fault?

We reverted the change of destroy CQ before destroy QP as it was preventing CQ
from getting destroyed.
Then we connected RoCE adapter to a regular NIC adapter.
We tried to establish connection using qemu  from RoCE adapter to other side.
Connection establishment times out and qemu hits seg fault while QP is getting destroyed. 

> 
> > Signed-off-by: Meghana Cheripady <meghana.cheripady@emulex.com>
> > Signed-off-by: Padmanabh Ratnakar <padmanabh.ratnakar@emulex.com>
> > ---
> >  migration/rdma.c |   22 ++++++++--------------
> >  1 files changed, 8 insertions(+), 14 deletions(-)
> >
> > diff --git a/migration/rdma.c b/migration/rdma.c index
> > e6c3a67..77e3444 100644
> > --- a/migration/rdma.c
> > +++ b/migration/rdma.c
> > @@ -2194,6 +2194,10 @@ static void qemu_rdma_cleanup(RDMAContext
> *rdma)
> >          }
> >      }
> >
> > +    if (rdma->qp) {
> > +        rdma_destroy_qp(rdma->cm_id);
> > +        rdma->qp = NULL;
> > +    }
> 
> OK, that makes sense, the manpage for rdma_destroy_qp says 'Users must
> destroy any QP associated with an rdma_cm_id before destroying the ID.'
> so it's probably good to have this early.
> 
> >      if (rdma->cq) {
> >          ibv_destroy_cq(rdma->cq);
> >          rdma->cq = NULL;
> > @@ -2206,18 +2210,14 @@ static void qemu_rdma_cleanup(RDMAContext
> *rdma)
> >          ibv_dealloc_pd(rdma->pd);
> >          rdma->pd = NULL;
> >      }
> > -    if (rdma->listen_id) {
> > -        rdma_destroy_id(rdma->listen_id);
> > -        rdma->listen_id = NULL;
> > -    }
> >      if (rdma->cm_id) {
> > -        if (rdma->qp) {
> > -            rdma_destroy_qp(rdma->cm_id);
> > -            rdma->qp = NULL;
> > -        }
> >          rdma_destroy_id(rdma->cm_id);
> >          rdma->cm_id = NULL;
> >      }
> > +    if (rdma->listen_id) {
> > +        rdma_destroy_id(rdma->listen_id);
> > +        rdma->listen_id = NULL;
> > +    }
> 
> Can you explain this reorder - why is the order of listen_id and cm_id
> important?
> is that a failure on the destination?

listen_id is used only in server side.
Saw that listen_id gets created before cm_id.
So for symmetry destroyed cm_id before listen_id.
Checked the status of rdma_destroy_id() and both connection
identifier were getting destroyed successfully.
Please let me know if I should revert this change.  
 
> 
> >      if (rdma->channel) {
> >          rdma_destroy_event_channel(rdma->channel);
> >          rdma->channel = NULL;
> > @@ -2309,8 +2309,6 @@ static int qemu_rdma_connect(RDMAContext
> *rdma, Error **errp)
> >      if (ret) {
> >          perror("rdma_connect");
> >          ERROR(errp, "connecting to destination!");
> > -        rdma_destroy_id(rdma->cm_id);
> > -        rdma->cm_id = NULL;
> >          goto err_rdma_source_connect;
> >      }
> >
> > @@ -2319,8 +2317,6 @@ static int qemu_rdma_connect(RDMAContext
> *rdma, Error **errp)
> >          perror("rdma_get_cm_event after rdma_connect");
> >          ERROR(errp, "connecting to destination!");
> >          rdma_ack_cm_event(cm_event);
> > -        rdma_destroy_id(rdma->cm_id);
> > -        rdma->cm_id = NULL;
> >          goto err_rdma_source_connect;
> >      }
> >
> > @@ -2328,8 +2324,6 @@ static int qemu_rdma_connect(RDMAContext
> *rdma, Error **errp)
> >          perror("rdma_get_cm_event != EVENT_ESTABLISHED after
> rdma_connect");
> >          ERROR(errp, "connecting to destination!");
> >          rdma_ack_cm_event(cm_event);
> > -        rdma_destroy_id(rdma->cm_id);
> > -        rdma->cm_id = NULL;
> >          goto err_rdma_source_connect;
> >      }
> >      rdma->connected = true;
> > --
> > 1.7.1
> 
> Dave
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

Thanks,
Padmanabh
Padmanabh Ratnakar March 25, 2015, 1:54 p.m. UTC | #4
> 
> This other three are nice, remove code, and make it correct with the case
> that qp has to be removed first.
> 
> So, should we drop the listen_id part, or there is a reason for it?
> 
> Later, Juan.
> 

listen_id is used only in server side.
Saw that listen_id gets created before cm_id.
So for symmetry destroyed cm_id before listen_id.
Checked the status of rdma_destroy_id() and both connection identifier were getting destroyed successfully.
Please see if I am missing something and should I revert this change.  

Thanks,
Padmanabh
Juan Quintela March 25, 2015, 2:12 p.m. UTC | #5
Padmanabh Ratnakar <padmanabh.ratnakar@Emulex.Com> wrote:
> As part of commit e325b49a320b493cc5d69e263751ff716dc458fe,
> order in which resources are destroyed was changed for fixing
> a seg fault. Due to this change, CQ will never get destroyed as
> CQ should be destroyed after QP destruction. Seg fault is caused
> improper cleanup when connection fails. Fixing cleanup after
> connection failure and order in which resources are destroyed
> in qemu_rdma_cleanup() routine.
>
> Signed-off-by: Meghana Cheripady <meghana.cheripady@emulex.com>
> Signed-off-by: Padmanabh Ratnakar <padmanabh.ratnakar@emulex.com>

Applied, thanks.
diff mbox

Patch

diff --git a/migration/rdma.c b/migration/rdma.c
index e6c3a67..77e3444 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -2194,6 +2194,10 @@  static void qemu_rdma_cleanup(RDMAContext *rdma)
         }
     }
 
+    if (rdma->qp) {
+        rdma_destroy_qp(rdma->cm_id);
+        rdma->qp = NULL;
+    }
     if (rdma->cq) {
         ibv_destroy_cq(rdma->cq);
         rdma->cq = NULL;
@@ -2206,18 +2210,14 @@  static void qemu_rdma_cleanup(RDMAContext *rdma)
         ibv_dealloc_pd(rdma->pd);
         rdma->pd = NULL;
     }
-    if (rdma->listen_id) {
-        rdma_destroy_id(rdma->listen_id);
-        rdma->listen_id = NULL;
-    }
     if (rdma->cm_id) {
-        if (rdma->qp) {
-            rdma_destroy_qp(rdma->cm_id);
-            rdma->qp = NULL;
-        }
         rdma_destroy_id(rdma->cm_id);
         rdma->cm_id = NULL;
     }
+    if (rdma->listen_id) {
+        rdma_destroy_id(rdma->listen_id);
+        rdma->listen_id = NULL;
+    }
     if (rdma->channel) {
         rdma_destroy_event_channel(rdma->channel);
         rdma->channel = NULL;
@@ -2309,8 +2309,6 @@  static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
     if (ret) {
         perror("rdma_connect");
         ERROR(errp, "connecting to destination!");
-        rdma_destroy_id(rdma->cm_id);
-        rdma->cm_id = NULL;
         goto err_rdma_source_connect;
     }
 
@@ -2319,8 +2317,6 @@  static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
         perror("rdma_get_cm_event after rdma_connect");
         ERROR(errp, "connecting to destination!");
         rdma_ack_cm_event(cm_event);
-        rdma_destroy_id(rdma->cm_id);
-        rdma->cm_id = NULL;
         goto err_rdma_source_connect;
     }
 
@@ -2328,8 +2324,6 @@  static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
         perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
         ERROR(errp, "connecting to destination!");
         rdma_ack_cm_event(cm_event);
-        rdma_destroy_id(rdma->cm_id);
-        rdma->cm_id = NULL;
         goto err_rdma_source_connect;
     }
     rdma->connected = true;