diff mbox

iscsi: fix deadlock during login

Message ID 50A50150.8010201@dlhnet.de
State New
Headers show

Commit Message

Peter Lieven Nov. 15, 2012, 2:50 p.m. UTC
If the connection is interrupted before the first login is successfully
completed qemu-kvm is waiting forever in qemu_aio_wait().

This is fixed by performing an sync login to the target. If the
connection breaks after the first successful login errors are
handled internally by libiscsi.

Signed-off-by: Peter Lieven <pl@kamp.de>
---
  block/iscsi.c |   56 
+++++++++++++++++++++-----------------------------------
  1 file changed, 21 insertions(+), 35 deletions(-)

  {
      QemuOptsList *list;
@@ -934,7 +910,8 @@ static int iscsi_open(BlockDriverState *bs, const 
char *filename, int flags)
      IscsiLun *iscsilun = bs->opaque;
      struct iscsi_context *iscsi = NULL;
      struct iscsi_url *iscsi_url = NULL;
-    struct IscsiTask task;
+    struct IscsiTask itask;
+    struct scsi_task *task;
      char *initiator_name = NULL;
      int ret;

@@ -997,27 +974,36 @@ static int iscsi_open(BlockDriverState *bs, const 
char *filename, int flags)
      /* check if we got HEADER_DIGEST via the options */
      parse_header_digest(iscsi, iscsi_url->target);

-    task.iscsilun = iscsilun;
-    task.status = 0;
-    task.complete = 0;
-    task.bs = bs;
+    if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, 
iscsi_url->lun) != 0) {
+        error_report("iSCSI: Failed to connect to LUN : %s",
+            iscsi_get_error(iscsi));
+        ret = -EINVAL;
+        goto out;
+    }
+
+    itask.iscsilun = iscsilun;
+    itask.status = 0;
+    itask.complete = 0;
+    itask.bs = bs;

      iscsilun->iscsi = iscsi;
      iscsilun->lun   = iscsi_url->lun;

-    if (iscsi_full_connect_async(iscsi, iscsi_url->portal, iscsi_url->lun,
-                                 iscsi_connect_cb, &task)
-        != 0) {
-        error_report("iSCSI: Failed to start async connect.");
+    task = iscsi_inquiry_task(iscsi, iscsilun->lun,
+                              0, 0, 36,
+                              iscsi_inquiry_cb, &itask);
+    if (task == NULL) {
+        error_report("iSCSI: failed to send inquiry command.");
          ret = -EINVAL;
          goto out;
      }

-    while (!task.complete) {
+    while (!itask.complete) {
          iscsi_set_events(iscsilun);
          qemu_aio_wait();
      }
-    if (task.status != 0) {
+
+    if (itask.status != 0) {
          error_report("iSCSI: Failed to connect to LUN : %s",
                       iscsi_get_error(iscsi));
          ret = -EINVAL;

Comments

ronnie sahlberg Nov. 15, 2012, 2:57 p.m. UTC | #1
I dont know if we should switch to use synchronous code here.
It is much nicer if all code is async.

Is it possible to add a timeout instead that would break out if the
connect/login has not completed within a certain amount of time?


regards
ronnie sahlberg

On Thu, Nov 15, 2012 at 6:50 AM, Peter Lieven <pl@dlhnet.de> wrote:
> If the connection is interrupted before the first login is successfully
> completed qemu-kvm is waiting forever in qemu_aio_wait().
>
> This is fixed by performing an sync login to the target. If the
> connection breaks after the first successful login errors are
> handled internally by libiscsi.
>
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>  block/iscsi.c |   56
> +++++++++++++++++++++-----------------------------------
>  1 file changed, 21 insertions(+), 35 deletions(-)
>
> diff --git a/block/iscsi.c b/block/iscsi.c
> index b5c3161..f44bb57 100644
> --- a/block/iscsi.c
> +++ b/block/iscsi.c
> @@ -798,30 +798,6 @@ iscsi_inquiry_cb(struct iscsi_context *iscsi, int
> status, void *command_data,
>      }
>  }
>
> -static void
> -iscsi_connect_cb(struct iscsi_context *iscsi, int status, void
> *command_data,
> -                 void *opaque)
> -{
> -    struct IscsiTask *itask = opaque;
> -    struct scsi_task *task;
> -
> -    if (status != 0) {
> -        itask->status   = 1;
> -        itask->complete = 1;
> -        return;
> -    }
> -
> -    task = iscsi_inquiry_task(iscsi, itask->iscsilun->lun,
> -                              0, 0, 36,
> -                              iscsi_inquiry_cb, opaque);
> -    if (task == NULL) {
> -        error_report("iSCSI: failed to send inquiry command.");
> -        itask->status   = 1;
> -        itask->complete = 1;
> -        return;
> -    }
> -}
> -
>  static int parse_chap(struct iscsi_context *iscsi, const char *target)
>  {
>      QemuOptsList *list;
> @@ -934,7 +910,8 @@ static int iscsi_open(BlockDriverState *bs, const char
> *filename, int flags)
>      IscsiLun *iscsilun = bs->opaque;
>      struct iscsi_context *iscsi = NULL;
>      struct iscsi_url *iscsi_url = NULL;
> -    struct IscsiTask task;
> +    struct IscsiTask itask;
> +    struct scsi_task *task;
>      char *initiator_name = NULL;
>      int ret;
>
> @@ -997,27 +974,36 @@ static int iscsi_open(BlockDriverState *bs, const char
> *filename, int flags)
>      /* check if we got HEADER_DIGEST via the options */
>      parse_header_digest(iscsi, iscsi_url->target);
>
> -    task.iscsilun = iscsilun;
> -    task.status = 0;
> -    task.complete = 0;
> -    task.bs = bs;
> +    if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun)
> != 0) {
> +        error_report("iSCSI: Failed to connect to LUN : %s",
> +            iscsi_get_error(iscsi));
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +
> +    itask.iscsilun = iscsilun;
> +    itask.status = 0;
> +    itask.complete = 0;
> +    itask.bs = bs;
>
>      iscsilun->iscsi = iscsi;
>      iscsilun->lun   = iscsi_url->lun;
>
> -    if (iscsi_full_connect_async(iscsi, iscsi_url->portal, iscsi_url->lun,
> -                                 iscsi_connect_cb, &task)
> -        != 0) {
> -        error_report("iSCSI: Failed to start async connect.");
> +    task = iscsi_inquiry_task(iscsi, iscsilun->lun,
> +                              0, 0, 36,
> +                              iscsi_inquiry_cb, &itask);
> +    if (task == NULL) {
> +        error_report("iSCSI: failed to send inquiry command.");
>          ret = -EINVAL;
>          goto out;
>      }
>
> -    while (!task.complete) {
> +    while (!itask.complete) {
>          iscsi_set_events(iscsilun);
>          qemu_aio_wait();
>      }
> -    if (task.status != 0) {
> +
> +    if (itask.status != 0) {
>          error_report("iSCSI: Failed to connect to LUN : %s",
>                       iscsi_get_error(iscsi));
>          ret = -EINVAL;
> --
> 1.7.9.5
Peter Lieven Nov. 15, 2012, 3:05 p.m. UTC | #2
Am 15.11.2012 um 15:57 schrieb ronnie sahlberg <ronniesahlberg@gmail.com>:

> I dont know if we should switch to use synchronous code here.
> It is much nicer if all code is async.

Of course, but its just the initial login after which qemu should exit if it fails.

> 
> Is it possible to add a timeout instead that would break out if the
> connect/login has not completed within a certain amount of time?

I think it is, but this has to be done by qemu or not?

I found switching to synchronous code fixed 2 issues at once, the issue
that it hangs if the connection is interrupted during login and another
issue that i submitted a patch for earlier:

https://github.com/sahlberg/libiscsi/commit/a9257d52a7577b607e237e209b9868c5ce78a927

it might be that this fix introduced the new issue.

Peter


> 
> 
> regards
> ronnie sahlberg
> 
> On Thu, Nov 15, 2012 at 6:50 AM, Peter Lieven <pl@dlhnet.de> wrote:
>> If the connection is interrupted before the first login is successfully
>> completed qemu-kvm is waiting forever in qemu_aio_wait().
>> 
>> This is fixed by performing an sync login to the target. If the
>> connection breaks after the first successful login errors are
>> handled internally by libiscsi.
>> 
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>> block/iscsi.c |   56
>> +++++++++++++++++++++-----------------------------------
>> 1 file changed, 21 insertions(+), 35 deletions(-)
>> 
>> diff --git a/block/iscsi.c b/block/iscsi.c
>> index b5c3161..f44bb57 100644
>> --- a/block/iscsi.c
>> +++ b/block/iscsi.c
>> @@ -798,30 +798,6 @@ iscsi_inquiry_cb(struct iscsi_context *iscsi, int
>> status, void *command_data,
>>     }
>> }
>> 
>> -static void
>> -iscsi_connect_cb(struct iscsi_context *iscsi, int status, void
>> *command_data,
>> -                 void *opaque)
>> -{
>> -    struct IscsiTask *itask = opaque;
>> -    struct scsi_task *task;
>> -
>> -    if (status != 0) {
>> -        itask->status   = 1;
>> -        itask->complete = 1;
>> -        return;
>> -    }
>> -
>> -    task = iscsi_inquiry_task(iscsi, itask->iscsilun->lun,
>> -                              0, 0, 36,
>> -                              iscsi_inquiry_cb, opaque);
>> -    if (task == NULL) {
>> -        error_report("iSCSI: failed to send inquiry command.");
>> -        itask->status   = 1;
>> -        itask->complete = 1;
>> -        return;
>> -    }
>> -}
>> -
>> static int parse_chap(struct iscsi_context *iscsi, const char *target)
>> {
>>     QemuOptsList *list;
>> @@ -934,7 +910,8 @@ static int iscsi_open(BlockDriverState *bs, const char
>> *filename, int flags)
>>     IscsiLun *iscsilun = bs->opaque;
>>     struct iscsi_context *iscsi = NULL;
>>     struct iscsi_url *iscsi_url = NULL;
>> -    struct IscsiTask task;
>> +    struct IscsiTask itask;
>> +    struct scsi_task *task;
>>     char *initiator_name = NULL;
>>     int ret;
>> 
>> @@ -997,27 +974,36 @@ static int iscsi_open(BlockDriverState *bs, const char
>> *filename, int flags)
>>     /* check if we got HEADER_DIGEST via the options */
>>     parse_header_digest(iscsi, iscsi_url->target);
>> 
>> -    task.iscsilun = iscsilun;
>> -    task.status = 0;
>> -    task.complete = 0;
>> -    task.bs = bs;
>> +    if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun)
>> != 0) {
>> +        error_report("iSCSI: Failed to connect to LUN : %s",
>> +            iscsi_get_error(iscsi));
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +
>> +    itask.iscsilun = iscsilun;
>> +    itask.status = 0;
>> +    itask.complete = 0;
>> +    itask.bs = bs;
>> 
>>     iscsilun->iscsi = iscsi;
>>     iscsilun->lun   = iscsi_url->lun;
>> 
>> -    if (iscsi_full_connect_async(iscsi, iscsi_url->portal, iscsi_url->lun,
>> -                                 iscsi_connect_cb, &task)
>> -        != 0) {
>> -        error_report("iSCSI: Failed to start async connect.");
>> +    task = iscsi_inquiry_task(iscsi, iscsilun->lun,
>> +                              0, 0, 36,
>> +                              iscsi_inquiry_cb, &itask);
>> +    if (task == NULL) {
>> +        error_report("iSCSI: failed to send inquiry command.");
>>         ret = -EINVAL;
>>         goto out;
>>     }
>> 
>> -    while (!task.complete) {
>> +    while (!itask.complete) {
>>         iscsi_set_events(iscsilun);
>>         qemu_aio_wait();
>>     }
>> -    if (task.status != 0) {
>> +
>> +    if (itask.status != 0) {
>>         error_report("iSCSI: Failed to connect to LUN : %s",
>>                      iscsi_get_error(iscsi));
>>         ret = -EINVAL;
>> --
>> 1.7.9.5
Paolo Bonzini Nov. 15, 2012, 3:54 p.m. UTC | #3
Il 15/11/2012 15:57, ronnie sahlberg ha scritto:
> I dont know if we should switch to use synchronous code here.
> It is much nicer if all code is async.

bdrv_open is generally synchronous, so I think Peter's patch is ok.

Paolo

> Is it possible to add a timeout instead that would break out if the
> connect/login has not completed within a certain amount of time?
Peter Lieven Nov. 15, 2012, 4:11 p.m. UTC | #4
Am 15.11.2012 um 16:54 schrieb Paolo Bonzini <pbonzini@redhat.com>:

> Il 15/11/2012 15:57, ronnie sahlberg ha scritto:
>> I dont know if we should switch to use synchronous code here.
>> It is much nicer if all code is async.
> 
> bdrv_open is generally synchronous, so I think Peter's patch is ok.

would`t it be better to do the iscsi_inquiry commands and iscsi_readcapacity
also sync?

peter

> 
> Paolo
> 
>> Is it possible to add a timeout instead that would break out if the
>> connect/login has not completed within a certain amount of time?
>
ronnie sahlberg Nov. 15, 2012, 4:13 p.m. UTC | #5
On Thu, Nov 15, 2012 at 7:54 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 15/11/2012 15:57, ronnie sahlberg ha scritto:
>> I dont know if we should switch to use synchronous code here.
>> It is much nicer if all code is async.
>
> bdrv_open is generally synchronous, so I think Peter's patch is ok.

I was thinking about the case where you disconnect/reconnect a device
at runtime. Like swapping the medium in a CDROM.
If bdrv_open() is synchronous and blocks for a long time, would that
not impact the rest of QEMU?

Otherwise:
Acked-by:  ronniesahlberg@gmail.com

>
> Paolo
>
>> Is it possible to add a timeout instead that would break out if the
>> connect/login has not completed within a certain amount of time?
>
Paolo Bonzini Nov. 15, 2012, 4:37 p.m. UTC | #6
Il 15/11/2012 17:13, ronnie sahlberg ha scritto:
> On Thu, Nov 15, 2012 at 7:54 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> Il 15/11/2012 15:57, ronnie sahlberg ha scritto:
>>> I dont know if we should switch to use synchronous code here.
>>> It is much nicer if all code is async.
>>
>> bdrv_open is generally synchronous, so I think Peter's patch is ok.
> 
> I was thinking about the case where you disconnect/reconnect a device
> at runtime. Like swapping the medium in a CDROM.
> If bdrv_open() is synchronous and blocks for a long time, would that
> not impact the rest of QEMU?

Yes, it's not optimal, but VCPUs would still run until they request I/O.
 But usually iscsi devices should be non-removable, no?  That leaves
hotplug as the only problematic case.

Paolo

> 
> Otherwise:
> Acked-by:  ronniesahlberg@gmail.com
> 
>>
>> Paolo
>>
>>> Is it possible to add a timeout instead that would break out if the
>>> connect/login has not completed within a certain amount of time?
>>
Peter Lieven Nov. 15, 2012, 6:28 p.m. UTC | #7
Paolo Bonzini wrote:
> Il 15/11/2012 15:57, ronnie sahlberg ha scritto:
>> I dont know if we should switch to use synchronous code here.
>> It is much nicer if all code is async.
>
> bdrv_open is generally synchronous, so I think Peter's patch is ok.

if all is sync wouldn't it be best to have all code in iscsi_open sync
then and convert the iscsi_inquiry and iscsi_readcapacity commands also to
sync?

Peter

>
> Paolo
>
>> Is it possible to add a timeout instead that would break out if the
>> connect/login has not completed within a certain amount of time?
>
>
>
Paolo Bonzini Nov. 16, 2012, 7:44 a.m. UTC | #8
Il 15/11/2012 19:28, Peter Lieven ha scritto:
>>> >> I dont know if we should switch to use synchronous code here.
>>> >> It is much nicer if all code is async.
>> >
>> > bdrv_open is generally synchronous, so I think Peter's patch is ok.
> if all is sync wouldn't it be best to have all code in iscsi_open sync
> then and convert the iscsi_inquiry and iscsi_readcapacity commands also to
> sync?

Indeed, there is no real advantage in using qemu_aio_wait().  I didn't
know libiscsi also had sync APIs. :)

Paolo
Peter Lieven Nov. 16, 2012, 9:26 a.m. UTC | #9
Am 16.11.2012 08:44, schrieb Paolo Bonzini:
> Il 15/11/2012 19:28, Peter Lieven ha scritto:
>>>>>> I dont know if we should switch to use synchronous code here.
>>>>>> It is much nicer if all code is async.
>>>> bdrv_open is generally synchronous, so I think Peter's patch is ok.
>> if all is sync wouldn't it be best to have all code in iscsi_open sync
>> then and convert the iscsi_inquiry and iscsi_readcapacity commands also to
>> sync?
> Indeed, there is no real advantage in using qemu_aio_wait().  I didn't
> know libiscsi also had sync APIs. :)

ok, give me some days to rewrite this patch. i think it should be ready 
by monday.
i will also add the missing iscsi_create which is a few lines only then.

peter

>
> Paolo
Kevin Wolf Nov. 16, 2012, 10:38 a.m. UTC | #10
Am 15.11.2012 17:37, schrieb Paolo Bonzini:
> Il 15/11/2012 17:13, ronnie sahlberg ha scritto:
>> On Thu, Nov 15, 2012 at 7:54 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>> Il 15/11/2012 15:57, ronnie sahlberg ha scritto:
>>>> I dont know if we should switch to use synchronous code here.
>>>> It is much nicer if all code is async.
>>>
>>> bdrv_open is generally synchronous, so I think Peter's patch is ok.
>>
>> I was thinking about the case where you disconnect/reconnect a device
>> at runtime. Like swapping the medium in a CDROM.
>> If bdrv_open() is synchronous and blocks for a long time, would that
>> not impact the rest of QEMU?
> 
> Yes, it's not optimal, but VCPUs would still run until they request I/O.
>  But usually iscsi devices should be non-removable, no?  That leaves
> hotplug as the only problematic case.

I guess we need a bdrv_co_open() for the long term.

Kevin
Peter Lieven Nov. 16, 2012, 5:38 p.m. UTC | #11
Am 16.11.2012 11:38, schrieb Kevin Wolf:
> Am 15.11.2012 17:37, schrieb Paolo Bonzini:
>> Il 15/11/2012 17:13, ronnie sahlberg ha scritto:
>>> On Thu, Nov 15, 2012 at 7:54 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>> Il 15/11/2012 15:57, ronnie sahlberg ha scritto:
>>>>> I dont know if we should switch to use synchronous code here.
>>>>> It is much nicer if all code is async.
>>>> bdrv_open is generally synchronous, so I think Peter's patch is ok.
>>> I was thinking about the case where you disconnect/reconnect a device
>>> at runtime. Like swapping the medium in a CDROM.
>>> If bdrv_open() is synchronous and blocks for a long time, would that
>>> not impact the rest of QEMU?
>> Yes, it's not optimal, but VCPUs would still run until they request I/O.
>>   But usually iscsi devices should be non-removable, no?  That leaves
>> hotplug as the only problematic case.
> I guess we need a bdrv_co_open() for the long term.
but for now its save to implement iscsi_open (and iscsi_create) completely
sync?

Peter
>
> Kevin
Paolo Bonzini Nov. 16, 2012, 5:55 p.m. UTC | #12
Il 16/11/2012 18:38, Peter Lieven ha scritto:
> Am 16.11.2012 11:38, schrieb Kevin Wolf:
>> Am 15.11.2012 17:37, schrieb Paolo Bonzini:
>>> Il 15/11/2012 17:13, ronnie sahlberg ha scritto:
>>>> On Thu, Nov 15, 2012 at 7:54 AM, Paolo Bonzini <pbonzini@redhat.com>
>>>> wrote:
>>>>> Il 15/11/2012 15:57, ronnie sahlberg ha scritto:
>>>>>> I dont know if we should switch to use synchronous code here.
>>>>>> It is much nicer if all code is async.
>>>>> bdrv_open is generally synchronous, so I think Peter's patch is ok.
>>>> I was thinking about the case where you disconnect/reconnect a device
>>>> at runtime. Like swapping the medium in a CDROM.
>>>> If bdrv_open() is synchronous and blocks for a long time, would that
>>>> not impact the rest of QEMU?
>>> Yes, it's not optimal, but VCPUs would still run until they request I/O.
>>>   But usually iscsi devices should be non-removable, no?  That leaves
>>> hotplug as the only problematic case.
>> I guess we need a bdrv_co_open() for the long term.
> but for now its save to implement iscsi_open (and iscsi_create) completely
> sync?

Yes.

Paolo
diff mbox

Patch

diff --git a/block/iscsi.c b/block/iscsi.c
index b5c3161..f44bb57 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -798,30 +798,6 @@  iscsi_inquiry_cb(struct iscsi_context *iscsi, int 
status, void *command_data,
      }
  }

-static void
-iscsi_connect_cb(struct iscsi_context *iscsi, int status, void 
*command_data,
-                 void *opaque)
-{
-    struct IscsiTask *itask = opaque;
-    struct scsi_task *task;
-
-    if (status != 0) {
-        itask->status   = 1;
-        itask->complete = 1;
-        return;
-    }
-
-    task = iscsi_inquiry_task(iscsi, itask->iscsilun->lun,
-                              0, 0, 36,
-                              iscsi_inquiry_cb, opaque);
-    if (task == NULL) {
-        error_report("iSCSI: failed to send inquiry command.");
-        itask->status   = 1;
-        itask->complete = 1;
-        return;
-    }
-}
-
  static int parse_chap(struct iscsi_context *iscsi, const char *target)