Patchwork This patch adds a new block driver : iSCSI

login
register
mail settings
Submitter ronniesahlberg@gmail.com
Date Sept. 10, 2011, 4:23 a.m.
Message ID <1315628610-28222-2-git-send-email-ronniesahlberg@gmail.com>
Download mbox | patch
Permalink /patch/114148/
State New
Headers show

Comments

ronniesahlberg@gmail.com - Sept. 10, 2011, 4:23 a.m.
This provides built-in support for iSCSI to QEMU.
This has the advantage that the iSCSI devices need not be made visible to the host, which is useful if you have very many virtual machines and very many iscsi devices.
It also has the benefit that non-root users of QEMU can access iSCSI devices across the network without requiring root privilege on the host.

This driver interfaces with the multiplatform posix library
for iscsi initiator/client access to iscsi devices hosted at
git://github.com/sahlberg/libiscsi.git

The patch adds the driver to interface with the iscsi library.
It also updated the configure script to
* by default, probe is libiscsi is available and if so, build
  qemu against libiscsi.
* --enable-libiscsi
  Force a build against libiscsi. If libiscsi is not available
  the build will fail.
* --disable-libiscsi
  Do not link against libiscsi, even if it is available.

When linked with libiscsi, qemu gains support to access iscsi resources
such as disks and cdrom directly, without having to make the devices visible
to the host.

You can specify devices using a iscsi url of the form :
iscsi://[<username>[:<password>@]]<host>[:<port]/<target-iqn-name>/<lun>
When using authentication, the password can optionally be set with
LIBISCSI_CHAP_PASSWORD="password" to avoid it showing up in the process list

Example:
./x86_64-softmmu/qemu-system-x86_64 -m 1024 -cdrom iscsi://127.0.0.1/iqn.ronnie.test/2 --drive file=iscsi://127.0.0.1/iqn.ronnie.test/1 -boot d -enable-kvm

Signed-off-by: Ronnie Sahlberg <ronniesahlberg@gmail.com>
---
 Makefile.objs |    1 +
 block/iscsi.c |  594 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 configure     |   31 +++
 trace-events  |    7 +
 4 files changed, 633 insertions(+), 0 deletions(-)
 create mode 100644 block/iscsi.c
Stefan Hajnoczi - Sept. 12, 2011, 9:14 a.m.
On Sat, Sep 10, 2011 at 02:23:30PM +1000, Ronnie Sahlberg wrote:

Looking good.  I think this is worth merging because it does offer
benefits over host iSCSI.

> +static void
> +iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data,
> +                    void *private_data)
> +{
> +}
> +
> +static void
> +iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
> +{
> +    IscsiAIOCB *acb = (IscsiAIOCB *)blockacb;
> +    IscsiLun *iscsilun = acb->iscsilun;
> +
> +    acb->status = -ECANCELED;
> +    acb->common.cb(acb->common.opaque, acb->status);
> +    acb->canceled = 1;
> +
> +    iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task,
> +                                     iscsi_abort_task_cb, NULL);
> +}

The asynchronous abort task call looks odd.  If a caller allocates a
buffer and issues a read request, then we need to make sure that the
request is really aborted by the time .bdrv_aio_cancel() returns.

If I understand the code correctly, iscsi_aio_cancel() returns
immediately but the read request will still be in progress.  That means
the caller could now free their data buffer and the read request will
overwrite that unallocated memory.

> +static void
> +iscsi_aio_write10_cb(struct iscsi_context *iscsi, int status,
> +                     void *command_data, void *opaque)
> +{
> +    IscsiAIOCB *acb = opaque;
> +
> +    trace_iscsi_aio_write10_cb(iscsi, status, acb, acb->canceled);
> +
> +    if (acb->buf != NULL) {
> +        free(acb->buf);
> +    }

Please just use g_free(acb->buf).  g_free(NULL) is defined as a nop so
the check isn't necessary.  Also, this code uses free(3) when it should
use g_free(3).

> +
> +    if (acb->canceled != 0) {
> +        qemu_aio_release(acb);
> +        scsi_free_scsi_task(acb->task);
> +        acb->task = NULL;
> +        return;
> +    }
> +
> +    acb->status = 0;
> +    if (status < 0) {
> +        error_report("Failed to write10 data to iSCSI lun. %s",
> +                     iscsi_get_error(iscsi));
> +        acb->status = -EIO;
> +    }
> +
> +    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
> +    scsi_free_scsi_task(acb->task);
> +    acb->task = NULL;
> +}
> +
> +static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun)
> +{
> +    return sector * BDRV_SECTOR_SIZE / iscsilun->block_size;
> +}
> +
> +static BlockDriverAIOCB *
> +iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
> +                 QEMUIOVector *qiov, int nb_sectors,
> +                 BlockDriverCompletionFunc *cb,
> +                 void *opaque)
> +{
> +    IscsiLun *iscsilun = bs->opaque;
> +    struct iscsi_context *iscsi = iscsilun->iscsi;
> +    IscsiAIOCB *acb;
> +    size_t size;
> +    int fua = 0;
> +
> +    /* set FUA on writes when cache mode is write through */
> +    if (!(bs->open_flags & (BDRV_O_CACHE_WB|BDRV_O_NOCACHE))) {
> +        fua = 1;
> +    }

FUA needs to reflect the guest semantics - does this disk have an
enabled write cache?  When bs->open_flags has BDRV_O_CACHE_WB, then the
guest knows it needs to send flushes because there is a write cache:

if (!(bs->open_flags & BDRV_O_CACHE_WB)) {
    fua = 1;
}

BDRV_O_NOCACHE is just for local files and sets the O_DIRECT hint.  It
doesn't affect the cache semantics that the guest sees.

> +/*
> + * We support iscsi url's on the form
> + * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
> + */
> +static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
> +{
> +    IscsiLun *iscsilun = bs->opaque;
> +    struct iscsi_context *iscsi = NULL;
> +    struct iscsi_url *iscsi_url = NULL;
> +    struct IscsiTask task;
> +    int ret;
> +
> +    if ((BDRV_SECTOR_SIZE % 512) != 0) {
> +        error_report("iSCSI: Invalid BDRV_SECTOR_SIZE. "
> +                     "BDRV_SECTOR_SIZE(%lld) is not a multiple "
> +                     "of 512", BDRV_SECTOR_SIZE);
> +        return -EINVAL;
> +    }

Another way of saying this is:
QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE % 512 != 0);

The advantage is that the build fails instead of waiting until iscsi is
used at runtime until the failure is detected.

What will happen if BDRV_SECTOR_SIZE is not a multiple of 512?

> +
> +    memset(iscsilun, 0, sizeof(IscsiLun));
> +
> +    /* Should really append the KVM name after the ':' here */
> +    iscsi = iscsi_create_context("iqn.2008-11.org.linux-kvm:");
> +    if (iscsi == NULL) {
> +        error_report("iSCSI: Failed to create iSCSI context.");
> +        ret = -ENOMEM;
> +        goto failed;
> +    }
> +
> +    iscsi_url = iscsi_parse_full_url(iscsi, filename);
> +    if (iscsi_url == NULL) {
> +        error_report("Failed to parse URL : %s %s", filename,
> +                     iscsi_get_error(iscsi));
> +        ret = -ENOMEM;

-EINVAL?

> +static BlockDriver bdrv_iscsi = {
> +    .format_name     = "iscsi",
> +    .protocol_name   = "iscsi",
> +
> +    .instance_size   = sizeof(IscsiLun),
> +    .bdrv_file_open  = iscsi_open,
> +    .bdrv_close      = iscsi_close,
> +
> +    .bdrv_getlength  = iscsi_getlength,
> +
> +    .bdrv_aio_readv  = iscsi_aio_readv,
> +    .bdrv_aio_writev = iscsi_aio_writev,
> +    .bdrv_aio_flush  = iscsi_aio_flush,

block.c does not emulate .bdrv_flush() using your .bdrv_aio_flush()
implementation.  I have sent a patch to add this emulation.  This will
turn bdrv_flush(iscsi_bs) into an actual operation instead of a nop :).

> diff --git a/trace-events b/trace-events
> index 3fdd60f..4e461e5 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -494,3 +494,10 @@ escc_sunkbd_event_in(int ch) "Untranslated keycode %2.2x"
>  escc_sunkbd_event_out(int ch) "Translated keycode %2.2x"
>  escc_kbd_command(int val) "Command %d"
>  escc_sunmouse_event(int dx, int dy, int buttons_state) "dx=%d dy=%d buttons=%01x"
> +
> +# block/iscsi.c
> +disable iscsi_aio_write10_cb(void *iscsi, int status, void *acb, int canceled) "iscsi %p status %d acb %p canceled %d"
> +disable iscsi_aio_writev(void *iscsi, int64_t sector_num, int nb_sectors, void *opaque, void *acb) "iscsi %p sector_num %"PRId64" nb_sectors %d opaque %p acb %p"
> +disable iscsi_aio_read10_cb(void *iscsi, int status, void *acb, int canceled) "iscsi %p status %d acb %p canceled %d"
> +disable iscsi_aio_readv(void *iscsi, int64_t sector_num, int nb_sectors, void *opaque, void *acb) "iscsi %p sector_num %"PRId64" nb_sectors %d opaque %p acb %p"

It is no longer necessary to prefix trace event definitions with
"disable".

The stderr and simple backends now start up with all events disabled by
default.  The set of events can be set using the -trace
events=<events-file> option or on the QEMU monitor using set trace-event
<name> on.

Stefan
Christoph Hellwig - Sept. 14, 2011, 2:36 p.m.
On Mon, Sep 12, 2011 at 10:14:08AM +0100, Stefan Hajnoczi wrote:
> > +static void
> > +iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
> > +{
> > +    IscsiAIOCB *acb = (IscsiAIOCB *)blockacb;
> > +    IscsiLun *iscsilun = acb->iscsilun;
> > +
> > +    acb->status = -ECANCELED;
> > +    acb->common.cb(acb->common.opaque, acb->status);
> > +    acb->canceled = 1;
> > +
> > +    iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task,
> > +                                     iscsi_abort_task_cb, NULL);
> > +}
> 
> The asynchronous abort task call looks odd.  If a caller allocates a
> buffer and issues a read request, then we need to make sure that the
> request is really aborted by the time .bdrv_aio_cancel() returns.

Shouldn't new drivers use coroutines instead of aio instead?

(just poking around, I don't fully understand the new scheme myself yet
 either)

> BDRV_O_NOCACHE is just for local files and sets the O_DIRECT hint.  It
> doesn't affect the cache semantics that the guest sees.

Now that I fixed it it doesn't.  But that's been a fairly recent change,
which isn't always easy to pick up people having external patches.

> 
> > +/*
> > + * We support iscsi url's on the form
> > + * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
> > + */

Is having username + password on the command line really a that good idea?
Also what about the more complicated iSCSI authentification schemes?

> What will happen if BDRV_SECTOR_SIZE is not a multiple of 512?

hell will break lose for all of qemu.
Stefan Hajnoczi - Sept. 14, 2011, 3:50 p.m.
On Wed, Sep 14, 2011 at 3:36 PM, Christoph Hellwig <hch@lst.de> wrote:
> On Mon, Sep 12, 2011 at 10:14:08AM +0100, Stefan Hajnoczi wrote:
>> > +static void
>> > +iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
>> > +{
>> > +    IscsiAIOCB *acb = (IscsiAIOCB *)blockacb;
>> > +    IscsiLun *iscsilun = acb->iscsilun;
>> > +
>> > +    acb->status = -ECANCELED;
>> > +    acb->common.cb(acb->common.opaque, acb->status);
>> > +    acb->canceled = 1;
>> > +
>> > +    iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task,
>> > +                                     iscsi_abort_task_cb, NULL);
>> > +}
>>
>> The asynchronous abort task call looks odd.  If a caller allocates a
>> buffer and issues a read request, then we need to make sure that the
>> request is really aborted by the time .bdrv_aio_cancel() returns.
>
> Shouldn't new drivers use coroutines instead of aio instead?
>
> (just poking around, I don't fully understand the new scheme myself yet
>  either)

I think in this case it will not make the code nicer.  Since the
external iSCSI library is based on callbacks it would be necessary to
write the coroutines<->callbacks adapter functions.  So for example,
the READ10 command would need a function that can be called in
coroutine context and yields while libiscsi does the I/O.  When the
callback is invoked it will re-enter the coroutine.

The area where coroutines are useful in the block layer is for image
formats.  We already have common coroutines<->callback adapter
functions in block.c so it's possible to write sequential code for
image formats.  They only need access to block layer functions which
have already been adapted.  But as soon as you interact with a
callback-based API from the coroutine, then you need to write an
adapter yourself.

Stefan
ronniesahlberg@gmail.com - Sept. 14, 2011, 10:51 p.m.
On Thu, Sep 15, 2011 at 12:36 AM, Christoph Hellwig <hch@lst.de> wrote:
...
>> > +/*
>> > + * We support iscsi url's on the form
>> > + * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
>> > + */
>
> Is having username + password on the command line really a that good idea?
> Also what about the more complicated iSCSI authentification schemes?

In general it is a very bad idea. For local use on a private box it is
convenient to be able to use "<username>%<password>@" syntax.
For use on a shared box, libiscsi supports an alternative method too
by setting the username and/or password via environment variables :
LIBISCSI_CHAP_USERNAME=...  LIBISCSI_CHAP_PASSWORD=...

I will document this better in the next patch.


Libiscsi only support CHAP at this stage. Which other authentication
schemes do you have in mind? Perhaps I can add them.


regards
ronnie sahlberg
ronniesahlberg@gmail.com - Sept. 14, 2011, 11:08 p.m.
Stefan,

Thanks for your review.
I will do the changes you recommend and send an updated patch over the weekend.

Could you advice the best path for handling the .bdrv_flush  and the
blocksize questions?


I think it is reasonable to just not support iscsi at all for
blocksize that is not multiple of 512 bytes
since a read-modify-write cycle across a network would probably be
prohibitively expensive.

.bdrv_flush() I can easily add a synchronous implementation of this
unless your patch is expected to be merged
in the near future.


regards
ronnie sahlberg

On Mon, Sep 12, 2011 at 7:14 PM, Stefan Hajnoczi
<stefanha@linux.vnet.ibm.com> wrote:
> On Sat, Sep 10, 2011 at 02:23:30PM +1000, Ronnie Sahlberg wrote:
>
> Looking good.  I think this is worth merging because it does offer
> benefits over host iSCSI.
>
>> +static void
>> +iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data,
>> +                    void *private_data)
>> +{
>> +}
>> +
>> +static void
>> +iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
>> +{
>> +    IscsiAIOCB *acb = (IscsiAIOCB *)blockacb;
>> +    IscsiLun *iscsilun = acb->iscsilun;
>> +
>> +    acb->status = -ECANCELED;
>> +    acb->common.cb(acb->common.opaque, acb->status);
>> +    acb->canceled = 1;
>> +
>> +    iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task,
>> +                                     iscsi_abort_task_cb, NULL);
>> +}
>
> The asynchronous abort task call looks odd.  If a caller allocates a
> buffer and issues a read request, then we need to make sure that the
> request is really aborted by the time .bdrv_aio_cancel() returns.
>
> If I understand the code correctly, iscsi_aio_cancel() returns
> immediately but the read request will still be in progress.  That means
> the caller could now free their data buffer and the read request will
> overwrite that unallocated memory.
>

Right.
I will need to remove the read pdu from the local initiator side too
so that if the read is in flight and we receive data for it, that this
data is discarded
and not written into the possibly no longer valid buffers.
I will do this change in the next version of the patch.

>> +static void
>> +iscsi_aio_write10_cb(struct iscsi_context *iscsi, int status,
>> +                     void *command_data, void *opaque)
>> +{
>> +    IscsiAIOCB *acb = opaque;
>> +
>> +    trace_iscsi_aio_write10_cb(iscsi, status, acb, acb->canceled);
>> +
>> +    if (acb->buf != NULL) {
>> +        free(acb->buf);
>> +    }
>
> Please just use g_free(acb->buf).  g_free(NULL) is defined as a nop so
> the check isn't necessary.  Also, this code uses free(3) when it should
> use g_free(3).

Will do.

>
>> +
>> +    if (acb->canceled != 0) {
>> +        qemu_aio_release(acb);
>> +        scsi_free_scsi_task(acb->task);
>> +        acb->task = NULL;
>> +        return;
>> +    }
>> +
>> +    acb->status = 0;
>> +    if (status < 0) {
>> +        error_report("Failed to write10 data to iSCSI lun. %s",
>> +                     iscsi_get_error(iscsi));
>> +        acb->status = -EIO;
>> +    }
>> +
>> +    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
>> +    scsi_free_scsi_task(acb->task);
>> +    acb->task = NULL;
>> +}
>> +
>> +static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun)
>> +{
>> +    return sector * BDRV_SECTOR_SIZE / iscsilun->block_size;
>> +}
>> +
>> +static BlockDriverAIOCB *
>> +iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
>> +                 QEMUIOVector *qiov, int nb_sectors,
>> +                 BlockDriverCompletionFunc *cb,
>> +                 void *opaque)
>> +{
>> +    IscsiLun *iscsilun = bs->opaque;
>> +    struct iscsi_context *iscsi = iscsilun->iscsi;
>> +    IscsiAIOCB *acb;
>> +    size_t size;
>> +    int fua = 0;
>> +
>> +    /* set FUA on writes when cache mode is write through */
>> +    if (!(bs->open_flags & (BDRV_O_CACHE_WB|BDRV_O_NOCACHE))) {
>> +        fua = 1;
>> +    }
>
> FUA needs to reflect the guest semantics - does this disk have an
> enabled write cache?  When bs->open_flags has BDRV_O_CACHE_WB, then the
> guest knows it needs to send flushes because there is a write cache:
>
> if (!(bs->open_flags & BDRV_O_CACHE_WB)) {
>    fua = 1;
> }
>
> BDRV_O_NOCACHE is just for local files and sets the O_DIRECT hint.  It
> doesn't affect the cache semantics that the guest sees.

Thanks.  I'll do this change.

>
>> +/*
>> + * We support iscsi url's on the form
>> + * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
>> + */
>> +static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
>> +{
>> +    IscsiLun *iscsilun = bs->opaque;
>> +    struct iscsi_context *iscsi = NULL;
>> +    struct iscsi_url *iscsi_url = NULL;
>> +    struct IscsiTask task;
>> +    int ret;
>> +
>> +    if ((BDRV_SECTOR_SIZE % 512) != 0) {
>> +        error_report("iSCSI: Invalid BDRV_SECTOR_SIZE. "
>> +                     "BDRV_SECTOR_SIZE(%lld) is not a multiple "
>> +                     "of 512", BDRV_SECTOR_SIZE);
>> +        return -EINVAL;
>> +    }
>
> Another way of saying this is:
> QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE % 512 != 0);
>
> The advantage is that the build fails instead of waiting until iscsi is
> used at runtime until the failure is detected.

I can add this. But is it better to fail the whole build than to
return an error at runtime if/when iscsi is used?

>
> What will happen if BDRV_SECTOR_SIZE is not a multiple of 512?

iscsi will not work.
I dont think it is worth implementing a read-modify-write cycle for iscsi.
Performance would be so poor for this case that I think it is better
to just not support it at all.

>
>> +
>> +    memset(iscsilun, 0, sizeof(IscsiLun));
>> +
>> +    /* Should really append the KVM name after the ':' here */
>> +    iscsi = iscsi_create_context("iqn.2008-11.org.linux-kvm:");
>> +    if (iscsi == NULL) {
>> +        error_report("iSCSI: Failed to create iSCSI context.");
>> +        ret = -ENOMEM;
>> +        goto failed;
>> +    }
>> +
>> +    iscsi_url = iscsi_parse_full_url(iscsi, filename);
>> +    if (iscsi_url == NULL) {
>> +        error_report("Failed to parse URL : %s %s", filename,
>> +                     iscsi_get_error(iscsi));
>> +        ret = -ENOMEM;
>
> -EINVAL?

Will do.

>
>> +static BlockDriver bdrv_iscsi = {
>> +    .format_name     = "iscsi",
>> +    .protocol_name   = "iscsi",
>> +
>> +    .instance_size   = sizeof(IscsiLun),
>> +    .bdrv_file_open  = iscsi_open,
>> +    .bdrv_close      = iscsi_close,
>> +
>> +    .bdrv_getlength  = iscsi_getlength,
>> +
>> +    .bdrv_aio_readv  = iscsi_aio_readv,
>> +    .bdrv_aio_writev = iscsi_aio_writev,
>> +    .bdrv_aio_flush  = iscsi_aio_flush,
>
> block.c does not emulate .bdrv_flush() using your .bdrv_aio_flush()
> implementation.  I have sent a patch to add this emulation.  This will
> turn bdrv_flush(iscsi_bs) into an actual operation instead of a nop :).

Should I add a synchronous .bdrv_flush ? That is very easy for me to add.
Or should I just wait for your patch to be merged ?


>
>> diff --git a/trace-events b/trace-events
>> index 3fdd60f..4e461e5 100644
>> --- a/trace-events
>> +++ b/trace-events
>> @@ -494,3 +494,10 @@ escc_sunkbd_event_in(int ch) "Untranslated keycode %2.2x"
>>  escc_sunkbd_event_out(int ch) "Translated keycode %2.2x"
>>  escc_kbd_command(int val) "Command %d"
>>  escc_sunmouse_event(int dx, int dy, int buttons_state) "dx=%d dy=%d buttons=%01x"
>> +
>> +# block/iscsi.c
>> +disable iscsi_aio_write10_cb(void *iscsi, int status, void *acb, int canceled) "iscsi %p status %d acb %p canceled %d"
>> +disable iscsi_aio_writev(void *iscsi, int64_t sector_num, int nb_sectors, void *opaque, void *acb) "iscsi %p sector_num %"PRId64" nb_sectors %d opaque %p acb %p"
>> +disable iscsi_aio_read10_cb(void *iscsi, int status, void *acb, int canceled) "iscsi %p status %d acb %p canceled %d"
>> +disable iscsi_aio_readv(void *iscsi, int64_t sector_num, int nb_sectors, void *opaque, void *acb) "iscsi %p sector_num %"PRId64" nb_sectors %d opaque %p acb %p"
>
> It is no longer necessary to prefix trace event definitions with
> "disable".

Will fix this.

>
> The stderr and simple backends now start up with all events disabled by
> default.  The set of events can be set using the -trace
> events=<events-file> option or on the QEMU monitor using set trace-event
> <name> on.
>
> Stefan
>
Paolo Bonzini - Sept. 15, 2011, 6:04 a.m.
On 09/15/2011 01:08 AM, ronnie sahlberg wrote:
> I think it is reasonable to just not support iscsi at all for
> blocksize that is not multiple of 512 bytes
> since a read-modify-write cycle across a network would probably be
> prohibitively expensive.

Agreed.

> .bdrv_flush() I can easily add a synchronous implementation of this
> unless your patch is expected to be merged
> in the near future.

We need the same patch for NBD, so I wouldn't bother with the 
synchronous flush.

Paolo
Daniel P. Berrange - Sept. 15, 2011, 8:02 a.m.
On Thu, Sep 15, 2011 at 08:51:00AM +1000, ronnie sahlberg wrote:
> On Thu, Sep 15, 2011 at 12:36 AM, Christoph Hellwig <hch@lst.de> wrote:
> ...
> >> > +/*
> >> > + * We support iscsi url's on the form
> >> > + * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
> >> > + */
> >
> > Is having username + password on the command line really a that good idea?
> > Also what about the more complicated iSCSI authentification schemes?
> 
> In general it is a very bad idea. For local use on a private box it is
> convenient to be able to use "<username>%<password>@" syntax.
> For use on a shared box, libiscsi supports an alternative method too
> by setting the username and/or password via environment variables :
> LIBISCSI_CHAP_USERNAME=...  LIBISCSI_CHAP_PASSWORD=...

Environement variables are only a tiny bit better, since this still allows
the password to leak to any processes which can read /proc/$PID/environ.
It is also undesirable wrt many distro trouble shooting tools (eg Fedora/
RHEL's sosreport) which capture the contents of /proc/$PID/environ as part
of their data collection process. This means your passwords will end up
in attachments to bugzilla / issue tracker tickets.

For block devs with encrypted QCow2 disks (and VNC/SPICE) QEMU requires the
password to be set via the monitor. Since this iscsi: protocol is part of
the block layer, IMHO, the password should be settable the same way via the
monitor

Regards,
Daniel
Dor Laor - Sept. 15, 2011, 8:48 a.m.
On 09/15/2011 09:04 AM, Paolo Bonzini wrote:
> On 09/15/2011 01:08 AM, ronnie sahlberg wrote:
>> I think it is reasonable to just not support iscsi at all for
>> blocksize that is not multiple of 512 bytes
>> since a read-modify-write cycle across a network would probably be
>> prohibitively expensive.
>
> Agreed.
>
>> .bdrv_flush() I can easily add a synchronous implementation of this
>> unless your patch is expected to be merged
>> in the near future.
>
> We need the same patch for NBD, so I wouldn't bother with the
> synchronous flush.
>
> Paolo
>
>

It seems to me that using a qemu external initiator/target pairs like 
Orit's original design in 
http://wiki.qemu.org/Features/LiveBlockMigration#ISCSI_for_non_shared_storage 
would be a simpler (in terms of qemu code..) and flexible solution.

I agree that embedding the iscsi initiation in qemu can simplify the end 
user life but since this scenario is expected to be used by higher level 
software it's not relevant here. The risk is to have to maintain more 
code that won't be as general as the tgtd/lio solutions out there.
Kevin Wolf - Sept. 15, 2011, 9:03 a.m.
Am 15.09.2011 00:51, schrieb ronnie sahlberg:
> On Thu, Sep 15, 2011 at 12:36 AM, Christoph Hellwig <hch@lst.de> wrote:
> ...
>>>> +/*
>>>> + * We support iscsi url's on the form
>>>> + * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
>>>> + */
>>
>> Is having username + password on the command line really a that good idea?
>> Also what about the more complicated iSCSI authentification schemes?
> 
> In general it is a very bad idea. For local use on a private box it is
> convenient to be able to use "<username>%<password>@" syntax.
> For use on a shared box, libiscsi supports an alternative method too
> by setting the username and/or password via environment variables :
> LIBISCSI_CHAP_USERNAME=...  LIBISCSI_CHAP_PASSWORD=...

I wonder if we could make it look like an encrypted image. qemu already
has functionality to deal with passwords for block devices, so it seems
to make sense to reuse that for iscsi passwords.

Kevin
Kevin Wolf - Sept. 15, 2011, 9:10 a.m.
Am 15.09.2011 08:04, schrieb Paolo Bonzini:
> On 09/15/2011 01:08 AM, ronnie sahlberg wrote:
>> I think it is reasonable to just not support iscsi at all for
>> blocksize that is not multiple of 512 bytes
>> since a read-modify-write cycle across a network would probably be
>> prohibitively expensive.
> 
> Agreed.
> 
>> .bdrv_flush() I can easily add a synchronous implementation of this
>> unless your patch is expected to be merged
>> in the near future.
> 
> We need the same patch for NBD, so I wouldn't bother with the 
> synchronous flush.

Doesn't Stefan's patch conflict with your bdrv_co_flush patch?

Long term I think we should get rid of bdrv_flush and bdrv_aio_flush and
make all drivers use bdrv_co_flush. Having three different interfaces
for flush and emulation functions that do mappings for all missing
implementations is just insane.

Kevin
Paolo Bonzini - Sept. 15, 2011, 9:11 a.m.
On 09/15/2011 10:48 AM, Dor Laor wrote:
>> We need the same patch for NBD, so I wouldn't bother with the
>> synchronous flush.
>
> It seems to me that using a qemu external initiator/target pairs like
> Orit's original design in
> http://wiki.qemu.org/Features/LiveBlockMigration#ISCSI_for_non_shared_storage
> would be a simpler (in terms of qemu code..) and flexible solution.

Yes, it would be simpler for QEMU because everything is done outside.

However, iSCSI is a complex protocol and a complex suite of tools.  With 
a simpler set of tools such as qemu-nbd/nbd, you can for example tunnel 
data over the libvirt connection.  Possibly with encryption.  Also, with 
iSCSI you're tied to raw, while qemu-nbd lets you use qcow2.

iSCSI could be a better choice if everything in the QEMU block layer was 
done in terms of SCSI.  However, we're a far cry from that.

> I agree that embedding the iscsi initiation in qemu can simplify the end
> user life but since this scenario is expected to be used by higher level
> software it's not relevant here. The risk is to have to maintain more
> code that won't be as general as the tgtd/lio solutions out there.

I'm not sure I understand.  You say "embedding the iSCSI initiator in 
qemu can simplify the end user life" but "the risk is to have to 
maintain [another iSCSI target]".  I don't think anybody proposed adding 
an iSCSI target to QEMU (in fact tcm_vhost would even let you use lio 
instead of QEMU's SCSI target code!).  Only an iSCSI initiator which is 
not much code because it's just a wrapper for libiscsi.

In general, I'm not sure that libiscsi (instead of the in-kernel iSCSI 
initiator) is by definition a bad choice in high-end set-ups.  If you 
want to do SCSI passthrough, it's likely better to use libiscsi rather 
than using the in-kernel initiator together with scsi-generic 
(scsi-generic sucks; bsg is better but also a useless level of 
indirection IMO).

Perhaps Ronnie has rough performance numbers comparing in-kernel iSCSI 
with libiscsi?

Paolo
Paolo Bonzini - Sept. 15, 2011, 9:39 a.m.
On 09/15/2011 11:10 AM, Kevin Wolf wrote:
>> >  We need the same patch for NBD, so I wouldn't bother with the
>> >  synchronous flush.
> Doesn't Stefan's patch conflict with your bdrv_co_flush patch?

Yes. I'll include it myself in my NBD v2.

Paolo
Daniel P. Berrange - Sept. 15, 2011, 9:44 a.m.
On Thu, Sep 15, 2011 at 11:48:35AM +0300, Dor Laor wrote:
> On 09/15/2011 09:04 AM, Paolo Bonzini wrote:
> >On 09/15/2011 01:08 AM, ronnie sahlberg wrote:
> >>I think it is reasonable to just not support iscsi at all for
> >>blocksize that is not multiple of 512 bytes
> >>since a read-modify-write cycle across a network would probably be
> >>prohibitively expensive.
> >
> >Agreed.
> >
> >>.bdrv_flush() I can easily add a synchronous implementation of this
> >>unless your patch is expected to be merged
> >>in the near future.
> >
> >We need the same patch for NBD, so I wouldn't bother with the
> >synchronous flush.
> >
> >Paolo
> >
> >
> 
> It seems to me that using a qemu external initiator/target pairs
> like Orit's original design in http://wiki.qemu.org/Features/LiveBlockMigration#ISCSI_for_non_shared_storage
> would be a simpler (in terms of qemu code..) and flexible solution.
> 
> I agree that embedding the iscsi initiation in qemu can simplify the
> end user life but since this scenario is expected to be used by
> higher level software it's not relevant here. The risk is to have to
> maintain more code that won't be as general as the tgtd/lio
> solutions out there.

This should not be considered an either/or decision really. There are
compelling benefits for having a iSCSI initiator inside QEMU which Ronnie
outlined in the first mail in this thread. The enablement for non-root
users is, on its own, enough justification IMHO.

  * Privacy: The iSCSI LUNs are private to the guest and are not
    visible either to the host, nor to any processes running on the host.

  * Ease of managment : If you have very many guests and very many,
    thousands of, iSCSI LUNs. It is inconvenient to have to expose
    all LUNs to the underlying host.

  * No root requirement. Since the iSCSI LUNs are not mounted as
    devices on the host, ordinary users can set up and use iSCSI
    resources without the need for root privilege on the host to
    map the devices to local scsi devices.

There are several other benefits too

  * Since the network I/O for the iSCSI LUN is now done by the
    QEMU process instead of the kernel, each VM's iSCSI I/O
    traffic can be insolated & controlled via the cgroups network
    contorller / tc network classifier.

  * Portable across OS. Each OS has different tools for setting
    up iSCSI usage. A native driver lets QEMU users setup iSCSI
    in the same way regardless of the level of OS support for
    iSCSI.

Finally experiance integrating with the Linux iscsiadm command line
tools in libvirt, has shown that they can be quite a PITA to integrate
with if your mgmt app wants reliable/predictable results from their
usage regardless of what an admin may have previously setup on the
host.

So, IMHO, from a QEMU POV it makes perfect sense to have a builtin
iSCSI initiator, as an alternative to using the OS' iSCSI tools
externally. Vendors of applications managing KVM/QEMU are then free
to decide which of the approaches they wish to support in their own
products.

Daniel
ronniesahlberg@gmail.com - Sept. 15, 2011, 11:27 a.m.
On Thu, Sep 15, 2011 at 7:11 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
...
> Perhaps Ronnie has rough performance numbers comparing in-kernel iSCSI with
> libiscsi?

I did some tests some time ago just doing things like 'dd if=/dev/sda
of=/dev/null' on the root disk from within a RHEL guest itself
and performace was comparable.
In some tests libiscsi was a few percent faster, in other tests it was
a few percent slower.
But overall, very unscientifically performed test, performance was not
much different.

Now, I am not an expert on tuning cache or tuning the kernel scsi and
iscsi layer, so I would not really want to make too many statements in
the area other than in some tests I performed, performance was good
enough for my use.


I would be very happy if someone competent in tuning
qemu/scsi/open-iscsi (==that means, not me) would do a real
comparasion.


regards
ronnie sahlberg
Dor Laor - Sept. 15, 2011, 11:42 a.m.
On 09/15/2011 12:11 PM, Paolo Bonzini wrote:
> On 09/15/2011 10:48 AM, Dor Laor wrote:
>>> We need the same patch for NBD, so I wouldn't bother with the
>>> synchronous flush.
>>
>> It seems to me that using a qemu external initiator/target pairs like
>> Orit's original design in
>> http://wiki.qemu.org/Features/LiveBlockMigration#ISCSI_for_non_shared_storage
>>
>> would be a simpler (in terms of qemu code..) and flexible solution.
>
> Yes, it would be simpler for QEMU because everything is done outside.
>
> However, iSCSI is a complex protocol and a complex suite of tools. With
> a simpler set of tools such as qemu-nbd/nbd, you can for example tunnel
> data over the libvirt connection. Possibly with encryption. Also, with
> iSCSI you're tied to raw, while qemu-nbd lets you use qcow2.

My main motivation for external iScsi is to provide qemu operations w/ 
non shared storage. Things like streaming an image w/o shared storage or 
block live migration can be done where the src host exposes iscsi target 
and the destination is the initiator. In case of qcow2, every snapshot 
in the chain should be exposed as a separate LUN. The idea is to ignore 
the data in the guest image and treat it as opaque.

>
> iSCSI could be a better choice if everything in the QEMU block layer was
> done in terms of SCSI. However, we're a far cry from that.
>
>> I agree that embedding the iscsi initiation in qemu can simplify the end
>> user life but since this scenario is expected to be used by higher level
>> software it's not relevant here. The risk is to have to maintain more
>> code that won't be as general as the tgtd/lio solutions out there.
>
> I'm not sure I understand. You say "embedding the iSCSI initiator in
> qemu can simplify the end user life" but "the risk is to have to
> maintain [another iSCSI target]". I don't think anybody proposed adding
> an iSCSI target to QEMU (in fact tcm_vhost would even let you use lio
> instead of QEMU's SCSI target code!). Only an iSCSI initiator which is
> not much code because it's just a wrapper for libiscsi.
>
> In general, I'm not sure that libiscsi (instead of the in-kernel iSCSI
> initiator) is by definition a bad choice in high-end set-ups. If you
> want to do SCSI passthrough, it's likely better to use libiscsi rather
> than using the in-kernel initiator together with scsi-generic
> (scsi-generic sucks; bsg is better but also a useless level of
> indirection IMO).
>
> Perhaps Ronnie has rough performance numbers comparing in-kernel iSCSI
> with libiscsi?
>
> Paolo
Christoph Hellwig - Sept. 15, 2011, 11:46 a.m.
On Thu, Sep 15, 2011 at 02:42:42PM +0300, Dor Laor wrote:
> My main motivation for external iScsi is to provide qemu operations w/ non 
> shared storage.

iSCSI with multiple initiator is shared storage.
Paolo Bonzini - Sept. 15, 2011, 11:58 a.m.
On 09/15/2011 01:42 PM, Dor Laor wrote:
>>
>> However, iSCSI is a complex protocol and a complex suite of tools. With
>> a simpler set of tools such as qemu-nbd/nbd, you can for example tunnel
>> data over the libvirt connection. Possibly with encryption. Also, with
>> iSCSI you're tied to raw, while qemu-nbd lets you use qcow2.
>
> My main motivation for external iScsi is to provide qemu operations w/
> non shared storage. Things like streaming an image w/o shared storage or
> block live migration can be done where the src host exposes iscsi target
> and the destination is the initiator. In case of qcow2, every snapshot
> in the chain should be exposed as a separate LUN. The idea is to ignore
> the data in the guest image and treat it as opaque.

Then you need an iSCSI *target* that understands qcow2, like qemu-nbd 
but for iSCSI... that's exactly the thing you were worried about 
implementing.

(Of course you could use qemu-nbd and expose /dev/nbd* via iSCSI, but it 
starts looking like an exercise in adding layers of indirections! :)).

Paolo
Dor Laor - Sept. 15, 2011, 12:01 p.m.
On 09/15/2011 02:46 PM, Christoph Hellwig wrote:
> On Thu, Sep 15, 2011 at 02:42:42PM +0300, Dor Laor wrote:
>> My main motivation for external iScsi is to provide qemu operations w/ non
>> shared storage.
>
> iSCSI with multiple initiator is shared storage.

Exactly :) that's the magic.
Paolo Bonzini - Sept. 15, 2011, 12:04 p.m.
On 09/15/2011 02:01 PM, Dor Laor wrote:
>>> My main motivation for external iScsi is to provide qemu operations
>>> w/ non shared storage.
>>
>> iSCSI with multiple initiator is shared storage.
>
> Exactly :) that's the magic.

I think we're on the same page, it's just a difference in terms.  iSCSI 
or NBD would provide the sharing of the storage instead of NFS, 
basically, so the underlying storage is not shared -- right?

Paolo
Orit Wasserman - Sept. 15, 2011, 12:34 p.m.
On Thu, 2011-09-15 at 13:58 +0200, Paolo Bonzini wrote:
> On 09/15/2011 01:42 PM, Dor Laor wrote:
> >>
> >> However, iSCSI is a complex protocol and a complex suite of tools. With
> >> a simpler set of tools such as qemu-nbd/nbd, you can for example tunnel
> >> data over the libvirt connection. Possibly with encryption. Also, with
> >> iSCSI you're tied to raw, while qemu-nbd lets you use qcow2.
> >
> > My main motivation for external iScsi is to provide qemu operations w/
> > non shared storage. Things like streaming an image w/o shared storage or
> > block live migration can be done where the src host exposes iscsi target
> > and the destination is the initiator. In case of qcow2, every snapshot
> > in the chain should be exposed as a separate LUN. The idea is to ignore
> > the data in the guest image and treat it as opaque.
> 
> Then you need an iSCSI *target* that understands qcow2, like qemu-nbd 
> but for iSCSI... that's exactly the thing you were worried about 
> implementing.
Not really , this is just part of the target configuration.
For each file in the chain we create a lun in the target that is backed
by the file and we number the luns in chronological order (base is lun
1) . you can look at a tgtd configuration here
http://wiki.qemu.org/Features/LiveBlockMigration#Example_single_base_master_image.
Orit

> 
> (Of course you could use qemu-nbd and expose /dev/nbd* via iSCSI, but it 
> starts looking like an exercise in adding layers of indirections! :)).
> 
> Paolo
>
Paolo Bonzini - Sept. 15, 2011, 12:58 p.m.
On 09/15/2011 02:34 PM, Orit Wasserman wrote:
>> >  Then you need an iSCSI*target*  that understands qcow2, like qemu-nbd
>> >  but for iSCSI... that's exactly the thing you were worried about
>> >  implementing.
>
> Not really , this is just part of the target configuration.
> For each file in the chain we create a lun in the target that is backed
> by the file and we number the luns in chronological order (base is lun
> 1) . you can look at a tgtd configuration here
> http://wiki.qemu.org/Features/LiveBlockMigration#Example_single_base_master_image.

That's surely a weird way to use iSCSI. :)

It's clever, but I would call that shared storage, since you need to 
share the details of the snapshot chain between the source and 
destination, down to the file names.  We need a more precise nomenclature.

Paolo
Orit Wasserman - Sept. 15, 2011, 4:59 p.m.
On Thu, 2011-09-15 at 14:58 +0200, Paolo Bonzini wrote:
> On 09/15/2011 02:34 PM, Orit Wasserman wrote:
> >> >  Then you need an iSCSI*target*  that understands qcow2, like qemu-nbd
> >> >  but for iSCSI... that's exactly the thing you were worried about
> >> >  implementing.
> >
> > Not really , this is just part of the target configuration.
> > For each file in the chain we create a lun in the target that is backed
> > by the file and we number the luns in chronological order (base is lun
> > 1) . you can look at a tgtd configuration here
> > http://wiki.qemu.org/Features/LiveBlockMigration#Example_single_base_master_image.
> 
> That's surely a weird way to use iSCSI. :)
:)
> It's clever, but I would call that shared storage, since you need to 
> share the details of the snapshot chain between the source and 
> destination, down to the file names.  We need a more precise nomenclature.
We store the parent base image inside the image (relative path). We can
reverse engineer it in the destination , start at the highest lun ,read
it's parent name , create a symbolic link to the parent (previous lun)
and so on ...

Orit
> Paolo
Christoph Hellwig - Sept. 16, 2011, 3:53 p.m.
On Wed, Sep 14, 2011 at 04:50:25PM +0100, Stefan Hajnoczi wrote:
> I think in this case it will not make the code nicer.  Since the
> external iSCSI library is based on callbacks it would be necessary to
> write the coroutines<->callbacks adapter functions.  So for example,
> the READ10 command would need a function that can be called in
> coroutine context and yields while libiscsi does the I/O.  When the
> callback is invoked it will re-enter the coroutine.
> 
> The area where coroutines are useful in the block layer is for image
> formats.  We already have common coroutines<->callback adapter
> functions in block.c so it's possible to write sequential code for
> image formats.  They only need access to block layer functions which
> have already been adapted.  But as soon as you interact with a
> callback-based API from the coroutine, then you need to write an
> adapter yourself.

So you plan on keeping the aio interface around forever?  Qemu with two
different I/O pathes was already more than painful enough, I don't
think keeping three, and two of them beeing fairly complex is a good
idea.
Stefan Hajnoczi - Sept. 17, 2011, 7:11 a.m.
On Fri, Sep 16, 2011 at 05:53:20PM +0200, Christoph Hellwig wrote:
> On Wed, Sep 14, 2011 at 04:50:25PM +0100, Stefan Hajnoczi wrote:
> > I think in this case it will not make the code nicer.  Since the
> > external iSCSI library is based on callbacks it would be necessary to
> > write the coroutines<->callbacks adapter functions.  So for example,
> > the READ10 command would need a function that can be called in
> > coroutine context and yields while libiscsi does the I/O.  When the
> > callback is invoked it will re-enter the coroutine.
> > 
> > The area where coroutines are useful in the block layer is for image
> > formats.  We already have common coroutines<->callback adapter
> > functions in block.c so it's possible to write sequential code for
> > image formats.  They only need access to block layer functions which
> > have already been adapted.  But as soon as you interact with a
> > callback-based API from the coroutine, then you need to write an
> > adapter yourself.
> 
> So you plan on keeping the aio interface around forever?  Qemu with two
> different I/O pathes was already more than painful enough, I don't
> think keeping three, and two of them beeing fairly complex is a good
> idea.

The synchronous interfaces can be converted to the coroutine
interfaces.

The block layer needs a public aio interface because device emulation is
asynchronous/callback-based.  That doesn't mean that BlockDriver needs
aio functions since block.c could transparently set up coroutines.  So
in theory BlockDriver could have only coroutine interfaces.

Doing the aio to coroutine conversion is pretty mechanical, that's why
I'm not afraid of doing it with this iSCSI code later.

Stefan
ronniesahlberg@gmail.com - Sept. 21, 2011, 9:48 a.m.
Stefan,
Thanks for your review.
I feel that iscsi would be beneficial for several usecases.

I have implemented all changes you pointed out, except two, and resent
an updated patch to the list.
Please see below.


regards
ronnie sahlberg


On Mon, Sep 12, 2011 at 7:14 PM, Stefan Hajnoczi
<stefanha@linux.vnet.ibm.com> wrote:
> On Sat, Sep 10, 2011 at 02:23:30PM +1000, Ronnie Sahlberg wrote:
>
> Looking good.  I think this is worth merging because it does offer
> benefits over host iSCSI.
>
>> +static void
>> +iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data,
>> +                    void *private_data)
>> +{
>> +}
>> +
>> +static void
>> +iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
>> +{
>> +    IscsiAIOCB *acb = (IscsiAIOCB *)blockacb;
>> +    IscsiLun *iscsilun = acb->iscsilun;
>> +
>> +    acb->status = -ECANCELED;
>> +    acb->common.cb(acb->common.opaque, acb->status);
>> +    acb->canceled = 1;
>> +
>> +    iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task,
>> +                                     iscsi_abort_task_cb, NULL);
>> +}
>
> The asynchronous abort task call looks odd.  If a caller allocates a
> buffer and issues a read request, then we need to make sure that the
> request is really aborted by the time .bdrv_aio_cancel() returns.
>
> If I understand the code correctly, iscsi_aio_cancel() returns
> immediately but the read request will still be in progress.  That means
> the caller could now free their data buffer and the read request will
> overwrite that unallocated memory.

You are right.
I have added a new function to libiscsi that will also release the
task structure from libiscsi as well.
So we should no longer have a window where we might have a cancelled
task in flight writing the data to an already released buffer once the
cancelled data buffers start arriving on the socket.

>
>> +static void
>> +iscsi_aio_write10_cb(struct iscsi_context *iscsi, int status,
>> +                     void *command_data, void *opaque)
>> +{
>> +    IscsiAIOCB *acb = opaque;
>> +
>> +    trace_iscsi_aio_write10_cb(iscsi, status, acb, acb->canceled);
>> +
>> +    if (acb->buf != NULL) {
>> +        free(acb->buf);
>> +    }
>
> Please just use g_free(acb->buf).  g_free(NULL) is defined as a nop so
> the check isn't necessary.  Also, this code uses free(3) when it shoulde.
> use g_free(3).
>

Done.

>> +
>> +    if (acb->canceled != 0) {
>> +        qemu_aio_release(acb);
>> +        scsi_free_scsi_task(acb->task);
>> +        acb->task = NULL;
>> +        return;
>> +    }
>> +
>> +    acb->status = 0;
>> +    if (status < 0) {
>> +        error_report("Failed to write10 data to iSCSI lun. %s",
>> +                     iscsi_get_error(iscsi));
>> +        acb->status = -EIO;
>> +    }
>> +
>> +    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
>> +    scsi_free_scsi_task(acb->task);
>> +    acb->task = NULL;
>> +}
>> +
>> +static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun)
>> +{
>> +    return sector * BDRV_SECTOR_SIZE / iscsilun->block_size;
>> +}
>> +
>> +static BlockDriverAIOCB *
>> +iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
>> +                 QEMUIOVector *qiov, int nb_sectors,
>> +                 BlockDriverCompletionFunc *cb,
>> +                 void *opaque)
>> +{
>> +    IscsiLun *iscsilun = bs->opaque;
>> +    struct iscsi_context *iscsi = iscsilun->iscsi;
>> +    IscsiAIOCB *acb;
>> +    size_t size;
>> +    int fua = 0;
>> +
>> +    /* set FUA on writes when cache mode is write through */
>> +    if (!(bs->open_flags & (BDRV_O_CACHE_WB|BDRV_O_NOCACHE))) {
>> +        fua = 1;
>> +    }
>
> FUA needs to reflect the guest semantics - does this disk have an
> enabled write cache?  When bs->open_flags has BDRV_O_CACHE_WB, then the
> guest knows it needs to send flushes because there is a write cache:
>
> if (!(bs->open_flags & BDRV_O_CACHE_WB)) {
>    fua = 1;
> }
>
> BDRV_O_NOCACHE is just for local files and sets the O_DIRECT hint.  It
> doesn't affect the cache semantics that the guest sees.
>

Done. When I first started building the patch a while ago, both flags
were needed. I missed when the second flag became obsolete upstream.


>> +/*
>> + * We support iscsi url's on the form
>> + * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
>> + */
>> +static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
>> +{
>> +    IscsiLun *iscsilun = bs->opaque;
>> +    struct iscsi_context *iscsi = NULL;
>> +    struct iscsi_url *iscsi_url = NULL;
>> +    struct IscsiTask task;
>> +    int ret;
>> +
>> +    if ((BDRV_SECTOR_SIZE % 512) != 0) {
>> +        error_report("iSCSI: Invalid BDRV_SECTOR_SIZE. "
>> +                     "BDRV_SECTOR_SIZE(%lld) is not a multiple "
>> +                     "of 512", BDRV_SECTOR_SIZE);
>> +        return -EINVAL;
>> +    }
>
> Another way of saying this is:
> QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE % 512 != 0);
>
> The advantage is that the build fails instead of waiting until iscsi is
> used at runtime until the failure is detected.
>
> What will happen if BDRV_SECTOR_SIZE is not a multiple of 512?
>

I did not do this change.
If blocksize is not multiple of 512,  the iscsi backend will just
refuse to work since having a read-modify-write cycle across the
network would be extremely costly.
I dont think iscsi should cause the entire build to fail.

While it is difficult to imagine a different blocksize, it is not
impossible I guess.
If someone comes up with a non-512-multiple blocksize and a good
reason for one, I dont want to be the guy that broke the build.


>> +
>> +    memset(iscsilun, 0, sizeof(IscsiLun));
>> +
>> +    /* Should really append the KVM name after the ':' here */
>> +    iscsi = iscsi_create_context("iqn.2008-11.org.linux-kvm:");
>> +    if (iscsi == NULL) {
>> +        error_report("iSCSI: Failed to create iSCSI context.");
>> +        ret = -ENOMEM;
>> +        goto failed;
>> +    }
>> +
>> +    iscsi_url = iscsi_parse_full_url(iscsi, filename);
>> +    if (iscsi_url == NULL) {
>> +        error_report("Failed to parse URL : %s %s", filename,
>> +                     iscsi_get_error(iscsi));
>> +        ret = -ENOMEM;
>
> -EINVAL?

Done.

>
>> +static BlockDriver bdrv_iscsi = {
>> +    .format_name     = "iscsi",
>> +    .protocol_name   = "iscsi",
>> +
>> +    .instance_size   = sizeof(IscsiLun),
>> +    .bdrv_file_open  = iscsi_open,
>> +    .bdrv_close      = iscsi_close,
>> +
>> +    .bdrv_getlength  = iscsi_getlength,
>> +
>> +    .bdrv_aio_readv  = iscsi_aio_readv,
>> +    .bdrv_aio_writev = iscsi_aio_writev,
>> +    .bdrv_aio_flush  = iscsi_aio_flush,
>
> block.c does not emulate .bdrv_flush() using your .bdrv_aio_flush()
> implementation.  I have sent a patch to add this emulation.  This will
> turn bdrv_flush(iscsi_bs) into an actual operation instead of a nop :).

I did not implement this since I understood from the discussion that a
patch is imminent and it is required for existing backend(s?) anyway.


>
>> diff --git a/trace-events b/trace-events
>> index 3fdd60f..4e461e5 100644
>> --- a/trace-events
>> +++ b/trace-events
>> @@ -494,3 +494,10 @@ escc_sunkbd_event_in(int ch) "Untranslated keycode %2.2x"
>>  escc_sunkbd_event_out(int ch) "Translated keycode %2.2x"
>>  escc_kbd_command(int val) "Command %d"
>>  escc_sunmouse_event(int dx, int dy, int buttons_state) "dx=%d dy=%d buttons=%01x"
>> +
>> +# block/iscsi.c
>> +disable iscsi_aio_write10_cb(void *iscsi, int status, void *acb, int canceled) "iscsi %p status %d acb %p canceled %d"
>> +disable iscsi_aio_writev(void *iscsi, int64_t sector_num, int nb_sectors, void *opaque, void *acb) "iscsi %p sector_num %"PRId64" nb_sectors %d opaque %p acb %p"
>> +disable iscsi_aio_read10_cb(void *iscsi, int status, void *acb, int canceled) "iscsi %p status %d acb %p canceled %d"
>> +disable iscsi_aio_readv(void *iscsi, int64_t sector_num, int nb_sectors, void *opaque, void *acb) "iscsi %p sector_num %"PRId64" nb_sectors %d opaque %p acb %p"
>
> It is no longer necessary to prefix trace event definitions with
> "disable".

Done.


>
> The stderr and simple backends now start up with all events disabled by
> default.  The set of events can be set using the -trace
> events=<events-file> option or on the QEMU monitor using set trace-event
> <name> on.
>
> Stefan
>
Mark Wu - Sept. 23, 2011, 9:15 a.m.
I tested this patch with the following command:
x86_64-softmmu/qemu-system-x86_64 --enable-kvm rhel54_1.img -m 1024 -net 
tap,ifname=tap0,script=no -net nic,model=virtio -sdl -drive 
file=iscsi://127.0.0.1/iqn.2011-09.com.example:server.target1/
And I found that the whole qemu process would get freezed, not reachable 
via ping and no response on desktop if there's I/O targeted to the iscsi 
drive and the iscsi target was forcefully stopped. After checking the 
backtrace with gdb, I found the I/O thread got stuck on the mutex 
qemu_global_mutex , which was hold by the vcpu thread. It should be 
released before re-entering guest. But the vcpu thread was waiting for 
the completion of iscsi aio request endlessly, and therefore couldn't 
get chance to release the mutex. So the whole qemu process became 
unresponsive. But this problem doesn't exist with the combination of 
virtio and iscsi. Only the I/O process got hung on guest in this case. 
It's more acceptable.  I am not sure how to fix this problem.


gdb backtrace:

(gdb) info threads
   2 Thread 0x7fa0fdd4c700 (LWP 5086)  0x0000003a868de383 in select () 
from /lib64/libc.so.6
* 1 Thread 0x7fa0fdd4d740 (LWP 5085)  0x0000003a8700dfe4 in 
__lll_lock_wait () from /lib64/libpthread.so.0
(gdb) bt
#0  0x0000003a8700dfe4 in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x0000003a87009318 in _L_lock_854 () from /lib64/libpthread.so.0
#2  0x0000003a870091e7 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3  0x00000000004c9819 in qemu_mutex_lock (mutex=<value optimized out>) 
at qemu-thread-posix.c:54
#4  0x00000000004a46c6 in main_loop_wait (nonblocking=<value optimized 
out>) at /home/mark/Work/source/qemu/vl.c:1545
#5  0x00000000004a60d6 in main_loop (argc=<value optimized out>, 
argv=<value optimized out>, envp=<value optimized out>) at 
/home/mark/Work/source/qemu/vl.c:1579
#6  main (argc=<value optimized out>, argv=<value optimized out>, 
envp=<value optimized out>) at /home/mark/Work/source/qemu/vl.c:3574
(gdb) t 2
[Switching to thread 2 (Thread 0x7fa0fdd4c700 (LWP 5086))]#0  
0x0000003a868de383 in select () from /lib64/libc.so.6
(gdb) bt
#0  0x0000003a868de383 in select () from /lib64/libc.so.6
#1  0x00000000004096aa in qemu_aio_wait () at aio.c:193
#2  0x0000000000409815 in qemu_aio_flush () at aio.c:113
#3  0x00000000004761ea in bmdma_cmd_writeb (bm=0x1db2230, val=8) at 
/home/mark/Work/source/qemu/hw/ide/pci.c:311
#4  0x0000000000555900 in access_with_adjusted_size (addr=0, 
value=0x7fa0fdd4bdb8, size=1, access_size_min=<value optimized out>, 
access_size_max=<value optimized out>, access=
     0x555820 <memory_region_write_accessor>, opaque=0x1db2370) at 
/home/mark/Work/source/qemu/memory.c:284
#5  0x0000000000555ae1 in memory_region_iorange_write (iorange=<value 
optimized out>, offset=<value optimized out>, width=<value optimized 
out>, data=8) at /home/mark/Work/source/qemu/memory.c:425
#6  0x000000000054eda1 in kvm_handle_io (env=0x192e080) at 
/home/mark/Work/source/qemu/kvm-all.c:834
#7  kvm_cpu_exec (env=0x192e080) at 
/home/mark/Work/source/qemu/kvm-all.c:976
#8  0x000000000052cc1a in qemu_kvm_cpu_thread_fn (arg=0x192e080) at 
/home/mark/Work/source/qemu/cpus.c:656
#9  0x0000003a870077e1 in start_thread () from /lib64/libpthread.so.0
#10 0x0000003a868e577d in clone () from /lib64/libc.so.6
Paolo Bonzini - Sept. 23, 2011, 10:16 a.m.
On 09/23/2011 11:15 AM, Mark Wu wrote:
> I tested this patch with the following command:
> x86_64-softmmu/qemu-system-x86_64 --enable-kvm rhel54_1.img -m 1024 -net
> tap,ifname=tap0,script=no -net nic,model=virtio -sdl -drive
> file=iscsi://127.0.0.1/iqn.2011-09.com.example:server.target1/
> And I found that the whole qemu process would get freezed, not reachable
> via ping and no response on desktop if there's I/O targeted to the iscsi
> drive and the iscsi target was forcefully stopped. After checking the
> backtrace with gdb, I found the I/O thread got stuck on the mutex
> qemu_global_mutex , which was hold by the vcpu thread. It should be
> released before re-entering guest. But the vcpu thread was waiting for
> the completion of iscsi aio request endlessly, and therefore couldn't
> get chance to release the mutex. So the whole qemu process became
> unresponsive. But this problem doesn't exist with the combination of
> virtio and iscsi. Only the I/O process got hung on guest in this case.
> It's more acceptable.  I am not sure how to fix this problem.

You don't. :(  It's a problem in IDE emulation and anything else that 
uses qemu_aio_flush; luckily it's only called in a few places.

It would be the same with NFS instead of iSCSI, for example.

Otherwise, you could implement some kind of timeout+reconnect logic in 
the iSCSI driver or in libiscsi.

Paolo

Patch

diff --git a/Makefile.objs b/Makefile.objs
index a529a11..8c8e420 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -36,6 +36,7 @@  block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
+block-nested-$(CONFIG_LIBISCSI) += iscsi.o
 block-nested-$(CONFIG_CURL) += curl.o
 block-nested-$(CONFIG_RBD) += rbd.o
 
diff --git a/block/iscsi.c b/block/iscsi.c
new file mode 100644
index 0000000..f1c1cc3
--- /dev/null
+++ b/block/iscsi.c
@@ -0,0 +1,594 @@ 
+/*
+ * QEMU Block driver for iSCSI images
+ *
+ * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "config-host.h"
+
+#include <poll.h>
+#include "sysemu.h"
+#include "qemu-common.h"
+#include "qemu-error.h"
+#include "block_int.h"
+#include "trace.h"
+
+#include <iscsi/iscsi.h>
+#include <iscsi/scsi-lowlevel.h>
+
+
+typedef struct IscsiLun {
+    struct iscsi_context *iscsi;
+    int lun;
+    int block_size;
+    unsigned long num_blocks;
+} IscsiLun;
+
+typedef struct IscsiAIOCB {
+    BlockDriverAIOCB common;
+    QEMUIOVector *qiov;
+    QEMUBH *bh;
+    IscsiLun *iscsilun;
+    struct scsi_task *task;
+    uint8_t *buf;
+    int canceled;
+    int status;
+    size_t read_size;
+    size_t read_offset;
+} IscsiAIOCB;
+
+struct IscsiTask {
+    IscsiLun *iscsilun;
+    int status;
+    int complete;
+};
+
+static void
+iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data,
+                    void *private_data)
+{
+}
+
+static void
+iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    IscsiAIOCB *acb = (IscsiAIOCB *)blockacb;
+    IscsiLun *iscsilun = acb->iscsilun;
+
+    acb->status = -ECANCELED;
+    acb->common.cb(acb->common.opaque, acb->status);
+    acb->canceled = 1;
+
+    iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task,
+                                     iscsi_abort_task_cb, NULL);
+}
+
+static AIOPool iscsi_aio_pool = {
+    .aiocb_size         = sizeof(IscsiAIOCB),
+    .cancel             = iscsi_aio_cancel,
+};
+
+
+static void iscsi_process_read(void *arg);
+static void iscsi_process_write(void *arg);
+
+static int iscsi_process_flush(void *arg)
+{
+    IscsiLun *iscsilun = arg;
+
+    return iscsi_queue_length(iscsilun->iscsi) > 0;
+}
+
+static void
+iscsi_set_events(IscsiLun *iscsilun)
+{
+    struct iscsi_context *iscsi = iscsilun->iscsi;
+
+    qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), iscsi_process_read,
+                           (iscsi_which_events(iscsi) & POLLOUT)
+                           ? iscsi_process_write : NULL,
+                           iscsi_process_flush, NULL, iscsilun);
+}
+
+static void
+iscsi_process_read(void *arg)
+{
+    IscsiLun *iscsilun = arg;
+    struct iscsi_context *iscsi = iscsilun->iscsi;
+
+    iscsi_service(iscsi, POLLIN);
+    iscsi_set_events(iscsilun);
+}
+
+static void
+iscsi_process_write(void *arg)
+{
+    IscsiLun *iscsilun = arg;
+    struct iscsi_context *iscsi = iscsilun->iscsi;
+
+    iscsi_service(iscsi, POLLOUT);
+    iscsi_set_events(iscsilun);
+}
+
+
+static int
+iscsi_schedule_bh(QEMUBHFunc *cb, IscsiAIOCB *acb)
+{
+    acb->bh = qemu_bh_new(cb, acb);
+    if (!acb->bh) {
+        error_report("oom: could not create iscsi bh");
+        return -EIO;
+    }
+
+    qemu_bh_schedule(acb->bh);
+    return 0;
+}
+
+static void
+iscsi_readv_writev_bh_cb(void *p)
+{
+    IscsiAIOCB *acb = p;
+
+    qemu_bh_delete(acb->bh);
+
+    if (acb->status != -ECANCELED) {
+        acb->common.cb(acb->common.opaque, acb->status);
+    }
+
+    qemu_aio_release(acb);
+}
+
+
+static void
+iscsi_aio_write10_cb(struct iscsi_context *iscsi, int status,
+                     void *command_data, void *opaque)
+{
+    IscsiAIOCB *acb = opaque;
+
+    trace_iscsi_aio_write10_cb(iscsi, status, acb, acb->canceled);
+
+    if (acb->buf != NULL) {
+        free(acb->buf);
+    }
+
+    if (acb->canceled != 0) {
+        qemu_aio_release(acb);
+        scsi_free_scsi_task(acb->task);
+        acb->task = NULL;
+        return;
+    }
+
+    acb->status = 0;
+    if (status < 0) {
+        error_report("Failed to write10 data to iSCSI lun. %s",
+                     iscsi_get_error(iscsi));
+        acb->status = -EIO;
+    }
+
+    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    scsi_free_scsi_task(acb->task);
+    acb->task = NULL;
+}
+
+static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun)
+{
+    return sector * BDRV_SECTOR_SIZE / iscsilun->block_size;
+}
+
+static BlockDriverAIOCB *
+iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
+                 QEMUIOVector *qiov, int nb_sectors,
+                 BlockDriverCompletionFunc *cb,
+                 void *opaque)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    struct iscsi_context *iscsi = iscsilun->iscsi;
+    IscsiAIOCB *acb;
+    size_t size;
+    int fua = 0;
+
+    /* set FUA on writes when cache mode is write through */
+    if (!(bs->open_flags & (BDRV_O_CACHE_WB|BDRV_O_NOCACHE))) {
+        fua = 1;
+    }
+
+    acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+    trace_iscsi_aio_writev(iscsi, sector_num, nb_sectors, opaque, acb);
+    if (!acb) {
+        return NULL;
+    }
+
+    acb->iscsilun = iscsilun;
+    acb->qiov     = qiov;
+
+    acb->canceled   = 0;
+
+    /* XXX we should pass the iovec to write10 to avoid the extra copy */
+    /* this will allow us to get rid of 'buf' completely */
+    size = nb_sectors * BDRV_SECTOR_SIZE;
+    acb->buf = g_malloc(size);
+    qemu_iovec_to_buffer(acb->qiov, acb->buf);
+    acb->task = iscsi_write10_task(iscsi, iscsilun->lun, acb->buf, size,
+                              sector_qemu2lun(sector_num, iscsilun),
+                              fua, 0, iscsilun->block_size,
+                              iscsi_aio_write10_cb, acb);
+    if (acb->task == NULL) {
+        error_report("iSCSI: Failed to send write10 command. %s",
+                     iscsi_get_error(iscsi));
+        g_free(acb->buf);
+        qemu_aio_release(acb);
+        return NULL;
+    }
+
+    iscsi_set_events(iscsilun);
+
+    return &acb->common;
+}
+
+static void
+iscsi_aio_read10_cb(struct iscsi_context *iscsi, int status,
+                    void *command_data, void *opaque)
+{
+    IscsiAIOCB *acb = opaque;
+
+    trace_iscsi_aio_read10_cb(iscsi, status, acb, acb->canceled);
+
+    if (acb->canceled != 0) {
+        qemu_aio_release(acb);
+        scsi_free_scsi_task(acb->task);
+        acb->task = NULL;
+        return;
+    }
+
+    acb->status = 0;
+    if (status != 0) {
+        error_report("Failed to read10 data from iSCSI lun. %s",
+                     iscsi_get_error(iscsi));
+        acb->status = -EIO;
+    }
+
+    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    scsi_free_scsi_task(acb->task);
+    acb->task = NULL;
+}
+
+static BlockDriverAIOCB *
+iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
+                QEMUIOVector *qiov, int nb_sectors,
+                BlockDriverCompletionFunc *cb,
+                void *opaque)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    struct iscsi_context *iscsi = iscsilun->iscsi;
+    IscsiAIOCB *acb;
+    size_t qemu_read_size, lun_read_size;
+    int i;
+
+    qemu_read_size = BDRV_SECTOR_SIZE * (size_t)nb_sectors;
+
+    acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+    trace_iscsi_aio_readv(iscsi, sector_num, nb_sectors, opaque, acb);
+    if (!acb) {
+        return NULL;
+    }
+
+    acb->iscsilun = iscsilun;
+    acb->qiov     = qiov;
+
+    acb->canceled    = 0;
+    acb->read_size   = qemu_read_size;
+    acb->buf         = NULL;
+
+    /* If LUN blocksize is bigger than BDRV_BLOCK_SIZE a read from QEMU
+     * may be misaligned to the LUN, so we may need to read some extra
+     * data.
+     */
+    acb->read_offset = 0;
+    if (iscsilun->block_size > BDRV_SECTOR_SIZE) {
+        uint64_t bdrv_offset = BDRV_SECTOR_SIZE * sector_num;
+
+        acb->read_offset  = bdrv_offset % iscsilun->block_size;
+    }
+
+    lun_read_size  = (qemu_read_size + iscsilun->block_size
+                     + acb->read_offset - 1)
+                     / iscsilun->block_size * iscsilun->block_size;
+    acb->task = iscsi_read10_task(iscsi, iscsilun->lun,
+                             sector_qemu2lun(sector_num, iscsilun),
+                             lun_read_size, iscsilun->block_size,
+                             iscsi_aio_read10_cb, acb);
+    if (acb->task == NULL) {
+        error_report("iSCSI: Failed to send read10 command. %s",
+                     iscsi_get_error(iscsi));
+        qemu_aio_release(acb);
+        return NULL;
+    }
+
+    for (i = 0; i < acb->qiov->niov; i++) {
+        scsi_task_add_data_in_buffer(acb->task,
+                acb->qiov->iov[i].iov_len,
+                acb->qiov->iov[i].iov_base);
+    }
+
+    iscsi_set_events(iscsilun);
+
+    return &acb->common;
+}
+
+
+static void
+iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
+                     void *command_data, void *opaque)
+{
+    IscsiAIOCB *acb = opaque;
+
+    if (acb->canceled != 0) {
+        qemu_aio_release(acb);
+        scsi_free_scsi_task(acb->task);
+        acb->task = NULL;
+        return;
+    }
+
+    acb->status = 0;
+    if (status < 0) {
+        error_report("Failed to sync10 data on iSCSI lun. %s",
+                     iscsi_get_error(iscsi));
+        acb->status = -EIO;
+    }
+
+    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    scsi_free_scsi_task(acb->task);
+    acb->task = NULL;
+}
+
+static BlockDriverAIOCB *
+iscsi_aio_flush(BlockDriverState *bs,
+                BlockDriverCompletionFunc *cb, void *opaque)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    struct iscsi_context *iscsi = iscsilun->iscsi;
+    IscsiAIOCB *acb;
+
+    acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+
+    acb->iscsilun = iscsilun;
+    acb->canceled   = 0;
+
+    acb->task = iscsi_synchronizecache10_task(iscsi, iscsilun->lun,
+                                         0, 0, 0, 0,
+                                         iscsi_synccache10_cb,
+                                         acb);
+    if (acb->task == NULL) {
+        error_report("iSCSI: Failed to send synchronizecache10 command. %s",
+                     iscsi_get_error(iscsi));
+        qemu_aio_release(acb);
+        return NULL;
+    }
+
+    iscsi_set_events(iscsilun);
+
+    return &acb->common;
+}
+
+static int64_t
+iscsi_getlength(BlockDriverState *bs)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    int64_t len;
+
+    len  = iscsilun->num_blocks;
+    len *= iscsilun->block_size;
+
+    return len;
+}
+
+static void
+iscsi_readcapacity10_cb(struct iscsi_context *iscsi, int status,
+                        void *command_data, void *opaque)
+{
+    struct IscsiTask *itask = opaque;
+    struct scsi_readcapacity10 *rc10;
+    struct scsi_task *task = command_data;
+
+    if (status != 0) {
+        error_report("iSCSI: Failed to read capacity of iSCSI lun. %s",
+                     iscsi_get_error(iscsi));
+        itask->status   = 1;
+        itask->complete = 1;
+        scsi_free_scsi_task(task);
+        return;
+    }
+
+    rc10 = scsi_datain_unmarshall(task);
+    if (rc10 == NULL) {
+        error_report("iSCSI: Failed to unmarshall readcapacity10 data.");
+        itask->status   = 1;
+        itask->complete = 1;
+        scsi_free_scsi_task(task);
+        return;
+    }
+
+    itask->iscsilun->block_size = rc10->block_size;
+    itask->iscsilun->num_blocks = rc10->lba;
+
+    itask->status   = 0;
+    itask->complete = 1;
+    scsi_free_scsi_task(task);
+}
+
+
+static void
+iscsi_connect_cb(struct iscsi_context *iscsi, int status, void *command_data,
+                 void *opaque)
+{
+    struct IscsiTask *itask = opaque;
+    struct scsi_task *task;
+
+    if (status != 0) {
+        itask->status   = 1;
+        itask->complete = 1;
+        return;
+    }
+
+    task = iscsi_readcapacity10_task(iscsi, itask->iscsilun->lun, 0, 0,
+                                   iscsi_readcapacity10_cb, opaque);
+    if (task == NULL) {
+        error_report("iSCSI: failed to send readcapacity command.");
+        itask->status   = 1;
+        itask->complete = 1;
+        return;
+    }
+}
+
+/*
+ * We support iscsi url's on the form
+ * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun>
+ */
+static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    struct iscsi_context *iscsi = NULL;
+    struct iscsi_url *iscsi_url = NULL;
+    struct IscsiTask task;
+    int ret;
+
+    if ((BDRV_SECTOR_SIZE % 512) != 0) {
+        error_report("iSCSI: Invalid BDRV_SECTOR_SIZE. "
+                     "BDRV_SECTOR_SIZE(%lld) is not a multiple "
+                     "of 512", BDRV_SECTOR_SIZE);
+        return -EINVAL;
+    }
+
+    memset(iscsilun, 0, sizeof(IscsiLun));
+
+    /* Should really append the KVM name after the ':' here */
+    iscsi = iscsi_create_context("iqn.2008-11.org.linux-kvm:");
+    if (iscsi == NULL) {
+        error_report("iSCSI: Failed to create iSCSI context.");
+        ret = -ENOMEM;
+        goto failed;
+    }
+
+    iscsi_url = iscsi_parse_full_url(iscsi, filename);
+    if (iscsi_url == NULL) {
+        error_report("Failed to parse URL : %s %s", filename,
+                     iscsi_get_error(iscsi));
+        ret = -ENOMEM;
+        goto failed;
+    }
+
+    if (iscsi_set_targetname(iscsi, iscsi_url->target)) {
+        error_report("iSCSI: Failed to set target name.");
+        ret = -ENOMEM;
+        goto failed;
+    }
+
+    if (iscsi_url->user != NULL) {
+        ret = iscsi_set_initiator_username_pwd(iscsi, iscsi_url->user,
+                                              iscsi_url->passwd);
+        if (ret != 0) {
+            error_report("Failed to set initiator username and password");
+            ret = -ENOMEM;
+            goto failed;
+        }
+    }
+    if (iscsi_set_session_type(iscsi, ISCSI_SESSION_NORMAL) != 0) {
+        error_report("iSCSI: Failed to set session type to normal.");
+        ret = -ENOMEM;
+        goto failed;
+    }
+
+    iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C);
+
+    task.iscsilun = iscsilun;
+    task.status = 0;
+    task.complete = 0;
+
+    iscsilun->iscsi = iscsi;
+    iscsilun->lun   = iscsi_url->lun;
+
+    if (iscsi_full_connect_async(iscsi, iscsi_url->portal, iscsi_url->lun,
+                                 iscsi_connect_cb, &task)
+        != 0) {
+        error_report("iSCSI: Failed to start async connect.");
+        ret = -ENOMEM;
+        goto failed;
+    }
+
+    while (!task.complete) {
+        iscsi_set_events(iscsilun);
+        qemu_aio_wait();
+    }
+    if (task.status != 0) {
+        error_report("iSCSI: Failed to connect to LUN : %s",
+                     iscsi_get_error(iscsi));
+        ret = -EINVAL;
+        goto failed;
+    }
+
+    return 0;
+
+failed:
+    if (iscsi_url != NULL) {
+        iscsi_destroy_url(iscsi_url);
+    }
+    if (iscsi != NULL) {
+        iscsi_destroy_context(iscsi);
+    }
+    memset(iscsilun, 0, sizeof(IscsiLun));
+    return ret;
+}
+
+static void iscsi_close(BlockDriverState *bs)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    struct iscsi_context *iscsi = iscsilun->iscsi;
+
+    qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL, NULL, NULL);
+    iscsi_destroy_context(iscsi);
+    memset(iscsilun, 0, sizeof(IscsiLun));
+}
+
+static BlockDriver bdrv_iscsi = {
+    .format_name     = "iscsi",
+    .protocol_name   = "iscsi",
+
+    .instance_size   = sizeof(IscsiLun),
+    .bdrv_file_open  = iscsi_open,
+    .bdrv_close      = iscsi_close,
+
+    .bdrv_getlength  = iscsi_getlength,
+
+    .bdrv_aio_readv  = iscsi_aio_readv,
+    .bdrv_aio_writev = iscsi_aio_writev,
+    .bdrv_aio_flush  = iscsi_aio_flush,
+};
+
+static void iscsi_block_init(void)
+{
+    bdrv_register(&bdrv_iscsi);
+}
+
+block_init(iscsi_block_init);
+
diff --git a/configure b/configure
index 348f36a..a2c30e1 100755
--- a/configure
+++ b/configure
@@ -182,6 +182,7 @@  usb_redir=""
 opengl=""
 zlib="yes"
 guest_agent="yes"
+libiscsi=""
 
 # parse CC options first
 for opt do
@@ -651,6 +652,10 @@  for opt do
   ;;
   --enable-spice) spice="yes"
   ;;
+  --disable-libiscsi) libiscsi="no"
+  ;;
+  --enable-libiscsi) libiscsi="yes"
+  ;;
   --enable-profiler) profiler="yes"
   ;;
   --enable-cocoa)
@@ -1037,6 +1042,8 @@  echo "                           Default:trace-<pid>"
 echo "  --disable-spice          disable spice"
 echo "  --enable-spice           enable spice"
 echo "  --enable-rbd             enable building the rados block device (rbd)"
+echo "  --disable-libiscsi       disable iscsi support"
+echo "  --enable-libiscsi        enable iscsi support"
 echo "  --disable-smartcard      disable smartcard support"
 echo "  --enable-smartcard       enable smartcard support"
 echo "  --disable-smartcard-nss  disable smartcard nss support"
@@ -2326,6 +2333,25 @@  if compile_prog "" "" ; then
 fi
 
 ##########################################
+# Do we have libiscsi
+if test "$libiscsi" != "no" ; then
+  cat > $TMPC << EOF
+#include <iscsi/iscsi.h>
+int main(void) { iscsi_create_context(""); return 0; }
+EOF
+  if compile_prog "-Werror" "-liscsi" ; then
+    libiscsi="yes"
+    LIBS="$LIBS -liscsi"
+  else
+    if test "$libiscsi" = "yes" ; then
+      feature_not_found "libiscsi"
+    fi
+    libiscsi="no"
+  fi
+fi
+
+
+##########################################
 # Do we need librt
 cat > $TMPC <<EOF
 #include <signal.h>
@@ -2727,6 +2753,7 @@  echo "xfsctl support    $xfs"
 echo "nss used          $smartcard_nss"
 echo "usb net redir     $usb_redir"
 echo "OpenGL support    $opengl"
+echo "libiscsi support  $libiscsi"
 echo "build guest agent $guest_agent"
 
 if test "$sdl_too_old" = "yes"; then
@@ -3028,6 +3055,10 @@  if test "$opengl" = "yes" ; then
   echo "CONFIG_OPENGL=y" >> $config_host_mak
 fi
 
+if test "$libiscsi" = "yes" ; then
+  echo "CONFIG_LIBISCSI=y" >> $config_host_mak
+fi
+
 # XXX: suppress that
 if [ "$bsd" = "yes" ] ; then
   echo "CONFIG_BSD=y" >> $config_host_mak
diff --git a/trace-events b/trace-events
index 3fdd60f..4e461e5 100644
--- a/trace-events
+++ b/trace-events
@@ -494,3 +494,10 @@  escc_sunkbd_event_in(int ch) "Untranslated keycode %2.2x"
 escc_sunkbd_event_out(int ch) "Translated keycode %2.2x"
 escc_kbd_command(int val) "Command %d"
 escc_sunmouse_event(int dx, int dy, int buttons_state) "dx=%d dy=%d buttons=%01x"
+
+# block/iscsi.c
+disable iscsi_aio_write10_cb(void *iscsi, int status, void *acb, int canceled) "iscsi %p status %d acb %p canceled %d"
+disable iscsi_aio_writev(void *iscsi, int64_t sector_num, int nb_sectors, void *opaque, void *acb) "iscsi %p sector_num %"PRId64" nb_sectors %d opaque %p acb %p"
+disable iscsi_aio_read10_cb(void *iscsi, int status, void *acb, int canceled) "iscsi %p status %d acb %p canceled %d"
+disable iscsi_aio_readv(void *iscsi, int64_t sector_num, int nb_sectors, void *opaque, void *acb) "iscsi %p sector_num %"PRId64" nb_sectors %d opaque %p acb %p"
+