diff mbox

[PATCHv2,RESEND] block/iscsi: speed up read for unallocated sectors

Message ID 535E3764.303@kamp.de
State New
Headers show

Commit Message

Peter Lieven April 28, 2014, 11:11 a.m. UTC
this patch implements a cache that tracks if a page on the
iscsi target is allocated or not. The cache is implemented in
a way that it allows for false positives
(e.g. pretending a page is allocated, but it isn't), but
no false negatives.

The cached allocation info is then used to speed up the
read process for unallocated sectors by issueing a GET_LBA_STATUS
request for all sectors that are not yet known to be allocated.
If the read request is confirmed to fall into an unallocated
range we directly return zeroes and do not transfer the
data over the wire.

Tests have shown that a relatively small amount of GET_LBA_STATUS
requests happens a vServer boot time to fill the allocation cache
(all those blocks are not queried again).

Not to transfer all the data of unallocated sectors saves a lot
of time, bandwidth and storage I/O load during block jobs or storage
migration and it saves a lot of bandwidth as well for any big sequential
read of the whole disk (e.g. block copy or speed tests) if a significant
number of blocks is unallocated.

Signed-off-by: Peter Lieven <pl@kamp.de>
---
v1 -> v2: - fix rounding in iscsi_allocationmap_clear().
          - check for iscsilun->allocationmap == NULL instead
            of iscsilun->cluster_size == 0 in various places.
            Because we can have a cluster_size but no allocationmap,
            but can't have an allocationmap without cluster_size.
RFC -> v1: - check that BDRV_O_NOCACHE is clear
           - move set/clear of allocation map to iscsi_co_get_block_status
           - clear allocation map in iscsi_co_discard also

 block/iscsi.c |  300 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 198 insertions(+), 102 deletions(-)

Comments

Paolo Bonzini April 28, 2014, 1:22 p.m. UTC | #1
Il 28/04/2014 13:11, Peter Lieven ha scritto:
> diff --git a/block/iscsi.c b/block/iscsi.c
> index b490e98..9f5b4a0 100644
> --- a/block/iscsi.c
> +++ b/block/iscsi.c
> @@ -30,6 +30,8 @@
>  #include "qemu-common.h"
>  #include "qemu/config-file.h"
>  #include "qemu/error-report.h"
> +#include "qemu/bitops.h"
> +#include "qemu/bitmap.h"
>  #include "block/block_int.h"
>  #include "trace.h"
>  #include "block/scsi.h"
> @@ -59,6 +61,8 @@ typedef struct IscsiLun {
>      struct scsi_inquiry_logical_block_provisioning lbp;
>      struct scsi_inquiry_block_limits bl;
>      unsigned char *zeroblock;
> +    unsigned long *allocationmap;
> +    int cluster_sectors;
>  } IscsiLun;
>
>  typedef struct IscsiTask {
> @@ -91,6 +95,7 @@ typedef struct IscsiAIOCB {
>  #define NOP_INTERVAL 5000
>  #define MAX_NOP_FAILURES 3
>  #define ISCSI_CMD_RETRIES 5
> +#define ISCSI_CHECKALLOC_THRES 63

One thing that crossed my mind.  You have a threshold of 64 sectors, 
which is a nice power of two and usually smaller than the minimum 
"interesting" opt_unmap_gran (64K).  So perhaps one of the following is 
true:

a) the threshold should be 128

b) the allocationmap should be allocated even for out-of-range 
opt_unmap_gran, using a granularity of 64 sectors in that case.

c) an opt_unmap_gran of 32K should be used by QEMU as a cluster size

d) the threshold should be dynamic and equal to iscsilun->cluster_sectors

e) c+d are both true

If you respin for any of the above changes, please change uses of 
ISCSI_CHECK_ALLOC_THRES to compare with >=.  It would simplify the logic 
a bit, and the non-power-of-two makes people scratch their heads. 
Otherwise please post a patch to be applied on top.  For now, I'm 
holding this patch.

Paolo

>  static void
>  iscsi_bh_cb(void *p)
> @@ -273,6 +278,32 @@ static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
>      return 1;
>  }
>
> +static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num,
> +                                    int nb_sectors)
> +{
> +    if (iscsilun->allocationmap == NULL) {
> +        return;
> +    }
> +    bitmap_set(iscsilun->allocationmap,
> +               sector_num / iscsilun->cluster_sectors,
> +               DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
> +}
> +
> +static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num,
> +                                      int nb_sectors)
> +{
> +    int64_t cluster_num, nb_clusters;
> +    if (iscsilun->allocationmap == NULL) {
> +        return;
> +    }
> +    cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors);
> +    nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors
> +                  - cluster_num;
> +    if (nb_clusters > 0) {
> +        bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters);
> +    }
> +}
> +
>  static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
>                                          int64_t sector_num, int nb_sectors,
>                                          QEMUIOVector *iov)
> @@ -336,9 +367,127 @@ retry:
>          return -EIO;
>      }
>
> +    iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
> +
>      return 0;
>  }
>
> +
> +static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun,
> +                                             int64_t sector_num, int nb_sectors)
> +{
> +    unsigned long size;
> +    if (iscsilun->allocationmap == NULL) {
> +        return true;
> +    }
> +    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
> +    return !(find_next_bit(iscsilun->allocationmap, size,
> +                           sector_num / iscsilun->cluster_sectors) == size);
> +}
> +
> +
> +#if defined(LIBISCSI_FEATURE_IOVECTOR)
> +
> +static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
> +                                                  int64_t sector_num,
> +                                                  int nb_sectors, int *pnum)
> +{
> +    IscsiLun *iscsilun = bs->opaque;
> +    struct scsi_get_lba_status *lbas = NULL;
> +    struct scsi_lba_status_descriptor *lbasd = NULL;
> +    struct IscsiTask iTask;
> +    int64_t ret;
> +
> +    iscsi_co_init_iscsitask(iscsilun, &iTask);
> +
> +    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +
> +    /* default to all sectors allocated */
> +    ret = BDRV_BLOCK_DATA;
> +    ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID;
> +    *pnum = nb_sectors;
> +
> +    /* LUN does not support logical block provisioning */
> +    if (iscsilun->lbpme == 0) {
> +        goto out;
> +    }
> +
> +retry:
> +    if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun,
> +                                  sector_qemu2lun(sector_num, iscsilun),
> +                                  8 + 16, iscsi_co_generic_cb,
> +                                  &iTask) == NULL) {
> +        ret = -ENOMEM;
> +        goto out;
> +    }
> +
> +    while (!iTask.complete) {
> +        iscsi_set_events(iscsilun);
> +        qemu_coroutine_yield();
> +    }
> +
> +    if (iTask.do_retry) {
> +        if (iTask.task != NULL) {
> +            scsi_free_scsi_task(iTask.task);
> +            iTask.task = NULL;
> +        }
> +        iTask.complete = 0;
> +        goto retry;
> +    }
> +
> +    if (iTask.status != SCSI_STATUS_GOOD) {
> +        /* in case the get_lba_status_callout fails (i.e.
> +         * because the device is busy or the cmd is not
> +         * supported) we pretend all blocks are allocated
> +         * for backwards compatibility */
> +        goto out;
> +    }
> +
> +    lbas = scsi_datain_unmarshall(iTask.task);
> +    if (lbas == NULL) {
> +        ret = -EIO;
> +        goto out;
> +    }
> +
> +    lbasd = &lbas->descriptors[0];
> +
> +    if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) {
> +        ret = -EIO;
> +        goto out;
> +    }
> +
> +    *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun);
> +
> +    if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED ||
> +        lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) {
> +        ret &= ~BDRV_BLOCK_DATA;
> +        if (iscsilun->lbprz) {
> +            ret |= BDRV_BLOCK_ZERO;
> +        }
> +    }
> +
> +    if (ret & BDRV_BLOCK_ZERO) {
> +        iscsi_allocationmap_clear(iscsilun, sector_num, *pnum);
> +    } else {
> +        iscsi_allocationmap_set(iscsilun, sector_num, *pnum);
> +    }
> +
> +    if (*pnum > nb_sectors) {
> +        *pnum = nb_sectors;
> +    }
> +out:
> +    if (iTask.task != NULL) {
> +        scsi_free_scsi_task(iTask.task);
> +    }
> +    return ret;
> +}
> +
> +#endif /* LIBISCSI_FEATURE_IOVECTOR */
> +
> +
>  static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
>                                         int64_t sector_num, int nb_sectors,
>                                         QEMUIOVector *iov)
> @@ -355,6 +504,22 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
>          return -EINVAL;
>      }
>
> +#if defined(LIBISCSI_FEATURE_IOVECTOR)
> +    if (iscsilun->lbprz && nb_sectors > ISCSI_CHECKALLOC_THRES &&
> +        !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
> +        int64_t ret;
> +        int pnum;
> +        ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        if (ret & BDRV_BLOCK_ZERO && pnum >= nb_sectors) {
> +            qemu_iovec_memset(iov, 0, 0x00, iov->size);
> +            return 0;
> +        }
> +    }
> +#endif
> +
>      lba = sector_qemu2lun(sector_num, iscsilun);
>      num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
>
> @@ -639,101 +804,6 @@ iscsi_getlength(BlockDriverState *bs)
>      return len;
>  }
>
> -#if defined(LIBISCSI_FEATURE_IOVECTOR)
> -
> -static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
> -                                                  int64_t sector_num,
> -                                                  int nb_sectors, int *pnum)
> -{
> -    IscsiLun *iscsilun = bs->opaque;
> -    struct scsi_get_lba_status *lbas = NULL;
> -    struct scsi_lba_status_descriptor *lbasd = NULL;
> -    struct IscsiTask iTask;
> -    int64_t ret;
> -
> -    iscsi_co_init_iscsitask(iscsilun, &iTask);
> -
> -    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
> -        ret = -EINVAL;
> -        goto out;
> -    }
> -
> -    /* default to all sectors allocated */
> -    ret = BDRV_BLOCK_DATA;
> -    ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID;
> -    *pnum = nb_sectors;
> -
> -    /* LUN does not support logical block provisioning */
> -    if (iscsilun->lbpme == 0) {
> -        goto out;
> -    }
> -
> -retry:
> -    if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun,
> -                                  sector_qemu2lun(sector_num, iscsilun),
> -                                  8 + 16, iscsi_co_generic_cb,
> -                                  &iTask) == NULL) {
> -        ret = -ENOMEM;
> -        goto out;
> -    }
> -
> -    while (!iTask.complete) {
> -        iscsi_set_events(iscsilun);
> -        qemu_coroutine_yield();
> -    }
> -
> -    if (iTask.do_retry) {
> -        if (iTask.task != NULL) {
> -            scsi_free_scsi_task(iTask.task);
> -            iTask.task = NULL;
> -        }
> -        iTask.complete = 0;
> -        goto retry;
> -    }
> -
> -    if (iTask.status != SCSI_STATUS_GOOD) {
> -        /* in case the get_lba_status_callout fails (i.e.
> -         * because the device is busy or the cmd is not
> -         * supported) we pretend all blocks are allocated
> -         * for backwards compatibility */
> -        goto out;
> -    }
> -
> -    lbas = scsi_datain_unmarshall(iTask.task);
> -    if (lbas == NULL) {
> -        ret = -EIO;
> -        goto out;
> -    }
> -
> -    lbasd = &lbas->descriptors[0];
> -
> -    if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) {
> -        ret = -EIO;
> -        goto out;
> -    }
> -
> -    *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun);
> -    if (*pnum > nb_sectors) {
> -        *pnum = nb_sectors;
> -    }
> -
> -    if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED ||
> -        lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) {
> -        ret &= ~BDRV_BLOCK_DATA;
> -        if (iscsilun->lbprz) {
> -            ret |= BDRV_BLOCK_ZERO;
> -        }
> -    }
> -
> -out:
> -    if (iTask.task != NULL) {
> -        scsi_free_scsi_task(iTask.task);
> -    }
> -    return ret;
> -}
> -
> -#endif /* LIBISCSI_FEATURE_IOVECTOR */
> -
>  static int
>  coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num,
>                                     int nb_sectors)
> @@ -787,6 +857,8 @@ retry:
>          return -EIO;
>      }
>
> +    iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
> +
>      return 0;
>  }
>
> @@ -859,6 +931,12 @@ retry:
>          return -EIO;
>      }
>
> +    if (flags & BDRV_REQ_MAY_UNMAP) {
> +        iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
> +    } else {
> +        iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
> +    }
> +
>      return 0;
>  }
>
> @@ -1288,6 +1366,22 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
>      timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
>  #endif
>
> +    /* Guess the internal cluster (page) size of the iscsi target by the means
> +     * of opt_unmap_gran. Transfer the unmap granularity only if it has a
> +     * reasonable size */
> +    if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 &&
> +        iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) {
> +        iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
> +                                     iscsilun->block_size) >> BDRV_SECTOR_BITS;
> +#if defined(LIBISCSI_FEATURE_IOVECTOR)
> +        if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) {
> +            iscsilun->allocationmap =
> +                bitmap_new(DIV_ROUND_UP(bs->total_sectors,
> +                                        iscsilun->cluster_sectors));
> +        }
> +#endif
> +    }
> +
>  out:
>      qemu_opts_del(opts);
>      if (initiator_name != NULL) {
> @@ -1321,6 +1415,7 @@ static void iscsi_close(BlockDriverState *bs)
>      qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL);
>      iscsi_destroy_context(iscsi);
>      g_free(iscsilun->zeroblock);
> +    g_free(iscsilun->allocationmap);
>      memset(iscsilun, 0, sizeof(IscsiLun));
>  }
>
> @@ -1379,6 +1474,13 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
>          return -EINVAL;
>      }
>
> +    if (iscsilun->allocationmap != NULL) {
> +        g_free(iscsilun->allocationmap);
> +        iscsilun->allocationmap =
> +            bitmap_new(DIV_ROUND_UP(bs->total_sectors,
> +                                    iscsilun->cluster_sectors));
> +    }
> +
>      return 0;
>  }
>
> @@ -1441,13 +1543,7 @@ static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
>      IscsiLun *iscsilun = bs->opaque;
>      bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz;
>      bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws;
> -    /* Guess the internal cluster (page) size of the iscsi target by the means
> -     * of opt_unmap_gran. Transfer the unmap granularity only if it has a
> -     * reasonable size for bdi->cluster_size */
> -    if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 &&
> -        iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) {
> -        bdi->cluster_size = iscsilun->bl.opt_unmap_gran * iscsilun->block_size;
> -    }
> +    bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE;
>      return 0;
>  }
>
>
Peter Lieven April 28, 2014, 1:43 p.m. UTC | #2
Am 28.04.2014 15:22, schrieb Paolo Bonzini:
> Il 28/04/2014 13:11, Peter Lieven ha scritto:
>> diff --git a/block/iscsi.c b/block/iscsi.c
>> index b490e98..9f5b4a0 100644
>> --- a/block/iscsi.c
>> +++ b/block/iscsi.c
>> @@ -30,6 +30,8 @@
>>  #include "qemu-common.h"
>>  #include "qemu/config-file.h"
>>  #include "qemu/error-report.h"
>> +#include "qemu/bitops.h"
>> +#include "qemu/bitmap.h"
>>  #include "block/block_int.h"
>>  #include "trace.h"
>>  #include "block/scsi.h"
>> @@ -59,6 +61,8 @@ typedef struct IscsiLun {
>>      struct scsi_inquiry_logical_block_provisioning lbp;
>>      struct scsi_inquiry_block_limits bl;
>>      unsigned char *zeroblock;
>> +    unsigned long *allocationmap;
>> +    int cluster_sectors;
>>  } IscsiLun;
>>
>>  typedef struct IscsiTask {
>> @@ -91,6 +95,7 @@ typedef struct IscsiAIOCB {
>>  #define NOP_INTERVAL 5000
>>  #define MAX_NOP_FAILURES 3
>>  #define ISCSI_CMD_RETRIES 5
>> +#define ISCSI_CHECKALLOC_THRES 63

My intention to introduce this threshold was to have a trade off between how much does it hurt to read unallocated sectors with READ16 and ask for the allocation status, find out that
sectors are allocated and READ16 then anyway. I was not intending to make this correlated to the cluster_size. I just wanted to avoid asking the lba_status for e.g. a 4K read and
in this case just read.

>
> One thing that crossed my mind.  You have a threshold of 64 sectors, which is a nice power of two and usually smaller than the minimum "interesting" opt_unmap_gran (64K).  So perhaps one of the following is true:
>
> a) the threshold should be 128
As stated above the THRESHOLD was introduced to justify the additional GET_LBA_STATUS callout.

>
> b) the allocationmap should be allocated even for out-of-range opt_unmap_gran, using a granularity of 64 sectors in that case.

Would the increase of the resolution bring any benefit? If we increase the resolution I think all sectors falling into the same
physical cluster would end up with the same status.
>
> c) an opt_unmap_gran of 32K should be used by QEMU as a cluster size

Do we have evidence of used cluster sizes on real hardware? I only know our Equallogic storages which have 30720 sectors.

In general I don't mind to use any non-zero value here. Worst case, we would end up have an allocation map with
1/4096 bytes in size of the target size. Maybe lower-limit should be 8 sectors (4K cluster size) which would mean
1/32768 of the target size. For a 100GB target this would mean an allocation map of 3.2MB.

>
> d) the threshold should be dynamic and equal to iscsilun->cluster_sectors

I don't think this is useful. For big cluster_sizes we will end up to never use the allocation cache.

>
> e) c+d are both true
>
> If you respin for any of the above changes, please change uses of ISCSI_CHECK_ALLOC_THRES to compare with >=.  It would simplify the logic a bit, and the non-power-of-two makes people scratch their heads. Otherwise please post a patch to be applied on top.  For now, I'm holding this patch.

Peter
Paolo Bonzini April 28, 2014, 2:39 p.m. UTC | #3
Il 28/04/2014 15:43, Peter Lieven ha scritto:
>> b) the allocationmap should be allocated even for out-of-range opt_unmap_gran, using a granularity of 64 sectors in that case.
>
> Would the increase of the resolution bring any benefit? If we increase the resolution I think all sectors falling into the same
> physical cluster would end up with the same status.

What if opt_unmap_gran is 32K or lower?  In this case you're not using 
an allocationmap.

>> c) an opt_unmap_gran of 32K should be used by QEMU as a cluster size
>
> Do we have evidence of used cluster sizes on real hardware? I only know our Equallogic storages which have 30720 sectors.

IDE does not specify the granularity, so an IDE disk will show a 
granularity of 1 sector (but it has LBPRZ=0).

>> d) the threshold should be dynamic and equal to iscsilun->cluster_sectors
>
> I don't think this is useful. For big cluster_sizes we will end up to never use the allocation cache.

Right.

Paolo
Peter Lieven April 28, 2014, 2:41 p.m. UTC | #4
Am 28.04.2014 16:39, schrieb Paolo Bonzini:
> Il 28/04/2014 15:43, Peter Lieven ha scritto:
>>> b) the allocationmap should be allocated even for out-of-range opt_unmap_gran, using a granularity of 64 sectors in that case.
>>
>> Would the increase of the resolution bring any benefit? If we increase the resolution I think all sectors falling into the same
>> physical cluster would end up with the same status.
>
> What if opt_unmap_gran is 32K or lower?  In this case you're not using an allocationmap.

As written I am fine with lowering this to 4K.

Follow-Up or v3?

Peter

>
>>> c) an opt_unmap_gran of 32K should be used by QEMU as a cluster size
>>
>> Do we have evidence of used cluster sizes on real hardware? I only know our Equallogic storages which have 30720 sectors.
>
> IDE does not specify the granularity, so an IDE disk will show a granularity of 1 sector (but it has LBPRZ=0).
>
>>> d) the threshold should be dynamic and equal to iscsilun->cluster_sectors
>>
>> I don't think this is useful. For big cluster_sizes we will end up to never use the allocation cache.
>
> Right.
>
> Paolo
Paolo Bonzini April 28, 2014, 2:56 p.m. UTC | #5
Il 28/04/2014 16:41, Peter Lieven ha scritto:
>> > What if opt_unmap_gran is 32K or lower?  In this case you're not using an allocationmap.
> As written I am fine with lowering this to 4K.
>
> Follow-Up or v3?

Follow up is okay.

Paolo
Peter Lieven April 29, 2014, 8:57 a.m. UTC | #6
Am 28.04.2014 16:56, schrieb Paolo Bonzini:
> Il 28/04/2014 16:41, Peter Lieven ha scritto:
>>> > What if opt_unmap_gran is 32K or lower?  In this case you're not using an allocationmap.
>> As written I am fine with lowering this to 4K.
>>
>> Follow-Up or v3?
>
> Follow up is okay.

2 follow ups send. I also added some comments about the threshold to explain its meaning.

Peter
Paolo Bonzini April 29, 2014, 9:15 a.m. UTC | #7
Il 29/04/2014 10:57, Peter Lieven ha scritto:
> Am 28.04.2014 16:56, schrieb Paolo Bonzini:
>> Il 28/04/2014 16:41, Peter Lieven ha scritto:
>>>>> What if opt_unmap_gran is 32K or lower?  In this case you're not using an allocationmap.
>>> As written I am fine with lowering this to 4K.
>>>
>>> Follow-Up or v3?
>>
>> Follow up is okay.
>
> 2 follow ups send. I also added some comments about the threshold to explain its meaning.

Applying all three patches.

Paolo
diff mbox

Patch

diff --git a/block/iscsi.c b/block/iscsi.c
index b490e98..9f5b4a0 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -30,6 +30,8 @@ 
 #include "qemu-common.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
+#include "qemu/bitops.h"
+#include "qemu/bitmap.h"
 #include "block/block_int.h"
 #include "trace.h"
 #include "block/scsi.h"
@@ -59,6 +61,8 @@  typedef struct IscsiLun {
     struct scsi_inquiry_logical_block_provisioning lbp;
     struct scsi_inquiry_block_limits bl;
     unsigned char *zeroblock;
+    unsigned long *allocationmap;
+    int cluster_sectors;
 } IscsiLun;
 
 typedef struct IscsiTask {
@@ -91,6 +95,7 @@  typedef struct IscsiAIOCB {
 #define NOP_INTERVAL 5000
 #define MAX_NOP_FAILURES 3
 #define ISCSI_CMD_RETRIES 5
+#define ISCSI_CHECKALLOC_THRES 63
 
 static void
 iscsi_bh_cb(void *p)
@@ -273,6 +278,32 @@  static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
     return 1;
 }
 
+static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num,
+                                    int nb_sectors)
+{
+    if (iscsilun->allocationmap == NULL) {
+        return;
+    }
+    bitmap_set(iscsilun->allocationmap,
+               sector_num / iscsilun->cluster_sectors,
+               DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
+}
+
+static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num,
+                                      int nb_sectors)
+{
+    int64_t cluster_num, nb_clusters;
+    if (iscsilun->allocationmap == NULL) {
+        return;
+    }
+    cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors);
+    nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors
+                  - cluster_num;
+    if (nb_clusters > 0) {
+        bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters);
+    }
+}
+
 static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
                                         int64_t sector_num, int nb_sectors,
                                         QEMUIOVector *iov)
@@ -336,9 +367,127 @@  retry:
         return -EIO;
     }
 
+    iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
+
     return 0;
 }
 
+
+static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun,
+                                             int64_t sector_num, int nb_sectors)
+{
+    unsigned long size;
+    if (iscsilun->allocationmap == NULL) {
+        return true;
+    }
+    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
+    return !(find_next_bit(iscsilun->allocationmap, size,
+                           sector_num / iscsilun->cluster_sectors) == size);
+}
+
+
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+
+static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
+                                                  int64_t sector_num,
+                                                  int nb_sectors, int *pnum)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    struct scsi_get_lba_status *lbas = NULL;
+    struct scsi_lba_status_descriptor *lbasd = NULL;
+    struct IscsiTask iTask;
+    int64_t ret;
+
+    iscsi_co_init_iscsitask(iscsilun, &iTask);
+
+    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    /* default to all sectors allocated */
+    ret = BDRV_BLOCK_DATA;
+    ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID;
+    *pnum = nb_sectors;
+
+    /* LUN does not support logical block provisioning */
+    if (iscsilun->lbpme == 0) {
+        goto out;
+    }
+
+retry:
+    if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun,
+                                  sector_qemu2lun(sector_num, iscsilun),
+                                  8 + 16, iscsi_co_generic_cb,
+                                  &iTask) == NULL) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    while (!iTask.complete) {
+        iscsi_set_events(iscsilun);
+        qemu_coroutine_yield();
+    }
+
+    if (iTask.do_retry) {
+        if (iTask.task != NULL) {
+            scsi_free_scsi_task(iTask.task);
+            iTask.task = NULL;
+        }
+        iTask.complete = 0;
+        goto retry;
+    }
+
+    if (iTask.status != SCSI_STATUS_GOOD) {
+        /* in case the get_lba_status_callout fails (i.e.
+         * because the device is busy or the cmd is not
+         * supported) we pretend all blocks are allocated
+         * for backwards compatibility */
+        goto out;
+    }
+
+    lbas = scsi_datain_unmarshall(iTask.task);
+    if (lbas == NULL) {
+        ret = -EIO;
+        goto out;
+    }
+
+    lbasd = &lbas->descriptors[0];
+
+    if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) {
+        ret = -EIO;
+        goto out;
+    }
+
+    *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun);
+
+    if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED ||
+        lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) {
+        ret &= ~BDRV_BLOCK_DATA;
+        if (iscsilun->lbprz) {
+            ret |= BDRV_BLOCK_ZERO;
+        }
+    }
+
+    if (ret & BDRV_BLOCK_ZERO) {
+        iscsi_allocationmap_clear(iscsilun, sector_num, *pnum);
+    } else {
+        iscsi_allocationmap_set(iscsilun, sector_num, *pnum);
+    }
+
+    if (*pnum > nb_sectors) {
+        *pnum = nb_sectors;
+    }
+out:
+    if (iTask.task != NULL) {
+        scsi_free_scsi_task(iTask.task);
+    }
+    return ret;
+}
+
+#endif /* LIBISCSI_FEATURE_IOVECTOR */
+
+
 static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
                                        int64_t sector_num, int nb_sectors,
                                        QEMUIOVector *iov)
@@ -355,6 +504,22 @@  static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
         return -EINVAL;
     }
 
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+    if (iscsilun->lbprz && nb_sectors > ISCSI_CHECKALLOC_THRES &&
+        !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
+        int64_t ret;
+        int pnum;
+        ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum);
+        if (ret < 0) {
+            return ret;
+        }
+        if (ret & BDRV_BLOCK_ZERO && pnum >= nb_sectors) {
+            qemu_iovec_memset(iov, 0, 0x00, iov->size);
+            return 0;
+        }
+    }
+#endif
+
     lba = sector_qemu2lun(sector_num, iscsilun);
     num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
 
@@ -639,101 +804,6 @@  iscsi_getlength(BlockDriverState *bs)
     return len;
 }
 
-#if defined(LIBISCSI_FEATURE_IOVECTOR)
-
-static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
-                                                  int64_t sector_num,
-                                                  int nb_sectors, int *pnum)
-{
-    IscsiLun *iscsilun = bs->opaque;
-    struct scsi_get_lba_status *lbas = NULL;
-    struct scsi_lba_status_descriptor *lbasd = NULL;
-    struct IscsiTask iTask;
-    int64_t ret;
-
-    iscsi_co_init_iscsitask(iscsilun, &iTask);
-
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
-        ret = -EINVAL;
-        goto out;
-    }
-
-    /* default to all sectors allocated */
-    ret = BDRV_BLOCK_DATA;
-    ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID;
-    *pnum = nb_sectors;
-
-    /* LUN does not support logical block provisioning */
-    if (iscsilun->lbpme == 0) {
-        goto out;
-    }
-
-retry:
-    if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun,
-                                  sector_qemu2lun(sector_num, iscsilun),
-                                  8 + 16, iscsi_co_generic_cb,
-                                  &iTask) == NULL) {
-        ret = -ENOMEM;
-        goto out;
-    }
-
-    while (!iTask.complete) {
-        iscsi_set_events(iscsilun);
-        qemu_coroutine_yield();
-    }
-
-    if (iTask.do_retry) {
-        if (iTask.task != NULL) {
-            scsi_free_scsi_task(iTask.task);
-            iTask.task = NULL;
-        }
-        iTask.complete = 0;
-        goto retry;
-    }
-
-    if (iTask.status != SCSI_STATUS_GOOD) {
-        /* in case the get_lba_status_callout fails (i.e.
-         * because the device is busy or the cmd is not
-         * supported) we pretend all blocks are allocated
-         * for backwards compatibility */
-        goto out;
-    }
-
-    lbas = scsi_datain_unmarshall(iTask.task);
-    if (lbas == NULL) {
-        ret = -EIO;
-        goto out;
-    }
-
-    lbasd = &lbas->descriptors[0];
-
-    if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) {
-        ret = -EIO;
-        goto out;
-    }
-
-    *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun);
-    if (*pnum > nb_sectors) {
-        *pnum = nb_sectors;
-    }
-
-    if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED ||
-        lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) {
-        ret &= ~BDRV_BLOCK_DATA;
-        if (iscsilun->lbprz) {
-            ret |= BDRV_BLOCK_ZERO;
-        }
-    }
-
-out:
-    if (iTask.task != NULL) {
-        scsi_free_scsi_task(iTask.task);
-    }
-    return ret;
-}
-
-#endif /* LIBISCSI_FEATURE_IOVECTOR */
-
 static int
 coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num,
                                    int nb_sectors)
@@ -787,6 +857,8 @@  retry:
         return -EIO;
     }
 
+    iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
+
     return 0;
 }
 
@@ -859,6 +931,12 @@  retry:
         return -EIO;
     }
 
+    if (flags & BDRV_REQ_MAY_UNMAP) {
+        iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
+    } else {
+        iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
+    }
+
     return 0;
 }
 
@@ -1288,6 +1366,22 @@  static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
     timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
 #endif
 
+    /* Guess the internal cluster (page) size of the iscsi target by the means
+     * of opt_unmap_gran. Transfer the unmap granularity only if it has a
+     * reasonable size */
+    if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 &&
+        iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) {
+        iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
+                                     iscsilun->block_size) >> BDRV_SECTOR_BITS;
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+        if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) {
+            iscsilun->allocationmap =
+                bitmap_new(DIV_ROUND_UP(bs->total_sectors,
+                                        iscsilun->cluster_sectors));
+        }
+#endif
+    }
+
 out:
     qemu_opts_del(opts);
     if (initiator_name != NULL) {
@@ -1321,6 +1415,7 @@  static void iscsi_close(BlockDriverState *bs)
     qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL);
     iscsi_destroy_context(iscsi);
     g_free(iscsilun->zeroblock);
+    g_free(iscsilun->allocationmap);
     memset(iscsilun, 0, sizeof(IscsiLun));
 }
 
@@ -1379,6 +1474,13 @@  static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
         return -EINVAL;
     }
 
+    if (iscsilun->allocationmap != NULL) {
+        g_free(iscsilun->allocationmap);
+        iscsilun->allocationmap =
+            bitmap_new(DIV_ROUND_UP(bs->total_sectors,
+                                    iscsilun->cluster_sectors));
+    }
+
     return 0;
 }
 
@@ -1441,13 +1543,7 @@  static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
     IscsiLun *iscsilun = bs->opaque;
     bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz;
     bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws;
-    /* Guess the internal cluster (page) size of the iscsi target by the means
-     * of opt_unmap_gran. Transfer the unmap granularity only if it has a
-     * reasonable size for bdi->cluster_size */
-    if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 &&
-        iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) {
-        bdi->cluster_size = iscsilun->bl.opt_unmap_gran * iscsilun->block_size;
-    }
+    bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE;
     return 0;
 }