diff mbox

[1/2,v1] blkdrv: Add queue limits parameters for sg block drive

Message ID 1345537427-21601-1-git-send-email-mc@linux.vnet.ibm.com
State New
Headers show

Commit Message

Cong Meng Aug. 21, 2012, 8:23 a.m. UTC
This patch adds some queue limit parameters into block drive. And inits them
for sg block drive. Some interfaces are also added for accessing them.

Signed-off-by: Cong Meng <mc@linux.vnet.ibm.com>
---
 block/raw-posix.c |   58 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 block_int.h       |    4 +++
 blockdev.c        |   15 +++++++++++++
 hw/block-common.h |    3 ++
 4 files changed, 80 insertions(+), 0 deletions(-)

Comments

Paolo Bonzini Aug. 21, 2012, 8:48 a.m. UTC | #1
Il 21/08/2012 10:23, Cong Meng ha scritto:
> +static void sg_get_queue_limits(BlockDriverState *bs, const char *filename)
> +{
> +    DIR *ffs;
> +    struct dirent *d;
> +    char path[MAXPATHLEN];
> +
> +    snprintf(path, MAXPATHLEN,
> +             "/sys/class/scsi_generic/sg%s/device/block/",
> +             filename + strlen("/dev/sg"));
> +
> +    ffs = opendir(path);
> +    if (!ffs) {
> +        return;
> +    }
> +
> +    for (;;) {
> +        d = readdir(ffs);
> +        if (!d) {
> +            return;
> +        }
> +
> +        if (strcmp(d->d_name, ".") == 0 || strcmp(d->d_name, "..") == 0) {
> +            continue;
> +        }
> +
> +        break;
> +    }
> +
> +    closedir(ffs);
> +
> +    pstrcat(path, MAXPATHLEN, d->d_name);
> +    pstrcat(path, MAXPATHLEN, "/queue/");
> +
> +    read_queue_limit(path, "max_sectors_kb", &bs->max_sectors);
> +    read_queue_limit(path, "max_segments", &bs->max_segments);
> +    read_queue_limit(path, "max_segment_size", &bs->max_segment_size);
> +}

Using /sys/dev/block or /sys/dev/char seems easier, and lets you
retrieve the parameters for block devices too.

However, I'm worried of the consequences this has for migration.  You
could have the same physical disk accessed with two different HBAs, with
different limits.  So I don't know if this can really be solved at all.

Paolo
Cong Meng Aug. 21, 2012, 9:41 a.m. UTC | #2
On Tue 21 Aug 2012 04:48:17 PM CST, Paolo Bonzini wrote:
> Il 21/08/2012 10:23, Cong Meng ha scritto:
>> +static void sg_get_queue_limits(BlockDriverState *bs, const char *filename)
>> +{
>> +    DIR *ffs;
>> +    struct dirent *d;
>> +    char path[MAXPATHLEN];
>> +
>> +    snprintf(path, MAXPATHLEN,
>> +             "/sys/class/scsi_generic/sg%s/device/block/",
>> +             filename + strlen("/dev/sg"));
>> +
>> +    ffs = opendir(path);
>> +    if (!ffs) {
>> +        return;
>> +    }
>> +
>> +    for (;;) {
>> +        d = readdir(ffs);
>> +        if (!d) {
>> +            return;
>> +        }
>> +
>> +        if (strcmp(d->d_name, ".") == 0 || strcmp(d->d_name, "..") == 0) {
>> +            continue;
>> +        }
>> +
>> +        break;
>> +    }
>> +
>> +    closedir(ffs);
>> +
>> +    pstrcat(path, MAXPATHLEN, d->d_name);
>> +    pstrcat(path, MAXPATHLEN, "/queue/");
>> +
>> +    read_queue_limit(path, "max_sectors_kb", &bs->max_sectors);
>> +    read_queue_limit(path, "max_segments", &bs->max_segments);
>> +    read_queue_limit(path, "max_segment_size", &bs->max_segment_size);
>> +}
>
> Using /sys/dev/block or /sys/dev/char seems easier, and lets you
> retrieve the parameters for block devices too.
>
what do you mean with "block devices"?   Using "/dev/sda" instead of 
"/dev/sg0"?

> However, I'm worried of the consequences this has for migration.  You
> could have the same physical disk accessed with two different HBAs, with
> different limits.  So I don't know if this can really be solved at all.
>
I know little about qemu migration now.  The pending scsi commands will 
be saved and
transfered to remote machine when starting migration?

Cong.

> Paolo
>
Stefan Hajnoczi Aug. 21, 2012, 9:49 a.m. UTC | #3
On Tue, Aug 21, 2012 at 9:23 AM, Cong Meng <mc@linux.vnet.ibm.com> wrote:
> This patch adds some queue limit parameters into block drive. And inits them
> for sg block drive. Some interfaces are also added for accessing them.
>
> Signed-off-by: Cong Meng <mc@linux.vnet.ibm.com>
> ---
>  block/raw-posix.c |   58 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  block_int.h       |    4 +++
>  blockdev.c        |   15 +++++++++++++
>  hw/block-common.h |    3 ++
>  4 files changed, 80 insertions(+), 0 deletions(-)
>
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index 0dce089..a086f89 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -53,6 +53,7 @@
>  #include <linux/cdrom.h>
>  #include <linux/fd.h>
>  #include <linux/fs.h>
> +#include <dirent.h>
>  #endif
>  #ifdef CONFIG_FIEMAP
>  #include <linux/fiemap.h>
> @@ -829,6 +830,62 @@ static int hdev_probe_device(const char *filename)
>      return 0;
>  }
>
> +static void read_queue_limit(char *path, const char *filename, unsigned int *val)
> +{
> +    FILE *f;
> +    char *tail = path + strlen(path);
> +
> +    pstrcat(path, MAXPATHLEN, filename);
> +    f = fopen(path, "r");
> +    if (!f) {
> +        goto out;
> +    }
> +
> +    fscanf(f, "%u", val);
> +    fclose(f);
> +
> +out:
> +    *tail = 0;
> +}
> +
> +static void sg_get_queue_limits(BlockDriverState *bs, const char *filename)
> +{
> +    DIR *ffs;
> +    struct dirent *d;
> +    char path[MAXPATHLEN];
> +
> +    snprintf(path, MAXPATHLEN,
> +             "/sys/class/scsi_generic/sg%s/device/block/",
> +             filename + strlen("/dev/sg"));
> +
> +    ffs = opendir(path);
> +    if (!ffs) {
> +        return;
> +    }
> +
> +    for (;;) {
> +        d = readdir(ffs);
> +        if (!d) {
> +            return;

Leaks ffs

> +        }
> +
> +        if (strcmp(d->d_name, ".") == 0 || strcmp(d->d_name, "..") == 0) {
> +            continue;
> +        }
> +
> +        break;
> +    }

Not sure where in the kernel the block/ sysfs directory is created for
the device.  I wasn't able to check that there is only ever 1
directory entry for a SCSI device's block/.  Any ideas whether it's
safe to grab the first directory entry?

> +
> +    closedir(ffs);
> +
> +    pstrcat(path, MAXPATHLEN, d->d_name);
> +    pstrcat(path, MAXPATHLEN, "/queue/");
> +
> +    read_queue_limit(path, "max_sectors_kb", &bs->max_sectors);
> +    read_queue_limit(path, "max_segments", &bs->max_segments);
> +    read_queue_limit(path, "max_segment_size", &bs->max_segment_size);
> +}
> +
>  static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
>  {
>      BDRVRawState *s = bs->opaque;
> @@ -868,6 +925,7 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
>          temp = realpath(filename, resolved_path);
>          if (temp && strstart(temp, "/dev/sg", NULL)) {
>              bs->sg = 1;
> +            sg_get_queue_limits(bs, temp);
>          }
>      }
>  #endif
> diff --git a/block_int.h b/block_int.h
> index d72317f..a9d07a2 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -333,6 +333,10 @@ struct BlockDriverState {
>
>      /* long-running background operation */
>      BlockJob *job;
> +
> +    unsigned int max_sectors;
> +    unsigned int max_segments;
> +    unsigned int max_segment_size;
>  };
>
>  int get_tmp_filename(char *filename, int size);
> diff --git a/blockdev.c b/blockdev.c
> index 3d75015..e17edbf 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -1191,3 +1191,18 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
>      bdrv_iterate(do_qmp_query_block_jobs_one, &prev);
>      return dummy.next;
>  }
> +
> +unsigned int get_queue_max_sectors(BlockDriverState *bs)

These should be bdrv_get_queue_max_sectors(BlockDriverState *bs) and
should live in block.c.

> +{
> +    return (bs->file && bs->file->sg) ? bs->file->max_sectors : 0;

The BlockDriver should be able to provide these values, we shouldn't
reach into bs->file.
Stefan Hajnoczi Aug. 21, 2012, 9:52 a.m. UTC | #4
On Tue, Aug 21, 2012 at 10:41 AM, Cong Meng <mc@linux.vnet.ibm.com> wrote:
>
>
> On Tue 21 Aug 2012 04:48:17 PM CST, Paolo Bonzini wrote:
>>
>> Il 21/08/2012 10:23, Cong Meng ha scritto:
>>>
>>> +static void sg_get_queue_limits(BlockDriverState *bs, const char
>>> *filename)
>>> +{
>>> +    DIR *ffs;
>>> +    struct dirent *d;
>>> +    char path[MAXPATHLEN];
>>> +
>>> +    snprintf(path, MAXPATHLEN,
>>> +             "/sys/class/scsi_generic/sg%s/device/block/",
>>> +             filename + strlen("/dev/sg"));
>>> +
>>> +    ffs = opendir(path);
>>> +    if (!ffs) {
>>> +        return;
>>> +    }
>>> +
>>> +    for (;;) {
>>> +        d = readdir(ffs);
>>> +        if (!d) {
>>> +            return;
>>> +        }
>>> +
>>> +        if (strcmp(d->d_name, ".") == 0 || strcmp(d->d_name, "..") == 0)
>>> {
>>> +            continue;
>>> +        }
>>> +
>>> +        break;
>>> +    }
>>> +
>>> +    closedir(ffs);
>>> +
>>> +    pstrcat(path, MAXPATHLEN, d->d_name);
>>> +    pstrcat(path, MAXPATHLEN, "/queue/");
>>> +
>>> +    read_queue_limit(path, "max_sectors_kb", &bs->max_sectors);
>>> +    read_queue_limit(path, "max_segments", &bs->max_segments);
>>> +    read_queue_limit(path, "max_segment_size", &bs->max_segment_size);
>>> +}
>>
>>
>> Using /sys/dev/block or /sys/dev/char seems easier, and lets you
>> retrieve the parameters for block devices too.
>>
> what do you mean with "block devices"?   Using "/dev/sda" instead of
> "/dev/sg0"?
>
>
>> However, I'm worried of the consequences this has for migration.  You
>> could have the same physical disk accessed with two different HBAs, with
>> different limits.  So I don't know if this can really be solved at all.
>>
> I know little about qemu migration now.  The pending scsi commands will be
> saved and
> transfered to remote machine when starting migration?

Passthrough is already a migration blocker if both hosts do not have
access to the same LUNs.

When both hosts do have access to the same LUNs it's possible to
extract the block queue limits (using sysfs) and compare them.

Today you can start QEMU with different image files on both hosts.
Migration will appear to work but the disk image on the destination
host could be junk.  This is a similar case, I don't see a problem
except that there should be a safety check (maybe at the libvirt
level) to make this safe.

Stefan
Paolo Bonzini Aug. 21, 2012, 10:14 a.m. UTC | #5
Il 21/08/2012 11:52, Stefan Hajnoczi ha scritto:
>>> >> Using /sys/dev/block or /sys/dev/char seems easier, and lets you
>>> >> retrieve the parameters for block devices too.
>>> >>
>> > what do you mean with "block devices"?   Using "/dev/sda" instead of
>> > "/dev/sg0"?

Yes.

>>> >> However, I'm worried of the consequences this has for migration.  You
>>> >> could have the same physical disk accessed with two different HBAs, with
>>> >> different limits.  So I don't know if this can really be solved at all.
>>> >>
>> > I know little about qemu migration now.  The pending scsi commands will be
>> > saved and
>> > transfered to remote machine when starting migration?
> 
> Passthrough is already a migration blocker if both hosts do not have
> access to the same LUNs.

Yes, but requiring the exact same hardware may be too much.  I'm trying
to understand the problem better before committing to a threefold
spec/qemu/kernel change.

Cong, what is the limit that the host HBA enforces (and what is the
HBA)?  What commands see a problem?  Is it fixed by using scsi-block
instead of scsi-generic (if you can use scsi-block at all, i.e. it's not
a tape or similar device)?

With scsi-generic, QEMU uses a bounce buffer for non-I/O commands to a
SCSI passthrough device, so the only problem in that case should be the
maximum segment size.  This could change in the future, but max_segments
and max_sectors should not yet be a problem.

With scsi-block, QEMU will use read/write on the block device and the
host kernel will then obey the host HBA's block limits.  QEMU will still
use a bounce buffer for non-I/O commands to a scsi-block device, but the
payload is usually small for non-I/O commands.

Paolo

> When both hosts do have access to the same LUNs it's possible to
> extract the block queue limits (using sysfs) and compare them.
> 
> Today you can start QEMU with different image files on both hosts.
> Migration will appear to work but the disk image on the destination
> host could be junk.  This is a similar case, I don't see a problem
> except that there should be a safety check (maybe at the libvirt
> level) to make this safe.
Blue Swirl Aug. 21, 2012, 6:31 p.m. UTC | #6
On Tue, Aug 21, 2012 at 8:23 AM, Cong Meng <mc@linux.vnet.ibm.com> wrote:
> This patch adds some queue limit parameters into block drive. And inits them
> for sg block drive. Some interfaces are also added for accessing them.
>
> Signed-off-by: Cong Meng <mc@linux.vnet.ibm.com>
> ---
>  block/raw-posix.c |   58 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  block_int.h       |    4 +++
>  blockdev.c        |   15 +++++++++++++
>  hw/block-common.h |    3 ++
>  4 files changed, 80 insertions(+), 0 deletions(-)
>
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index 0dce089..a086f89 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -53,6 +53,7 @@
>  #include <linux/cdrom.h>
>  #include <linux/fd.h>
>  #include <linux/fs.h>
> +#include <dirent.h>
>  #endif
>  #ifdef CONFIG_FIEMAP
>  #include <linux/fiemap.h>
> @@ -829,6 +830,62 @@ static int hdev_probe_device(const char *filename)
>      return 0;
>  }
>
> +static void read_queue_limit(char *path, const char *filename, unsigned int *val)
> +{
> +    FILE *f;
> +    char *tail = path + strlen(path);
> +
> +    pstrcat(path, MAXPATHLEN, filename);
> +    f = fopen(path, "r");
> +    if (!f) {
> +        goto out;
> +    }
> +
> +    fscanf(f, "%u", val);

Please handle errors, otherwise *val may be left to uninitialized state.

> +    fclose(f);
> +
> +out:
> +    *tail = 0;
> +}
> +
> +static void sg_get_queue_limits(BlockDriverState *bs, const char *filename)
> +{
> +    DIR *ffs;
> +    struct dirent *d;
> +    char path[MAXPATHLEN];
> +
> +    snprintf(path, MAXPATHLEN,
> +             "/sys/class/scsi_generic/sg%s/device/block/",
> +             filename + strlen("/dev/sg"));
> +

I'd init bs->max_sectors etc. here so they are not left uninitialized.

> +    ffs = opendir(path);
> +    if (!ffs) {
> +        return;
> +    }
> +
> +    for (;;) {
> +        d = readdir(ffs);
> +        if (!d) {
> +            return;
> +        }
> +
> +        if (strcmp(d->d_name, ".") == 0 || strcmp(d->d_name, "..") == 0) {
> +            continue;
> +        }
> +
> +        break;
> +    }
> +
> +    closedir(ffs);
> +
> +    pstrcat(path, MAXPATHLEN, d->d_name);
> +    pstrcat(path, MAXPATHLEN, "/queue/");
> +
> +    read_queue_limit(path, "max_sectors_kb", &bs->max_sectors);
> +    read_queue_limit(path, "max_segments", &bs->max_segments);
> +    read_queue_limit(path, "max_segment_size", &bs->max_segment_size);
> +}
> +
>  static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
>  {
>      BDRVRawState *s = bs->opaque;
> @@ -868,6 +925,7 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
>          temp = realpath(filename, resolved_path);
>          if (temp && strstart(temp, "/dev/sg", NULL)) {
>              bs->sg = 1;
> +            sg_get_queue_limits(bs, temp);
>          }
>      }
>  #endif
> diff --git a/block_int.h b/block_int.h
> index d72317f..a9d07a2 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -333,6 +333,10 @@ struct BlockDriverState {
>
>      /* long-running background operation */
>      BlockJob *job;
> +
> +    unsigned int max_sectors;

With 32 bit ints and even with 4k sector size, the maximum disk size
would be 16TB which may be soon exceeded by disks on the market.
Please use 64 bit values, probably also for segment values below.

> +    unsigned int max_segments;
> +    unsigned int max_segment_size;
>  };
>
>  int get_tmp_filename(char *filename, int size);
> diff --git a/blockdev.c b/blockdev.c
> index 3d75015..e17edbf 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -1191,3 +1191,18 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
>      bdrv_iterate(do_qmp_query_block_jobs_one, &prev);
>      return dummy.next;
>  }
> +
> +unsigned int get_queue_max_sectors(BlockDriverState *bs)
> +{
> +    return (bs->file && bs->file->sg) ? bs->file->max_sectors : 0;
> +}
> +
> +unsigned int get_queue_max_segments(BlockDriverState *bs)
> +{
> +    return (bs->file && bs->file->sg) ? bs->file->max_segments : 0;
> +}
> +
> +unsigned int get_queue_max_segment_size(BlockDriverState *bs)
> +{
> +    return (bs->file && bs->file->sg) ? bs->file->max_segment_size : 0;
> +}
> diff --git a/hw/block-common.h b/hw/block-common.h
> index bb808f7..d47fcd7 100644
> --- a/hw/block-common.h
> +++ b/hw/block-common.h
> @@ -76,4 +76,7 @@ void hd_geometry_guess(BlockDriverState *bs,
>                         int *ptrans);
>  int hd_bios_chs_auto_trans(uint32_t cyls, uint32_t heads, uint32_t secs);
>
> +unsigned int get_queue_max_sectors(BlockDriverState *bs);
> +unsigned int get_queue_max_segments(BlockDriverState *bs);
> +unsigned int get_queue_max_segment_size(BlockDriverState *bs);
>  #endif
> --
> 1.7.7.6
>
>
Stefan Hajnoczi Aug. 22, 2012, 8:25 a.m. UTC | #7
On Tue, Aug 21, 2012 at 7:31 PM, Blue Swirl <blauwirbel@gmail.com> wrote:
> On Tue, Aug 21, 2012 at 8:23 AM, Cong Meng <mc@linux.vnet.ibm.com> wrote:
>> diff --git a/block_int.h b/block_int.h
>> index d72317f..a9d07a2 100644
>> --- a/block_int.h
>> +++ b/block_int.h
>> @@ -333,6 +333,10 @@ struct BlockDriverState {
>>
>>      /* long-running background operation */
>>      BlockJob *job;
>> +
>> +    unsigned int max_sectors;
>
> With 32 bit ints and even with 4k sector size, the maximum disk size
> would be 16TB which may be soon exceeded by disks on the market.
> Please use 64 bit values, probably also for segment values below.

This doesn't specify device size, it specifies max sectors per I/O
request.  When reviewing, I checked that the Linux kernel also uses
unsigned int for these fields.

Stefan
Cong Meng Aug. 22, 2012, 11:04 a.m. UTC | #8
On 08/21/2012 06:14 PM, Paolo Bonzini wrote:
> Il 21/08/2012 11:52, Stefan Hajnoczi ha scritto:
>>>>>> Using /sys/dev/block or /sys/dev/char seems easier, and lets you
>>>>>> retrieve the parameters for block devices too.
>>>>>>
>>>> what do you mean with "block devices"?   Using "/dev/sda" instead of
>>>> "/dev/sg0"?
>
> Yes.
>
>>>>>> However, I'm worried of the consequences this has for migration.  You
>>>>>> could have the same physical disk accessed with two different HBAs, with
>>>>>> different limits.  So I don't know if this can really be solved at all.
>>>>>>
>>>> I know little about qemu migration now.  The pending scsi commands will be
>>>> saved and
>>>> transfered to remote machine when starting migration?
>>
>> Passthrough is already a migration blocker if both hosts do not have
>> access to the same LUNs.
>
> Yes, but requiring the exact same hardware may be too much.  I'm trying
> to understand the problem better before committing to a threefold
> spec/qemu/kernel change.
>
> Cong, what is the limit that the host HBA enforces (and what is the
> HBA)?  What commands see a problem?  Is it fixed by using scsi-block
> instead of scsi-generic (if you can use scsi-block at all, i.e. it's not
> a tape or similar device)?
>
I don't see real problem caused by the the queue limits actually. It's a 
bug which Stefan told me.

> With scsi-generic, QEMU uses a bounce buffer for non-I/O commands to a
> SCSI passthrough device, so the only problem in that case should be the
> maximum segment size.  This could change in the future, but max_segments
> and max_sectors should not yet be a problem.

about bounce buffer, do you meat the buffer allocated in 
scsi_send_command() of hw/scsi-generic.c?

Cong.

>
> With scsi-block, QEMU will use read/write on the block device and the
> host kernel will then obey the host HBA's block limits.  QEMU will still
> use a bounce buffer for non-I/O commands to a scsi-block device, but the
> payload is usually small for non-I/O commands.
>
> Paolo
>
>> When both hosts do have access to the same LUNs it's possible to
>> extract the block queue limits (using sysfs) and compare them.
>>
>> Today you can start QEMU with different image files on both hosts.
>> Migration will appear to work but the disk image on the destination
>> host could be junk.  This is a similar case, I don't see a problem
>> except that there should be a safety check (maybe at the libvirt
>> level) to make this safe.
>
Paolo Bonzini Aug. 22, 2012, 12:09 p.m. UTC | #9
Il 22/08/2012 13:04, Cong Meng ha scritto:
>>
>> Cong, what is the limit that the host HBA enforces (and what is the
>> HBA)?  What commands see a problem?  Is it fixed by using scsi-block
>> instead of scsi-generic (if you can use scsi-block at all, i.e. it's not
>> a tape or similar device)?
>>
> I don't see real problem caused by the the queue limits actually. It's a
> bug which Stefan told me.

I'd rather avoid patching the specification if not to solve real (rather
than known-but-theoretical) problems.

>> With scsi-generic, QEMU uses a bounce buffer for non-I/O commands to a
>> SCSI passthrough device, so the only problem in that case should be the
>> maximum segment size.  This could change in the future, but max_segments
>> and max_sectors should not yet be a problem.
> 
> about bounce buffer, do you meat the buffer allocated in
> scsi_send_command() of hw/scsi-generic.c?

Yes.

Paolo
Stefan Hajnoczi Aug. 22, 2012, 1:13 p.m. UTC | #10
On Wed, Aug 22, 2012 at 02:09:28PM +0200, Paolo Bonzini wrote:
> Il 22/08/2012 13:04, Cong Meng ha scritto:
> >>
> >> Cong, what is the limit that the host HBA enforces (and what is the
> >> HBA)?  What commands see a problem?  Is it fixed by using scsi-block
> >> instead of scsi-generic (if you can use scsi-block at all, i.e. it's not
> >> a tape or similar device)?
> >>
> > I don't see real problem caused by the the queue limits actually. It's a
> > bug which Stefan told me.
> 
> I'd rather avoid patching the specification if not to solve real (rather
> than known-but-theoretical) problems.

Benjamin Herrenschmidt reported this in problem in 2010:

http://lists.gnu.org/archive/html/qemu-devel/2010-12/msg01741.html

"This is a real problem in practice. IE. the USB CD-ROM on this POWER7
blade limits transfers to 0x1e000 bytes for example and the Linux "sr"
driver on the guest is going to try to give me bigger requests than that
if I don't start limiting them, which will cause all sort of errors."

It cannot be fixed for emulated SCSI HBAs in general but it can for
virtio-scsi.

I/O requests will be failed with EIO if they exceed block queue limits.
Take a look at block/blk-core.c:generic_make_request_checks() and
blk_rq_check_limits().

Cong: Have you checked the block queue limits on your test machine and
exceeded them to see what happens?

Stefan
Paolo Bonzini Aug. 22, 2012, 2:13 p.m. UTC | #11
Il 22/08/2012 15:13, Stefan Hajnoczi ha scritto:
> http://lists.gnu.org/archive/html/qemu-devel/2010-12/msg01741.html
> 
> "This is a real problem in practice. IE. the USB CD-ROM on this POWER7
> blade limits transfers to 0x1e000 bytes for example and the Linux "sr"
> driver on the guest is going to try to give me bigger requests than that
> if I don't start limiting them, which will cause all sort of errors."
> 
> It cannot be fixed for emulated SCSI HBAs in general but it can for
> virtio-scsi.

For disks, this should be fixed simply by using scsi-block instead of
scsi-generic.

CD-ROMs are indeed more complicated because burning CDs cannot be done
with syscalls. :/

Paolo
Cong Meng Aug. 23, 2012, 9:31 a.m. UTC | #12
On Wed 22 Aug 2012 10:13:44 PM CST, Paolo Bonzini wrote:
> Il 22/08/2012 15:13, Stefan Hajnoczi ha scritto:
>> http://lists.gnu.org/archive/html/qemu-devel/2010-12/msg01741.html
>>
>> "This is a real problem in practice. IE. the USB CD-ROM on this POWER7
>> blade limits transfers to 0x1e000 bytes for example and the Linux "sr"
>> driver on the guest is going to try to give me bigger requests than that
>> if I don't start limiting them, which will cause all sort of errors."
>>
>> It cannot be fixed for emulated SCSI HBAs in general but it can for
>> virtio-scsi.
>
> For disks, this should be fixed simply by using scsi-block instead of
> scsi-generic.
>
> CD-ROMs are indeed more complicated because burning CDs cannot be done
> with syscalls. :/
>

So, as the problem exist to CD-ROM, I will continue to get these 
patches move on.

As Paolo pointed out,  the only limit is max_sectors (the total size of 
a scatter-list),
I will drop the other 2 parameters.

Paolo, what's your opinion?

Cong

> Paolo
>
Paolo Bonzini Aug. 23, 2012, 10:03 a.m. UTC | #13
Il 23/08/2012 11:31, Cong Meng ha scritto:
>> For disks, this should be fixed simply by using scsi-block instead of
>> scsi-generic.
>>
>> CD-ROMs are indeed more complicated because burning CDs cannot be done
>> with syscalls. :/
> 
> So, as the problem exist to CD-ROM, I will continue to get these patches
> move on.

I'm still trying to understand the extent of the problem.

The problem occurs for _USB_ CD-ROMs according to Ben.  Passthrough of
USB storage devices should be done via USB passthrough, not virtio-scsi.
 If we do USB passthrough via the SCSI layer we miss on all the quirks
that the OS may do based on the USB product/vendor pairs.  There's no
end to these, and some of the quirks may cause the device to lock up or
corruption.

I'd rather see a reproducer using SAS/ATA/ATAPI disks before punting.

Paolo
Stefan Hajnoczi Aug. 23, 2012, 10:08 a.m. UTC | #14
On Thu, Aug 23, 2012 at 11:03 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 23/08/2012 11:31, Cong Meng ha scritto:
>>> For disks, this should be fixed simply by using scsi-block instead of
>>> scsi-generic.
>>>
>>> CD-ROMs are indeed more complicated because burning CDs cannot be done
>>> with syscalls. :/
>>
>> So, as the problem exist to CD-ROM, I will continue to get these patches
>> move on.
>
> I'm still trying to understand the extent of the problem.
>
> The problem occurs for _USB_ CD-ROMs according to Ben.  Passthrough of
> USB storage devices should be done via USB passthrough, not virtio-scsi.
>  If we do USB passthrough via the SCSI layer we miss on all the quirks
> that the OS may do based on the USB product/vendor pairs.  There's no
> end to these, and some of the quirks may cause the device to lock up or
> corruption.
>
> I'd rather see a reproducer using SAS/ATA/ATAPI disks before punting.

This issue affects passthrough: either an entire sg device or at least
a SG_IO ioctl (e.g. a non-READ/WRITE SCSI command).

To reproduce it, check host queue limits and guest virtio-scsi queue
limits.  Then pick a command that can exceed the limits and try it
from inside the guest :).

Stefan
Paolo Bonzini Aug. 23, 2012, 10:52 a.m. UTC | #15
Il 23/08/2012 12:08, Stefan Hajnoczi ha scritto:
>> I'm still trying to understand the extent of the problem.
>>
>> The problem occurs for _USB_ CD-ROMs according to Ben.  Passthrough of
>> USB storage devices should be done via USB passthrough, not virtio-scsi.
>>  If we do USB passthrough via the SCSI layer we miss on all the quirks
>> that the OS may do based on the USB product/vendor pairs.  There's no
>> end to these, and some of the quirks may cause the device to lock up or
>> corruption.
>>
>> I'd rather see a reproducer using SAS/ATA/ATAPI disks before punting.
> 
> This issue affects passthrough: either an entire sg device or at least
> a SG_IO ioctl (e.g. a non-READ/WRITE SCSI command).
> 
> To reproduce it, check host queue limits and guest virtio-scsi queue
> limits.  Then pick a command that can exceed the limits and try it
> from inside the guest :).

Yes, so much is clear.  But does it happen _in practice_?  Do initiators
actually issue commands that are that big?

Paolo
Stefan Hajnoczi Aug. 23, 2012, 12:08 p.m. UTC | #16
On Thu, Aug 23, 2012 at 11:52 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 23/08/2012 12:08, Stefan Hajnoczi ha scritto:
>>> I'm still trying to understand the extent of the problem.
>>>
>>> The problem occurs for _USB_ CD-ROMs according to Ben.  Passthrough of
>>> USB storage devices should be done via USB passthrough, not virtio-scsi.
>>>  If we do USB passthrough via the SCSI layer we miss on all the quirks
>>> that the OS may do based on the USB product/vendor pairs.  There's no
>>> end to these, and some of the quirks may cause the device to lock up or
>>> corruption.
>>>
>>> I'd rather see a reproducer using SAS/ATA/ATAPI disks before punting.
>>
>> This issue affects passthrough: either an entire sg device or at least
>> a SG_IO ioctl (e.g. a non-READ/WRITE SCSI command).
>>
>> To reproduce it, check host queue limits and guest virtio-scsi queue
>> limits.  Then pick a command that can exceed the limits and try it
>> from inside the guest :).
>
> Yes, so much is clear.  But does it happen _in practice_?  Do initiators
> actually issue commands that are that big?

Here I think we need to err on the side of caution.  A user passes
through a tape drive or exotic SCSI device.  They run a vendor utility
inside the guest that issues a command that exceeds the host block
queue limits.

Passing through limits is intended to make SCSI device passthrough
work, in all cases.

Is the number of real cases where it happens small?  Probably.  I
still think we should make passthrough bulletproof.

Stefan
Nicholas A. Bellinger Aug. 24, 2012, 12:45 a.m. UTC | #17
On Thu, 2012-08-23 at 11:08 +0100, Stefan Hajnoczi wrote:
> On Thu, Aug 23, 2012 at 11:03 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> > Il 23/08/2012 11:31, Cong Meng ha scritto:
> >>> For disks, this should be fixed simply by using scsi-block instead of
> >>> scsi-generic.
> >>>
> >>> CD-ROMs are indeed more complicated because burning CDs cannot be done
> >>> with syscalls. :/
> >>
> >> So, as the problem exist to CD-ROM, I will continue to get these patches
> >> move on.
> >
> > I'm still trying to understand the extent of the problem.
> >
> > The problem occurs for _USB_ CD-ROMs according to Ben.  Passthrough of
> > USB storage devices should be done via USB passthrough, not virtio-scsi.
> >  If we do USB passthrough via the SCSI layer we miss on all the quirks
> > that the OS may do based on the USB product/vendor pairs.  There's no
> > end to these, and some of the quirks may cause the device to lock up or
> > corruption.
> >
> > I'd rather see a reproducer using SAS/ATA/ATAPI disks before punting.
> 
> This issue affects passthrough: either an entire sg device or at least
> a SG_IO ioctl (e.g. a non-READ/WRITE SCSI command).
> 
> To reproduce it, check host queue limits and guest virtio-scsi queue
> limits.  Then pick a command that can exceed the limits and try it
> from inside the guest :).
> 

Just following along on this thread, and wanted to add a few of my
experiences with this scenario from the kernel target perspective..

So up until very recently, TCM would accept an I/O request for an DATA
I/O type CDB with a max_sectors larger than the reported max_sectors for
it's TCM backend (regardless of backend type), and silently generate N
backend 'tasks' to complete the single initiator generated command.
Also FYI for Paolo, for control type CDBs I've never actually seen an
allocation length exceed max_sectors, so in practice AFAIK this only
happens for DATA I/O type CDBs.

This was historically required by the pSCSI backend driver (using a
number of old SCSI passthrough interfaces) in order to support this very
type of case described above, but over the years the logic ended up
creeping into various other non-passthrough backend drivers like IBLOCK
+FILEIO.  So for v3.6-rc1 code, hch ended up removing the 'task' logic
thus allowing backends (and the layers below) to the I/O sectors >
max_sectors handling work, allowing modern pSCSI using struct request to
do the same.  (hch assured me this works now for pSCSI)

Anyways, I think having the guest limit virtio-scsi DATA I/O to
max_sectors based upon the host accessible block limits is reasonable
approach to consider.  Reducing this value even further based upon the
lowest max_sectors available amongst possible migration hosts would be a
good idea here to avoid having to reject any I/O's exceeding a new
host's device block queue limits.

--nab
Paolo Bonzini Aug. 24, 2012, 7:56 a.m. UTC | #18
Il 24/08/2012 02:45, Nicholas A. Bellinger ha scritto:
> So up until very recently, TCM would accept an I/O request for an DATA
> I/O type CDB with a max_sectors larger than the reported max_sectors for
> it's TCM backend (regardless of backend type), and silently generate N
> backend 'tasks' to complete the single initiator generated command.

This is what QEMU does if you use scsi-block, except for MMC devices
(because of the insanity of the commands used for burning).

> Also FYI for Paolo, for control type CDBs I've never actually seen an
> allocation length exceed max_sectors, so in practice AFAIK this only
> happens for DATA I/O type CDBs.

Yes, that was my impression as well.

> This was historically required by the pSCSI backend driver (using a
> number of old SCSI passthrough interfaces) in order to support this very
> type of case described above, but over the years the logic ended up
> creeping into various other non-passthrough backend drivers like IBLOCK
> +FILEIO.  So for v3.6-rc1 code, hch ended up removing the 'task' logic
> thus allowing backends (and the layers below) to the I/O sectors >
> max_sectors handling work, allowing modern pSCSI using struct request to
> do the same.  (hch assured me this works now for pSCSI)

So now LIO and QEMU work the same.  (Did he test tapes too?)

> Anyways, I think having the guest limit virtio-scsi DATA I/O to
> max_sectors based upon the host accessible block limits is reasonable
> approach to consider.  Reducing this value even further based upon the
> lowest max_sectors available amongst possible migration hosts would be a
> good idea here to avoid having to reject any I/O's exceeding a new
> host's device block queue limits.

Yeah, it's reasonable _assuming it is needed at all_.  For disks, it is
not needed.  For CD-ROMs it is, but right now we have only one report
and it is using USB so we don't know if the problem is in the drive or
rather in the USB bridge (whose quality usually leaves much to be desired).

So in the only observed case, the fix would really be a workaround; the
right thing to do with USB devices is to use USB passthrough.

Paolo
Stefan Hajnoczi Aug. 24, 2012, 9:05 a.m. UTC | #19
On Fri, Aug 24, 2012 at 12:43:34PM +0200, Hannes Reinecke wrote:
> On 08/24/2012 09:56 AM, Paolo Bonzini wrote:
> >Il 24/08/2012 02:45, Nicholas A. Bellinger ha scritto:
> >>So up until very recently, TCM would accept an I/O request for an DATA
> >>I/O type CDB with a max_sectors larger than the reported max_sectors for
> >>it's TCM backend (regardless of backend type), and silently generate N
> >>backend 'tasks' to complete the single initiator generated command.
> >
> >This is what QEMU does if you use scsi-block, except for MMC devices
> >(because of the insanity of the commands used for burning).
> >
> >>Also FYI for Paolo, for control type CDBs I've never actually seen an
> >>allocation length exceed max_sectors, so in practice AFAIK this only
> >>happens for DATA I/O type CDBs.
> >
> >Yes, that was my impression as well.
> >
> >>This was historically required by the pSCSI backend driver (using a
> >>number of old SCSI passthrough interfaces) in order to support this very
> >>type of case described above, but over the years the logic ended up
> >>creeping into various other non-passthrough backend drivers like IBLOCK
> >>+FILEIO.  So for v3.6-rc1 code, hch ended up removing the 'task' logic
> >>thus allowing backends (and the layers below) to the I/O sectors >
> >>max_sectors handling work, allowing modern pSCSI using struct request to
> >>do the same.  (hch assured me this works now for pSCSI)
> >
> >So now LIO and QEMU work the same.  (Did he test tapes too?)
> >
> >>Anyways, I think having the guest limit virtio-scsi DATA I/O to
> >>max_sectors based upon the host accessible block limits is reasonable
> >>approach to consider.  Reducing this value even further based upon the
> >>lowest max_sectors available amongst possible migration hosts would be a
> >>good idea here to avoid having to reject any I/O's exceeding a new
> >>host's device block queue limits.
> >
> >Yeah, it's reasonable _assuming it is needed at all_.  For disks, it is
> >not needed.  For CD-ROMs it is, but right now we have only one report
> >and it is using USB so we don't know if the problem is in the drive or
> >rather in the USB bridge (whose quality usually leaves much to be desired).
> >
> >So in the only observed case, the fix would really be a workaround; the
> >right thing to do with USB devices is to use USB passthrough.
> >
> 
> Hehe. So finally someone else stumbled across this one.
> 
> All is fine and dandy as long as you're able to use scsi-disk.
> As soon as you're forced to use scsi-generic we're in trouble.
> 
> With scsi-generic we actually have two problems:
> 1) scsi-generic just acts as a pass-through and passes the commands
>    as-is, including the scatter-gather information as formatted by
>    the guest. So the guest could easily format an SG_IO comand
>    which will not be compatible with the host.
> 2) The host is not able to differentiate between a malformed
>    SG_IO command and a real I/O error; in both cases it'll return
>    -EIO.
> 
> So we can fix this by either
> a) ignore (as we do nowadays :-)
> b) Fixup scsi-generic to inspect and modify SG_IO information
>    to ensure the host-limits are respected
> c) Fixup the host to differentiate between a malformed SG_IO
>    and a real I/O error.
> 
> c) would only be feasible for Linux et al. _personally_ I would
> prefer that approach, as I fail to see why we cannot return a proper
> error code here.
> But I already can hear the outraged cry 'POSIX! POSIX!', so I guess
> it's not going to happen anytime soon.
> So I would vote for b).
> Yes, it's painful. But in the long run we'll have to do an SG_IO
> inspection anyway, otherwise we'll always be susceptible to
> malicious SG_IO attacks.

Are you suggesting we do not expose host block queue limits to the
guest.  Instead we should inspect and reformat SG_IO requests in QEMU?
Reformatting seems hard because there are many possible SCSI
commands/sub-commands and we would have to whitelist them on a
case-by-case basis.

That sounds like more work than exposing the block queue limits using
Cong Meng's patches.

On a side-note, are you thinking of blacklisting/whitelisting certain
commands that don't make sense or would have an unintended effect if
sent from a guest (e.g. reservations)?  That would be an interesting
topic for another email thread, I'd love to learn what weird things we
need to protect against.

Stefan
Paolo Bonzini Aug. 24, 2012, 9:14 a.m. UTC | #20
Il 24/08/2012 12:43, Hannes Reinecke ha scritto:
> Hehe. So finally someone else stumbled across this one.
> 
> All is fine and dandy as long as you're able to use scsi-disk.
> As soon as you're forced to use scsi-generic we're in trouble.
> 
> With scsi-generic we actually have two problems:
> 1) scsi-generic just acts as a pass-through and passes the commands
>    as-is, including the scatter-gather information as formatted by
>    the guest. So the guest could easily format an SG_IO comand
>    which will not be compatible with the host.
> 2) The host is not able to differentiate between a malformed
>    SG_IO command and a real I/O error; in both cases it'll return
>    -EIO.
> 
> So we can fix this by either
> a) ignore (as we do nowadays :-)
> b) Fixup scsi-generic to inspect and modify SG_IO information
>    to ensure the host-limits are respected

That's what scsi-block already does.

Perhaps sooner or later we will need a scsi-tape?  That would be fine.

> Yes, it's painful. But in the long run we'll have to do an SG_IO
> inspection anyway, otherwise we'll always be susceptible to malicious
> SG_IO attacks.

I would like to do this in the kernel using BPF.  I posted a possible
spec at
http://www.redhat.com/archives/libvir-list/2012-June/msg00505.html but
the response was, ehm, underwhelming.

Paolo
Hannes Reinecke Aug. 24, 2012, 10:43 a.m. UTC | #21
On 08/24/2012 09:56 AM, Paolo Bonzini wrote:
> Il 24/08/2012 02:45, Nicholas A. Bellinger ha scritto:
>> So up until very recently, TCM would accept an I/O request for an DATA
>> I/O type CDB with a max_sectors larger than the reported max_sectors for
>> it's TCM backend (regardless of backend type), and silently generate N
>> backend 'tasks' to complete the single initiator generated command.
>
> This is what QEMU does if you use scsi-block, except for MMC devices
> (because of the insanity of the commands used for burning).
>
>> Also FYI for Paolo, for control type CDBs I've never actually seen an
>> allocation length exceed max_sectors, so in practice AFAIK this only
>> happens for DATA I/O type CDBs.
>
> Yes, that was my impression as well.
>
>> This was historically required by the pSCSI backend driver (using a
>> number of old SCSI passthrough interfaces) in order to support this very
>> type of case described above, but over the years the logic ended up
>> creeping into various other non-passthrough backend drivers like IBLOCK
>> +FILEIO.  So for v3.6-rc1 code, hch ended up removing the 'task' logic
>> thus allowing backends (and the layers below) to the I/O sectors >
>> max_sectors handling work, allowing modern pSCSI using struct request to
>> do the same.  (hch assured me this works now for pSCSI)
>
> So now LIO and QEMU work the same.  (Did he test tapes too?)
>
>> Anyways, I think having the guest limit virtio-scsi DATA I/O to
>> max_sectors based upon the host accessible block limits is reasonable
>> approach to consider.  Reducing this value even further based upon the
>> lowest max_sectors available amongst possible migration hosts would be a
>> good idea here to avoid having to reject any I/O's exceeding a new
>> host's device block queue limits.
>
> Yeah, it's reasonable _assuming it is needed at all_.  For disks, it is
> not needed.  For CD-ROMs it is, but right now we have only one report
> and it is using USB so we don't know if the problem is in the drive or
> rather in the USB bridge (whose quality usually leaves much to be desired).
>
> So in the only observed case, the fix would really be a workaround; the
> right thing to do with USB devices is to use USB passthrough.
>

Hehe. So finally someone else stumbled across this one.

All is fine and dandy as long as you're able to use scsi-disk.
As soon as you're forced to use scsi-generic we're in trouble.

With scsi-generic we actually have two problems:
1) scsi-generic just acts as a pass-through and passes the commands
    as-is, including the scatter-gather information as formatted by
    the guest. So the guest could easily format an SG_IO comand
    which will not be compatible with the host.
2) The host is not able to differentiate between a malformed
    SG_IO command and a real I/O error; in both cases it'll return
    -EIO.

So we can fix this by either
a) ignore (as we do nowadays :-)
b) Fixup scsi-generic to inspect and modify SG_IO information
    to ensure the host-limits are respected
c) Fixup the host to differentiate between a malformed SG_IO
    and a real I/O error.

c) would only be feasible for Linux et al. _personally_ I would prefer 
that approach, as I fail to see why we cannot return a proper error code 
here.
But I already can hear the outraged cry 'POSIX! POSIX!', so I guess it's 
not going to happen anytime soon.
So I would vote for b).
Yes, it's painful. But in the long run we'll have to do an SG_IO 
inspection anyway, otherwise we'll always be susceptible to malicious 
SG_IO attacks.

Cheers,

Hannes
diff mbox

Patch

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 0dce089..a086f89 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -53,6 +53,7 @@ 
 #include <linux/cdrom.h>
 #include <linux/fd.h>
 #include <linux/fs.h>
+#include <dirent.h>
 #endif
 #ifdef CONFIG_FIEMAP
 #include <linux/fiemap.h>
@@ -829,6 +830,62 @@  static int hdev_probe_device(const char *filename)
     return 0;
 }
 
+static void read_queue_limit(char *path, const char *filename, unsigned int *val)
+{
+    FILE *f;
+    char *tail = path + strlen(path);
+
+    pstrcat(path, MAXPATHLEN, filename);
+    f = fopen(path, "r");
+    if (!f) {
+        goto out;
+    }
+
+    fscanf(f, "%u", val);
+    fclose(f);
+
+out:
+    *tail = 0;
+}
+
+static void sg_get_queue_limits(BlockDriverState *bs, const char *filename)
+{
+    DIR *ffs;
+    struct dirent *d;
+    char path[MAXPATHLEN];
+
+    snprintf(path, MAXPATHLEN,
+             "/sys/class/scsi_generic/sg%s/device/block/",
+             filename + strlen("/dev/sg"));
+
+    ffs = opendir(path);
+    if (!ffs) {
+        return;
+    }
+
+    for (;;) {
+        d = readdir(ffs);
+        if (!d) {
+            return;
+        }
+
+        if (strcmp(d->d_name, ".") == 0 || strcmp(d->d_name, "..") == 0) {
+            continue;
+        }
+
+        break;
+    }
+
+    closedir(ffs);
+
+    pstrcat(path, MAXPATHLEN, d->d_name);
+    pstrcat(path, MAXPATHLEN, "/queue/");
+
+    read_queue_limit(path, "max_sectors_kb", &bs->max_sectors);
+    read_queue_limit(path, "max_segments", &bs->max_segments);
+    read_queue_limit(path, "max_segment_size", &bs->max_segment_size);
+}
+
 static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
 {
     BDRVRawState *s = bs->opaque;
@@ -868,6 +925,7 @@  static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
         temp = realpath(filename, resolved_path);
         if (temp && strstart(temp, "/dev/sg", NULL)) {
             bs->sg = 1;
+            sg_get_queue_limits(bs, temp);
         }
     }
 #endif
diff --git a/block_int.h b/block_int.h
index d72317f..a9d07a2 100644
--- a/block_int.h
+++ b/block_int.h
@@ -333,6 +333,10 @@  struct BlockDriverState {
 
     /* long-running background operation */
     BlockJob *job;
+
+    unsigned int max_sectors;
+    unsigned int max_segments;
+    unsigned int max_segment_size;
 };
 
 int get_tmp_filename(char *filename, int size);
diff --git a/blockdev.c b/blockdev.c
index 3d75015..e17edbf 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1191,3 +1191,18 @@  BlockJobInfoList *qmp_query_block_jobs(Error **errp)
     bdrv_iterate(do_qmp_query_block_jobs_one, &prev);
     return dummy.next;
 }
+
+unsigned int get_queue_max_sectors(BlockDriverState *bs)
+{
+    return (bs->file && bs->file->sg) ? bs->file->max_sectors : 0;
+}
+
+unsigned int get_queue_max_segments(BlockDriverState *bs)
+{
+    return (bs->file && bs->file->sg) ? bs->file->max_segments : 0;
+}
+
+unsigned int get_queue_max_segment_size(BlockDriverState *bs)
+{
+    return (bs->file && bs->file->sg) ? bs->file->max_segment_size : 0;
+}
diff --git a/hw/block-common.h b/hw/block-common.h
index bb808f7..d47fcd7 100644
--- a/hw/block-common.h
+++ b/hw/block-common.h
@@ -76,4 +76,7 @@  void hd_geometry_guess(BlockDriverState *bs,
                        int *ptrans);
 int hd_bios_chs_auto_trans(uint32_t cyls, uint32_t heads, uint32_t secs);
 
+unsigned int get_queue_max_sectors(BlockDriverState *bs);
+unsigned int get_queue_max_segments(BlockDriverState *bs);
+unsigned int get_queue_max_segment_size(BlockDriverState *bs);
 #endif