diff mbox

raw-posix: add 'offset' and 'size' options

Message ID 598de7ff27e32fcb1b7f677f40fb8da4f0a1f512.1475434971.git.tgolembi@redhat.com
State New
Headers show

Commit Message

Tomáš Golembiovský Oct. 2, 2016, 7:13 p.m. UTC
Added two new options 'offset' and 'size'. This makes it possible to use
only part of the file as a device. This can be used e.g. to limit the
access only to single partition in a disk image or use a disk inside a
tar archive (like OVA).

For now this is only possible for files in read-only mode. It should be
possible to extend it later to allow read-write mode, but would probably
require that the size of the device is kept constant (i.e. no resizing).

Signed-off-by: Tomáš Golembiovský <tgolembi@redhat.com>
---
 block/raw-posix.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 91 insertions(+), 6 deletions(-)

Comments

Daniel P. Berrangé Oct. 3, 2016, 8:52 a.m. UTC | #1
On Sun, Oct 02, 2016 at 09:13:29PM +0200, Tomáš Golembiovský wrote:
> Added two new options 'offset' and 'size'. This makes it possible to use
> only part of the file as a device. This can be used e.g. to limit the
> access only to single partition in a disk image or use a disk inside a
> tar archive (like OVA).
> 
> For now this is only possible for files in read-only mode. It should be
> possible to extend it later to allow read-write mode, but would probably
> require that the size of the device is kept constant (i.e. no resizing).
> 
> Signed-off-by: Tomáš Golembiovský <tgolembi@redhat.com>
> ---
>  block/raw-posix.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 91 insertions(+), 6 deletions(-)

An equivalent change is needed to raw-win32.c

> 
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index 6ed7547..36f2712 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c

> @@ -421,6 +435,17 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
>          goto fail;
>      }
>  
> +    s->offset = qemu_opt_get_size(opts, "offset", 0);
> +    s->assumed_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);

If the user didn't provide "size", then we should initialize
it to the actual size of the underlying storage, rather than
to 0.

> +
> +    if (((bs->drv != &bdrv_file) || !bs->read_only) &&

Why the check against bdrv_file ?

> +        ((s->offset > 0) || (s->assumed_size > 0))) {
> +        error_setg(errp, "offset and size options are allowed only for "
> +                         "files in read-only mode");
> +        ret = -EINVAL;
> +        goto fail;
> +    }

Why did you restrict it to read-only instead of adding the offset logic
to the write & truncate methods. IMHO if we're going to add this feature
we should make it work in all scenarios, not just do 1/2 the job.

> @@ -443,6 +468,23 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
>      }
>      s->fd = fd;
>  
> +    /* Check size and offset */
> +    real_size = raw_getlength_real(bs);
> +    if (real_size < (s->offset + s->assumed_size)) {

There's risk of integer overflow here.

> +        if (s->assumed_size == 0) {
> +            error_setg(errp, "The offset has to be smaller than actual size "
> +                             "of the containing file (%ld) ",
> +                             real_size);
> +        } else {
> +            error_setg(errp, "The sum of offset (%lu) and size (%lu) has to "
> +                             "be smaller than actual size of the containing "
> +                             "file (%ld) ",
> +                             s->offset, s->assumed_size, real_size);
> +        }

Not sure I see the point in special-casing assumed_size == 0 here, as
the second error message is clearer IMHO.

> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +
>  #ifdef CONFIG_LINUX_AIO
>      if (!raw_use_aio(bdrv_flags) && (bdrv_flags & BDRV_O_NATIVE_AIO)) {
>          error_setg(errp, "aio=native was specified, but it requires "
> @@ -1271,6 +1313,19 @@ static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
>                                        uint64_t bytes, QEMUIOVector *qiov,
>                                        int flags)
>  {
> +    BDRVRawState *s = bs->opaque;
> +    if (s->assumed_size > 0) {
> +        if (offset > s->assumed_size) {
> +            /* Attempt to read beyond EOF */
> +            return 0;
> +        } else if ((offset + bytes) > s->assumed_size) {

Integer overflow risk again here

> +            /* Trying to read more than is available */
> +            bytes = s->assumed_size - offset;
> +        }
> +    }
> +
> +    offset += s->offset;
> +
>      return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
>  }
>  

> @@ -1516,6 +1571,28 @@ static int64_t raw_getlength(BlockDriverState *bs)
>  }
>  #endif
>  
> +static int64_t raw_getlength(BlockDriverState *bs)
> +{
> +    BDRVRawState *s = bs->opaque;
> +
> +    if (s->assumed_size > 0) {
> +        return (int64_t)s->assumed_size;
> +    }
> +
> +    int64_t size = raw_getlength_real(bs);
> +    if (s->offset > 0) {
> +        if (s->offset > size) {
> +            /* The size has changed! We didn't expect that. */
> +            return -EIO;
> +        }
> +        size -= s->offset;
> +    }

The 'if (s->offset > 0)' check doesn't seem to do anything
useful here - if offset is zero, then the if body is a no-op
already.

> @@ -1524,7 +1601,15 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
>      if (fstat(s->fd, &st) < 0) {
>          return -errno;
>      }
> -    return (int64_t)st.st_blocks * 512;
> +    uint64_t size = st.st_blocks * 512;
> +    /* If the file is sparse we have no idea which part of the file is
> +     * allocated and which is not. So we just make sure the returned value is
> +     * not greater than what we're working with.
> +     */
> +    if (s->assumed_size > 0 && s->assumed_size < size) {
> +        size = s->assumed_size;
> +    }
> +    return (int64_t)size;
>  }
>  
>  static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
> -- 
> 2.10.0
> 
> 

Regards,
Daniel
Paolo Bonzini Oct. 3, 2016, 9:20 a.m. UTC | #2
On 03/10/2016 10:52, Daniel P. Berrange wrote:
> On Sun, Oct 02, 2016 at 09:13:29PM +0200, Tomáš Golembiovský wrote:
>> Added two new options 'offset' and 'size'. This makes it possible to use
>> only part of the file as a device. This can be used e.g. to limit the
>> access only to single partition in a disk image or use a disk inside a
>> tar archive (like OVA).
>>
>> For now this is only possible for files in read-only mode. It should be
>> possible to extend it later to allow read-write mode, but would probably
>> require that the size of the device is kept constant (i.e. no resizing).
>>
>> Signed-off-by: Tomáš Golembiovský <tgolembi@redhat.com>
>> ---
>>  block/raw-posix.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++----
>>  1 file changed, 91 insertions(+), 6 deletions(-)
> 
> An equivalent change is needed to raw-win32.c

Actually, it should be done _only_ in block/raw_bsd.c.

Paolo
Tomáš Golembiovský Oct. 3, 2016, 10:45 a.m. UTC | #3
On Mon, 3 Oct 2016 09:52:13 +0100
"Daniel P. Berrange" <berrange@redhat.com> wrote:

> On Sun, Oct 02, 2016 at 09:13:29PM +0200, Tomáš Golembiovský wrote:
> > Added two new options 'offset' and 'size'. This makes it possible to use
> > only part of the file as a device. This can be used e.g. to limit the
> > access only to single partition in a disk image or use a disk inside a
> > tar archive (like OVA).
> > 
> > For now this is only possible for files in read-only mode. It should be
> > possible to extend it later to allow read-write mode, but would probably
> > require that the size of the device is kept constant (i.e. no resizing).
> > 
> > Signed-off-by: Tomáš Golembiovský <tgolembi@redhat.com>
> > ---
> >  block/raw-posix.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++----
> >  1 file changed, 91 insertions(+), 6 deletions(-)  
> 
> An equivalent change is needed to raw-win32.c

That's what I feared.


> > 
> > diff --git a/block/raw-posix.c b/block/raw-posix.c
> > index 6ed7547..36f2712 100644
> > --- a/block/raw-posix.c
> > +++ b/block/raw-posix.c  
> 
> > @@ -421,6 +435,17 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
> >          goto fail;
> >      }
> >  
> > +    s->offset = qemu_opt_get_size(opts, "offset", 0);
> > +    s->assumed_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);  
> 
> If the user didn't provide "size", then we should initialize
> it to the actual size of the underlying storage, rather than
> to 0.

I rather wanted to distinguish  the case when no offset or size was
specified to be able to limit the scope of introduced changes.


> > +
> > +    if (((bs->drv != &bdrv_file) || !bs->read_only) &&  
> 
> Why the check against bdrv_file ?

To limit it only to files. Maybe there is better way to do that? The
devices have a nasty habit to change the size. Sure, this can happen to
file too, e.g. if somebody truncates the file outside QEMU. But that's
rather a bad behaviour. For devices changing the size may be perfectly
valid operation, e.g. replacing CD in drive or card in a card reader.


> 
> > +        ((s->offset > 0) || (s->assumed_size > 0))) {
> > +        error_setg(errp, "offset and size options are allowed only for "
> > +                         "files in read-only mode");
> > +        ret = -EINVAL;
> > +        goto fail;
> > +    }  
> 
> Why did you restrict it to read-only instead of adding the offset logic
> to the write & truncate methods. IMHO if we're going to add this feature
> we should make it work in all scenarios, not just do 1/2 the job.

I still plan to do that, but I didn't feel confident enough to do it in
the first run.

We need to decide what is the correct behaviour here first. Since the
image we're working with is contained in some larger unit it cannot be
resized freely. There are two option that come to my mind:

1) block truncate/grow completely,
2) allow image to be truncated and grown only if the new size does not
   go over the original size.

If we go with the second option then I have a another question. How
strict is (or should be) qemu about the sizes being block aligned? I'm
still little bit fuzzy on that. Somewhere it is assumed that the size is
multiple of 512, sometimes qemu automatically rounds up if it isn't and
sometimes it seems to me that the size can be arbitrary.


> 
> > @@ -443,6 +468,23 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
> >      }
> >      s->fd = fd;
> >  
> > +    /* Check size and offset */
> > +    real_size = raw_getlength_real(bs);
> > +    if (real_size < (s->offset + s->assumed_size)) {  
> 
> There's risk of integer overflow here.
> 
> > +        if (s->assumed_size == 0) {
> > +            error_setg(errp, "The offset has to be smaller than actual size "
> > +                             "of the containing file (%ld) ",
> > +                             real_size);
> > +        } else {
> > +            error_setg(errp, "The sum of offset (%lu) and size (%lu) has to "
> > +                             "be smaller than actual size of the containing "
> > +                             "file (%ld) ",
> > +                             s->offset, s->assumed_size, real_size);
> > +        }  
> 
> Not sure I see the point in special-casing assumed_size == 0 here, as
> the second error message is clearer IMHO.

Ok, I'll keep just the second one then.

> 
> > +        ret = -EINVAL;
> > +        goto fail;
> > +    }
> > +
> >  #ifdef CONFIG_LINUX_AIO
> >      if (!raw_use_aio(bdrv_flags) && (bdrv_flags & BDRV_O_NATIVE_AIO)) {
> >          error_setg(errp, "aio=native was specified, but it requires "
> > @@ -1271,6 +1313,19 @@ static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
> >                                        uint64_t bytes, QEMUIOVector *qiov,
> >                                        int flags)
> >  {
> > +    BDRVRawState *s = bs->opaque;
> > +    if (s->assumed_size > 0) {
> > +        if (offset > s->assumed_size) {
> > +            /* Attempt to read beyond EOF */
> > +            return 0;
> > +        } else if ((offset + bytes) > s->assumed_size) {  
> 
> Integer overflow risk again here
> 
> > +            /* Trying to read more than is available */
> > +            bytes = s->assumed_size - offset;
> > +        }
> > +    }
> > +
> > +    offset += s->offset;
> > +
> >      return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
> >  }
> >    
> 
> > @@ -1516,6 +1571,28 @@ static int64_t raw_getlength(BlockDriverState *bs)
> >  }
> >  #endif
> >  
> > +static int64_t raw_getlength(BlockDriverState *bs)
> > +{
> > +    BDRVRawState *s = bs->opaque;
> > +
> > +    if (s->assumed_size > 0) {
> > +        return (int64_t)s->assumed_size;
> > +    }
> > +
> > +    int64_t size = raw_getlength_real(bs);
> > +    if (s->offset > 0) {
> > +        if (s->offset > size) {
> > +            /* The size has changed! We didn't expect that. */
> > +            return -EIO;
> > +        }
> > +        size -= s->offset;
> > +    }  
> 
> The 'if (s->offset > 0)' check doesn't seem to do anything
> useful here - if offset is zero, then the if body is a no-op
> already.

You're right. I'll fix that.

> 
> > @@ -1524,7 +1601,15 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
> >      if (fstat(s->fd, &st) < 0) {
> >          return -errno;
> >      }
> > -    return (int64_t)st.st_blocks * 512;
> > +    uint64_t size = st.st_blocks * 512;
> > +    /* If the file is sparse we have no idea which part of the file is
> > +     * allocated and which is not. So we just make sure the returned value is
> > +     * not greater than what we're working with.
> > +     */
> > +    if (s->assumed_size > 0 && s->assumed_size < size) {
> > +        size = s->assumed_size;
> > +    }
> > +    return (int64_t)size;
> >  }
> >  
> >  static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
> > -- 
> > 2.10.0
> > 
> >   
> 
> Regards,
> Daniel
> -- 
> |: http://berrange.com      -o-    http://www.flickr.com/photos/dberrange/ :|
> |: http://libvirt.org              -o-             http://virt-manager.org :|
> |: http://entangle-photo.org       -o-    http://search.cpan.org/~danberr/ :|
Tomáš Golembiovský Oct. 3, 2016, 10:47 a.m. UTC | #4
On Mon, 3 Oct 2016 11:20:44 +0200
Paolo Bonzini <pbonzini@redhat.com> wrote:

> On 03/10/2016 10:52, Daniel P. Berrange wrote:
> > On Sun, Oct 02, 2016 at 09:13:29PM +0200, Tomáš Golembiovský wrote:  
> >> Added two new options 'offset' and 'size'. This makes it possible to use
> >> only part of the file as a device. This can be used e.g. to limit the
> >> access only to single partition in a disk image or use a disk inside a
> >> tar archive (like OVA).
> >>
> >> For now this is only possible for files in read-only mode. It should be
> >> possible to extend it later to allow read-write mode, but would probably
> >> require that the size of the device is kept constant (i.e. no resizing).
> >>
> >> Signed-off-by: Tomáš Golembiovský <tgolembi@redhat.com>
> >> ---
> >>  block/raw-posix.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++----
> >>  1 file changed, 91 insertions(+), 6 deletions(-)  
> > 
> > An equivalent change is needed to raw-win32.c  
> 
> Actually, it should be done _only_ in block/raw_bsd.c.

You mean in raw-posix.c and in raw_bsd.c, or just raw_bsd.c?

    Tomas
Daniel P. Berrangé Oct. 3, 2016, 10:52 a.m. UTC | #5
On Mon, Oct 03, 2016 at 12:45:57PM +0200, Tomáš Golembiovský wrote:
> On Mon, 3 Oct 2016 09:52:13 +0100
> "Daniel P. Berrange" <berrange@redhat.com> wrote:
> 
> > On Sun, Oct 02, 2016 at 09:13:29PM +0200, Tomáš Golembiovský wrote:
> > > Added two new options 'offset' and 'size'. This makes it possible to use
> > > only part of the file as a device. This can be used e.g. to limit the
> > > access only to single partition in a disk image or use a disk inside a
> > > tar archive (like OVA).
> > > 
> > > For now this is only possible for files in read-only mode. It should be
> > > possible to extend it later to allow read-write mode, but would probably
> > > require that the size of the device is kept constant (i.e. no resizing).
> > > 
> > > Signed-off-by: Tomáš Golembiovský <tgolembi@redhat.com>
> > > ---
> > >  block/raw-posix.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++----
> > >  1 file changed, 91 insertions(+), 6 deletions(-)  
> > 
> > An equivalent change is needed to raw-win32.c
> 
> That's what I feared.
> 
> 
> > > 
> > > diff --git a/block/raw-posix.c b/block/raw-posix.c
> > > index 6ed7547..36f2712 100644
> > > --- a/block/raw-posix.c
> > > +++ b/block/raw-posix.c  
> > 
> > > @@ -421,6 +435,17 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
> > >          goto fail;
> > >      }
> > >  
> > > +    s->offset = qemu_opt_get_size(opts, "offset", 0);
> > > +    s->assumed_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);  
> > 
> > If the user didn't provide "size", then we should initialize
> > it to the actual size of the underlying storage, rather than
> > to 0.
> 
> I rather wanted to distinguish  the case when no offset or size was
> specified to be able to limit the scope of introduced changes.

IMHO that's a mistake - it leads to 2 different codepaths one of
which will be mostly unused & untested, so liable to bitrot.

> > > +
> > > +    if (((bs->drv != &bdrv_file) || !bs->read_only) &&  
> > 
> > Why the check against bdrv_file ?
> 
> To limit it only to files. Maybe there is better way to do that? The
> devices have a nasty habit to change the size. Sure, this can happen to
> file too, e.g. if somebody truncates the file outside QEMU. But that's
> rather a bad behaviour. For devices changing the size may be perfectly
> valid operation, e.g. replacing CD in drive or card in a card reader.

The raw driver is usable over any storage backend (file, rbd, iscsi,
etc, etc) and it is valid to want to use a offset/size parameter in
combination with any of them. So we should not restrict it to just
files.

> > > +        ((s->offset > 0) || (s->assumed_size > 0))) {
> > > +        error_setg(errp, "offset and size options are allowed only for "
> > > +                         "files in read-only mode");
> > > +        ret = -EINVAL;
> > > +        goto fail;
> > > +    }  
> > 
> > Why did you restrict it to read-only instead of adding the offset logic
> > to the write & truncate methods. IMHO if we're going to add this feature
> > we should make it work in all scenarios, not just do 1/2 the job.
> 
> I still plan to do that, but I didn't feel confident enough to do it in
> the first run.
> 
> We need to decide what is the correct behaviour here first. Since the
> image we're working with is contained in some larger unit it cannot be
> resized freely. There are two option that come to my mind:
> 
> 1) block truncate/grow completely,
> 2) allow image to be truncated and grown only if the new size does not
>    go over the original size.
> 
> If we go with the second option then I have a another question. How
> strict is (or should be) qemu about the sizes being block aligned? I'm
> still little bit fuzzy on that. Somewhere it is assumed that the size is
> multiple of 512, sometimes qemu automatically rounds up if it isn't and
> sometimes it seems to me that the size can be arbitrary.

We should not make assumptions about what is a valid or invalid usage,
as QEMU doesn't have enough knowledge to do that correctly.

IOW, we should just allow truncate with no restrictions. It is up to the
calling app to figure out whether truncate makes sense or not.


Regards,
Daniel
Tomáš Golembiovský Oct. 3, 2016, 11:07 a.m. UTC | #6
On Mon, 3 Oct 2016 11:52:59 +0100
"Daniel P. Berrange" <berrange@redhat.com> wrote:

> On Mon, Oct 03, 2016 at 12:45:57PM +0200, Tomáš Golembiovský wrote:
> > On Mon, 3 Oct 2016 09:52:13 +0100
> > "Daniel P. Berrange" <berrange@redhat.com> wrote:
> >   
> > > On Sun, Oct 02, 2016 at 09:13:29PM +0200, Tomáš Golembiovský wrote:  
> > > > Added two new options 'offset' and 'size'. This makes it possible to use
> > > > only part of the file as a device. This can be used e.g. to limit the
> > > > access only to single partition in a disk image or use a disk inside a
> > > > tar archive (like OVA).
> > > > 
> > > > For now this is only possible for files in read-only mode. It should be
> > > > possible to extend it later to allow read-write mode, but would probably
> > > > require that the size of the device is kept constant (i.e. no resizing).
> > > > 
> > > > Signed-off-by: Tomáš Golembiovský <tgolembi@redhat.com>
> > > > ---
> > > >  block/raw-posix.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++----
> > > >  1 file changed, 91 insertions(+), 6 deletions(-)    
> > > 
> > > An equivalent change is needed to raw-win32.c  
> > 
> > That's what I feared.
> > 
> >   
> > > > 
> > > > diff --git a/block/raw-posix.c b/block/raw-posix.c
> > > > index 6ed7547..36f2712 100644
> > > > --- a/block/raw-posix.c
> > > > +++ b/block/raw-posix.c    
> > >   
> > > > @@ -421,6 +435,17 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
> > > >          goto fail;
> > > >      }
> > > >  
> > > > +    s->offset = qemu_opt_get_size(opts, "offset", 0);
> > > > +    s->assumed_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);    
> > > 
> > > If the user didn't provide "size", then we should initialize
> > > it to the actual size of the underlying storage, rather than
> > > to 0.  
> > 
> > I rather wanted to distinguish  the case when no offset or size was
> > specified to be able to limit the scope of introduced changes.  
> 
> IMHO that's a mistake - it leads to 2 different codepaths one of
> which will be mostly unused & untested, so liable to bitrot.

Hmm, valid point.


> 
> > > > +
> > > > +    if (((bs->drv != &bdrv_file) || !bs->read_only) &&    
> > > 
> > > Why the check against bdrv_file ?  
> > 
> > To limit it only to files. Maybe there is better way to do that? The
> > devices have a nasty habit to change the size. Sure, this can happen to
> > file too, e.g. if somebody truncates the file outside QEMU. But that's
> > rather a bad behaviour. For devices changing the size may be perfectly
> > valid operation, e.g. replacing CD in drive or card in a card reader.  
> 
> The raw driver is usable over any storage backend (file, rbd, iscsi,
> etc, etc) and it is valid to want to use a offset/size parameter in
> combination with any of them. So we should not restrict it to just
> files.

The question is then, what to do when the underlying device changes? The
size/offset may not be valid at that point anymore.


> > > > +        ((s->offset > 0) || (s->assumed_size > 0))) {
> > > > +        error_setg(errp, "offset and size options are allowed only for "
> > > > +                         "files in read-only mode");
> > > > +        ret = -EINVAL;
> > > > +        goto fail;
> > > > +    }    
> > > 
> > > Why did you restrict it to read-only instead of adding the offset logic
> > > to the write & truncate methods. IMHO if we're going to add this feature
> > > we should make it work in all scenarios, not just do 1/2 the job.  
> > 
> > I still plan to do that, but I didn't feel confident enough to do it in
> > the first run.
> > 
> > We need to decide what is the correct behaviour here first. Since the
> > image we're working with is contained in some larger unit it cannot be
> > resized freely. There are two option that come to my mind:
> > 
> > 1) block truncate/grow completely,
> > 2) allow image to be truncated and grown only if the new size does not
> >    go over the original size.
> > 
> > If we go with the second option then I have a another question. How
> > strict is (or should be) qemu about the sizes being block aligned? I'm
> > still little bit fuzzy on that. Somewhere it is assumed that the size is
> > multiple of 512, sometimes qemu automatically rounds up if it isn't and
> > sometimes it seems to me that the size can be arbitrary.  
> 
> We should not make assumptions about what is a valid or invalid usage,
> as QEMU doesn't have enough knowledge to do that correctly.
> 
> IOW, we should just allow truncate with no restrictions. It is up to the
> calling app to figure out whether truncate makes sense or not.

Maybe I'm missing something here. Can the truncate operation be somehow
initiated from inside the VM, or is that only something that can be done
from outside with 'qemu-img resize'?


    Tomas
Daniel P. Berrangé Oct. 3, 2016, 11:11 a.m. UTC | #7
On Mon, Oct 03, 2016 at 01:07:07PM +0200, Tomáš Golembiovský wrote:
> On Mon, 3 Oct 2016 11:52:59 +0100
> > 
> > > > > +
> > > > > +    if (((bs->drv != &bdrv_file) || !bs->read_only) &&    
> > > > 
> > > > Why the check against bdrv_file ?  
> > > 
> > > To limit it only to files. Maybe there is better way to do that? The
> > > devices have a nasty habit to change the size. Sure, this can happen to
> > > file too, e.g. if somebody truncates the file outside QEMU. But that's
> > > rather a bad behaviour. For devices changing the size may be perfectly
> > > valid operation, e.g. replacing CD in drive or card in a card reader.  
> > 
> > The raw driver is usable over any storage backend (file, rbd, iscsi,
> > etc, etc) and it is valid to want to use a offset/size parameter in
> > combination with any of them. So we should not restrict it to just
> > files.
> 
> The question is then, what to do when the underlying device changes? The
> size/offset may not be valid at that point anymore.

The underlying device shouldn't be changing in size without that change
going through the raw format driver

> > > > Why did you restrict it to read-only instead of adding the offset logic
> > > > to the write & truncate methods. IMHO if we're going to add this feature
> > > > we should make it work in all scenarios, not just do 1/2 the job.  
> > > 
> > > I still plan to do that, but I didn't feel confident enough to do it in
> > > the first run.
> > > 
> > > We need to decide what is the correct behaviour here first. Since the
> > > image we're working with is contained in some larger unit it cannot be
> > > resized freely. There are two option that come to my mind:
> > > 
> > > 1) block truncate/grow completely,
> > > 2) allow image to be truncated and grown only if the new size does not
> > >    go over the original size.
> > > 
> > > If we go with the second option then I have a another question. How
> > > strict is (or should be) qemu about the sizes being block aligned? I'm
> > > still little bit fuzzy on that. Somewhere it is assumed that the size is
> > > multiple of 512, sometimes qemu automatically rounds up if it isn't and
> > > sometimes it seems to me that the size can be arbitrary.  
> > 
> > We should not make assumptions about what is a valid or invalid usage,
> > as QEMU doesn't have enough knowledge to do that correctly.
> > 
> > IOW, we should just allow truncate with no restrictions. It is up to the
> > calling app to figure out whether truncate makes sense or not.
> 
> Maybe I'm missing something here. Can the truncate operation be somehow
> initiated from inside the VM, or is that only something that can be done
> from outside with 'qemu-img resize'?

Truncate is something run from the host - guests are certainly not allowed
to trigger truncate, as that would let you inflict a denial of service on
the host by consuming  disk space they should not be allowed.

Regards,
Daniel
Paolo Bonzini Oct. 3, 2016, 11:20 a.m. UTC | #8
On 03/10/2016 12:47, Tomáš Golembiovský wrote:
> On Mon, 3 Oct 2016 11:20:44 +0200
> Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
>> On 03/10/2016 10:52, Daniel P. Berrange wrote:
>>> On Sun, Oct 02, 2016 at 09:13:29PM +0200, Tomáš Golembiovský wrote:  
>>>> Added two new options 'offset' and 'size'. This makes it possible to use
>>>> only part of the file as a device. This can be used e.g. to limit the
>>>> access only to single partition in a disk image or use a disk inside a
>>>> tar archive (like OVA).
>>>>
>>>> For now this is only possible for files in read-only mode. It should be
>>>> possible to extend it later to allow read-write mode, but would probably
>>>> require that the size of the device is kept constant (i.e. no resizing).
>>>>
>>>> Signed-off-by: Tomáš Golembiovský <tgolembi@redhat.com>
>>>> ---
>>>>  block/raw-posix.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++----
>>>>  1 file changed, 91 insertions(+), 6 deletions(-)  
>>>
>>> An equivalent change is needed to raw-win32.c  
>>
>> Actually, it should be done _only_ in block/raw_bsd.c.
> 
> You mean in raw-posix.c and in raw_bsd.c, or just raw_bsd.c?

Just raw_bsd.c.

Paolo
Eric Blake Oct. 3, 2016, 3:28 p.m. UTC | #9
On 10/02/2016 02:13 PM, Tomáš Golembiovský wrote:
> Added two new options 'offset' and 'size'. This makes it possible to use
> only part of the file as a device. This can be used e.g. to limit the
> access only to single partition in a disk image or use a disk inside a
> tar archive (like OVA).
> 
> For now this is only possible for files in read-only mode. It should be
> possible to extend it later to allow read-write mode, but would probably
> require that the size of the device is kept constant (i.e. no resizing).
> 
> Signed-off-by: Tomáš Golembiovský <tgolembi@redhat.com>
> ---
>  block/raw-posix.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 91 insertions(+), 6 deletions(-)
> 

> +    s->offset = qemu_opt_get_size(opts, "offset", 0);
> +    s->assumed_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
> +
> +    if (((bs->drv != &bdrv_file) || !bs->read_only) &&
> +        ((s->offset > 0) || (s->assumed_size > 0))) {

Over-parenthesized.

> @@ -443,6 +468,23 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
>      }
>      s->fd = fd;
>  
> +    /* Check size and offset */
> +    real_size = raw_getlength_real(bs);
> +    if (real_size < (s->offset + s->assumed_size)) {
> +        if (s->assumed_size == 0) {
> +            error_setg(errp, "The offset has to be smaller than actual size "
> +                             "of the containing file (%ld) ",
> +                             real_size);

Won't compile on 32-bit platforms.  %ld is not necessarily synonymous
with int64_t; you need to use PRId64 instead.  Multiple times through
your patch.
Kevin Wolf Oct. 4, 2016, 8:57 a.m. UTC | #10
Am 03.10.2016 um 13:07 hat Tomáš Golembiovský geschrieben:
> > > > > +    if (((bs->drv != &bdrv_file) || !bs->read_only) &&    
> > > > 
> > > > Why the check against bdrv_file ?  
> > > 
> > > To limit it only to files. Maybe there is better way to do that? The
> > > devices have a nasty habit to change the size. Sure, this can happen to
> > > file too, e.g. if somebody truncates the file outside QEMU. But that's
> > > rather a bad behaviour. For devices changing the size may be perfectly
> > > valid operation, e.g. replacing CD in drive or card in a card reader.  
> > 
> > The raw driver is usable over any storage backend (file, rbd, iscsi,
> > etc, etc) and it is valid to want to use a offset/size parameter in
> > combination with any of them. So we should not restrict it to just
> > files.

Just to clear up some confusion here: There are the file/host_device/...
protocol drivers, which only access local files. These are implemented
in raw-posix.c, i.e. the file that this patch is touching. raw-win32.c
implements the same kind of file access for Windows.

And then there is the raw image format driver, which is raw_bsd.c. This
one is what is layered on top of any of the protocol drivers, including
raw-posix, raw-win32 and all the network protocols.

This is why implementing the feature in the raw format driver would make
it so much more useful. You could use it with any backend then.

> The question is then, what to do when the underlying device changes?
> The size/offset may not be valid at that point anymore.

A user who runs qemu on a block device must make sure that the device
makes sense while the VM is running. Second guessing is not qemu's job.

> > > > Why did you restrict it to read-only instead of adding the offset logic
> > > > to the write & truncate methods. IMHO if we're going to add this feature
> > > > we should make it work in all scenarios, not just do 1/2 the job.  
> > > 
> > > I still plan to do that, but I didn't feel confident enough to do it in
> > > the first run.
> > > 
> > > We need to decide what is the correct behaviour here first. Since the
> > > image we're working with is contained in some larger unit it cannot be
> > > resized freely. There are two option that come to my mind:
> > > 
> > > 1) block truncate/grow completely,
> > > 2) allow image to be truncated and grown only if the new size does not
> > >    go over the original size.

As long as there is only an offset, apply the offset and do the same
operation as usual.

If a size was specified, then maybe we should block the operation if it
requires changing the the size, i.e. it would behave the same as if we
were working on a real block device.

You could then still use bdrv_reopen() with new option values in order
to extend the visible size.

> > > If we go with the second option then I have a another question. How
> > > strict is (or should be) qemu about the sizes being block aligned? I'm
> > > still little bit fuzzy on that. Somewhere it is assumed that the size is
> > > multiple of 512, sometimes qemu automatically rounds up if it isn't and
> > > sometimes it seems to me that the size can be arbitrary.  

Traditionally, the qemu block layer worked in 512 byte units and you
still see places where this is true. But we're in a process of
converting things to use bytes everywhere (which saves a lot of
conversion back and forth and simplifies the code), and the basic
read/write functions in the primary block drivers (raw, qcow2, file) can
all work with byte granularity today.

Kevin
Daniel P. Berrangé Oct. 4, 2016, 9:15 a.m. UTC | #11
On Tue, Oct 04, 2016 at 10:57:49AM +0200, Kevin Wolf wrote:
> Am 03.10.2016 um 13:07 hat Tomáš Golembiovský geschrieben:
> > > > > > +    if (((bs->drv != &bdrv_file) || !bs->read_only) &&    
> > > > > 
> > > > > Why the check against bdrv_file ?  
> > > > 
> > > > To limit it only to files. Maybe there is better way to do that? The
> > > > devices have a nasty habit to change the size. Sure, this can happen to
> > > > file too, e.g. if somebody truncates the file outside QEMU. But that's
> > > > rather a bad behaviour. For devices changing the size may be perfectly
> > > > valid operation, e.g. replacing CD in drive or card in a card reader.  
> > > 
> > > The raw driver is usable over any storage backend (file, rbd, iscsi,
> > > etc, etc) and it is valid to want to use a offset/size parameter in
> > > combination with any of them. So we should not restrict it to just
> > > files.
> 
> Just to clear up some confusion here: There are the file/host_device/...
> protocol drivers, which only access local files. These are implemented
> in raw-posix.c, i.e. the file that this patch is touching. raw-win32.c
> implements the same kind of file access for Windows.

This naming is constantly confusing - is there any appetite for renaming
those to 'file-posix.c' and 'file-win32.c', and raw_bsd.c to raw.c ?

Regards,
Daniel
Kevin Wolf Oct. 4, 2016, 9:24 a.m. UTC | #12
Am 02.10.2016 um 21:13 hat Tomáš Golembiovský geschrieben:
> Added two new options 'offset' and 'size'. This makes it possible to use
> only part of the file as a device. This can be used e.g. to limit the
> access only to single partition in a disk image or use a disk inside a
> tar archive (like OVA).
> 
> For now this is only possible for files in read-only mode. It should be
> possible to extend it later to allow read-write mode, but would probably
> require that the size of the device is kept constant (i.e. no resizing).
> 
> Signed-off-by: Tomáš Golembiovský <tgolembi@redhat.com>
> ---
>  block/raw-posix.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 91 insertions(+), 6 deletions(-)

Sounds like a very useful feature. I agree with comments made by others
that we should include write support from the start, and that the code
should be moved to raw_bsd.c.

You also need to update the options for the blockdev-add QMP command in
qapi/block-core.json. The relevant type is BlockdevOptionsFile.

> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index 6ed7547..36f2712 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -136,6 +136,8 @@ typedef struct BDRVRawState {
>      int type;
>      int open_flags;
>      size_t buf_align;
> +    uint64_t offset;
> +    uint64_t assumed_size;
>  
>  #ifdef CONFIG_XFS
>      bool is_xfs:1;
> @@ -154,6 +156,7 @@ typedef struct BDRVRawReopenState {
>  
>  static int fd_open(BlockDriverState *bs);
>  static int64_t raw_getlength(BlockDriverState *bs);
> +static int64_t raw_getlength_real(BlockDriverState *bs);
>  
>  typedef struct RawPosixAIOData {
>      BlockDriverState *bs;
> @@ -399,6 +402,16 @@ static QemuOptsList raw_runtime_opts = {
>              .type = QEMU_OPT_STRING,
>              .help = "File name of the image",
>          },
> +        {
> +            .name = "offset",
> +            .type = QEMU_OPT_SIZE,
> +            .help = "Offset into the file"
> +        },
> +        {
> +            .name = BLOCK_OPT_SIZE,
> +            .type = QEMU_OPT_SIZE,
> +            .help = "Virtual disk size"
> +        },

BLOCK_OPT_SIZE is an option for bdrv_create(). I'd either create a
BDRV_OPT_SIZE to be consistent with other bdrv_open() option name
macros, or just use a string literal here.

>          { /* end of list */ }
>      },
>  };
> @@ -412,6 +425,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
>      const char *filename = NULL;
>      int fd, ret;
>      struct stat st;
> +    int64_t real_size;
>  
>      opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
>      qemu_opts_absorb_qdict(opts, options, &local_err);
> @@ -421,6 +435,17 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
>          goto fail;
>      }
>  
> +    s->offset = qemu_opt_get_size(opts, "offset", 0);
> +    s->assumed_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
> +
> +    if (((bs->drv != &bdrv_file) || !bs->read_only) &&
> +        ((s->offset > 0) || (s->assumed_size > 0))) {
> +        error_setg(errp, "offset and size options are allowed only for "
> +                         "files in read-only mode");
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +
>      filename = qemu_opt_get(opts, "filename");
>  
>      ret = raw_normalize_devicepath(&filename);
> @@ -443,6 +468,23 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
>      }
>      s->fd = fd;
>  
> +    /* Check size and offset */
> +    real_size = raw_getlength_real(bs);

raw_getlength() can fail, so real_size < 0 should be handled separately
(otherwise we end up with an error message that doesn't make a lot of
sense).

> +    if (real_size < (s->offset + s->assumed_size)) {
> +        if (s->assumed_size == 0) {
> +            error_setg(errp, "The offset has to be smaller than actual size "
> +                             "of the containing file (%ld) ",
> +                             real_size);
> +        } else {
> +            error_setg(errp, "The sum of offset (%lu) and size (%lu) has to "
> +                             "be smaller than actual size of the containing "
> +                             "file (%ld) ",
> +                             s->offset, s->assumed_size, real_size);

I think the need for PRId64/PRIu64 has been mentioned.

Also s/has to be smaller/must not be larger/. Equality is allowed.

> +        }
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +
>  #ifdef CONFIG_LINUX_AIO
>      if (!raw_use_aio(bdrv_flags) && (bdrv_flags & BDRV_O_NATIVE_AIO)) {
>          error_setg(errp, "aio=native was specified, but it requires "
> @@ -1271,6 +1313,19 @@ static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
>                                        uint64_t bytes, QEMUIOVector *qiov,
>                                        int flags)
>  {
> +    BDRVRawState *s = bs->opaque;
> +    if (s->assumed_size > 0) {
> +        if (offset > s->assumed_size) {
> +            /* Attempt to read beyond EOF */
> +            return 0;
> +        } else if ((offset + bytes) > s->assumed_size) {
> +            /* Trying to read more than is available */
> +            bytes = s->assumed_size - offset;
> +        }
> +    }

As long as we make sure to return the right values in .bdrv_getlength(),
we can assert(!s->assumed_size || offset + bytes <= s->assumed_size)
instead of doing all of this.

The block layer already handles reads across and beyond EOF (by
returning a zeroed area, not ignoring half of the request as this code
would do).

> +    offset += s->offset;
> +
>      return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
>  }
>  
> @@ -1348,7 +1403,7 @@ static int raw_truncate(BlockDriverState *bs, int64_t offset)
>  }
>  
>  #ifdef __OpenBSD__
> -static int64_t raw_getlength(BlockDriverState *bs)
> +static int64_t raw_getlength_real(BlockDriverState *bs)
>  {
>      BDRVRawState *s = bs->opaque;
>      int fd = s->fd;
> @@ -1367,7 +1422,7 @@ static int64_t raw_getlength(BlockDriverState *bs)
>          return st.st_size;
>  }
>  #elif defined(__NetBSD__)
> -static int64_t raw_getlength(BlockDriverState *bs)
> +static int64_t raw_getlength_real(BlockDriverState *bs)
>  {
>      BDRVRawState *s = bs->opaque;
>      int fd = s->fd;
> @@ -1392,7 +1447,7 @@ static int64_t raw_getlength(BlockDriverState *bs)
>          return st.st_size;
>  }
>  #elif defined(__sun__)
> -static int64_t raw_getlength(BlockDriverState *bs)
> +static int64_t raw_getlength_real(BlockDriverState *bs)
>  {
>      BDRVRawState *s = bs->opaque;
>      struct dk_minfo minfo;
> @@ -1423,7 +1478,7 @@ static int64_t raw_getlength(BlockDriverState *bs)
>      return size;
>  }
>  #elif defined(CONFIG_BSD)
> -static int64_t raw_getlength(BlockDriverState *bs)
> +static int64_t raw_getlength_real(BlockDriverState *bs)
>  {
>      BDRVRawState *s = bs->opaque;
>      int fd = s->fd;
> @@ -1497,7 +1552,7 @@ again:
>      return size;
>  }
>  #else
> -static int64_t raw_getlength(BlockDriverState *bs)
> +static int64_t raw_getlength_real(BlockDriverState *bs)
>  {
>      BDRVRawState *s = bs->opaque;
>      int ret;
> @@ -1516,6 +1571,28 @@ static int64_t raw_getlength(BlockDriverState *bs)
>  }
>  #endif
>  
> +static int64_t raw_getlength(BlockDriverState *bs)
> +{
> +    BDRVRawState *s = bs->opaque;
> +
> +    if (s->assumed_size > 0) {
> +        return (int64_t)s->assumed_size;
> +    }
> +
> +    int64_t size = raw_getlength_real(bs);
> +    if (s->offset > 0) {
> +        if (s->offset > size) {
> +            /* The size has changed! We didn't expect that. */
> +            return -EIO;

You're overwriting any error return code with -EIO here. I think the
correct way to phrase it would be:

if (size >= 0 && s->offset > size) {
    return -EIO;
}
return size - s->offset;

You can subtract s->offset unconditionally as subtracting 0 doesn't
hurt.
>
> +        }
> +        size -= s->offset;
> +    }
> +
> +    return size;
> +}
> +
> +
> +

Why three newlines?

>  static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
>  {
>      struct stat st;
> @@ -1524,7 +1601,15 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
>      if (fstat(s->fd, &st) < 0) {
>          return -errno;
>      }
> -    return (int64_t)st.st_blocks * 512;
> +    uint64_t size = st.st_blocks * 512;
> +    /* If the file is sparse we have no idea which part of the file is
> +     * allocated and which is not. So we just make sure the returned value is
> +     * not greater than what we're working with.
> +     */
> +    if (s->assumed_size > 0 && s->assumed_size < size) {
> +        size = s->assumed_size;
> +    }
> +    return (int64_t)size;
>  }

This function isn't on a hot path, so maybe even using a
bdrv_get_block_status() loop to determine the actual allocation status
could be defendable here. I won't insist on it, though.

Kevin
Paolo Bonzini Oct. 4, 2016, 10:12 a.m. UTC | #13
On 04/10/2016 11:24, Kevin Wolf wrote:
> You also need to update the options for the blockdev-add QMP command in
> qapi/block-core.json. The relevant type is BlockdevOptionsFile.

Is it? Or should he define a new BlockdevOptionsRaw, and point to it in
the BlockdevOptions union instead of "'raw':
'BlockdevOptionsGenericFormat'"?

Paolo
Tomáš Golembiovský Oct. 4, 2016, 11:48 a.m. UTC | #14
On Tue, 4 Oct 2016 10:57:49 +0200
Kevin Wolf <kwolf@redhat.com> wrote:

> Am 03.10.2016 um 13:07 hat Tomáš Golembiovský geschrieben:
> > > > > > +    if (((bs->drv != &bdrv_file) || !bs->read_only) &&      
> > > > > 
> > > > > Why the check against bdrv_file ?    
> > > > 
> > > > To limit it only to files. Maybe there is better way to do that? The
> > > > devices have a nasty habit to change the size. Sure, this can happen to
> > > > file too, e.g. if somebody truncates the file outside QEMU. But that's
> > > > rather a bad behaviour. For devices changing the size may be perfectly
> > > > valid operation, e.g. replacing CD in drive or card in a card reader.    
> > > 
> > > The raw driver is usable over any storage backend (file, rbd, iscsi,
> > > etc, etc) and it is valid to want to use a offset/size parameter in
> > > combination with any of them. So we should not restrict it to just
> > > files.  
> 
> Just to clear up some confusion here: There are the file/host_device/...
> protocol drivers, which only access local files. These are implemented
> in raw-posix.c, i.e. the file that this patch is touching. raw-win32.c
> implements the same kind of file access for Windows.
> 
> And then there is the raw image format driver, which is raw_bsd.c. This
> one is what is layered on top of any of the protocol drivers, including
> raw-posix, raw-win32 and all the network protocols.
> 
> This is why implementing the feature in the raw format driver would make
> it so much more useful. You could use it with any backend then.
> 

Thanks Kevin, this really helped. I have to admit the naming is quite
confusing.


> > > > If we go with the second option then I have a another question. How
> > > > strict is (or should be) qemu about the sizes being block aligned? I'm
> > > > still little bit fuzzy on that. Somewhere it is assumed that the size is
> > > > multiple of 512, sometimes qemu automatically rounds up if it isn't and
> > > > sometimes it seems to me that the size can be arbitrary.    
> 
> Traditionally, the qemu block layer worked in 512 byte units and you
> still see places where this is true. But we're in a process of
> converting things to use bytes everywhere (which saves a lot of
> conversion back and forth and simplifies the code), and the basic
> read/write functions in the primary block drivers (raw, qcow2, file) can
> all work with byte granularity today.

Again thanks for clearing that up a bit.


    Tomas
Kevin Wolf Oct. 4, 2016, 11:59 a.m. UTC | #15
Am 04.10.2016 um 12:12 hat Paolo Bonzini geschrieben:
> 
> 
> On 04/10/2016 11:24, Kevin Wolf wrote:
> > You also need to update the options for the blockdev-add QMP command in
> > qapi/block-core.json. The relevant type is BlockdevOptionsFile.
> 
> Is it? Or should he define a new BlockdevOptionsRaw, and point to it in
> the BlockdevOptions union instead of "'raw':
> 'BlockdevOptionsGenericFormat'"?

It would be the right type for the raw-posix patch as it exists today.
But you're right that for adding the option to the raw format
driver, as we requested, a new BlockdevOptionsRaw (which inherits from
BlockdevOptionsGenericFormat) is needed.

Kevin
Eric Blake Oct. 4, 2016, 1:58 p.m. UTC | #16
On 10/04/2016 04:15 AM, Daniel P. Berrange wrote:
> On Tue, Oct 04, 2016 at 10:57:49AM +0200, Kevin Wolf wrote:
>> Am 03.10.2016 um 13:07 hat Tomáš Golembiovský geschrieben:
>>>>>>> +    if (((bs->drv != &bdrv_file) || !bs->read_only) &&    
>>>>>>
>>>>>> Why the check against bdrv_file ?  
>>>>>
>>>>> To limit it only to files. Maybe there is better way to do that? The
>>>>> devices have a nasty habit to change the size. Sure, this can happen to
>>>>> file too, e.g. if somebody truncates the file outside QEMU. But that's
>>>>> rather a bad behaviour. For devices changing the size may be perfectly
>>>>> valid operation, e.g. replacing CD in drive or card in a card reader.  
>>>>
>>>> The raw driver is usable over any storage backend (file, rbd, iscsi,
>>>> etc, etc) and it is valid to want to use a offset/size parameter in
>>>> combination with any of them. So we should not restrict it to just
>>>> files.
>>
>> Just to clear up some confusion here: There are the file/host_device/...
>> protocol drivers, which only access local files. These are implemented
>> in raw-posix.c, i.e. the file that this patch is touching. raw-win32.c
>> implements the same kind of file access for Windows.
> 
> This naming is constantly confusing - is there any appetite for renaming
> those to 'file-posix.c' and 'file-win32.c', and raw_bsd.c to raw.c ?

I would welcome that naming change.  The mix of - vs. _ is bad enough;
and _bsd makes me think BSD (non-Linux) rather than shared block device
code.
diff mbox

Patch

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 6ed7547..36f2712 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -136,6 +136,8 @@  typedef struct BDRVRawState {
     int type;
     int open_flags;
     size_t buf_align;
+    uint64_t offset;
+    uint64_t assumed_size;
 
 #ifdef CONFIG_XFS
     bool is_xfs:1;
@@ -154,6 +156,7 @@  typedef struct BDRVRawReopenState {
 
 static int fd_open(BlockDriverState *bs);
 static int64_t raw_getlength(BlockDriverState *bs);
+static int64_t raw_getlength_real(BlockDriverState *bs);
 
 typedef struct RawPosixAIOData {
     BlockDriverState *bs;
@@ -399,6 +402,16 @@  static QemuOptsList raw_runtime_opts = {
             .type = QEMU_OPT_STRING,
             .help = "File name of the image",
         },
+        {
+            .name = "offset",
+            .type = QEMU_OPT_SIZE,
+            .help = "Offset into the file"
+        },
+        {
+            .name = BLOCK_OPT_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Virtual disk size"
+        },
         { /* end of list */ }
     },
 };
@@ -412,6 +425,7 @@  static int raw_open_common(BlockDriverState *bs, QDict *options,
     const char *filename = NULL;
     int fd, ret;
     struct stat st;
+    int64_t real_size;
 
     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
@@ -421,6 +435,17 @@  static int raw_open_common(BlockDriverState *bs, QDict *options,
         goto fail;
     }
 
+    s->offset = qemu_opt_get_size(opts, "offset", 0);
+    s->assumed_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
+
+    if (((bs->drv != &bdrv_file) || !bs->read_only) &&
+        ((s->offset > 0) || (s->assumed_size > 0))) {
+        error_setg(errp, "offset and size options are allowed only for "
+                         "files in read-only mode");
+        ret = -EINVAL;
+        goto fail;
+    }
+
     filename = qemu_opt_get(opts, "filename");
 
     ret = raw_normalize_devicepath(&filename);
@@ -443,6 +468,23 @@  static int raw_open_common(BlockDriverState *bs, QDict *options,
     }
     s->fd = fd;
 
+    /* Check size and offset */
+    real_size = raw_getlength_real(bs);
+    if (real_size < (s->offset + s->assumed_size)) {
+        if (s->assumed_size == 0) {
+            error_setg(errp, "The offset has to be smaller than actual size "
+                             "of the containing file (%ld) ",
+                             real_size);
+        } else {
+            error_setg(errp, "The sum of offset (%lu) and size (%lu) has to "
+                             "be smaller than actual size of the containing "
+                             "file (%ld) ",
+                             s->offset, s->assumed_size, real_size);
+        }
+        ret = -EINVAL;
+        goto fail;
+    }
+
 #ifdef CONFIG_LINUX_AIO
     if (!raw_use_aio(bdrv_flags) && (bdrv_flags & BDRV_O_NATIVE_AIO)) {
         error_setg(errp, "aio=native was specified, but it requires "
@@ -1271,6 +1313,19 @@  static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
                                       uint64_t bytes, QEMUIOVector *qiov,
                                       int flags)
 {
+    BDRVRawState *s = bs->opaque;
+    if (s->assumed_size > 0) {
+        if (offset > s->assumed_size) {
+            /* Attempt to read beyond EOF */
+            return 0;
+        } else if ((offset + bytes) > s->assumed_size) {
+            /* Trying to read more than is available */
+            bytes = s->assumed_size - offset;
+        }
+    }
+
+    offset += s->offset;
+
     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
 }
 
@@ -1348,7 +1403,7 @@  static int raw_truncate(BlockDriverState *bs, int64_t offset)
 }
 
 #ifdef __OpenBSD__
-static int64_t raw_getlength(BlockDriverState *bs)
+static int64_t raw_getlength_real(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
     int fd = s->fd;
@@ -1367,7 +1422,7 @@  static int64_t raw_getlength(BlockDriverState *bs)
         return st.st_size;
 }
 #elif defined(__NetBSD__)
-static int64_t raw_getlength(BlockDriverState *bs)
+static int64_t raw_getlength_real(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
     int fd = s->fd;
@@ -1392,7 +1447,7 @@  static int64_t raw_getlength(BlockDriverState *bs)
         return st.st_size;
 }
 #elif defined(__sun__)
-static int64_t raw_getlength(BlockDriverState *bs)
+static int64_t raw_getlength_real(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
     struct dk_minfo minfo;
@@ -1423,7 +1478,7 @@  static int64_t raw_getlength(BlockDriverState *bs)
     return size;
 }
 #elif defined(CONFIG_BSD)
-static int64_t raw_getlength(BlockDriverState *bs)
+static int64_t raw_getlength_real(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
     int fd = s->fd;
@@ -1497,7 +1552,7 @@  again:
     return size;
 }
 #else
-static int64_t raw_getlength(BlockDriverState *bs)
+static int64_t raw_getlength_real(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
     int ret;
@@ -1516,6 +1571,28 @@  static int64_t raw_getlength(BlockDriverState *bs)
 }
 #endif
 
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (s->assumed_size > 0) {
+        return (int64_t)s->assumed_size;
+    }
+
+    int64_t size = raw_getlength_real(bs);
+    if (s->offset > 0) {
+        if (s->offset > size) {
+            /* The size has changed! We didn't expect that. */
+            return -EIO;
+        }
+        size -= s->offset;
+    }
+
+    return size;
+}
+
+
+
 static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
 {
     struct stat st;
@@ -1524,7 +1601,15 @@  static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
     if (fstat(s->fd, &st) < 0) {
         return -errno;
     }
-    return (int64_t)st.st_blocks * 512;
+    uint64_t size = st.st_blocks * 512;
+    /* If the file is sparse we have no idea which part of the file is
+     * allocated and which is not. So we just make sure the returned value is
+     * not greater than what we're working with.
+     */
+    if (s->assumed_size > 0 && s->assumed_size < size) {
+        size = s->assumed_size;
+    }
+    return (int64_t)size;
 }
 
 static int raw_create(const char *filename, QemuOpts *opts, Error **errp)