diff mbox

[RFC] glusterfs: allow partial reads

Message ID 1480589964-29411-1-git-send-email-w.bumiller@proxmox.com
State New
Headers show

Commit Message

Wolfgang Bumiller Dec. 1, 2016, 10:59 a.m. UTC
Fixes #1644754.

Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
---
I'm not sure what the original rationale was to treat both partial
reads as well as well as writes as I/O error. (Seems to have happened
from original glusterfs v1 to v2 series with a note but no reasoning
for the read side as far as I could see.)
The general direction lately seems to be to move away from sector
based block APIs. Also eg. the NFS code allows partial reads. (It
does, however, have an old patch (c2eb918e3) dedicated to aligning
sizes to 512 byte boundaries for file creation for compatibility to
other parts of qemu like qcow2. This already happens in glusterfs,
though, but if you move a file from a different storage over to
glusterfs you may end up with a qcow2 file with eg. the L1 table in
the last 80 bytes of the file aligned to _begin_ at a 512 boundary,
but not _end_ at one.)

 block/gluster.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

Comments

Eric Blake Dec. 2, 2016, 7:13 p.m. UTC | #1
On 12/01/2016 04:59 AM, Wolfgang Bumiller wrote:
> Fixes #1644754.
> 
> Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
> ---
> I'm not sure what the original rationale was to treat both partial
> reads as well as well as writes as I/O error. (Seems to have happened
> from original glusterfs v1 to v2 series with a note but no reasoning
> for the read side as far as I could see.)
> The general direction lately seems to be to move away from sector
> based block APIs. Also eg. the NFS code allows partial reads. (It
> does, however, have an old patch (c2eb918e3) dedicated to aligning
> sizes to 512 byte boundaries for file creation for compatibility to
> other parts of qemu like qcow2. This already happens in glusterfs,
> though, but if you move a file from a different storage over to
> glusterfs you may end up with a qcow2 file with eg. the L1 table in
> the last 80 bytes of the file aligned to _begin_ at a 512 boundary,
> but not _end_ at one.)
> 
>  block/gluster.c | 10 +++++++++-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/block/gluster.c b/block/gluster.c
> index 891c13b..3db0bf8 100644
> --- a/block/gluster.c
> +++ b/block/gluster.c
> @@ -41,6 +41,7 @@ typedef struct GlusterAIOCB {
>      int ret;
>      Coroutine *coroutine;
>      AioContext *aio_context;
> +    bool is_write;
>  } GlusterAIOCB;
>  
>  typedef struct BDRVGlusterState {
> @@ -716,8 +717,10 @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
>          acb->ret = 0; /* Success */
>      } else if (ret < 0) {
>          acb->ret = -errno; /* Read/Write failed */
> +    } else if (acb->is_write) {
> +        acb->ret = -EIO; /* Partial write - fail it */
>      } else {
> -        acb->ret = -EIO; /* Partial read/write - fail it */
> +        acb->ret = 0; /* Success */

Does this properly guarantee that the portion beyond EOF reads as zero?

Would it be better to switch to byte-based interfaces rather than
continue to force gluster interaction in 512-byte sector chunks, since
gluster can obviously store files that are not 512-aligned?
Wolfgang Bumiller Dec. 5, 2016, 8:26 a.m. UTC | #2
On Fri, Dec 02, 2016 at 01:13:28PM -0600, Eric Blake wrote:
> On 12/01/2016 04:59 AM, Wolfgang Bumiller wrote:
> > Fixes #1644754.
> > 
> > Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
> > ---
> > I'm not sure what the original rationale was to treat both partial
> > reads as well as well as writes as I/O error. (Seems to have happened
> > from original glusterfs v1 to v2 series with a note but no reasoning
> > for the read side as far as I could see.)
> > The general direction lately seems to be to move away from sector
> > based block APIs. Also eg. the NFS code allows partial reads. (It
> > does, however, have an old patch (c2eb918e3) dedicated to aligning
> > sizes to 512 byte boundaries for file creation for compatibility to
> > other parts of qemu like qcow2. This already happens in glusterfs,
> > though, but if you move a file from a different storage over to
> > glusterfs you may end up with a qcow2 file with eg. the L1 table in
> > the last 80 bytes of the file aligned to _begin_ at a 512 boundary,
> > but not _end_ at one.)
> > 
> >  block/gluster.c | 10 +++++++++-
> >  1 file changed, 9 insertions(+), 1 deletion(-)
> > 
> > diff --git a/block/gluster.c b/block/gluster.c
> > index 891c13b..3db0bf8 100644
> > --- a/block/gluster.c
> > +++ b/block/gluster.c
> > @@ -41,6 +41,7 @@ typedef struct GlusterAIOCB {
> >      int ret;
> >      Coroutine *coroutine;
> >      AioContext *aio_context;
> > +    bool is_write;
> >  } GlusterAIOCB;
> >  
> >  typedef struct BDRVGlusterState {
> > @@ -716,8 +717,10 @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
> >          acb->ret = 0; /* Success */
> >      } else if (ret < 0) {
> >          acb->ret = -errno; /* Read/Write failed */
> > +    } else if (acb->is_write) {
> > +        acb->ret = -EIO; /* Partial write - fail it */
> >      } else {
> > -        acb->ret = -EIO; /* Partial read/write - fail it */
> > +        acb->ret = 0; /* Success */
> 
> Does this properly guarantee that the portion beyond EOF reads as zero?

I'd argue this wasn't necessarily the case before either, considering
the first check starts with `!ret`:

    if (!ret || ret == acb->size) {
        acb->ret = 0; /* Success */

A read right at EOF would return 0 and be treated as success there, no?
Iow. it wouldn't zero out the destination buffer as far as I can see.
Come to think of it, I'm not too fond of this part of the check for the
write case either.

> Would it be better to switch to byte-based interfaces rather than
> continue to force gluster interaction in 512-byte sector chunks, since
> gluster can obviously store files that are not 512-aligned?
Kevin Wolf Dec. 6, 2016, 9:59 a.m. UTC | #3
Am 05.12.2016 um 09:26 hat Wolfgang Bumiller geschrieben:
> On Fri, Dec 02, 2016 at 01:13:28PM -0600, Eric Blake wrote:
> > On 12/01/2016 04:59 AM, Wolfgang Bumiller wrote:
> > > Fixes #1644754.
> > > 
> > > Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
> > > ---
> > > I'm not sure what the original rationale was to treat both partial
> > > reads as well as well as writes as I/O error. (Seems to have happened
> > > from original glusterfs v1 to v2 series with a note but no reasoning
> > > for the read side as far as I could see.)
> > > The general direction lately seems to be to move away from sector
> > > based block APIs. Also eg. the NFS code allows partial reads. (It
> > > does, however, have an old patch (c2eb918e3) dedicated to aligning
> > > sizes to 512 byte boundaries for file creation for compatibility to
> > > other parts of qemu like qcow2. This already happens in glusterfs,
> > > though, but if you move a file from a different storage over to
> > > glusterfs you may end up with a qcow2 file with eg. the L1 table in
> > > the last 80 bytes of the file aligned to _begin_ at a 512 boundary,
> > > but not _end_ at one.)

Hm, does this really happen? I always thought that the file size of
qcow2 images is aligned to the cluster size. If it isn't, maybe we
should fix that.

> > >  block/gluster.c | 10 +++++++++-
> > >  1 file changed, 9 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/block/gluster.c b/block/gluster.c
> > > index 891c13b..3db0bf8 100644
> > > --- a/block/gluster.c
> > > +++ b/block/gluster.c
> > > @@ -41,6 +41,7 @@ typedef struct GlusterAIOCB {
> > >      int ret;
> > >      Coroutine *coroutine;
> > >      AioContext *aio_context;
> > > +    bool is_write;
> > >  } GlusterAIOCB;
> > >  
> > >  typedef struct BDRVGlusterState {
> > > @@ -716,8 +717,10 @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
> > >          acb->ret = 0; /* Success */
> > >      } else if (ret < 0) {
> > >          acb->ret = -errno; /* Read/Write failed */
> > > +    } else if (acb->is_write) {
> > > +        acb->ret = -EIO; /* Partial write - fail it */
> > >      } else {
> > > -        acb->ret = -EIO; /* Partial read/write - fail it */
> > > +        acb->ret = 0; /* Success */
> > 
> > Does this properly guarantee that the portion beyond EOF reads as zero?
> 
> I'd argue this wasn't necessarily the case before either, considering
> the first check starts with `!ret`:
> 
>     if (!ret || ret == acb->size) {
>         acb->ret = 0; /* Success */
> 
> A read right at EOF would return 0 and be treated as success there, no?

Yes, this is a bug.

I guess this was the lazy way that "usually" works both for
reads/writes, which return a positive number of bytes, and for things
like flush which return 0 on success. But the callback really needs to
distinguish these cases and apply different checks.

> Iow. it wouldn't zero out the destination buffer as far as I can see.
> Come to think of it, I'm not too fond of this part of the check for the
> write case either.

raw-posix treats short reads as success, too, but it zeroes out the
missing part. Note that it also loops after a short read and only if it
reads 0 bytes then, it returns success. If an error is returned after
the short read, the whole function returns an error. Is this necessary
for gluster, too?

> > Would it be better to switch to byte-based interfaces rather than
> > continue to force gluster interaction in 512-byte sector chunks,
> > since gluster can obviously store files that are not 512-aligned?

The gluster I/O functions are byte-based anyway, and the driver already
implements .bdrv_co_readv, so going to .bdrv_co_preadv should be
trivial. Probably the best solution here indeed.

Kevin
Eric Blake Dec. 6, 2016, 3:02 p.m. UTC | #4
On 12/06/2016 03:59 AM, Kevin Wolf wrote:

>>>> I'm not sure what the original rationale was to treat both partial
>>>> reads as well as well as writes as I/O error. (Seems to have happened
>>>> from original glusterfs v1 to v2 series with a note but no reasoning
>>>> for the read side as far as I could see.)
>>>> The general direction lately seems to be to move away from sector
>>>> based block APIs. Also eg. the NFS code allows partial reads. (It
>>>> does, however, have an old patch (c2eb918e3) dedicated to aligning
>>>> sizes to 512 byte boundaries for file creation for compatibility to
>>>> other parts of qemu like qcow2. This already happens in glusterfs,
>>>> though, but if you move a file from a different storage over to
>>>> glusterfs you may end up with a qcow2 file with eg. the L1 table in
>>>> the last 80 bytes of the file aligned to _begin_ at a 512 boundary,
>>>> but not _end_ at one.)
> 
> Hm, does this really happen? I always thought that the file size of
> qcow2 images is aligned to the cluster size. If it isn't, maybe we
> should fix that.

Yes, it absolutely happens all the time!  In fact, it is often the case
that our images are not even sector-aligned, let alone cluster-aligned:

$ qemu-img create -f qcow2 file 1M
Formatting 'file', fmt=qcow2 size=1048576 encryption=off
cluster_size=65536 lazy_refcounts=off refcount_bits=16
$ ls -l file
-rw-r--r--. 1 eblake eblake 196616 Dec  6 08:58 file
$ echo $((384*512))
196608
$ echo $((385*512))
197120

196616 is a fraction more than 384 sectors.

This is because the qcow2 format explicitly requires that if the L1 or
L2 table is at the end of the file (which is what happens by default in
qemu-img create), any entries not physically present in the table
(because the file ends early) are read as 0.

>>> Would it be better to switch to byte-based interfaces rather than
>>> continue to force gluster interaction in 512-byte sector chunks,
>>> since gluster can obviously store files that are not 512-aligned?
> 
> The gluster I/O functions are byte-based anyway, and the driver already
> implements .bdrv_co_readv, so going to .bdrv_co_preadv should be
> trivial. Probably the best solution here indeed.
> 

Are we still hoping to fix this bug (the gluster behavior on unaligned
files, not the larger [non-?]bug of qemu-img create giving unaligned
images in the first place) for 2.8, or are we pushing our luck here,
where the correct patch should be 2.9 material and cc qemu-stable?
Kevin Wolf Dec. 6, 2016, 3:09 p.m. UTC | #5
Am 06.12.2016 um 16:02 hat Eric Blake geschrieben:
> On 12/06/2016 03:59 AM, Kevin Wolf wrote:
> 
> >>>> I'm not sure what the original rationale was to treat both partial
> >>>> reads as well as well as writes as I/O error. (Seems to have happened
> >>>> from original glusterfs v1 to v2 series with a note but no reasoning
> >>>> for the read side as far as I could see.)
> >>>> The general direction lately seems to be to move away from sector
> >>>> based block APIs. Also eg. the NFS code allows partial reads. (It
> >>>> does, however, have an old patch (c2eb918e3) dedicated to aligning
> >>>> sizes to 512 byte boundaries for file creation for compatibility to
> >>>> other parts of qemu like qcow2. This already happens in glusterfs,
> >>>> though, but if you move a file from a different storage over to
> >>>> glusterfs you may end up with a qcow2 file with eg. the L1 table in
> >>>> the last 80 bytes of the file aligned to _begin_ at a 512 boundary,
> >>>> but not _end_ at one.)
> > 
> > Hm, does this really happen? I always thought that the file size of
> > qcow2 images is aligned to the cluster size. If it isn't, maybe we
> > should fix that.
> 
> Yes, it absolutely happens all the time!  In fact, it is often the case
> that our images are not even sector-aligned, let alone cluster-aligned:
> 
> $ qemu-img create -f qcow2 file 1M
> Formatting 'file', fmt=qcow2 size=1048576 encryption=off
> cluster_size=65536 lazy_refcounts=off refcount_bits=16
> $ ls -l file
> -rw-r--r--. 1 eblake eblake 196616 Dec  6 08:58 file
> $ echo $((384*512))
> 196608
> $ echo $((385*512))
> 197120
> 
> 196616 is a fraction more than 384 sectors.
> 
> This is because the qcow2 format explicitly requires that if the L1 or
> L2 table is at the end of the file (which is what happens by default in
> qemu-img create), any entries not physically present in the table
> (because the file ends early) are read as 0.
> 
> >>> Would it be better to switch to byte-based interfaces rather than
> >>> continue to force gluster interaction in 512-byte sector chunks,
> >>> since gluster can obviously store files that are not 512-aligned?
> > 
> > The gluster I/O functions are byte-based anyway, and the driver already
> > implements .bdrv_co_readv, so going to .bdrv_co_preadv should be
> > trivial. Probably the best solution here indeed.
> > 
> 
> Are we still hoping to fix this bug (the gluster behavior on unaligned
> files, not the larger [non-?]bug of qemu-img create giving unaligned
> images in the first place) for 2.8, or are we pushing our luck here,
> where the correct patch should be 2.9 material and cc qemu-stable?

I think we're too late for 2.8.

Kevin
diff mbox

Patch

diff --git a/block/gluster.c b/block/gluster.c
index 891c13b..3db0bf8 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -41,6 +41,7 @@  typedef struct GlusterAIOCB {
     int ret;
     Coroutine *coroutine;
     AioContext *aio_context;
+    bool is_write;
 } GlusterAIOCB;
 
 typedef struct BDRVGlusterState {
@@ -716,8 +717,10 @@  static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
         acb->ret = 0; /* Success */
     } else if (ret < 0) {
         acb->ret = -errno; /* Read/Write failed */
+    } else if (acb->is_write) {
+        acb->ret = -EIO; /* Partial write - fail it */
     } else {
-        acb->ret = -EIO; /* Partial read/write - fail it */
+        acb->ret = 0; /* Success */
     }
 
     aio_bh_schedule_oneshot(acb->aio_context, qemu_gluster_complete_aio, acb);
@@ -965,6 +968,7 @@  static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
     acb.ret = 0;
     acb.coroutine = qemu_coroutine_self();
     acb.aio_context = bdrv_get_aio_context(bs);
+    acb.is_write = true;
 
     ret = glfs_zerofill_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
     if (ret < 0) {
@@ -1087,9 +1091,11 @@  static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
     acb.aio_context = bdrv_get_aio_context(bs);
 
     if (write) {
+        acb.is_write = true;
         ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
                                  gluster_finish_aiocb, &acb);
     } else {
+        acb.is_write = false;
         ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
                                 gluster_finish_aiocb, &acb);
     }
@@ -1153,6 +1159,7 @@  static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
     acb.ret = 0;
     acb.coroutine = qemu_coroutine_self();
     acb.aio_context = bdrv_get_aio_context(bs);
+    acb.is_write = true;
 
     ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb);
     if (ret < 0) {
@@ -1199,6 +1206,7 @@  static coroutine_fn int qemu_gluster_co_pdiscard(BlockDriverState *bs,
     acb.ret = 0;
     acb.coroutine = qemu_coroutine_self();
     acb.aio_context = bdrv_get_aio_context(bs);
+    acb.is_write = true;
 
     ret = glfs_discard_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
     if (ret < 0) {