[v4,1/3] CIFS: Add support for direct I/O read

Message ID 20181031221311.2596-1-longli@linuxonhyperv.com
State New
Headers show
Series
  • [v4,1/3] CIFS: Add support for direct I/O read
Related show

Commit Message

Long Li Oct. 31, 2018, 10:13 p.m.
From: Long Li <longli@microsoft.com>

With direct I/O read, we transfer the data directly from transport layer to
the user data buffer.

Change in v3: add support for kernel AIO

Change in v4:
Refactor common read code to __cifs_readv for direct and non-direct I/O.
Retry on direct I/O failure.

Signed-off-by: Long Li <longli@microsoft.com>
---
 fs/cifs/cifsfs.h   |   1 +
 fs/cifs/cifsglob.h |   5 ++
 fs/cifs/file.c     | 219 +++++++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 186 insertions(+), 39 deletions(-)

Comments

Pavel Shilovsky Nov. 17, 2018, 12:16 a.m. | #1
Hi Long,

Please find my comments below.


ср, 31 окт. 2018 г. в 15:14, Long Li <longli@linuxonhyperv.com>:
>
> From: Long Li <longli@microsoft.com>
>
> With direct I/O read, we transfer the data directly from transport layer to
> the user data buffer.
>
> Change in v3: add support for kernel AIO
>
> Change in v4:
> Refactor common read code to __cifs_readv for direct and non-direct I/O.
> Retry on direct I/O failure.
>
> Signed-off-by: Long Li <longli@microsoft.com>
> ---
>  fs/cifs/cifsfs.h   |   1 +
>  fs/cifs/cifsglob.h |   5 ++
>  fs/cifs/file.c     | 219 +++++++++++++++++++++++++++++++++++++++++++----------
>  3 files changed, 186 insertions(+), 39 deletions(-)
>
> diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
> index 5f02318..7fba9aa 100644
> --- a/fs/cifs/cifsfs.h
> +++ b/fs/cifs/cifsfs.h
> @@ -102,6 +102,7 @@ extern int cifs_open(struct inode *inode, struct file *file);
>  extern int cifs_close(struct inode *inode, struct file *file);
>  extern int cifs_closedir(struct inode *inode, struct file *file);
>  extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
> +extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
>  extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
>  extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
>  extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
> diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
> index 7f62c98..52248dd 100644
> --- a/fs/cifs/cifsglob.h
> +++ b/fs/cifs/cifsglob.h
> @@ -1146,6 +1146,11 @@ struct cifs_aio_ctx {
>         unsigned int            len;
>         unsigned int            total_len;
>         bool                    should_dirty;
> +       /*
> +        * Indicates if this aio_ctx is for direct_io,
> +        * If yes, iter is a copy of the user passed iov_iter
> +        */
> +       bool                    direct_io;
>  };
>
>  struct cifs_readdata;
> diff --git a/fs/cifs/file.c b/fs/cifs/file.c
> index 87eece6..daab878 100644
> --- a/fs/cifs/file.c
> +++ b/fs/cifs/file.c
> @@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref *refcount)
>         kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
>         for (i = 0; i < rdata->nr_pages; i++) {
>                 put_page(rdata->pages[i]);
> -               rdata->pages[i] = NULL;
>         }
>         cifs_readdata_release(refcount);
>  }
> @@ -3092,6 +3091,63 @@ cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
>         return uncached_fill_pages(server, rdata, iter, iter->count);
>  }
>
> +static int cifs_resend_rdata(struct cifs_readdata *rdata,
> +                     struct list_head *rdata_list,
> +                     struct cifs_aio_ctx *ctx)
> +{
> +       int wait_retry = 0;
> +       unsigned int rsize, credits;
> +       int rc;
> +       struct TCP_Server_Info *server = tlink_tcon(rdata->cfile->tlink)->ses->server;
> +
> +       /*
> +        * Try to resend this rdata, waiting for credits up to 3 seconds.
> +        * Note: we are attempting to resend the whole rdata not in segments
> +        */
> +       do {
> +               rc = server->ops->wait_mtu_credits(server, rdata->bytes,
> +                                               &rsize, &credits);
> +
> +               if (rc)
> +                       break;
> +
> +               if (rsize < rdata->bytes) {
> +                       add_credits_and_wake_if(server, credits, 0);
> +                       msleep(1000);
> +                       wait_retry++;
> +               }
> +       } while (rsize < rdata->bytes && wait_retry < 3);
> +
> +       /*
> +        * If we can't find enough credits to send this rdata
> +        * release the rdata and return failure, this will pass
> +        * whatever I/O amount we have finished to VFS.
> +        */
> +       if (rsize < rdata->bytes) {
> +               rc = -EBUSY;

We don't have enough credits and return EBUSY here...

> +               goto out;
> +       }
> +
> +       rc = -EAGAIN;
> +       while (rc == -EAGAIN)
> +               if (!rdata->cfile->invalidHandle ||
> +                   !(rc = cifs_reopen_file(rdata->cfile, true)))
> +                       rc = server->ops->async_readv(rdata);
> +
> +       if (!rc) {
> +               /* Add to aio pending list */
> +               list_add_tail(&rdata->list, rdata_list);
> +               return 0;
> +       }
> +
> +       add_credits_and_wake_if(server, rdata->credits, 0);
> +out:
> +       kref_put(&rdata->refcount,
> +               cifs_uncached_readdata_release);
> +
> +       return rc;
> +}
> +
>  static int
>  cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
>                      struct cifs_sb_info *cifs_sb, struct list_head *rdata_list,
> @@ -3103,6 +3159,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
>         int rc;
>         pid_t pid;
>         struct TCP_Server_Info *server;
> +       struct page **pagevec;
> +       size_t start;
> +       struct iov_iter direct_iov = ctx->iter;
>
>         server = tlink_tcon(open_file->tlink)->ses->server;
>
> @@ -3111,6 +3170,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
>         else
>                 pid = current->tgid;
>
> +       if (ctx->direct_io)
> +               iov_iter_advance(&direct_iov, offset - ctx->pos);
> +
>         do {
>                 rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
>                                                    &rsize, &credits);
> @@ -3118,20 +3180,56 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
>                         break;
>
>                 cur_len = min_t(const size_t, len, rsize);
> -               npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
>
> -               /* allocate a readdata struct */
> -               rdata = cifs_readdata_alloc(npages,
> +               if (ctx->direct_io) {
> +
> +                       cur_len = iov_iter_get_pages_alloc(
> +                                       &direct_iov, &pagevec,
> +                                       cur_len, &start);
> +                       if (cur_len < 0) {
> +                               cifs_dbg(VFS,
> +                                       "couldn't get user pages (cur_len=%zd)"
> +                                       " iter type %d"
> +                                       " iov_offset %zd count %zd\n",
> +                                       cur_len, direct_iov.type, direct_iov.iov_offset,
> +                                       direct_iov.count);
> +                               dump_stack();
> +                               break;
> +                       }
> +                       iov_iter_advance(&direct_iov, cur_len);
> +
> +                       rdata = cifs_readdata_direct_alloc(
> +                                       pagevec, cifs_uncached_readv_complete);
> +                       if (!rdata) {
> +                               add_credits_and_wake_if(server, credits, 0);
> +                               rc = -ENOMEM;
> +                               break;
> +                       }
> +
> +                       npages = (cur_len + start + PAGE_SIZE-1) / PAGE_SIZE;
> +                       rdata->page_offset = start;
> +                       rdata->tailsz = npages > 1 ?
> +                               cur_len-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE :
> +                               cur_len;
> +
> +               } else {
> +
> +                       npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
> +                       /* allocate a readdata struct */
> +                       rdata = cifs_readdata_alloc(npages,
>                                             cifs_uncached_readv_complete);
> -               if (!rdata) {
> -                       add_credits_and_wake_if(server, credits, 0);
> -                       rc = -ENOMEM;
> -                       break;
> -               }
> +                       if (!rdata) {
> +                               add_credits_and_wake_if(server, credits, 0);
> +                               rc = -ENOMEM;
> +                               break;
> +                       }
>
> -               rc = cifs_read_allocate_pages(rdata, npages);
> -               if (rc)
> -                       goto error;
> +                       rc = cifs_read_allocate_pages(rdata, npages);
> +                       if (rc)
> +                               goto error;
> +
> +                       rdata->tailsz = PAGE_SIZE;
> +               }
>
>                 rdata->cfile = cifsFileInfo_get(open_file);
>                 rdata->nr_pages = npages;
> @@ -3139,7 +3237,6 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
>                 rdata->bytes = cur_len;
>                 rdata->pid = pid;
>                 rdata->pagesz = PAGE_SIZE;
> -               rdata->tailsz = PAGE_SIZE;
>                 rdata->read_into_pages = cifs_uncached_read_into_pages;
>                 rdata->copy_into_pages = cifs_uncached_copy_into_pages;
>                 rdata->credits = credits;
> @@ -3153,9 +3250,11 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
>                 if (rc) {
>                         add_credits_and_wake_if(server, rdata->credits, 0);
>                         kref_put(&rdata->refcount,
> -                                cifs_uncached_readdata_release);
> -                       if (rc == -EAGAIN)
> +                               cifs_uncached_readdata_release);
> +                       if (rc == -EAGAIN) {
> +                               iov_iter_revert(&direct_iov, cur_len);
>                                 continue;
> +                       }
>                         break;
>                 }
>
> @@ -3211,45 +3310,62 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx)
>                                  * reading.
>                                  */
>                                 if (got_bytes && got_bytes < rdata->bytes) {
> -                                       rc = cifs_readdata_to_iov(rdata, to);
> +                                       rc = 0;
> +                                       if (!ctx->direct_io)
> +                                               rc = cifs_readdata_to_iov(rdata, to);
>                                         if (rc) {
>                                                 kref_put(&rdata->refcount,
> -                                               cifs_uncached_readdata_release);
> +                                                       cifs_uncached_readdata_release);
>                                                 continue;
>                                         }
>                                 }
>
> -                               rc = cifs_send_async_read(
> +                               if (ctx->direct_io) {
> +                                       /*
> +                                        * Re-use rdata as this is a
> +                                        * direct I/O
> +                                        */
> +                                       rc = cifs_resend_rdata(
> +                                               rdata,
> +                                               &tmp_list, ctx);

...so we set rc to -EBUSY...


> +                               } else {
> +                                       rc = cifs_send_async_read(
>                                                 rdata->offset + got_bytes,
>                                                 rdata->bytes - got_bytes,
>                                                 rdata->cfile, cifs_sb,
>                                                 &tmp_list, ctx);
>
> +                                       kref_put(&rdata->refcount,
> +                                               cifs_uncached_readdata_release);
> +                               }
> +
>                                 list_splice(&tmp_list, &ctx->list);
>
> -                               kref_put(&rdata->refcount,
> -                                        cifs_uncached_readdata_release);
>                                 goto again;
>                         } else if (rdata->result)
>                                 rc = rdata->result;
> -                       else
> +                       else if (!ctx->direct_io)
>                                 rc = cifs_readdata_to_iov(rdata, to);
>
>                         /* if there was a short read -- discard anything left */
>                         if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
>                                 rc = -ENODATA;
> +
> +                       ctx->total_len += rdata->got_bytes;
>                 }
>                 list_del_init(&rdata->list);
>                 kref_put(&rdata->refcount, cifs_uncached_readdata_release);
>         }
>
> -       for (i = 0; i < ctx->npages; i++) {
> -               if (ctx->should_dirty)
> -                       set_page_dirty(ctx->bv[i].bv_page);
> -               put_page(ctx->bv[i].bv_page);
> -       }
> +       if (!ctx->direct_io) {
> +               for (i = 0; i < ctx->npages; i++) {
> +                       if (ctx->should_dirty)
> +                               set_page_dirty(ctx->bv[i].bv_page);
> +                       put_page(ctx->bv[i].bv_page);
> +               }
>
> -       ctx->total_len = ctx->len - iov_iter_count(to);
> +               ctx->total_len = ctx->len - iov_iter_count(to);
> +       }
>
>         cifs_stats_bytes_read(tcon, ctx->total_len);
>
> @@ -3267,18 +3383,27 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx)
>                 complete(&ctx->done);

and then ctx->rc is set to -EBUSY too.

>  }
>
> -ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
> +ssize_t __cifs_readv(struct kiocb *iocb, struct iov_iter *to, bool direct)
>  {
> -       struct file *file = iocb->ki_filp;
> -       ssize_t rc;
>         size_t len;
> -       ssize_t total_read = 0;
> -       loff_t offset = iocb->ki_pos;
> +       struct file *file = iocb->ki_filp;
>         struct cifs_sb_info *cifs_sb;
> -       struct cifs_tcon *tcon;
>         struct cifsFileInfo *cfile;
> +       struct cifs_tcon *tcon;
> +       ssize_t rc, total_read = 0;
> +       loff_t offset = iocb->ki_pos;
>         struct cifs_aio_ctx *ctx;
>
> +       /*
> +        * iov_iter_get_pages_alloc() doesn't work with ITER_KVEC,
> +        * fall back to data copy read path
> +        * this could be improved by getting pages directly in ITER_KVEC
> +        */
> +       if (direct && to->type & ITER_KVEC) {
> +               cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n");
> +               direct = false;
> +       }
> +
>         len = iov_iter_count(to);
>         if (!len)
>                 return 0;
> @@ -3305,14 +3430,20 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
>         if (to->type == ITER_IOVEC)
>                 ctx->should_dirty = true;
>
> -       rc = setup_aio_ctx_iter(ctx, to, READ);
> -       if (rc) {
> -               kref_put(&ctx->refcount, cifs_aio_ctx_release);
> -               return rc;
> +       if (direct) {
> +               ctx->pos = offset;
> +               ctx->direct_io = true;
> +               ctx->iter = *to;
> +               ctx->len = len;
> +       } else {
> +               rc = setup_aio_ctx_iter(ctx, to, READ);
> +               if (rc) {
> +                       kref_put(&ctx->refcount, cifs_aio_ctx_release);
> +                       return rc;
> +               }
> +               len = ctx->len;
>         }
>
> -       len = ctx->len;
> -
>         /* grab a lock here due to read response handlers can access ctx */
>         mutex_lock(&ctx->aio_mutex);
>
> @@ -3354,6 +3485,16 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
>         return rc;

In case of a blocking read we will return -EBUSY to the caller but
read man page doesn't say anything about a possibility to return this
error code. Is it being mapped somehow by VFS or is there another
consideration? The same question applies to the write patch as well.

>  }
>
> +ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to)
> +{
> +       return __cifs_readv(iocb, to, true);
> +}
> +
> +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
> +{
> +       return __cifs_readv(iocb, to, false);
> +}
> +
>  ssize_t
>  cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
>  {
> --
> 2.7.4
>

--
Best regards,
Pavel Shilovsky
Long Li Nov. 28, 2018, 11:43 p.m. | #2
> Subject: Re: [Patch v4 1/3] CIFS: Add support for direct I/O read
> 
> Hi Long,
> 
> Please find my comments below.
> 
> 
> ср, 31 окт. 2018 г. в 15:14, Long Li <longli@linuxonhyperv.com>:
> >
> > From: Long Li <longli@microsoft.com>
> >
> > With direct I/O read, we transfer the data directly from transport
> > layer to the user data buffer.
> >
> > Change in v3: add support for kernel AIO
> >
> > Change in v4:
> > Refactor common read code to __cifs_readv for direct and non-direct I/O.
> > Retry on direct I/O failure.
> >
> > Signed-off-by: Long Li <longli@microsoft.com>
> > ---
> >  fs/cifs/cifsfs.h   |   1 +
> >  fs/cifs/cifsglob.h |   5 ++
> >  fs/cifs/file.c     | 219 +++++++++++++++++++++++++++++++++++++++++++--
> --------
> >  3 files changed, 186 insertions(+), 39 deletions(-)
> >
> > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > 5f02318..7fba9aa 100644
> > --- a/fs/cifs/cifsfs.h
> > +++ b/fs/cifs/cifsfs.h
> > @@ -102,6 +102,7 @@ extern int cifs_open(struct inode *inode, struct
> > file *file);  extern int cifs_close(struct inode *inode, struct file
> > *file);  extern int cifs_closedir(struct inode *inode, struct file
> > *file);  extern ssize_t cifs_user_readv(struct kiocb *iocb, struct
> > iov_iter *to);
> > +extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter
> > +*to);
> >  extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter
> > *to);  extern ssize_t cifs_user_writev(struct kiocb *iocb, struct
> > iov_iter *from);  extern ssize_t cifs_strict_writev(struct kiocb
> > *iocb, struct iov_iter *from); diff --git a/fs/cifs/cifsglob.h
> > b/fs/cifs/cifsglob.h index 7f62c98..52248dd 100644
> > --- a/fs/cifs/cifsglob.h
> > +++ b/fs/cifs/cifsglob.h
> > @@ -1146,6 +1146,11 @@ struct cifs_aio_ctx {
> >         unsigned int            len;
> >         unsigned int            total_len;
> >         bool                    should_dirty;
> > +       /*
> > +        * Indicates if this aio_ctx is for direct_io,
> > +        * If yes, iter is a copy of the user passed iov_iter
> > +        */
> > +       bool                    direct_io;
> >  };
> >
> >  struct cifs_readdata;
> > diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 87eece6..daab878
> > 100644
> > --- a/fs/cifs/file.c
> > +++ b/fs/cifs/file.c
> > @@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref
> *refcount)
> >         kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
> >         for (i = 0; i < rdata->nr_pages; i++) {
> >                 put_page(rdata->pages[i]);
> > -               rdata->pages[i] = NULL;
> >         }
> >         cifs_readdata_release(refcount);  } @@ -3092,6 +3091,63 @@
> > cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
> >         return uncached_fill_pages(server, rdata, iter, iter->count);
> > }
> >
> > +static int cifs_resend_rdata(struct cifs_readdata *rdata,
> > +                     struct list_head *rdata_list,
> > +                     struct cifs_aio_ctx *ctx) {
> > +       int wait_retry = 0;
> > +       unsigned int rsize, credits;
> > +       int rc;
> > +       struct TCP_Server_Info *server =
> > +tlink_tcon(rdata->cfile->tlink)->ses->server;
> > +
> > +       /*
> > +        * Try to resend this rdata, waiting for credits up to 3 seconds.
> > +        * Note: we are attempting to resend the whole rdata not in segments
> > +        */
> > +       do {
> > +               rc = server->ops->wait_mtu_credits(server, rdata->bytes,
> > +                                               &rsize, &credits);
> > +
> > +               if (rc)
> > +                       break;
> > +
> > +               if (rsize < rdata->bytes) {
> > +                       add_credits_and_wake_if(server, credits, 0);
> > +                       msleep(1000);
> > +                       wait_retry++;
> > +               }
> > +       } while (rsize < rdata->bytes && wait_retry < 3);
> > +
> > +       /*
> > +        * If we can't find enough credits to send this rdata
> > +        * release the rdata and return failure, this will pass
> > +        * whatever I/O amount we have finished to VFS.
> > +        */
> > +       if (rsize < rdata->bytes) {
> > +               rc = -EBUSY;
> 
> We don't have enough credits and return EBUSY here...
> 
> > +               goto out;
> > +       }
> > +
> > +       rc = -EAGAIN;
> > +       while (rc == -EAGAIN)
> > +               if (!rdata->cfile->invalidHandle ||
> > +                   !(rc = cifs_reopen_file(rdata->cfile, true)))
> > +                       rc = server->ops->async_readv(rdata);
> > +
> > +       if (!rc) {
> > +               /* Add to aio pending list */
> > +               list_add_tail(&rdata->list, rdata_list);
> > +               return 0;
> > +       }
> > +
> > +       add_credits_and_wake_if(server, rdata->credits, 0);
> > +out:
> > +       kref_put(&rdata->refcount,
> > +               cifs_uncached_readdata_release);
> > +
> > +       return rc;
> > +}
> > +
> >  static int
> >  cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
> >                      struct cifs_sb_info *cifs_sb, struct list_head
> > *rdata_list, @@ -3103,6 +3159,9 @@ cifs_send_async_read(loff_t offset,
> size_t len, struct cifsFileInfo *open_file,
> >         int rc;
> >         pid_t pid;
> >         struct TCP_Server_Info *server;
> > +       struct page **pagevec;
> > +       size_t start;
> > +       struct iov_iter direct_iov = ctx->iter;
> >
> >         server = tlink_tcon(open_file->tlink)->ses->server;
> >
> > @@ -3111,6 +3170,9 @@ cifs_send_async_read(loff_t offset, size_t len,
> struct cifsFileInfo *open_file,
> >         else
> >                 pid = current->tgid;
> >
> > +       if (ctx->direct_io)
> > +               iov_iter_advance(&direct_iov, offset - ctx->pos);
> > +
> >         do {
> >                 rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
> >                                                    &rsize, &credits);
> > @@ -3118,20 +3180,56 @@ cifs_send_async_read(loff_t offset, size_t len,
> struct cifsFileInfo *open_file,
> >                         break;
> >
> >                 cur_len = min_t(const size_t, len, rsize);
> > -               npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
> >
> > -               /* allocate a readdata struct */
> > -               rdata = cifs_readdata_alloc(npages,
> > +               if (ctx->direct_io) {
> > +
> > +                       cur_len = iov_iter_get_pages_alloc(
> > +                                       &direct_iov, &pagevec,
> > +                                       cur_len, &start);
> > +                       if (cur_len < 0) {
> > +                               cifs_dbg(VFS,
> > +                                       "couldn't get user pages (cur_len=%zd)"
> > +                                       " iter type %d"
> > +                                       " iov_offset %zd count %zd\n",
> > +                                       cur_len, direct_iov.type, direct_iov.iov_offset,
> > +                                       direct_iov.count);
> > +                               dump_stack();
> > +                               break;
> > +                       }
> > +                       iov_iter_advance(&direct_iov, cur_len);
> > +
> > +                       rdata = cifs_readdata_direct_alloc(
> > +                                       pagevec, cifs_uncached_readv_complete);
> > +                       if (!rdata) {
> > +                               add_credits_and_wake_if(server, credits, 0);
> > +                               rc = -ENOMEM;
> > +                               break;
> > +                       }
> > +
> > +                       npages = (cur_len + start + PAGE_SIZE-1) / PAGE_SIZE;
> > +                       rdata->page_offset = start;
> > +                       rdata->tailsz = npages > 1 ?
> > +                               cur_len-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE :
> > +                               cur_len;
> > +
> > +               } else {
> > +
> > +                       npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
> > +                       /* allocate a readdata struct */
> > +                       rdata = cifs_readdata_alloc(npages,
> >                                             cifs_uncached_readv_complete);
> > -               if (!rdata) {
> > -                       add_credits_and_wake_if(server, credits, 0);
> > -                       rc = -ENOMEM;
> > -                       break;
> > -               }
> > +                       if (!rdata) {
> > +                               add_credits_and_wake_if(server, credits, 0);
> > +                               rc = -ENOMEM;
> > +                               break;
> > +                       }
> >
> > -               rc = cifs_read_allocate_pages(rdata, npages);
> > -               if (rc)
> > -                       goto error;
> > +                       rc = cifs_read_allocate_pages(rdata, npages);
> > +                       if (rc)
> > +                               goto error;
> > +
> > +                       rdata->tailsz = PAGE_SIZE;
> > +               }
> >
> >                 rdata->cfile = cifsFileInfo_get(open_file);
> >                 rdata->nr_pages = npages; @@ -3139,7 +3237,6 @@
> > cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
> >                 rdata->bytes = cur_len;
> >                 rdata->pid = pid;
> >                 rdata->pagesz = PAGE_SIZE;
> > -               rdata->tailsz = PAGE_SIZE;
> >                 rdata->read_into_pages = cifs_uncached_read_into_pages;
> >                 rdata->copy_into_pages = cifs_uncached_copy_into_pages;
> >                 rdata->credits = credits; @@ -3153,9 +3250,11 @@
> > cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
> >                 if (rc) {
> >                         add_credits_and_wake_if(server, rdata->credits, 0);
> >                         kref_put(&rdata->refcount,
> > -                                cifs_uncached_readdata_release);
> > -                       if (rc == -EAGAIN)
> > +                               cifs_uncached_readdata_release);
> > +                       if (rc == -EAGAIN) {
> > +                               iov_iter_revert(&direct_iov, cur_len);
> >                                 continue;
> > +                       }
> >                         break;
> >                 }
> >
> > @@ -3211,45 +3310,62 @@ collect_uncached_read_data(struct
> cifs_aio_ctx *ctx)
> >                                  * reading.
> >                                  */
> >                                 if (got_bytes && got_bytes < rdata->bytes) {
> > -                                       rc = cifs_readdata_to_iov(rdata, to);
> > +                                       rc = 0;
> > +                                       if (!ctx->direct_io)
> > +                                               rc =
> > + cifs_readdata_to_iov(rdata, to);
> >                                         if (rc) {
> >                                                 kref_put(&rdata->refcount,
> > -                                               cifs_uncached_readdata_release);
> > +
> > + cifs_uncached_readdata_release);
> >                                                 continue;
> >                                         }
> >                                 }
> >
> > -                               rc = cifs_send_async_read(
> > +                               if (ctx->direct_io) {
> > +                                       /*
> > +                                        * Re-use rdata as this is a
> > +                                        * direct I/O
> > +                                        */
> > +                                       rc = cifs_resend_rdata(
> > +                                               rdata,
> > +                                               &tmp_list, ctx);
> 
> ...so we set rc to -EBUSY...
> 
> 
> > +                               } else {
> > +                                       rc = cifs_send_async_read(
> >                                                 rdata->offset + got_bytes,
> >                                                 rdata->bytes - got_bytes,
> >                                                 rdata->cfile, cifs_sb,
> >                                                 &tmp_list, ctx);
> >
> > +                                       kref_put(&rdata->refcount,
> > +                                               cifs_uncached_readdata_release);
> > +                               }
> > +
> >                                 list_splice(&tmp_list, &ctx->list);
> >
> > -                               kref_put(&rdata->refcount,
> > -                                        cifs_uncached_readdata_release);
> >                                 goto again;
> >                         } else if (rdata->result)
> >                                 rc = rdata->result;
> > -                       else
> > +                       else if (!ctx->direct_io)
> >                                 rc = cifs_readdata_to_iov(rdata, to);
> >
> >                         /* if there was a short read -- discard anything left */
> >                         if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
> >                                 rc = -ENODATA;
> > +
> > +                       ctx->total_len += rdata->got_bytes;
> >                 }
> >                 list_del_init(&rdata->list);
> >                 kref_put(&rdata->refcount, cifs_uncached_readdata_release);
> >         }
> >
> > -       for (i = 0; i < ctx->npages; i++) {
> > -               if (ctx->should_dirty)
> > -                       set_page_dirty(ctx->bv[i].bv_page);
> > -               put_page(ctx->bv[i].bv_page);
> > -       }
> > +       if (!ctx->direct_io) {
> > +               for (i = 0; i < ctx->npages; i++) {
> > +                       if (ctx->should_dirty)
> > +                               set_page_dirty(ctx->bv[i].bv_page);
> > +                       put_page(ctx->bv[i].bv_page);
> > +               }
> >
> > -       ctx->total_len = ctx->len - iov_iter_count(to);
> > +               ctx->total_len = ctx->len - iov_iter_count(to);
> > +       }
> >
> >         cifs_stats_bytes_read(tcon, ctx->total_len);
> >
> > @@ -3267,18 +3383,27 @@ collect_uncached_read_data(struct
> cifs_aio_ctx *ctx)
> >                 complete(&ctx->done);
> 
> and then ctx->rc is set to -EBUSY too.
> 
> >  }
> >
> > -ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
> > +ssize_t __cifs_readv(struct kiocb *iocb, struct iov_iter *to, bool
> > +direct)
> >  {
> > -       struct file *file = iocb->ki_filp;
> > -       ssize_t rc;
> >         size_t len;
> > -       ssize_t total_read = 0;
> > -       loff_t offset = iocb->ki_pos;
> > +       struct file *file = iocb->ki_filp;
> >         struct cifs_sb_info *cifs_sb;
> > -       struct cifs_tcon *tcon;
> >         struct cifsFileInfo *cfile;
> > +       struct cifs_tcon *tcon;
> > +       ssize_t rc, total_read = 0;
> > +       loff_t offset = iocb->ki_pos;
> >         struct cifs_aio_ctx *ctx;
> >
> > +       /*
> > +        * iov_iter_get_pages_alloc() doesn't work with ITER_KVEC,
> > +        * fall back to data copy read path
> > +        * this could be improved by getting pages directly in ITER_KVEC
> > +        */
> > +       if (direct && to->type & ITER_KVEC) {
> > +               cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n");
> > +               direct = false;
> > +       }
> > +
> >         len = iov_iter_count(to);
> >         if (!len)
> >                 return 0;
> > @@ -3305,14 +3430,20 @@ ssize_t cifs_user_readv(struct kiocb *iocb,
> struct iov_iter *to)
> >         if (to->type == ITER_IOVEC)
> >                 ctx->should_dirty = true;
> >
> > -       rc = setup_aio_ctx_iter(ctx, to, READ);
> > -       if (rc) {
> > -               kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > -               return rc;
> > +       if (direct) {
> > +               ctx->pos = offset;
> > +               ctx->direct_io = true;
> > +               ctx->iter = *to;
> > +               ctx->len = len;
> > +       } else {
> > +               rc = setup_aio_ctx_iter(ctx, to, READ);
> > +               if (rc) {
> > +                       kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > +                       return rc;
> > +               }
> > +               len = ctx->len;
> >         }
> >
> > -       len = ctx->len;
> > -
> >         /* grab a lock here due to read response handlers can access ctx */
> >         mutex_lock(&ctx->aio_mutex);
> >
> > @@ -3354,6 +3485,16 @@ ssize_t cifs_user_readv(struct kiocb *iocb,
> struct iov_iter *to)
> >         return rc;
> 
> In case of a blocking read we will return -EBUSY to the caller but read man
> page doesn't say anything about a possibility to return this error code. Is it
> being mapped somehow by VFS or is there another consideration? The same
> question applies to the write patch as well.

Thanks Pavel,  Yes this is a bug.

In practice, the retry logic will rarely get hit. If getting hit, normally the server will give us far more credits for resending this whole packet.

How about just retrying forever (instead of trying 3 times) on this resend packet? This will change the code to the same behavior before direct I/O patches. 

e.g.:

        /*
         * Try to resend this wdata, waiting for credits before sending
         * Note: we are attempting to resend the whole wdata not in segments
         */
        do {
                rc = server->ops->wait_mtu_credits(
                        server, wdata->bytes, &wsize, &credits);

                if (rc)
                        break;

                if (wsize < wdata->bytes) {
                        add_credits_and_wake_if(server, credits, 0);
                        msleep(1000);
                }
        } while (wsize < wdata->bytes);

> 
> >  }
> >
> > +ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to) {
> > +       return __cifs_readv(iocb, to, true); }
> > +
> > +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) {
> > +       return __cifs_readv(iocb, to, false); }
> > +
> >  ssize_t
> >  cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)  {
> > --
> > 2.7.4
> >
> 
> --
> Best regards,
> Pavel Shilovsky
Pavel Shilovsky Nov. 29, 2018, 6:54 p.m. | #3
ср, 28 нояб. 2018 г. в 15:43, Long Li <longli@microsoft.com>:
>
> > Subject: Re: [Patch v4 1/3] CIFS: Add support for direct I/O read
> >
> > Hi Long,
> >
> > Please find my comments below.
> >
> >
> > ср, 31 окт. 2018 г. в 15:14, Long Li <longli@linuxonhyperv.com>:
> > >
> > > From: Long Li <longli@microsoft.com>
> > >
> > > With direct I/O read, we transfer the data directly from transport
> > > layer to the user data buffer.
> > >
> > > Change in v3: add support for kernel AIO
> > >
> > > Change in v4:
> > > Refactor common read code to __cifs_readv for direct and non-direct I/O.
> > > Retry on direct I/O failure.
> > >
> > > Signed-off-by: Long Li <longli@microsoft.com>
> > > ---
> > >  fs/cifs/cifsfs.h   |   1 +
> > >  fs/cifs/cifsglob.h |   5 ++
> > >  fs/cifs/file.c     | 219 +++++++++++++++++++++++++++++++++++++++++++--
> > --------
> > >  3 files changed, 186 insertions(+), 39 deletions(-)
> > >
> > > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > > 5f02318..7fba9aa 100644
> > > --- a/fs/cifs/cifsfs.h
> > > +++ b/fs/cifs/cifsfs.h
> > > @@ -102,6 +102,7 @@ extern int cifs_open(struct inode *inode, struct
> > > file *file);  extern int cifs_close(struct inode *inode, struct file
> > > *file);  extern int cifs_closedir(struct inode *inode, struct file
> > > *file);  extern ssize_t cifs_user_readv(struct kiocb *iocb, struct
> > > iov_iter *to);
> > > +extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter
> > > +*to);
> > >  extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter
> > > *to);  extern ssize_t cifs_user_writev(struct kiocb *iocb, struct
> > > iov_iter *from);  extern ssize_t cifs_strict_writev(struct kiocb
> > > *iocb, struct iov_iter *from); diff --git a/fs/cifs/cifsglob.h
> > > b/fs/cifs/cifsglob.h index 7f62c98..52248dd 100644
> > > --- a/fs/cifs/cifsglob.h
> > > +++ b/fs/cifs/cifsglob.h
> > > @@ -1146,6 +1146,11 @@ struct cifs_aio_ctx {
> > >         unsigned int            len;
> > >         unsigned int            total_len;
> > >         bool                    should_dirty;
> > > +       /*
> > > +        * Indicates if this aio_ctx is for direct_io,
> > > +        * If yes, iter is a copy of the user passed iov_iter
> > > +        */
> > > +       bool                    direct_io;
> > >  };
> > >
> > >  struct cifs_readdata;
> > > diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 87eece6..daab878
> > > 100644
> > > --- a/fs/cifs/file.c
> > > +++ b/fs/cifs/file.c
> > > @@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref
> > *refcount)
> > >         kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
> > >         for (i = 0; i < rdata->nr_pages; i++) {
> > >                 put_page(rdata->pages[i]);
> > > -               rdata->pages[i] = NULL;
> > >         }
> > >         cifs_readdata_release(refcount);  } @@ -3092,6 +3091,63 @@
> > > cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
> > >         return uncached_fill_pages(server, rdata, iter, iter->count);
> > > }
> > >
> > > +static int cifs_resend_rdata(struct cifs_readdata *rdata,
> > > +                     struct list_head *rdata_list,
> > > +                     struct cifs_aio_ctx *ctx) {
> > > +       int wait_retry = 0;
> > > +       unsigned int rsize, credits;
> > > +       int rc;
> > > +       struct TCP_Server_Info *server =
> > > +tlink_tcon(rdata->cfile->tlink)->ses->server;
> > > +
> > > +       /*
> > > +        * Try to resend this rdata, waiting for credits up to 3 seconds.
> > > +        * Note: we are attempting to resend the whole rdata not in segments
> > > +        */
> > > +       do {
> > > +               rc = server->ops->wait_mtu_credits(server, rdata->bytes,
> > > +                                               &rsize, &credits);
> > > +
> > > +               if (rc)
> > > +                       break;
> > > +
> > > +               if (rsize < rdata->bytes) {
> > > +                       add_credits_and_wake_if(server, credits, 0);
> > > +                       msleep(1000);
> > > +                       wait_retry++;
> > > +               }
> > > +       } while (rsize < rdata->bytes && wait_retry < 3);
> > > +
> > > +       /*
> > > +        * If we can't find enough credits to send this rdata
> > > +        * release the rdata and return failure, this will pass
> > > +        * whatever I/O amount we have finished to VFS.
> > > +        */
> > > +       if (rsize < rdata->bytes) {
> > > +               rc = -EBUSY;
> >
> > We don't have enough credits and return EBUSY here...
> >
> > > +               goto out;
> > > +       }
> > > +
> > > +       rc = -EAGAIN;
> > > +       while (rc == -EAGAIN)
> > > +               if (!rdata->cfile->invalidHandle ||
> > > +                   !(rc = cifs_reopen_file(rdata->cfile, true)))
> > > +                       rc = server->ops->async_readv(rdata);
> > > +
> > > +       if (!rc) {
> > > +               /* Add to aio pending list */
> > > +               list_add_tail(&rdata->list, rdata_list);
> > > +               return 0;
> > > +       }
> > > +
> > > +       add_credits_and_wake_if(server, rdata->credits, 0);
> > > +out:
> > > +       kref_put(&rdata->refcount,
> > > +               cifs_uncached_readdata_release);
> > > +
> > > +       return rc;
> > > +}
> > > +
> > >  static int
> > >  cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
> > >                      struct cifs_sb_info *cifs_sb, struct list_head
> > > *rdata_list, @@ -3103,6 +3159,9 @@ cifs_send_async_read(loff_t offset,
> > size_t len, struct cifsFileInfo *open_file,
> > >         int rc;
> > >         pid_t pid;
> > >         struct TCP_Server_Info *server;
> > > +       struct page **pagevec;
> > > +       size_t start;
> > > +       struct iov_iter direct_iov = ctx->iter;
> > >
> > >         server = tlink_tcon(open_file->tlink)->ses->server;
> > >
> > > @@ -3111,6 +3170,9 @@ cifs_send_async_read(loff_t offset, size_t len,
> > struct cifsFileInfo *open_file,
> > >         else
> > >                 pid = current->tgid;
> > >
> > > +       if (ctx->direct_io)
> > > +               iov_iter_advance(&direct_iov, offset - ctx->pos);
> > > +
> > >         do {
> > >                 rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
> > >                                                    &rsize, &credits);
> > > @@ -3118,20 +3180,56 @@ cifs_send_async_read(loff_t offset, size_t len,
> > struct cifsFileInfo *open_file,
> > >                         break;
> > >
> > >                 cur_len = min_t(const size_t, len, rsize);
> > > -               npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
> > >
> > > -               /* allocate a readdata struct */
> > > -               rdata = cifs_readdata_alloc(npages,
> > > +               if (ctx->direct_io) {
> > > +
> > > +                       cur_len = iov_iter_get_pages_alloc(
> > > +                                       &direct_iov, &pagevec,
> > > +                                       cur_len, &start);
> > > +                       if (cur_len < 0) {
> > > +                               cifs_dbg(VFS,
> > > +                                       "couldn't get user pages (cur_len=%zd)"
> > > +                                       " iter type %d"
> > > +                                       " iov_offset %zd count %zd\n",
> > > +                                       cur_len, direct_iov.type, direct_iov.iov_offset,
> > > +                                       direct_iov.count);
> > > +                               dump_stack();
> > > +                               break;
> > > +                       }
> > > +                       iov_iter_advance(&direct_iov, cur_len);
> > > +
> > > +                       rdata = cifs_readdata_direct_alloc(
> > > +                                       pagevec, cifs_uncached_readv_complete);
> > > +                       if (!rdata) {
> > > +                               add_credits_and_wake_if(server, credits, 0);
> > > +                               rc = -ENOMEM;
> > > +                               break;
> > > +                       }
> > > +
> > > +                       npages = (cur_len + start + PAGE_SIZE-1) / PAGE_SIZE;
> > > +                       rdata->page_offset = start;
> > > +                       rdata->tailsz = npages > 1 ?
> > > +                               cur_len-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE :
> > > +                               cur_len;
> > > +
> > > +               } else {
> > > +
> > > +                       npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
> > > +                       /* allocate a readdata struct */
> > > +                       rdata = cifs_readdata_alloc(npages,
> > >                                             cifs_uncached_readv_complete);
> > > -               if (!rdata) {
> > > -                       add_credits_and_wake_if(server, credits, 0);
> > > -                       rc = -ENOMEM;
> > > -                       break;
> > > -               }
> > > +                       if (!rdata) {
> > > +                               add_credits_and_wake_if(server, credits, 0);
> > > +                               rc = -ENOMEM;
> > > +                               break;
> > > +                       }
> > >
> > > -               rc = cifs_read_allocate_pages(rdata, npages);
> > > -               if (rc)
> > > -                       goto error;
> > > +                       rc = cifs_read_allocate_pages(rdata, npages);
> > > +                       if (rc)
> > > +                               goto error;
> > > +
> > > +                       rdata->tailsz = PAGE_SIZE;
> > > +               }
> > >
> > >                 rdata->cfile = cifsFileInfo_get(open_file);
> > >                 rdata->nr_pages = npages; @@ -3139,7 +3237,6 @@
> > > cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
> > >                 rdata->bytes = cur_len;
> > >                 rdata->pid = pid;
> > >                 rdata->pagesz = PAGE_SIZE;
> > > -               rdata->tailsz = PAGE_SIZE;
> > >                 rdata->read_into_pages = cifs_uncached_read_into_pages;
> > >                 rdata->copy_into_pages = cifs_uncached_copy_into_pages;
> > >                 rdata->credits = credits; @@ -3153,9 +3250,11 @@
> > > cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
> > >                 if (rc) {
> > >                         add_credits_and_wake_if(server, rdata->credits, 0);
> > >                         kref_put(&rdata->refcount,
> > > -                                cifs_uncached_readdata_release);
> > > -                       if (rc == -EAGAIN)
> > > +                               cifs_uncached_readdata_release);
> > > +                       if (rc == -EAGAIN) {
> > > +                               iov_iter_revert(&direct_iov, cur_len);
> > >                                 continue;
> > > +                       }
> > >                         break;
> > >                 }
> > >
> > > @@ -3211,45 +3310,62 @@ collect_uncached_read_data(struct
> > cifs_aio_ctx *ctx)
> > >                                  * reading.
> > >                                  */
> > >                                 if (got_bytes && got_bytes < rdata->bytes) {
> > > -                                       rc = cifs_readdata_to_iov(rdata, to);
> > > +                                       rc = 0;
> > > +                                       if (!ctx->direct_io)
> > > +                                               rc =
> > > + cifs_readdata_to_iov(rdata, to);
> > >                                         if (rc) {
> > >                                                 kref_put(&rdata->refcount,
> > > -                                               cifs_uncached_readdata_release);
> > > +
> > > + cifs_uncached_readdata_release);
> > >                                                 continue;
> > >                                         }
> > >                                 }
> > >
> > > -                               rc = cifs_send_async_read(
> > > +                               if (ctx->direct_io) {
> > > +                                       /*
> > > +                                        * Re-use rdata as this is a
> > > +                                        * direct I/O
> > > +                                        */
> > > +                                       rc = cifs_resend_rdata(
> > > +                                               rdata,
> > > +                                               &tmp_list, ctx);
> >
> > ...so we set rc to -EBUSY...
> >
> >
> > > +                               } else {
> > > +                                       rc = cifs_send_async_read(
> > >                                                 rdata->offset + got_bytes,
> > >                                                 rdata->bytes - got_bytes,
> > >                                                 rdata->cfile, cifs_sb,
> > >                                                 &tmp_list, ctx);
> > >
> > > +                                       kref_put(&rdata->refcount,
> > > +                                               cifs_uncached_readdata_release);
> > > +                               }
> > > +
> > >                                 list_splice(&tmp_list, &ctx->list);
> > >
> > > -                               kref_put(&rdata->refcount,
> > > -                                        cifs_uncached_readdata_release);
> > >                                 goto again;
> > >                         } else if (rdata->result)
> > >                                 rc = rdata->result;
> > > -                       else
> > > +                       else if (!ctx->direct_io)
> > >                                 rc = cifs_readdata_to_iov(rdata, to);
> > >
> > >                         /* if there was a short read -- discard anything left */
> > >                         if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
> > >                                 rc = -ENODATA;
> > > +
> > > +                       ctx->total_len += rdata->got_bytes;
> > >                 }
> > >                 list_del_init(&rdata->list);
> > >                 kref_put(&rdata->refcount, cifs_uncached_readdata_release);
> > >         }
> > >
> > > -       for (i = 0; i < ctx->npages; i++) {
> > > -               if (ctx->should_dirty)
> > > -                       set_page_dirty(ctx->bv[i].bv_page);
> > > -               put_page(ctx->bv[i].bv_page);
> > > -       }
> > > +       if (!ctx->direct_io) {
> > > +               for (i = 0; i < ctx->npages; i++) {
> > > +                       if (ctx->should_dirty)
> > > +                               set_page_dirty(ctx->bv[i].bv_page);
> > > +                       put_page(ctx->bv[i].bv_page);
> > > +               }
> > >
> > > -       ctx->total_len = ctx->len - iov_iter_count(to);
> > > +               ctx->total_len = ctx->len - iov_iter_count(to);
> > > +       }
> > >
> > >         cifs_stats_bytes_read(tcon, ctx->total_len);
> > >
> > > @@ -3267,18 +3383,27 @@ collect_uncached_read_data(struct
> > cifs_aio_ctx *ctx)
> > >                 complete(&ctx->done);
> >
> > and then ctx->rc is set to -EBUSY too.
> >
> > >  }
> > >
> > > -ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
> > > +ssize_t __cifs_readv(struct kiocb *iocb, struct iov_iter *to, bool
> > > +direct)
> > >  {
> > > -       struct file *file = iocb->ki_filp;
> > > -       ssize_t rc;
> > >         size_t len;
> > > -       ssize_t total_read = 0;
> > > -       loff_t offset = iocb->ki_pos;
> > > +       struct file *file = iocb->ki_filp;
> > >         struct cifs_sb_info *cifs_sb;
> > > -       struct cifs_tcon *tcon;
> > >         struct cifsFileInfo *cfile;
> > > +       struct cifs_tcon *tcon;
> > > +       ssize_t rc, total_read = 0;
> > > +       loff_t offset = iocb->ki_pos;
> > >         struct cifs_aio_ctx *ctx;
> > >
> > > +       /*
> > > +        * iov_iter_get_pages_alloc() doesn't work with ITER_KVEC,
> > > +        * fall back to data copy read path
> > > +        * this could be improved by getting pages directly in ITER_KVEC
> > > +        */
> > > +       if (direct && to->type & ITER_KVEC) {
> > > +               cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n");
> > > +               direct = false;
> > > +       }
> > > +
> > >         len = iov_iter_count(to);
> > >         if (!len)
> > >                 return 0;
> > > @@ -3305,14 +3430,20 @@ ssize_t cifs_user_readv(struct kiocb *iocb,
> > struct iov_iter *to)
> > >         if (to->type == ITER_IOVEC)
> > >                 ctx->should_dirty = true;
> > >
> > > -       rc = setup_aio_ctx_iter(ctx, to, READ);
> > > -       if (rc) {
> > > -               kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > > -               return rc;
> > > +       if (direct) {
> > > +               ctx->pos = offset;
> > > +               ctx->direct_io = true;
> > > +               ctx->iter = *to;
> > > +               ctx->len = len;
> > > +       } else {
> > > +               rc = setup_aio_ctx_iter(ctx, to, READ);
> > > +               if (rc) {
> > > +                       kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > > +                       return rc;
> > > +               }
> > > +               len = ctx->len;
> > >         }
> > >
> > > -       len = ctx->len;
> > > -
> > >         /* grab a lock here due to read response handlers can access ctx */
> > >         mutex_lock(&ctx->aio_mutex);
> > >
> > > @@ -3354,6 +3485,16 @@ ssize_t cifs_user_readv(struct kiocb *iocb,
> > struct iov_iter *to)
> > >         return rc;
> >
> > In case of a blocking read we will return -EBUSY to the caller but read man
> > page doesn't say anything about a possibility to return this error code. Is it
> > being mapped somehow by VFS or is there another consideration? The same
> > question applies to the write patch as well.
>
> Thanks Pavel,  Yes this is a bug.
>
> In practice, the retry logic will rarely get hit. If getting hit, normally the server will give us far more credits for resending this whole packet.
>
> How about just retrying forever (instead of trying 3 times) on this resend packet? This will change the code to the same behavior before direct I/O patches.
>
> e.g.:
>
>         /*
>          * Try to resend this wdata, waiting for credits before sending
>          * Note: we are attempting to resend the whole wdata not in segments
>          */
>         do {
>                 rc = server->ops->wait_mtu_credits(
>                         server, wdata->bytes, &wsize, &credits);
>
>                 if (rc)
>                         break;
>
>                 if (wsize < wdata->bytes) {
>                         add_credits_and_wake_if(server, credits, 0);
>                         msleep(1000);
>                 }
>         } while (wsize < wdata->bytes);
>

We can do that but it won't be the same behavior because we were using
a partial send:

+                               if (ctx->direct_io) {
+                                       /*
+                                        * Re-use rdata as this is a
+                                        * direct I/O
+                                        */
+                                       rc = cifs_resend_rdata(
+                                               rdata,
+                                               &tmp_list, ctx);
+                               } else {
+                                       rc = cifs_send_async_read(
                                                rdata->offset + got_bytes,
                                                rdata->bytes - got_bytes,
                                                rdata->cfile, cifs_sb,
                                                &tmp_list, ctx);

--
Best regards,
Pavel Shilovsky
Long Li Nov. 29, 2018, 10:48 p.m. | #4
> Subject: Re: [Patch v4 1/3] CIFS: Add support for direct I/O read
> 
> ср, 28 нояб. 2018 г. в 15:43, Long Li <longli@microsoft.com>:
> >
> > > Subject: Re: [Patch v4 1/3] CIFS: Add support for direct I/O read
> > >
> > > Hi Long,
> > >
> > > Please find my comments below.
> > >
> > >
> > > ср, 31 окт. 2018 г. в 15:14, Long Li <longli@linuxonhyperv.com>:
> > > >
> > > > From: Long Li <longli@microsoft.com>
> > > >
> > > > With direct I/O read, we transfer the data directly from transport
> > > > layer to the user data buffer.
> > > >
> > > > Change in v3: add support for kernel AIO
> > > >
> > > > Change in v4:
> > > > Refactor common read code to __cifs_readv for direct and non-direct
> I/O.
> > > > Retry on direct I/O failure.
> > > >
> > > > Signed-off-by: Long Li <longli@microsoft.com>
> > > > ---
> > > >  fs/cifs/cifsfs.h   |   1 +
> > > >  fs/cifs/cifsglob.h |   5 ++
> > > >  fs/cifs/file.c     | 219
> +++++++++++++++++++++++++++++++++++++++++++--
> > > --------
> > > >  3 files changed, 186 insertions(+), 39 deletions(-)
> > > >
> > > > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > > > 5f02318..7fba9aa 100644
> > > > --- a/fs/cifs/cifsfs.h
> > > > +++ b/fs/cifs/cifsfs.h
> > > > @@ -102,6 +102,7 @@ extern int cifs_open(struct inode *inode,
> > > > struct file *file);  extern int cifs_close(struct inode *inode,
> > > > struct file *file);  extern int cifs_closedir(struct inode *inode,
> > > > struct file *file);  extern ssize_t cifs_user_readv(struct kiocb
> > > > *iocb, struct iov_iter *to);
> > > > +extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct
> > > > +iov_iter *to);
> > > >  extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct
> > > > iov_iter *to);  extern ssize_t cifs_user_writev(struct kiocb
> > > > *iocb, struct iov_iter *from);  extern ssize_t
> > > > cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
> > > > diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index
> > > > 7f62c98..52248dd 100644
> > > > --- a/fs/cifs/cifsglob.h
> > > > +++ b/fs/cifs/cifsglob.h
> > > > @@ -1146,6 +1146,11 @@ struct cifs_aio_ctx {
> > > >         unsigned int            len;
> > > >         unsigned int            total_len;
> > > >         bool                    should_dirty;
> > > > +       /*
> > > > +        * Indicates if this aio_ctx is for direct_io,
> > > > +        * If yes, iter is a copy of the user passed iov_iter
> > > > +        */
> > > > +       bool                    direct_io;
> > > >  };
> > > >
> > > >  struct cifs_readdata;
> > > > diff --git a/fs/cifs/file.c b/fs/cifs/file.c index
> > > > 87eece6..daab878
> > > > 100644
> > > > --- a/fs/cifs/file.c
> > > > +++ b/fs/cifs/file.c
> > > > @@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref
> > > *refcount)
> > > >         kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
> > > >         for (i = 0; i < rdata->nr_pages; i++) {
> > > >                 put_page(rdata->pages[i]);
> > > > -               rdata->pages[i] = NULL;
> > > >         }
> > > >         cifs_readdata_release(refcount);  } @@ -3092,6 +3091,63 @@
> > > > cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
> > > >         return uncached_fill_pages(server, rdata, iter,
> > > > iter->count); }
> > > >
> > > > +static int cifs_resend_rdata(struct cifs_readdata *rdata,
> > > > +                     struct list_head *rdata_list,
> > > > +                     struct cifs_aio_ctx *ctx) {
> > > > +       int wait_retry = 0;
> > > > +       unsigned int rsize, credits;
> > > > +       int rc;
> > > > +       struct TCP_Server_Info *server =
> > > > +tlink_tcon(rdata->cfile->tlink)->ses->server;
> > > > +
> > > > +       /*
> > > > +        * Try to resend this rdata, waiting for credits up to 3 seconds.
> > > > +        * Note: we are attempting to resend the whole rdata not in
> segments
> > > > +        */
> > > > +       do {
> > > > +               rc = server->ops->wait_mtu_credits(server, rdata->bytes,
> > > > +                                               &rsize, &credits);
> > > > +
> > > > +               if (rc)
> > > > +                       break;
> > > > +
> > > > +               if (rsize < rdata->bytes) {
> > > > +                       add_credits_and_wake_if(server, credits, 0);
> > > > +                       msleep(1000);
> > > > +                       wait_retry++;
> > > > +               }
> > > > +       } while (rsize < rdata->bytes && wait_retry < 3);
> > > > +
> > > > +       /*
> > > > +        * If we can't find enough credits to send this rdata
> > > > +        * release the rdata and return failure, this will pass
> > > > +        * whatever I/O amount we have finished to VFS.
> > > > +        */
> > > > +       if (rsize < rdata->bytes) {
> > > > +               rc = -EBUSY;
> > >
> > > We don't have enough credits and return EBUSY here...
> > >
> > > > +               goto out;
> > > > +       }
> > > > +
> > > > +       rc = -EAGAIN;
> > > > +       while (rc == -EAGAIN)
> > > > +               if (!rdata->cfile->invalidHandle ||
> > > > +                   !(rc = cifs_reopen_file(rdata->cfile, true)))
> > > > +                       rc = server->ops->async_readv(rdata);
> > > > +
> > > > +       if (!rc) {
> > > > +               /* Add to aio pending list */
> > > > +               list_add_tail(&rdata->list, rdata_list);
> > > > +               return 0;
> > > > +       }
> > > > +
> > > > +       add_credits_and_wake_if(server, rdata->credits, 0);
> > > > +out:
> > > > +       kref_put(&rdata->refcount,
> > > > +               cifs_uncached_readdata_release);
> > > > +
> > > > +       return rc;
> > > > +}
> > > > +
> > > >  static int
> > > >  cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo
> *open_file,
> > > >                      struct cifs_sb_info *cifs_sb, struct
> > > > list_head *rdata_list, @@ -3103,6 +3159,9 @@
> > > > cifs_send_async_read(loff_t offset,
> > > size_t len, struct cifsFileInfo *open_file,
> > > >         int rc;
> > > >         pid_t pid;
> > > >         struct TCP_Server_Info *server;
> > > > +       struct page **pagevec;
> > > > +       size_t start;
> > > > +       struct iov_iter direct_iov = ctx->iter;
> > > >
> > > >         server = tlink_tcon(open_file->tlink)->ses->server;
> > > >
> > > > @@ -3111,6 +3170,9 @@ cifs_send_async_read(loff_t offset, size_t
> > > > len,
> > > struct cifsFileInfo *open_file,
> > > >         else
> > > >                 pid = current->tgid;
> > > >
> > > > +       if (ctx->direct_io)
> > > > +               iov_iter_advance(&direct_iov, offset - ctx->pos);
> > > > +
> > > >         do {
> > > >                 rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
> > > >                                                    &rsize,
> > > > &credits); @@ -3118,20 +3180,56 @@ cifs_send_async_read(loff_t
> > > > offset, size_t len,
> > > struct cifsFileInfo *open_file,
> > > >                         break;
> > > >
> > > >                 cur_len = min_t(const size_t, len, rsize);
> > > > -               npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
> > > >
> > > > -               /* allocate a readdata struct */
> > > > -               rdata = cifs_readdata_alloc(npages,
> > > > +               if (ctx->direct_io) {
> > > > +
> > > > +                       cur_len = iov_iter_get_pages_alloc(
> > > > +                                       &direct_iov, &pagevec,
> > > > +                                       cur_len, &start);
> > > > +                       if (cur_len < 0) {
> > > > +                               cifs_dbg(VFS,
> > > > +                                       "couldn't get user pages (cur_len=%zd)"
> > > > +                                       " iter type %d"
> > > > +                                       " iov_offset %zd count %zd\n",
> > > > +                                       cur_len, direct_iov.type, direct_iov.iov_offset,
> > > > +                                       direct_iov.count);
> > > > +                               dump_stack();
> > > > +                               break;
> > > > +                       }
> > > > +                       iov_iter_advance(&direct_iov, cur_len);
> > > > +
> > > > +                       rdata = cifs_readdata_direct_alloc(
> > > > +                                       pagevec, cifs_uncached_readv_complete);
> > > > +                       if (!rdata) {
> > > > +                               add_credits_and_wake_if(server, credits, 0);
> > > > +                               rc = -ENOMEM;
> > > > +                               break;
> > > > +                       }
> > > > +
> > > > +                       npages = (cur_len + start + PAGE_SIZE-1) / PAGE_SIZE;
> > > > +                       rdata->page_offset = start;
> > > > +                       rdata->tailsz = npages > 1 ?
> > > > +                               cur_len-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE :
> > > > +                               cur_len;
> > > > +
> > > > +               } else {
> > > > +
> > > > +                       npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
> > > > +                       /* allocate a readdata struct */
> > > > +                       rdata = cifs_readdata_alloc(npages,
> > > >                                             cifs_uncached_readv_complete);
> > > > -               if (!rdata) {
> > > > -                       add_credits_and_wake_if(server, credits, 0);
> > > > -                       rc = -ENOMEM;
> > > > -                       break;
> > > > -               }
> > > > +                       if (!rdata) {
> > > > +                               add_credits_and_wake_if(server, credits, 0);
> > > > +                               rc = -ENOMEM;
> > > > +                               break;
> > > > +                       }
> > > >
> > > > -               rc = cifs_read_allocate_pages(rdata, npages);
> > > > -               if (rc)
> > > > -                       goto error;
> > > > +                       rc = cifs_read_allocate_pages(rdata, npages);
> > > > +                       if (rc)
> > > > +                               goto error;
> > > > +
> > > > +                       rdata->tailsz = PAGE_SIZE;
> > > > +               }
> > > >
> > > >                 rdata->cfile = cifsFileInfo_get(open_file);
> > > >                 rdata->nr_pages = npages; @@ -3139,7 +3237,6 @@
> > > > cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo
> *open_file,
> > > >                 rdata->bytes = cur_len;
> > > >                 rdata->pid = pid;
> > > >                 rdata->pagesz = PAGE_SIZE;
> > > > -               rdata->tailsz = PAGE_SIZE;
> > > >                 rdata->read_into_pages = cifs_uncached_read_into_pages;
> > > >                 rdata->copy_into_pages = cifs_uncached_copy_into_pages;
> > > >                 rdata->credits = credits; @@ -3153,9 +3250,11 @@
> > > > cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo
> *open_file,
> > > >                 if (rc) {
> > > >                         add_credits_and_wake_if(server, rdata->credits, 0);
> > > >                         kref_put(&rdata->refcount,
> > > > -                                cifs_uncached_readdata_release);
> > > > -                       if (rc == -EAGAIN)
> > > > +                               cifs_uncached_readdata_release);
> > > > +                       if (rc == -EAGAIN) {
> > > > +                               iov_iter_revert(&direct_iov,
> > > > + cur_len);
> > > >                                 continue;
> > > > +                       }
> > > >                         break;
> > > >                 }
> > > >
> > > > @@ -3211,45 +3310,62 @@ collect_uncached_read_data(struct
> > > cifs_aio_ctx *ctx)
> > > >                                  * reading.
> > > >                                  */
> > > >                                 if (got_bytes && got_bytes < rdata->bytes) {
> > > > -                                       rc = cifs_readdata_to_iov(rdata, to);
> > > > +                                       rc = 0;
> > > > +                                       if (!ctx->direct_io)
> > > > +                                               rc =
> > > > + cifs_readdata_to_iov(rdata, to);
> > > >                                         if (rc) {
> > > >                                                 kref_put(&rdata->refcount,
> > > > -                                               cifs_uncached_readdata_release);
> > > > +
> > > > + cifs_uncached_readdata_release);
> > > >                                                 continue;
> > > >                                         }
> > > >                                 }
> > > >
> > > > -                               rc = cifs_send_async_read(
> > > > +                               if (ctx->direct_io) {
> > > > +                                       /*
> > > > +                                        * Re-use rdata as this is a
> > > > +                                        * direct I/O
> > > > +                                        */
> > > > +                                       rc = cifs_resend_rdata(
> > > > +                                               rdata,
> > > > +                                               &tmp_list, ctx);
> > >
> > > ...so we set rc to -EBUSY...
> > >
> > >
> > > > +                               } else {
> > > > +                                       rc = cifs_send_async_read(
> > > >                                                 rdata->offset + got_bytes,
> > > >                                                 rdata->bytes - got_bytes,
> > > >                                                 rdata->cfile, cifs_sb,
> > > >                                                 &tmp_list, ctx);
> > > >
> > > > +                                       kref_put(&rdata->refcount,
> > > > +                                               cifs_uncached_readdata_release);
> > > > +                               }
> > > > +
> > > >                                 list_splice(&tmp_list,
> > > > &ctx->list);
> > > >
> > > > -                               kref_put(&rdata->refcount,
> > > > -                                        cifs_uncached_readdata_release);
> > > >                                 goto again;
> > > >                         } else if (rdata->result)
> > > >                                 rc = rdata->result;
> > > > -                       else
> > > > +                       else if (!ctx->direct_io)
> > > >                                 rc = cifs_readdata_to_iov(rdata,
> > > > to);
> > > >
> > > >                         /* if there was a short read -- discard anything left */
> > > >                         if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
> > > >                                 rc = -ENODATA;
> > > > +
> > > > +                       ctx->total_len += rdata->got_bytes;
> > > >                 }
> > > >                 list_del_init(&rdata->list);
> > > >                 kref_put(&rdata->refcount, cifs_uncached_readdata_release);
> > > >         }
> > > >
> > > > -       for (i = 0; i < ctx->npages; i++) {
> > > > -               if (ctx->should_dirty)
> > > > -                       set_page_dirty(ctx->bv[i].bv_page);
> > > > -               put_page(ctx->bv[i].bv_page);
> > > > -       }
> > > > +       if (!ctx->direct_io) {
> > > > +               for (i = 0; i < ctx->npages; i++) {
> > > > +                       if (ctx->should_dirty)
> > > > +                               set_page_dirty(ctx->bv[i].bv_page);
> > > > +                       put_page(ctx->bv[i].bv_page);
> > > > +               }
> > > >
> > > > -       ctx->total_len = ctx->len - iov_iter_count(to);
> > > > +               ctx->total_len = ctx->len - iov_iter_count(to);
> > > > +       }
> > > >
> > > >         cifs_stats_bytes_read(tcon, ctx->total_len);
> > > >
> > > > @@ -3267,18 +3383,27 @@ collect_uncached_read_data(struct
> > > cifs_aio_ctx *ctx)
> > > >                 complete(&ctx->done);
> > >
> > > and then ctx->rc is set to -EBUSY too.
> > >
> > > >  }
> > > >
> > > > -ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
> > > > +ssize_t __cifs_readv(struct kiocb *iocb, struct iov_iter *to,
> > > > +bool
> > > > +direct)
> > > >  {
> > > > -       struct file *file = iocb->ki_filp;
> > > > -       ssize_t rc;
> > > >         size_t len;
> > > > -       ssize_t total_read = 0;
> > > > -       loff_t offset = iocb->ki_pos;
> > > > +       struct file *file = iocb->ki_filp;
> > > >         struct cifs_sb_info *cifs_sb;
> > > > -       struct cifs_tcon *tcon;
> > > >         struct cifsFileInfo *cfile;
> > > > +       struct cifs_tcon *tcon;
> > > > +       ssize_t rc, total_read = 0;
> > > > +       loff_t offset = iocb->ki_pos;
> > > >         struct cifs_aio_ctx *ctx;
> > > >
> > > > +       /*
> > > > +        * iov_iter_get_pages_alloc() doesn't work with ITER_KVEC,
> > > > +        * fall back to data copy read path
> > > > +        * this could be improved by getting pages directly in ITER_KVEC
> > > > +        */
> > > > +       if (direct && to->type & ITER_KVEC) {
> > > > +               cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n");
> > > > +               direct = false;
> > > > +       }
> > > > +
> > > >         len = iov_iter_count(to);
> > > >         if (!len)
> > > >                 return 0;
> > > > @@ -3305,14 +3430,20 @@ ssize_t cifs_user_readv(struct kiocb
> > > > *iocb,
> > > struct iov_iter *to)
> > > >         if (to->type == ITER_IOVEC)
> > > >                 ctx->should_dirty = true;
> > > >
> > > > -       rc = setup_aio_ctx_iter(ctx, to, READ);
> > > > -       if (rc) {
> > > > -               kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > > > -               return rc;
> > > > +       if (direct) {
> > > > +               ctx->pos = offset;
> > > > +               ctx->direct_io = true;
> > > > +               ctx->iter = *to;
> > > > +               ctx->len = len;
> > > > +       } else {
> > > > +               rc = setup_aio_ctx_iter(ctx, to, READ);
> > > > +               if (rc) {
> > > > +                       kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > > > +                       return rc;
> > > > +               }
> > > > +               len = ctx->len;
> > > >         }
> > > >
> > > > -       len = ctx->len;
> > > > -
> > > >         /* grab a lock here due to read response handlers can access ctx */
> > > >         mutex_lock(&ctx->aio_mutex);
> > > >
> > > > @@ -3354,6 +3485,16 @@ ssize_t cifs_user_readv(struct kiocb *iocb,
> > > struct iov_iter *to)
> > > >         return rc;
> > >
> > > In case of a blocking read we will return -EBUSY to the caller but
> > > read man page doesn't say anything about a possibility to return
> > > this error code. Is it being mapped somehow by VFS or is there
> > > another consideration? The same question applies to the write patch as
> well.
> >
> > Thanks Pavel,  Yes this is a bug.
> >
> > In practice, the retry logic will rarely get hit. If getting hit, normally the
> server will give us far more credits for resending this whole packet.
> >
> > How about just retrying forever (instead of trying 3 times) on this resend
> packet? This will change the code to the same behavior before direct I/O
> patches.
> >
> > e.g.:
> >
> >         /*
> >          * Try to resend this wdata, waiting for credits before sending
> >          * Note: we are attempting to resend the whole wdata not in
> segments
> >          */
> >         do {
> >                 rc = server->ops->wait_mtu_credits(
> >                         server, wdata->bytes, &wsize, &credits);
> >
> >                 if (rc)
> >                         break;
> >
> >                 if (wsize < wdata->bytes) {
> >                         add_credits_and_wake_if(server, credits, 0);
> >                         msleep(1000);
> >                 }
> >         } while (wsize < wdata->bytes);
> >
> 
> We can do that but it won't be the same behavior because we were using a
> partial send:

Yes it's a little different.

I don't see any ill effects of doing this, other than we may wait for a little longer on a full credit to send the whole packet. In practice the server always offers a full credit on the first call to wait_mtu_credits(), I am yet to see the while loop actually get looped.

The resend path is rarely getting hit in stress tests on TCP, I estimate maybe less than 0.001% on my test setup, the while loop for waiting credits is ever harder to hit that I never see one.

On RDMA, the resend path is never used.

> 
> +                               if (ctx->direct_io) {
> +                                       /*
> +                                        * Re-use rdata as this is a
> +                                        * direct I/O
> +                                        */
> +                                       rc = cifs_resend_rdata(
> +                                               rdata,
> +                                               &tmp_list, ctx);
> +                               } else {
> +                                       rc = cifs_send_async_read(
>                                                 rdata->offset + got_bytes,
>                                                 rdata->bytes - got_bytes,
>                                                 rdata->cfile, cifs_sb,
>                                                 &tmp_list, ctx);
> 
> --
> Best regards,
> Pavel Shilovsky
Pavel Shilovsky Nov. 29, 2018, 11:23 p.m. | #5
чт, 29 нояб. 2018 г. в 14:48, Long Li <longli@microsoft.com>:
>
> > Subject: Re: [Patch v4 1/3] CIFS: Add support for direct I/O read
> >
> > ср, 28 нояб. 2018 г. в 15:43, Long Li <longli@microsoft.com>:
> > >
> > > > Subject: Re: [Patch v4 1/3] CIFS: Add support for direct I/O read
> > > >
> > > > Hi Long,
> > > >
> > > > Please find my comments below.
> > > >
> > > >
> > > > ср, 31 окт. 2018 г. в 15:14, Long Li <longli@linuxonhyperv.com>:
> > > > >
> > > > > From: Long Li <longli@microsoft.com>
> > > > >
> > > > > With direct I/O read, we transfer the data directly from transport
> > > > > layer to the user data buffer.
> > > > >
> > > > > Change in v3: add support for kernel AIO
> > > > >
> > > > > Change in v4:
> > > > > Refactor common read code to __cifs_readv for direct and non-direct
> > I/O.
> > > > > Retry on direct I/O failure.
> > > > >
> > > > > Signed-off-by: Long Li <longli@microsoft.com>
> > > > > ---
> > > > >  fs/cifs/cifsfs.h   |   1 +
> > > > >  fs/cifs/cifsglob.h |   5 ++
> > > > >  fs/cifs/file.c     | 219
> > +++++++++++++++++++++++++++++++++++++++++++--
> > > > --------
> > > > >  3 files changed, 186 insertions(+), 39 deletions(-)
> > > > >
> > > > > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > > > > 5f02318..7fba9aa 100644
> > > > > --- a/fs/cifs/cifsfs.h
> > > > > +++ b/fs/cifs/cifsfs.h
> > > > > @@ -102,6 +102,7 @@ extern int cifs_open(struct inode *inode,
> > > > > struct file *file);  extern int cifs_close(struct inode *inode,
> > > > > struct file *file);  extern int cifs_closedir(struct inode *inode,
> > > > > struct file *file);  extern ssize_t cifs_user_readv(struct kiocb
> > > > > *iocb, struct iov_iter *to);
> > > > > +extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct
> > > > > +iov_iter *to);
> > > > >  extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct
> > > > > iov_iter *to);  extern ssize_t cifs_user_writev(struct kiocb
> > > > > *iocb, struct iov_iter *from);  extern ssize_t
> > > > > cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
> > > > > diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index
> > > > > 7f62c98..52248dd 100644
> > > > > --- a/fs/cifs/cifsglob.h
> > > > > +++ b/fs/cifs/cifsglob.h
> > > > > @@ -1146,6 +1146,11 @@ struct cifs_aio_ctx {
> > > > >         unsigned int            len;
> > > > >         unsigned int            total_len;
> > > > >         bool                    should_dirty;
> > > > > +       /*
> > > > > +        * Indicates if this aio_ctx is for direct_io,
> > > > > +        * If yes, iter is a copy of the user passed iov_iter
> > > > > +        */
> > > > > +       bool                    direct_io;
> > > > >  };
> > > > >
> > > > >  struct cifs_readdata;
> > > > > diff --git a/fs/cifs/file.c b/fs/cifs/file.c index
> > > > > 87eece6..daab878
> > > > > 100644
> > > > > --- a/fs/cifs/file.c
> > > > > +++ b/fs/cifs/file.c
> > > > > @@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref
> > > > *refcount)
> > > > >         kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
> > > > >         for (i = 0; i < rdata->nr_pages; i++) {
> > > > >                 put_page(rdata->pages[i]);
> > > > > -               rdata->pages[i] = NULL;
> > > > >         }
> > > > >         cifs_readdata_release(refcount);  } @@ -3092,6 +3091,63 @@
> > > > > cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
> > > > >         return uncached_fill_pages(server, rdata, iter,
> > > > > iter->count); }
> > > > >
> > > > > +static int cifs_resend_rdata(struct cifs_readdata *rdata,
> > > > > +                     struct list_head *rdata_list,
> > > > > +                     struct cifs_aio_ctx *ctx) {
> > > > > +       int wait_retry = 0;
> > > > > +       unsigned int rsize, credits;
> > > > > +       int rc;
> > > > > +       struct TCP_Server_Info *server =
> > > > > +tlink_tcon(rdata->cfile->tlink)->ses->server;
> > > > > +
> > > > > +       /*
> > > > > +        * Try to resend this rdata, waiting for credits up to 3 seconds.
> > > > > +        * Note: we are attempting to resend the whole rdata not in
> > segments
> > > > > +        */
> > > > > +       do {
> > > > > +               rc = server->ops->wait_mtu_credits(server, rdata->bytes,
> > > > > +                                               &rsize, &credits);
> > > > > +
> > > > > +               if (rc)
> > > > > +                       break;
> > > > > +
> > > > > +               if (rsize < rdata->bytes) {
> > > > > +                       add_credits_and_wake_if(server, credits, 0);
> > > > > +                       msleep(1000);
> > > > > +                       wait_retry++;
> > > > > +               }
> > > > > +       } while (rsize < rdata->bytes && wait_retry < 3);
> > > > > +
> > > > > +       /*
> > > > > +        * If we can't find enough credits to send this rdata
> > > > > +        * release the rdata and return failure, this will pass
> > > > > +        * whatever I/O amount we have finished to VFS.
> > > > > +        */
> > > > > +       if (rsize < rdata->bytes) {
> > > > > +               rc = -EBUSY;
> > > >
> > > > We don't have enough credits and return EBUSY here...
> > > >
> > > > > +               goto out;
> > > > > +       }
> > > > > +
> > > > > +       rc = -EAGAIN;
> > > > > +       while (rc == -EAGAIN)
> > > > > +               if (!rdata->cfile->invalidHandle ||
> > > > > +                   !(rc = cifs_reopen_file(rdata->cfile, true)))
> > > > > +                       rc = server->ops->async_readv(rdata);
> > > > > +
> > > > > +       if (!rc) {
> > > > > +               /* Add to aio pending list */
> > > > > +               list_add_tail(&rdata->list, rdata_list);
> > > > > +               return 0;
> > > > > +       }
> > > > > +
> > > > > +       add_credits_and_wake_if(server, rdata->credits, 0);
> > > > > +out:
> > > > > +       kref_put(&rdata->refcount,
> > > > > +               cifs_uncached_readdata_release);
> > > > > +
> > > > > +       return rc;
> > > > > +}
> > > > > +
> > > > >  static int
> > > > >  cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo
> > *open_file,
> > > > >                      struct cifs_sb_info *cifs_sb, struct
> > > > > list_head *rdata_list, @@ -3103,6 +3159,9 @@
> > > > > cifs_send_async_read(loff_t offset,
> > > > size_t len, struct cifsFileInfo *open_file,
> > > > >         int rc;
> > > > >         pid_t pid;
> > > > >         struct TCP_Server_Info *server;
> > > > > +       struct page **pagevec;
> > > > > +       size_t start;
> > > > > +       struct iov_iter direct_iov = ctx->iter;
> > > > >
> > > > >         server = tlink_tcon(open_file->tlink)->ses->server;
> > > > >
> > > > > @@ -3111,6 +3170,9 @@ cifs_send_async_read(loff_t offset, size_t
> > > > > len,
> > > > struct cifsFileInfo *open_file,
> > > > >         else
> > > > >                 pid = current->tgid;
> > > > >
> > > > > +       if (ctx->direct_io)
> > > > > +               iov_iter_advance(&direct_iov, offset - ctx->pos);
> > > > > +
> > > > >         do {
> > > > >                 rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
> > > > >                                                    &rsize,
> > > > > &credits); @@ -3118,20 +3180,56 @@ cifs_send_async_read(loff_t
> > > > > offset, size_t len,
> > > > struct cifsFileInfo *open_file,
> > > > >                         break;
> > > > >
> > > > >                 cur_len = min_t(const size_t, len, rsize);
> > > > > -               npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
> > > > >
> > > > > -               /* allocate a readdata struct */
> > > > > -               rdata = cifs_readdata_alloc(npages,
> > > > > +               if (ctx->direct_io) {
> > > > > +
> > > > > +                       cur_len = iov_iter_get_pages_alloc(
> > > > > +                                       &direct_iov, &pagevec,
> > > > > +                                       cur_len, &start);
> > > > > +                       if (cur_len < 0) {
> > > > > +                               cifs_dbg(VFS,
> > > > > +                                       "couldn't get user pages (cur_len=%zd)"
> > > > > +                                       " iter type %d"
> > > > > +                                       " iov_offset %zd count %zd\n",
> > > > > +                                       cur_len, direct_iov.type, direct_iov.iov_offset,
> > > > > +                                       direct_iov.count);
> > > > > +                               dump_stack();
> > > > > +                               break;
> > > > > +                       }
> > > > > +                       iov_iter_advance(&direct_iov, cur_len);
> > > > > +
> > > > > +                       rdata = cifs_readdata_direct_alloc(
> > > > > +                                       pagevec, cifs_uncached_readv_complete);
> > > > > +                       if (!rdata) {
> > > > > +                               add_credits_and_wake_if(server, credits, 0);
> > > > > +                               rc = -ENOMEM;
> > > > > +                               break;
> > > > > +                       }
> > > > > +
> > > > > +                       npages = (cur_len + start + PAGE_SIZE-1) / PAGE_SIZE;
> > > > > +                       rdata->page_offset = start;
> > > > > +                       rdata->tailsz = npages > 1 ?
> > > > > +                               cur_len-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE :
> > > > > +                               cur_len;
> > > > > +
> > > > > +               } else {
> > > > > +
> > > > > +                       npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
> > > > > +                       /* allocate a readdata struct */
> > > > > +                       rdata = cifs_readdata_alloc(npages,
> > > > >                                             cifs_uncached_readv_complete);
> > > > > -               if (!rdata) {
> > > > > -                       add_credits_and_wake_if(server, credits, 0);
> > > > > -                       rc = -ENOMEM;
> > > > > -                       break;
> > > > > -               }
> > > > > +                       if (!rdata) {
> > > > > +                               add_credits_and_wake_if(server, credits, 0);
> > > > > +                               rc = -ENOMEM;
> > > > > +                               break;
> > > > > +                       }
> > > > >
> > > > > -               rc = cifs_read_allocate_pages(rdata, npages);
> > > > > -               if (rc)
> > > > > -                       goto error;
> > > > > +                       rc = cifs_read_allocate_pages(rdata, npages);
> > > > > +                       if (rc)
> > > > > +                               goto error;
> > > > > +
> > > > > +                       rdata->tailsz = PAGE_SIZE;
> > > > > +               }
> > > > >
> > > > >                 rdata->cfile = cifsFileInfo_get(open_file);
> > > > >                 rdata->nr_pages = npages; @@ -3139,7 +3237,6 @@
> > > > > cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo
> > *open_file,
> > > > >                 rdata->bytes = cur_len;
> > > > >                 rdata->pid = pid;
> > > > >                 rdata->pagesz = PAGE_SIZE;
> > > > > -               rdata->tailsz = PAGE_SIZE;
> > > > >                 rdata->read_into_pages = cifs_uncached_read_into_pages;
> > > > >                 rdata->copy_into_pages = cifs_uncached_copy_into_pages;
> > > > >                 rdata->credits = credits; @@ -3153,9 +3250,11 @@
> > > > > cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo
> > *open_file,
> > > > >                 if (rc) {
> > > > >                         add_credits_and_wake_if(server, rdata->credits, 0);
> > > > >                         kref_put(&rdata->refcount,
> > > > > -                                cifs_uncached_readdata_release);
> > > > > -                       if (rc == -EAGAIN)
> > > > > +                               cifs_uncached_readdata_release);
> > > > > +                       if (rc == -EAGAIN) {
> > > > > +                               iov_iter_revert(&direct_iov,
> > > > > + cur_len);
> > > > >                                 continue;
> > > > > +                       }
> > > > >                         break;
> > > > >                 }
> > > > >
> > > > > @@ -3211,45 +3310,62 @@ collect_uncached_read_data(struct
> > > > cifs_aio_ctx *ctx)
> > > > >                                  * reading.
> > > > >                                  */
> > > > >                                 if (got_bytes && got_bytes < rdata->bytes) {
> > > > > -                                       rc = cifs_readdata_to_iov(rdata, to);
> > > > > +                                       rc = 0;
> > > > > +                                       if (!ctx->direct_io)
> > > > > +                                               rc =
> > > > > + cifs_readdata_to_iov(rdata, to);
> > > > >                                         if (rc) {
> > > > >                                                 kref_put(&rdata->refcount,
> > > > > -                                               cifs_uncached_readdata_release);
> > > > > +
> > > > > + cifs_uncached_readdata_release);
> > > > >                                                 continue;
> > > > >                                         }
> > > > >                                 }
> > > > >
> > > > > -                               rc = cifs_send_async_read(
> > > > > +                               if (ctx->direct_io) {
> > > > > +                                       /*
> > > > > +                                        * Re-use rdata as this is a
> > > > > +                                        * direct I/O
> > > > > +                                        */
> > > > > +                                       rc = cifs_resend_rdata(
> > > > > +                                               rdata,
> > > > > +                                               &tmp_list, ctx);
> > > >
> > > > ...so we set rc to -EBUSY...
> > > >
> > > >
> > > > > +                               } else {
> > > > > +                                       rc = cifs_send_async_read(
> > > > >                                                 rdata->offset + got_bytes,
> > > > >                                                 rdata->bytes - got_bytes,
> > > > >                                                 rdata->cfile, cifs_sb,
> > > > >                                                 &tmp_list, ctx);
> > > > >
> > > > > +                                       kref_put(&rdata->refcount,
> > > > > +                                               cifs_uncached_readdata_release);
> > > > > +                               }
> > > > > +
> > > > >                                 list_splice(&tmp_list,
> > > > > &ctx->list);
> > > > >
> > > > > -                               kref_put(&rdata->refcount,
> > > > > -                                        cifs_uncached_readdata_release);
> > > > >                                 goto again;
> > > > >                         } else if (rdata->result)
> > > > >                                 rc = rdata->result;
> > > > > -                       else
> > > > > +                       else if (!ctx->direct_io)
> > > > >                                 rc = cifs_readdata_to_iov(rdata,
> > > > > to);
> > > > >
> > > > >                         /* if there was a short read -- discard anything left */
> > > > >                         if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
> > > > >                                 rc = -ENODATA;
> > > > > +
> > > > > +                       ctx->total_len += rdata->got_bytes;
> > > > >                 }
> > > > >                 list_del_init(&rdata->list);
> > > > >                 kref_put(&rdata->refcount, cifs_uncached_readdata_release);
> > > > >         }
> > > > >
> > > > > -       for (i = 0; i < ctx->npages; i++) {
> > > > > -               if (ctx->should_dirty)
> > > > > -                       set_page_dirty(ctx->bv[i].bv_page);
> > > > > -               put_page(ctx->bv[i].bv_page);
> > > > > -       }
> > > > > +       if (!ctx->direct_io) {
> > > > > +               for (i = 0; i < ctx->npages; i++) {
> > > > > +                       if (ctx->should_dirty)
> > > > > +                               set_page_dirty(ctx->bv[i].bv_page);
> > > > > +                       put_page(ctx->bv[i].bv_page);
> > > > > +               }
> > > > >
> > > > > -       ctx->total_len = ctx->len - iov_iter_count(to);
> > > > > +               ctx->total_len = ctx->len - iov_iter_count(to);
> > > > > +       }
> > > > >
> > > > >         cifs_stats_bytes_read(tcon, ctx->total_len);
> > > > >
> > > > > @@ -3267,18 +3383,27 @@ collect_uncached_read_data(struct
> > > > cifs_aio_ctx *ctx)
> > > > >                 complete(&ctx->done);
> > > >
> > > > and then ctx->rc is set to -EBUSY too.
> > > >
> > > > >  }
> > > > >
> > > > > -ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
> > > > > +ssize_t __cifs_readv(struct kiocb *iocb, struct iov_iter *to,
> > > > > +bool
> > > > > +direct)
> > > > >  {
> > > > > -       struct file *file = iocb->ki_filp;
> > > > > -       ssize_t rc;
> > > > >         size_t len;
> > > > > -       ssize_t total_read = 0;
> > > > > -       loff_t offset = iocb->ki_pos;
> > > > > +       struct file *file = iocb->ki_filp;
> > > > >         struct cifs_sb_info *cifs_sb;
> > > > > -       struct cifs_tcon *tcon;
> > > > >         struct cifsFileInfo *cfile;
> > > > > +       struct cifs_tcon *tcon;
> > > > > +       ssize_t rc, total_read = 0;
> > > > > +       loff_t offset = iocb->ki_pos;
> > > > >         struct cifs_aio_ctx *ctx;
> > > > >
> > > > > +       /*
> > > > > +        * iov_iter_get_pages_alloc() doesn't work with ITER_KVEC,
> > > > > +        * fall back to data copy read path
> > > > > +        * this could be improved by getting pages directly in ITER_KVEC
> > > > > +        */
> > > > > +       if (direct && to->type & ITER_KVEC) {
> > > > > +               cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n");
> > > > > +               direct = false;
> > > > > +       }
> > > > > +
> > > > >         len = iov_iter_count(to);
> > > > >         if (!len)
> > > > >                 return 0;
> > > > > @@ -3305,14 +3430,20 @@ ssize_t cifs_user_readv(struct kiocb
> > > > > *iocb,
> > > > struct iov_iter *to)
> > > > >         if (to->type == ITER_IOVEC)
> > > > >                 ctx->should_dirty = true;
> > > > >
> > > > > -       rc = setup_aio_ctx_iter(ctx, to, READ);
> > > > > -       if (rc) {
> > > > > -               kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > > > > -               return rc;
> > > > > +       if (direct) {
> > > > > +               ctx->pos = offset;
> > > > > +               ctx->direct_io = true;
> > > > > +               ctx->iter = *to;
> > > > > +               ctx->len = len;
> > > > > +       } else {
> > > > > +               rc = setup_aio_ctx_iter(ctx, to, READ);
> > > > > +               if (rc) {
> > > > > +                       kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > > > > +                       return rc;
> > > > > +               }
> > > > > +               len = ctx->len;
> > > > >         }
> > > > >
> > > > > -       len = ctx->len;
> > > > > -
> > > > >         /* grab a lock here due to read response handlers can access ctx */
> > > > >         mutex_lock(&ctx->aio_mutex);
> > > > >
> > > > > @@ -3354,6 +3485,16 @@ ssize_t cifs_user_readv(struct kiocb *iocb,
> > > > struct iov_iter *to)
> > > > >         return rc;
> > > >
> > > > In case of a blocking read we will return -EBUSY to the caller but
> > > > read man page doesn't say anything about a possibility to return
> > > > this error code. Is it being mapped somehow by VFS or is there
> > > > another consideration? The same question applies to the write patch as
> > well.
> > >
> > > Thanks Pavel,  Yes this is a bug.
> > >
> > > In practice, the retry logic will rarely get hit. If getting hit, normally the
> > server will give us far more credits for resending this whole packet.
> > >
> > > How about just retrying forever (instead of trying 3 times) on this resend
> > packet? This will change the code to the same behavior before direct I/O
> > patches.
> > >
> > > e.g.:
> > >
> > >         /*
> > >          * Try to resend this wdata, waiting for credits before sending
> > >          * Note: we are attempting to resend the whole wdata not in
> > segments
> > >          */
> > >         do {
> > >                 rc = server->ops->wait_mtu_credits(
> > >                         server, wdata->bytes, &wsize, &credits);
> > >
> > >                 if (rc)
> > >                         break;
> > >
> > >                 if (wsize < wdata->bytes) {
> > >                         add_credits_and_wake_if(server, credits, 0);
> > >                         msleep(1000);
> > >                 }
> > >         } while (wsize < wdata->bytes);
> > >
> >
> > We can do that but it won't be the same behavior because we were using a
> > partial send:
>
> Yes it's a little different.
>
> I don't see any ill effects of doing this, other than we may wait for a little longer on a full credit to send the whole packet. In practice the server always offers a full credit on the first call to wait_mtu_credits(), I am yet to see the while loop actually get looped.
>
> The resend path is rarely getting hit in stress tests on TCP, I estimate maybe less than 0.001% on my test setup, the while loop for waiting credits is ever harder to hit that I never see one.
>
> On RDMA, the resend path is never used.

Ok. Another way to fix this is to break the request into several
smaller ones consuming 1 credit per request - see
cifs_writev_requeue() function that does the same thing for writes.

--
Best regards,
Pavel Shilovsky

Patch

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 5f02318..7fba9aa 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -102,6 +102,7 @@  extern int cifs_open(struct inode *inode, struct file *file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
 extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
 extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7f62c98..52248dd 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1146,6 +1146,11 @@  struct cifs_aio_ctx {
 	unsigned int		len;
 	unsigned int		total_len;
 	bool			should_dirty;
+	/*
+	 * Indicates if this aio_ctx is for direct_io,
+	 * If yes, iter is a copy of the user passed iov_iter
+	 */
+	bool			direct_io;
 };
 
 struct cifs_readdata;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 87eece6..daab878 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2965,7 +2965,6 @@  cifs_uncached_readdata_release(struct kref *refcount)
 	kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
 	for (i = 0; i < rdata->nr_pages; i++) {
 		put_page(rdata->pages[i]);
-		rdata->pages[i] = NULL;
 	}
 	cifs_readdata_release(refcount);
 }
@@ -3092,6 +3091,63 @@  cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
 	return uncached_fill_pages(server, rdata, iter, iter->count);
 }
 
+static int cifs_resend_rdata(struct cifs_readdata *rdata,
+                     struct list_head *rdata_list,
+                     struct cifs_aio_ctx *ctx)
+{
+	int wait_retry = 0;
+	unsigned int rsize, credits;
+	int rc;
+	struct TCP_Server_Info *server = tlink_tcon(rdata->cfile->tlink)->ses->server;
+
+	/*
+	 * Try to resend this rdata, waiting for credits up to 3 seconds.
+	 * Note: we are attempting to resend the whole rdata not in segments
+	 */
+	do {
+		rc = server->ops->wait_mtu_credits(server, rdata->bytes,
+						&rsize, &credits);
+
+		if (rc)
+			break;
+
+		if (rsize < rdata->bytes) {
+			add_credits_and_wake_if(server, credits, 0);
+			msleep(1000);
+			wait_retry++;
+		}
+	} while (rsize < rdata->bytes && wait_retry < 3);
+
+	/*
+	 * If we can't find enough credits to send this rdata
+	 * release the rdata and return failure, this will pass
+	 * whatever I/O amount we have finished to VFS.
+	 */
+	if (rsize < rdata->bytes) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	rc = -EAGAIN;
+	while (rc == -EAGAIN)
+		if (!rdata->cfile->invalidHandle ||
+		    !(rc = cifs_reopen_file(rdata->cfile, true)))
+			rc = server->ops->async_readv(rdata);
+
+	if (!rc) {
+		/* Add to aio pending list */
+		list_add_tail(&rdata->list, rdata_list);
+		return 0;
+	}
+
+	add_credits_and_wake_if(server, rdata->credits, 0);
+out:
+	kref_put(&rdata->refcount,
+		cifs_uncached_readdata_release);
+
+	return rc;
+}
+
 static int
 cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 		     struct cifs_sb_info *cifs_sb, struct list_head *rdata_list,
@@ -3103,6 +3159,9 @@  cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 	int rc;
 	pid_t pid;
 	struct TCP_Server_Info *server;
+	struct page **pagevec;
+	size_t start;
+	struct iov_iter direct_iov = ctx->iter;
 
 	server = tlink_tcon(open_file->tlink)->ses->server;
 
@@ -3111,6 +3170,9 @@  cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 	else
 		pid = current->tgid;
 
+	if (ctx->direct_io)
+		iov_iter_advance(&direct_iov, offset - ctx->pos);
+
 	do {
 		rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
 						   &rsize, &credits);
@@ -3118,20 +3180,56 @@  cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 			break;
 
 		cur_len = min_t(const size_t, len, rsize);
-		npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
 
-		/* allocate a readdata struct */
-		rdata = cifs_readdata_alloc(npages,
+		if (ctx->direct_io) {
+
+			cur_len = iov_iter_get_pages_alloc(
+					&direct_iov, &pagevec,
+					cur_len, &start);
+			if (cur_len < 0) {
+				cifs_dbg(VFS,
+					"couldn't get user pages (cur_len=%zd)"
+					" iter type %d"
+					" iov_offset %zd count %zd\n",
+					cur_len, direct_iov.type, direct_iov.iov_offset,
+					direct_iov.count);
+				dump_stack();
+				break;
+			}
+			iov_iter_advance(&direct_iov, cur_len);
+
+			rdata = cifs_readdata_direct_alloc(
+					pagevec, cifs_uncached_readv_complete);
+			if (!rdata) {
+				add_credits_and_wake_if(server, credits, 0);
+				rc = -ENOMEM;
+				break;
+			}
+
+			npages = (cur_len + start + PAGE_SIZE-1) / PAGE_SIZE;
+			rdata->page_offset = start;
+			rdata->tailsz = npages > 1 ?
+				cur_len-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE :
+				cur_len;
+
+		} else {
+
+			npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
+			/* allocate a readdata struct */
+			rdata = cifs_readdata_alloc(npages,
 					    cifs_uncached_readv_complete);
-		if (!rdata) {
-			add_credits_and_wake_if(server, credits, 0);
-			rc = -ENOMEM;
-			break;
-		}
+			if (!rdata) {
+				add_credits_and_wake_if(server, credits, 0);
+				rc = -ENOMEM;
+				break;
+			}
 
-		rc = cifs_read_allocate_pages(rdata, npages);
-		if (rc)
-			goto error;
+			rc = cifs_read_allocate_pages(rdata, npages);
+			if (rc)
+				goto error;
+
+			rdata->tailsz = PAGE_SIZE;
+		}
 
 		rdata->cfile = cifsFileInfo_get(open_file);
 		rdata->nr_pages = npages;
@@ -3139,7 +3237,6 @@  cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 		rdata->bytes = cur_len;
 		rdata->pid = pid;
 		rdata->pagesz = PAGE_SIZE;
-		rdata->tailsz = PAGE_SIZE;
 		rdata->read_into_pages = cifs_uncached_read_into_pages;
 		rdata->copy_into_pages = cifs_uncached_copy_into_pages;
 		rdata->credits = credits;
@@ -3153,9 +3250,11 @@  cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 		if (rc) {
 			add_credits_and_wake_if(server, rdata->credits, 0);
 			kref_put(&rdata->refcount,
-				 cifs_uncached_readdata_release);
-			if (rc == -EAGAIN)
+				cifs_uncached_readdata_release);
+			if (rc == -EAGAIN) {
+				iov_iter_revert(&direct_iov, cur_len);
 				continue;
+			}
 			break;
 		}
 
@@ -3211,45 +3310,62 @@  collect_uncached_read_data(struct cifs_aio_ctx *ctx)
 				 * reading.
 				 */
 				if (got_bytes && got_bytes < rdata->bytes) {
-					rc = cifs_readdata_to_iov(rdata, to);
+					rc = 0;
+					if (!ctx->direct_io)
+						rc = cifs_readdata_to_iov(rdata, to);
 					if (rc) {
 						kref_put(&rdata->refcount,
-						cifs_uncached_readdata_release);
+							cifs_uncached_readdata_release);
 						continue;
 					}
 				}
 
-				rc = cifs_send_async_read(
+				if (ctx->direct_io) {
+					/*
+					 * Re-use rdata as this is a
+					 * direct I/O
+					 */
+					rc = cifs_resend_rdata(
+						rdata,
+						&tmp_list, ctx);
+				} else {
+					rc = cifs_send_async_read(
 						rdata->offset + got_bytes,
 						rdata->bytes - got_bytes,
 						rdata->cfile, cifs_sb,
 						&tmp_list, ctx);
 
+					kref_put(&rdata->refcount,
+						cifs_uncached_readdata_release);
+				}
+
 				list_splice(&tmp_list, &ctx->list);
 
-				kref_put(&rdata->refcount,
-					 cifs_uncached_readdata_release);
 				goto again;
 			} else if (rdata->result)
 				rc = rdata->result;
-			else
+			else if (!ctx->direct_io)
 				rc = cifs_readdata_to_iov(rdata, to);
 
 			/* if there was a short read -- discard anything left */
 			if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
 				rc = -ENODATA;
+
+			ctx->total_len += rdata->got_bytes;
 		}
 		list_del_init(&rdata->list);
 		kref_put(&rdata->refcount, cifs_uncached_readdata_release);
 	}
 
-	for (i = 0; i < ctx->npages; i++) {
-		if (ctx->should_dirty)
-			set_page_dirty(ctx->bv[i].bv_page);
-		put_page(ctx->bv[i].bv_page);
-	}
+	if (!ctx->direct_io) {
+		for (i = 0; i < ctx->npages; i++) {
+			if (ctx->should_dirty)
+				set_page_dirty(ctx->bv[i].bv_page);
+			put_page(ctx->bv[i].bv_page);
+		}
 
-	ctx->total_len = ctx->len - iov_iter_count(to);
+		ctx->total_len = ctx->len - iov_iter_count(to);
+	}
 
 	cifs_stats_bytes_read(tcon, ctx->total_len);
 
@@ -3267,18 +3383,27 @@  collect_uncached_read_data(struct cifs_aio_ctx *ctx)
 		complete(&ctx->done);
 }
 
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+ssize_t __cifs_readv(struct kiocb *iocb, struct iov_iter *to, bool direct)
 {
-	struct file *file = iocb->ki_filp;
-	ssize_t rc;
 	size_t len;
-	ssize_t total_read = 0;
-	loff_t offset = iocb->ki_pos;
+	struct file *file = iocb->ki_filp;
 	struct cifs_sb_info *cifs_sb;
-	struct cifs_tcon *tcon;
 	struct cifsFileInfo *cfile;
+	struct cifs_tcon *tcon;
+	ssize_t rc, total_read = 0;
+	loff_t offset = iocb->ki_pos;
 	struct cifs_aio_ctx *ctx;
 
+	/*
+	 * iov_iter_get_pages_alloc() doesn't work with ITER_KVEC,
+	 * fall back to data copy read path
+	 * this could be improved by getting pages directly in ITER_KVEC
+	 */
+	if (direct && to->type & ITER_KVEC) {
+		cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n");
+		direct = false;
+	}
+
 	len = iov_iter_count(to);
 	if (!len)
 		return 0;
@@ -3305,14 +3430,20 @@  ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
 	if (to->type == ITER_IOVEC)
 		ctx->should_dirty = true;
 
-	rc = setup_aio_ctx_iter(ctx, to, READ);
-	if (rc) {
-		kref_put(&ctx->refcount, cifs_aio_ctx_release);
-		return rc;
+	if (direct) {
+		ctx->pos = offset;
+		ctx->direct_io = true;
+		ctx->iter = *to;
+		ctx->len = len;
+	} else {
+		rc = setup_aio_ctx_iter(ctx, to, READ);
+		if (rc) {
+			kref_put(&ctx->refcount, cifs_aio_ctx_release);
+			return rc;
+		}
+		len = ctx->len;
 	}
 
-	len = ctx->len;
-
 	/* grab a lock here due to read response handlers can access ctx */
 	mutex_lock(&ctx->aio_mutex);
 
@@ -3354,6 +3485,16 @@  ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
 	return rc;
 }
 
+ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+	return __cifs_readv(iocb, to, true);
+}
+
+ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+	return __cifs_readv(iocb, to, false);
+}
+
 ssize_t
 cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
 {