[v4,2/3] CIFS: Add support for direct I/O write

Message ID 20181031221311.2596-2-longli@linuxonhyperv.com
State New
Headers show
Series
  • [v4,1/3] CIFS: Add support for direct I/O read
Related show

Commit Message

Long Li Oct. 31, 2018, 10:13 p.m.
From: Long Li <longli@microsoft.com>

With direct I/O write, user supplied buffers are pinned to the memory and data
are transferred directly from user buffers to the transport layer.

Change in v3: add support for kernel AIO

Change in v4:
Refactor common write code to __cifs_writev for direct and non-direct I/O.
Retry on direct I/O failure.

Signed-off-by: Long Li <longli@microsoft.com>
---
 fs/cifs/cifsfs.h |   1 +
 fs/cifs/file.c   | 194 +++++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 154 insertions(+), 41 deletions(-)

Comments

Pavel Shilovsky Nov. 17, 2018, 12:20 a.m. | #1
ср, 31 окт. 2018 г. в 15:26, Long Li <longli@linuxonhyperv.com>:
>
> From: Long Li <longli@microsoft.com>
>
> With direct I/O write, user supplied buffers are pinned to the memory and data
> are transferred directly from user buffers to the transport layer.
>
> Change in v3: add support for kernel AIO
>
> Change in v4:
> Refactor common write code to __cifs_writev for direct and non-direct I/O.
> Retry on direct I/O failure.
>
> Signed-off-by: Long Li <longli@microsoft.com>
> ---
>  fs/cifs/cifsfs.h |   1 +
>  fs/cifs/file.c   | 194 +++++++++++++++++++++++++++++++++++++++++++------------
>  2 files changed, 154 insertions(+), 41 deletions(-)
>
> diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
> index 7fba9aa..e9c5103 100644
> --- a/fs/cifs/cifsfs.h
> +++ b/fs/cifs/cifsfs.h
> @@ -105,6 +105,7 @@ extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
>  extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
>  extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
>  extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
> +extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from);
>  extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
>  extern int cifs_lock(struct file *, int, struct file_lock *);
>  extern int cifs_fsync(struct file *, loff_t, loff_t, int);
> diff --git a/fs/cifs/file.c b/fs/cifs/file.c
> index daab878..1a41c04 100644
> --- a/fs/cifs/file.c
> +++ b/fs/cifs/file.c
> @@ -2524,6 +2524,55 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
>  }
>
>  static int
> +cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, struct cifs_aio_ctx *ctx)
> +{
> +       int wait_retry = 0;
> +       unsigned int wsize, credits;
> +       int rc;
> +       struct TCP_Server_Info *server = tlink_tcon(wdata->cfile->tlink)->ses->server;
> +
> +       /*
> +        * Try to resend this wdata, waiting for credits up to 3 seconds.
> +        * Note: we are attempting to resend the whole wdata not in segments
> +        */
> +       do {
> +               rc = server->ops->wait_mtu_credits(server, wdata->bytes, &wsize, &credits);
> +
> +               if (rc)
> +                       break;
> +
> +               if (wsize < wdata->bytes) {
> +                       add_credits_and_wake_if(server, credits, 0);
> +                       msleep(1000);
> +                       wait_retry++;
> +               }
> +       } while (wsize < wdata->bytes && wait_retry < 3);
> +
> +       if (wsize < wdata->bytes) {
> +               rc = -EBUSY;
> +               goto out;
> +       }
> +
> +       rc = -EAGAIN;
> +       while (rc == -EAGAIN)
> +               if (!wdata->cfile->invalidHandle ||
> +                   !(rc = cifs_reopen_file(wdata->cfile, false)))
> +                       rc = server->ops->async_writev(wdata,
> +                                       cifs_uncached_writedata_release);
> +
> +       if (!rc) {
> +               list_add_tail(&wdata->list, wdata_list);
> +               return 0;
> +       }
> +
> +       add_credits_and_wake_if(server, wdata->credits, 0);
> +out:
> +       kref_put(&wdata->refcount, cifs_uncached_writedata_release);
> +
> +       return rc;
> +}
> +
> +static int
>  cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
>                      struct cifsFileInfo *open_file,
>                      struct cifs_sb_info *cifs_sb, struct list_head *wdata_list,
> @@ -2537,6 +2586,8 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
>         loff_t saved_offset = offset;
>         pid_t pid;
>         struct TCP_Server_Info *server;
> +       struct page **pagevec;
> +       size_t start;
>
>         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
>                 pid = open_file->pid;
> @@ -2553,38 +2604,74 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
>                 if (rc)
>                         break;
>
> -               nr_pages = get_numpages(wsize, len, &cur_len);
> -               wdata = cifs_writedata_alloc(nr_pages,
> +               if (ctx->direct_io) {
> +                       cur_len = iov_iter_get_pages_alloc(
> +                               from, &pagevec, wsize, &start);
> +                       if (cur_len < 0) {
> +                               cifs_dbg(VFS,
> +                                       "direct_writev couldn't get user pages "
> +                                       "(rc=%zd) iter type %d iov_offset %zd count"
> +                                       " %zd\n",
> +                                       cur_len, from->type,
> +                                       from->iov_offset, from->count);
> +                               dump_stack();
> +                               break;
> +                       }
> +                       iov_iter_advance(from, cur_len);
> +
> +                       nr_pages = (cur_len + start + PAGE_SIZE - 1) / PAGE_SIZE;
> +
> +                       wdata = cifs_writedata_direct_alloc(pagevec,
>                                              cifs_uncached_writev_complete);
> -               if (!wdata) {
> -                       rc = -ENOMEM;
> -                       add_credits_and_wake_if(server, credits, 0);
> -                       break;
> -               }
> +                       if (!wdata) {
> +                               rc = -ENOMEM;
> +                               add_credits_and_wake_if(server, credits, 0);
> +                               break;
> +                       }
>
> -               rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
> -               if (rc) {
> -                       kfree(wdata);
> -                       add_credits_and_wake_if(server, credits, 0);
> -                       break;
> -               }
>
> -               num_pages = nr_pages;
> -               rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages);
> -               if (rc) {
> -                       for (i = 0; i < nr_pages; i++)
> -                               put_page(wdata->pages[i]);
> -                       kfree(wdata);
> -                       add_credits_and_wake_if(server, credits, 0);
> -                       break;
> -               }
> +                       wdata->page_offset = start;
> +                       wdata->tailsz =
> +                               nr_pages > 1 ?
> +                                       cur_len - (PAGE_SIZE - start) -
> +                                       (nr_pages - 2) * PAGE_SIZE :
> +                                       cur_len;
> +               } else {
> +                       nr_pages = get_numpages(wsize, len, &cur_len);
> +                       wdata = cifs_writedata_alloc(nr_pages,
> +                                            cifs_uncached_writev_complete);
> +                       if (!wdata) {
> +                               rc = -ENOMEM;
> +                               add_credits_and_wake_if(server, credits, 0);
> +                               break;
> +                       }
>
> -               /*
> -                * Bring nr_pages down to the number of pages we actually used,
> -                * and free any pages that we didn't use.
> -                */
> -               for ( ; nr_pages > num_pages; nr_pages--)
> -                       put_page(wdata->pages[nr_pages - 1]);
> +                       rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
> +                       if (rc) {
> +                               kfree(wdata);
> +                               add_credits_and_wake_if(server, credits, 0);
> +                               break;
> +                       }
> +
> +                       num_pages = nr_pages;
> +                       rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages);
> +                       if (rc) {
> +                               for (i = 0; i < nr_pages; i++)
> +                                       put_page(wdata->pages[i]);
> +                               kfree(wdata);
> +                               add_credits_and_wake_if(server, credits, 0);
> +                               break;
> +                       }
> +
> +                       /*
> +                        * Bring nr_pages down to the number of pages we actually used,
> +                        * and free any pages that we didn't use.
> +                        */
> +                       for ( ; nr_pages > num_pages; nr_pages--)
> +                               put_page(wdata->pages[nr_pages - 1]);
> +
> +                       wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
> +               }
>
>                 wdata->sync_mode = WB_SYNC_ALL;
>                 wdata->nr_pages = nr_pages;
> @@ -2593,7 +2680,6 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
>                 wdata->pid = pid;
>                 wdata->bytes = cur_len;
>                 wdata->pagesz = PAGE_SIZE;
> -               wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
>                 wdata->credits = credits;
>                 wdata->ctx = ctx;
>                 kref_get(&ctx->refcount);
> @@ -2668,13 +2754,17 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
>                                 INIT_LIST_HEAD(&tmp_list);
>                                 list_del_init(&wdata->list);
>
> -                               iov_iter_advance(&tmp_from,
> +                               if (ctx->direct_io)
> +                                       rc = cifs_resend_wdata(wdata, &tmp_list, ctx);
> +                               else {
> +                                       iov_iter_advance(&tmp_from,
>                                                  wdata->offset - ctx->pos);
>
> -                               rc = cifs_write_from_iter(wdata->offset,
> +                                       rc = cifs_write_from_iter(wdata->offset,
>                                                 wdata->bytes, &tmp_from,
>                                                 ctx->cfile, cifs_sb, &tmp_list,
>                                                 ctx);
> +                               }
>
>                                 list_splice(&tmp_list, &ctx->list);
>
> @@ -2687,8 +2777,9 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
>                 kref_put(&wdata->refcount, cifs_uncached_writedata_release);
>         }
>
> -       for (i = 0; i < ctx->npages; i++)
> -               put_page(ctx->bv[i].bv_page);
> +       if (!ctx->direct_io)
> +               for (i = 0; i < ctx->npages; i++)
> +                       put_page(ctx->bv[i].bv_page);
>
>         cifs_stats_bytes_written(tcon, ctx->total_len);
>         set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags);
> @@ -2703,7 +2794,7 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
>                 complete(&ctx->done);
>  }
>
> -ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
> +static ssize_t __cifs_writev(struct kiocb *iocb, struct iov_iter *from, bool direct)
>  {
>         struct file *file = iocb->ki_filp;
>         ssize_t total_written = 0;
> @@ -2712,13 +2803,18 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
>         struct cifs_sb_info *cifs_sb;
>         struct cifs_aio_ctx *ctx;
>         struct iov_iter saved_from = *from;
> +       size_t len = iov_iter_count(from);
>         int rc;
>
>         /*
> -        * BB - optimize the way when signing is disabled. We can drop this
> -        * extra memory-to-memory copying and use iovec buffers for constructing
> -        * write request.
> +        * iov_iter_get_pages_alloc doesn't work with ITER_KVEC.
> +        * In this case, fall back to non-direct write function.
> +        * this could be improved by getting pages directly in ITER_KVEC
>          */
> +       if (direct && from->type & ITER_KVEC) {
> +               cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n");
> +               direct = false;
> +       }
>
>         rc = generic_write_checks(iocb, from);
>         if (rc <= 0)
> @@ -2742,10 +2838,16 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
>
>         ctx->pos = iocb->ki_pos;
>
> -       rc = setup_aio_ctx_iter(ctx, from, WRITE);
> -       if (rc) {
> -               kref_put(&ctx->refcount, cifs_aio_ctx_release);
> -               return rc;
> +       if (direct) {
> +               ctx->direct_io = true;
> +               ctx->iter = *from;
> +               ctx->len = len;
> +       } else {
> +               rc = setup_aio_ctx_iter(ctx, from, WRITE);
> +               if (rc) {
> +                       kref_put(&ctx->refcount, cifs_aio_ctx_release);
> +                       return rc;
> +               }
>         }
>
>         /* grab a lock here due to read response handlers can access ctx */
> @@ -2795,6 +2897,16 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
>         return total_written;
>  }
>
> +ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from)
> +{
> +       return __cifs_writev(iocb, from, true);
> +}
> +
> +ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
> +{
> +       return __cifs_writev(iocb, from, false);
> +}
> +
>  static ssize_t
>  cifs_writev(struct kiocb *iocb, struct iov_iter *from)
>  {
> --
> 2.7.4
>

Did you measure the performance benefit of using O_DIRECT with your
patches for non-RDMA mode?

--
Best regards,
Pavel Shilovsky
Long Li Nov. 29, 2018, 2:19 a.m. | #2
> Subject: Re: [Patch v4 2/3] CIFS: Add support for direct I/O write
> 
> ср, 31 окт. 2018 г. в 15:26, Long Li <longli@linuxonhyperv.com>:
> >
> > From: Long Li <longli@microsoft.com>
> >
> > With direct I/O write, user supplied buffers are pinned to the memory
> > and data are transferred directly from user buffers to the transport layer.
> >
> > Change in v3: add support for kernel AIO
> >
> > Change in v4:
> > Refactor common write code to __cifs_writev for direct and non-direct I/O.
> > Retry on direct I/O failure.
> >
> > Signed-off-by: Long Li <longli@microsoft.com>
> > ---
> >  fs/cifs/cifsfs.h |   1 +
> >  fs/cifs/file.c   | 194 +++++++++++++++++++++++++++++++++++++++++++----
> --------
> >  2 files changed, 154 insertions(+), 41 deletions(-)
> >
> > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > 7fba9aa..e9c5103 100644
> > --- a/fs/cifs/cifsfs.h
> > +++ b/fs/cifs/cifsfs.h
> > @@ -105,6 +105,7 @@ extern ssize_t cifs_user_readv(struct kiocb *iocb,
> > struct iov_iter *to);  extern ssize_t cifs_direct_readv(struct kiocb
> > *iocb, struct iov_iter *to);  extern ssize_t cifs_strict_readv(struct
> > kiocb *iocb, struct iov_iter *to);  extern ssize_t
> > cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
> > +extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter
> > +*from);
> >  extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter
> > *from);  extern int cifs_lock(struct file *, int, struct file_lock *);
> > extern int cifs_fsync(struct file *, loff_t, loff_t, int); diff --git
> > a/fs/cifs/file.c b/fs/cifs/file.c index daab878..1a41c04 100644
> > --- a/fs/cifs/file.c
> > +++ b/fs/cifs/file.c
> > @@ -2524,6 +2524,55 @@ wdata_fill_from_iovec(struct cifs_writedata
> > *wdata, struct iov_iter *from,  }
> >
> >  static int
> > +cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head
> > +*wdata_list, struct cifs_aio_ctx *ctx) {
> > +       int wait_retry = 0;
> > +       unsigned int wsize, credits;
> > +       int rc;
> > +       struct TCP_Server_Info *server =
> > +tlink_tcon(wdata->cfile->tlink)->ses->server;
> > +
> > +       /*
> > +        * Try to resend this wdata, waiting for credits up to 3 seconds.
> > +        * Note: we are attempting to resend the whole wdata not in
> segments
> > +        */
> > +       do {
> > +               rc = server->ops->wait_mtu_credits(server,
> > + wdata->bytes, &wsize, &credits);
> > +
> > +               if (rc)
> > +                       break;
> > +
> > +               if (wsize < wdata->bytes) {
> > +                       add_credits_and_wake_if(server, credits, 0);
> > +                       msleep(1000);
> > +                       wait_retry++;
> > +               }
> > +       } while (wsize < wdata->bytes && wait_retry < 3);
> > +
> > +       if (wsize < wdata->bytes) {
> > +               rc = -EBUSY;
> > +               goto out;
> > +       }
> > +
> > +       rc = -EAGAIN;
> > +       while (rc == -EAGAIN)
> > +               if (!wdata->cfile->invalidHandle ||
> > +                   !(rc = cifs_reopen_file(wdata->cfile, false)))
> > +                       rc = server->ops->async_writev(wdata,
> > +
> > + cifs_uncached_writedata_release);
> > +
> > +       if (!rc) {
> > +               list_add_tail(&wdata->list, wdata_list);
> > +               return 0;
> > +       }
> > +
> > +       add_credits_and_wake_if(server, wdata->credits, 0);
> > +out:
> > +       kref_put(&wdata->refcount, cifs_uncached_writedata_release);
> > +
> > +       return rc;
> > +}
> > +
> > +static int
> >  cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
> >                      struct cifsFileInfo *open_file,
> >                      struct cifs_sb_info *cifs_sb, struct list_head
> > *wdata_list, @@ -2537,6 +2586,8 @@ cifs_write_from_iter(loff_t offset,
> size_t len, struct iov_iter *from,
> >         loff_t saved_offset = offset;
> >         pid_t pid;
> >         struct TCP_Server_Info *server;
> > +       struct page **pagevec;
> > +       size_t start;
> >
> >         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
> >                 pid = open_file->pid;
> > @@ -2553,38 +2604,74 @@ cifs_write_from_iter(loff_t offset, size_t len,
> struct iov_iter *from,
> >                 if (rc)
> >                         break;
> >
> > -               nr_pages = get_numpages(wsize, len, &cur_len);
> > -               wdata = cifs_writedata_alloc(nr_pages,
> > +               if (ctx->direct_io) {
> > +                       cur_len = iov_iter_get_pages_alloc(
> > +                               from, &pagevec, wsize, &start);
> > +                       if (cur_len < 0) {
> > +                               cifs_dbg(VFS,
> > +                                       "direct_writev couldn't get user pages "
> > +                                       "(rc=%zd) iter type %d iov_offset %zd count"
> > +                                       " %zd\n",
> > +                                       cur_len, from->type,
> > +                                       from->iov_offset, from->count);
> > +                               dump_stack();
> > +                               break;
> > +                       }
> > +                       iov_iter_advance(from, cur_len);
> > +
> > +                       nr_pages = (cur_len + start + PAGE_SIZE - 1) /
> > + PAGE_SIZE;
> > +
> > +                       wdata = cifs_writedata_direct_alloc(pagevec,
> >                                              cifs_uncached_writev_complete);
> > -               if (!wdata) {
> > -                       rc = -ENOMEM;
> > -                       add_credits_and_wake_if(server, credits, 0);
> > -                       break;
> > -               }
> > +                       if (!wdata) {
> > +                               rc = -ENOMEM;
> > +                               add_credits_and_wake_if(server, credits, 0);
> > +                               break;
> > +                       }
> >
> > -               rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
> > -               if (rc) {
> > -                       kfree(wdata);
> > -                       add_credits_and_wake_if(server, credits, 0);
> > -                       break;
> > -               }
> >
> > -               num_pages = nr_pages;
> > -               rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages);
> > -               if (rc) {
> > -                       for (i = 0; i < nr_pages; i++)
> > -                               put_page(wdata->pages[i]);
> > -                       kfree(wdata);
> > -                       add_credits_and_wake_if(server, credits, 0);
> > -                       break;
> > -               }
> > +                       wdata->page_offset = start;
> > +                       wdata->tailsz =
> > +                               nr_pages > 1 ?
> > +                                       cur_len - (PAGE_SIZE - start) -
> > +                                       (nr_pages - 2) * PAGE_SIZE :
> > +                                       cur_len;
> > +               } else {
> > +                       nr_pages = get_numpages(wsize, len, &cur_len);
> > +                       wdata = cifs_writedata_alloc(nr_pages,
> > +                                            cifs_uncached_writev_complete);
> > +                       if (!wdata) {
> > +                               rc = -ENOMEM;
> > +                               add_credits_and_wake_if(server, credits, 0);
> > +                               break;
> > +                       }
> >
> > -               /*
> > -                * Bring nr_pages down to the number of pages we actually used,
> > -                * and free any pages that we didn't use.
> > -                */
> > -               for ( ; nr_pages > num_pages; nr_pages--)
> > -                       put_page(wdata->pages[nr_pages - 1]);
> > +                       rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
> > +                       if (rc) {
> > +                               kfree(wdata);
> > +                               add_credits_and_wake_if(server, credits, 0);
> > +                               break;
> > +                       }
> > +
> > +                       num_pages = nr_pages;
> > +                       rc = wdata_fill_from_iovec(wdata, from, &cur_len,
> &num_pages);
> > +                       if (rc) {
> > +                               for (i = 0; i < nr_pages; i++)
> > +                                       put_page(wdata->pages[i]);
> > +                               kfree(wdata);
> > +                               add_credits_and_wake_if(server, credits, 0);
> > +                               break;
> > +                       }
> > +
> > +                       /*
> > +                        * Bring nr_pages down to the number of pages we actually
> used,
> > +                        * and free any pages that we didn't use.
> > +                        */
> > +                       for ( ; nr_pages > num_pages; nr_pages--)
> > +                               put_page(wdata->pages[nr_pages - 1]);
> > +
> > +                       wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
> > +               }
> >
> >                 wdata->sync_mode = WB_SYNC_ALL;
> >                 wdata->nr_pages = nr_pages; @@ -2593,7 +2680,6 @@
> > cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
> >                 wdata->pid = pid;
> >                 wdata->bytes = cur_len;
> >                 wdata->pagesz = PAGE_SIZE;
> > -               wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
> >                 wdata->credits = credits;
> >                 wdata->ctx = ctx;
> >                 kref_get(&ctx->refcount); @@ -2668,13 +2754,17 @@
> > static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
> >                                 INIT_LIST_HEAD(&tmp_list);
> >                                 list_del_init(&wdata->list);
> >
> > -                               iov_iter_advance(&tmp_from,
> > +                               if (ctx->direct_io)
> > +                                       rc = cifs_resend_wdata(wdata, &tmp_list, ctx);
> > +                               else {
> > +                                       iov_iter_advance(&tmp_from,
> >                                                  wdata->offset -
> > ctx->pos);
> >
> > -                               rc = cifs_write_from_iter(wdata->offset,
> > +                                       rc =
> > + cifs_write_from_iter(wdata->offset,
> >                                                 wdata->bytes, &tmp_from,
> >                                                 ctx->cfile, cifs_sb, &tmp_list,
> >                                                 ctx);
> > +                               }
> >
> >                                 list_splice(&tmp_list, &ctx->list);
> >
> > @@ -2687,8 +2777,9 @@ static void collect_uncached_write_data(struct
> cifs_aio_ctx *ctx)
> >                 kref_put(&wdata->refcount, cifs_uncached_writedata_release);
> >         }
> >
> > -       for (i = 0; i < ctx->npages; i++)
> > -               put_page(ctx->bv[i].bv_page);
> > +       if (!ctx->direct_io)
> > +               for (i = 0; i < ctx->npages; i++)
> > +                       put_page(ctx->bv[i].bv_page);
> >
> >         cifs_stats_bytes_written(tcon, ctx->total_len);
> >         set_bit(CIFS_INO_INVALID_MAPPING,
> > &CIFS_I(dentry->d_inode)->flags); @@ -2703,7 +2794,7 @@ static void
> collect_uncached_write_data(struct cifs_aio_ctx *ctx)
> >                 complete(&ctx->done);
> >  }
> >
> > -ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
> > +static ssize_t __cifs_writev(struct kiocb *iocb, struct iov_iter
> > +*from, bool direct)
> >  {
> >         struct file *file = iocb->ki_filp;
> >         ssize_t total_written = 0;
> > @@ -2712,13 +2803,18 @@ ssize_t cifs_user_writev(struct kiocb *iocb,
> struct iov_iter *from)
> >         struct cifs_sb_info *cifs_sb;
> >         struct cifs_aio_ctx *ctx;
> >         struct iov_iter saved_from = *from;
> > +       size_t len = iov_iter_count(from);
> >         int rc;
> >
> >         /*
> > -        * BB - optimize the way when signing is disabled. We can drop this
> > -        * extra memory-to-memory copying and use iovec buffers for
> constructing
> > -        * write request.
> > +        * iov_iter_get_pages_alloc doesn't work with ITER_KVEC.
> > +        * In this case, fall back to non-direct write function.
> > +        * this could be improved by getting pages directly in
> > + ITER_KVEC
> >          */
> > +       if (direct && from->type & ITER_KVEC) {
> > +               cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n");
> > +               direct = false;
> > +       }
> >
> >         rc = generic_write_checks(iocb, from);
> >         if (rc <= 0)
> > @@ -2742,10 +2838,16 @@ ssize_t cifs_user_writev(struct kiocb *iocb,
> > struct iov_iter *from)
> >
> >         ctx->pos = iocb->ki_pos;
> >
> > -       rc = setup_aio_ctx_iter(ctx, from, WRITE);
> > -       if (rc) {
> > -               kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > -               return rc;
> > +       if (direct) {
> > +               ctx->direct_io = true;
> > +               ctx->iter = *from;
> > +               ctx->len = len;
> > +       } else {
> > +               rc = setup_aio_ctx_iter(ctx, from, WRITE);
> > +               if (rc) {
> > +                       kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > +                       return rc;
> > +               }
> >         }
> >
> >         /* grab a lock here due to read response handlers can access
> > ctx */ @@ -2795,6 +2897,16 @@ ssize_t cifs_user_writev(struct kiocb
> *iocb, struct iov_iter *from)
> >         return total_written;
> >  }
> >
> > +ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from)
> > +{
> > +       return __cifs_writev(iocb, from, true); }
> > +
> > +ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) {
> > +       return __cifs_writev(iocb, from, false); }
> > +
> >  static ssize_t
> >  cifs_writev(struct kiocb *iocb, struct iov_iter *from)  {
> > --
> > 2.7.4
> >
> 
> Did you measure the performance benefit of using O_DIRECT with your
> patches for non-RDMA mode?

Here are some of the results measured on TCP/IP over IPoIB 40G Infiniband. Please note that IPoIB is done in software so it's slower than a 40G Ethernet NIC.

All servers are running 2 X Intel(R) Xeon(R) CPU E5-2650 v3 @ 2.30GHz. Tested on FIO 64k read or write with queue depths 1 or 16. All the tests are running with 1 FIO job, and with "direct=1".

Without direct I/O:
read 64k d1	113669KB/s
read 64k d16	618428KB/s
write 64k d1	134911KB/s
write 64k d16	457005KB/s

With direct I/O:
read 64k d1	127134KB/s
read 64k d16	714629KB/s
write 64k d1	129268KB/s
write 64k d16	461054KB/s

Direct I/O improvement:
read 64k d1	11%
read 64k d16	15%
write 64k d1	-5%
write 64k d16	1%



> 
> --
> Best regards,
> Pavel Shilovsky
Pavel Shilovsky Nov. 29, 2018, 6:31 p.m. | #3
ср, 28 нояб. 2018 г. в 18:20, Long Li <longli@microsoft.com>:
>
> > Subject: Re: [Patch v4 2/3] CIFS: Add support for direct I/O write
> >
> > ср, 31 окт. 2018 г. в 15:26, Long Li <longli@linuxonhyperv.com>:
> > >
> > > From: Long Li <longli@microsoft.com>
> > >
> > > With direct I/O write, user supplied buffers are pinned to the memory
> > > and data are transferred directly from user buffers to the transport layer.
> > >
> > > Change in v3: add support for kernel AIO
> > >
> > > Change in v4:
> > > Refactor common write code to __cifs_writev for direct and non-direct I/O.
> > > Retry on direct I/O failure.
> > >
> > > Signed-off-by: Long Li <longli@microsoft.com>
> > > ---
> > >  fs/cifs/cifsfs.h |   1 +
> > >  fs/cifs/file.c   | 194 +++++++++++++++++++++++++++++++++++++++++++----
> > --------
> > >  2 files changed, 154 insertions(+), 41 deletions(-)
> > >
> > > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > > 7fba9aa..e9c5103 100644
> > > --- a/fs/cifs/cifsfs.h
> > > +++ b/fs/cifs/cifsfs.h
> > > @@ -105,6 +105,7 @@ extern ssize_t cifs_user_readv(struct kiocb *iocb,
> > > struct iov_iter *to);  extern ssize_t cifs_direct_readv(struct kiocb
> > > *iocb, struct iov_iter *to);  extern ssize_t cifs_strict_readv(struct
> > > kiocb *iocb, struct iov_iter *to);  extern ssize_t
> > > cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
> > > +extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter
> > > +*from);
> > >  extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter
> > > *from);  extern int cifs_lock(struct file *, int, struct file_lock *);
> > > extern int cifs_fsync(struct file *, loff_t, loff_t, int); diff --git
> > > a/fs/cifs/file.c b/fs/cifs/file.c index daab878..1a41c04 100644
> > > --- a/fs/cifs/file.c
> > > +++ b/fs/cifs/file.c
> > > @@ -2524,6 +2524,55 @@ wdata_fill_from_iovec(struct cifs_writedata
> > > *wdata, struct iov_iter *from,  }
> > >
> > >  static int
> > > +cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head
> > > +*wdata_list, struct cifs_aio_ctx *ctx) {
> > > +       int wait_retry = 0;
> > > +       unsigned int wsize, credits;
> > > +       int rc;
> > > +       struct TCP_Server_Info *server =
> > > +tlink_tcon(wdata->cfile->tlink)->ses->server;
> > > +
> > > +       /*
> > > +        * Try to resend this wdata, waiting for credits up to 3 seconds.
> > > +        * Note: we are attempting to resend the whole wdata not in
> > segments
> > > +        */
> > > +       do {
> > > +               rc = server->ops->wait_mtu_credits(server,
> > > + wdata->bytes, &wsize, &credits);
> > > +
> > > +               if (rc)
> > > +                       break;
> > > +
> > > +               if (wsize < wdata->bytes) {
> > > +                       add_credits_and_wake_if(server, credits, 0);
> > > +                       msleep(1000);
> > > +                       wait_retry++;
> > > +               }
> > > +       } while (wsize < wdata->bytes && wait_retry < 3);
> > > +
> > > +       if (wsize < wdata->bytes) {
> > > +               rc = -EBUSY;
> > > +               goto out;
> > > +       }
> > > +
> > > +       rc = -EAGAIN;
> > > +       while (rc == -EAGAIN)
> > > +               if (!wdata->cfile->invalidHandle ||
> > > +                   !(rc = cifs_reopen_file(wdata->cfile, false)))
> > > +                       rc = server->ops->async_writev(wdata,
> > > +
> > > + cifs_uncached_writedata_release);
> > > +
> > > +       if (!rc) {
> > > +               list_add_tail(&wdata->list, wdata_list);
> > > +               return 0;
> > > +       }
> > > +
> > > +       add_credits_and_wake_if(server, wdata->credits, 0);
> > > +out:
> > > +       kref_put(&wdata->refcount, cifs_uncached_writedata_release);
> > > +
> > > +       return rc;
> > > +}
> > > +
> > > +static int
> > >  cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
> > >                      struct cifsFileInfo *open_file,
> > >                      struct cifs_sb_info *cifs_sb, struct list_head
> > > *wdata_list, @@ -2537,6 +2586,8 @@ cifs_write_from_iter(loff_t offset,
> > size_t len, struct iov_iter *from,
> > >         loff_t saved_offset = offset;
> > >         pid_t pid;
> > >         struct TCP_Server_Info *server;
> > > +       struct page **pagevec;
> > > +       size_t start;
> > >
> > >         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
> > >                 pid = open_file->pid;
> > > @@ -2553,38 +2604,74 @@ cifs_write_from_iter(loff_t offset, size_t len,
> > struct iov_iter *from,
> > >                 if (rc)
> > >                         break;
> > >
> > > -               nr_pages = get_numpages(wsize, len, &cur_len);
> > > -               wdata = cifs_writedata_alloc(nr_pages,
> > > +               if (ctx->direct_io) {
> > > +                       cur_len = iov_iter_get_pages_alloc(
> > > +                               from, &pagevec, wsize, &start);
> > > +                       if (cur_len < 0) {
> > > +                               cifs_dbg(VFS,
> > > +                                       "direct_writev couldn't get user pages "
> > > +                                       "(rc=%zd) iter type %d iov_offset %zd count"
> > > +                                       " %zd\n",
> > > +                                       cur_len, from->type,
> > > +                                       from->iov_offset, from->count);
> > > +                               dump_stack();
> > > +                               break;
> > > +                       }
> > > +                       iov_iter_advance(from, cur_len);
> > > +
> > > +                       nr_pages = (cur_len + start + PAGE_SIZE - 1) /
> > > + PAGE_SIZE;
> > > +
> > > +                       wdata = cifs_writedata_direct_alloc(pagevec,
> > >                                              cifs_uncached_writev_complete);
> > > -               if (!wdata) {
> > > -                       rc = -ENOMEM;
> > > -                       add_credits_and_wake_if(server, credits, 0);
> > > -                       break;
> > > -               }
> > > +                       if (!wdata) {
> > > +                               rc = -ENOMEM;
> > > +                               add_credits_and_wake_if(server, credits, 0);
> > > +                               break;
> > > +                       }
> > >
> > > -               rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
> > > -               if (rc) {
> > > -                       kfree(wdata);
> > > -                       add_credits_and_wake_if(server, credits, 0);
> > > -                       break;
> > > -               }
> > >
> > > -               num_pages = nr_pages;
> > > -               rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages);
> > > -               if (rc) {
> > > -                       for (i = 0; i < nr_pages; i++)
> > > -                               put_page(wdata->pages[i]);
> > > -                       kfree(wdata);
> > > -                       add_credits_and_wake_if(server, credits, 0);
> > > -                       break;
> > > -               }
> > > +                       wdata->page_offset = start;
> > > +                       wdata->tailsz =
> > > +                               nr_pages > 1 ?
> > > +                                       cur_len - (PAGE_SIZE - start) -
> > > +                                       (nr_pages - 2) * PAGE_SIZE :
> > > +                                       cur_len;
> > > +               } else {
> > > +                       nr_pages = get_numpages(wsize, len, &cur_len);
> > > +                       wdata = cifs_writedata_alloc(nr_pages,
> > > +                                            cifs_uncached_writev_complete);
> > > +                       if (!wdata) {
> > > +                               rc = -ENOMEM;
> > > +                               add_credits_and_wake_if(server, credits, 0);
> > > +                               break;
> > > +                       }
> > >
> > > -               /*
> > > -                * Bring nr_pages down to the number of pages we actually used,
> > > -                * and free any pages that we didn't use.
> > > -                */
> > > -               for ( ; nr_pages > num_pages; nr_pages--)
> > > -                       put_page(wdata->pages[nr_pages - 1]);
> > > +                       rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
> > > +                       if (rc) {
> > > +                               kfree(wdata);
> > > +                               add_credits_and_wake_if(server, credits, 0);
> > > +                               break;
> > > +                       }
> > > +
> > > +                       num_pages = nr_pages;
> > > +                       rc = wdata_fill_from_iovec(wdata, from, &cur_len,
> > &num_pages);
> > > +                       if (rc) {
> > > +                               for (i = 0; i < nr_pages; i++)
> > > +                                       put_page(wdata->pages[i]);
> > > +                               kfree(wdata);
> > > +                               add_credits_and_wake_if(server, credits, 0);
> > > +                               break;
> > > +                       }
> > > +
> > > +                       /*
> > > +                        * Bring nr_pages down to the number of pages we actually
> > used,
> > > +                        * and free any pages that we didn't use.
> > > +                        */
> > > +                       for ( ; nr_pages > num_pages; nr_pages--)
> > > +                               put_page(wdata->pages[nr_pages - 1]);
> > > +
> > > +                       wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
> > > +               }
> > >
> > >                 wdata->sync_mode = WB_SYNC_ALL;
> > >                 wdata->nr_pages = nr_pages; @@ -2593,7 +2680,6 @@
> > > cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
> > >                 wdata->pid = pid;
> > >                 wdata->bytes = cur_len;
> > >                 wdata->pagesz = PAGE_SIZE;
> > > -               wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
> > >                 wdata->credits = credits;
> > >                 wdata->ctx = ctx;
> > >                 kref_get(&ctx->refcount); @@ -2668,13 +2754,17 @@
> > > static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
> > >                                 INIT_LIST_HEAD(&tmp_list);
> > >                                 list_del_init(&wdata->list);
> > >
> > > -                               iov_iter_advance(&tmp_from,
> > > +                               if (ctx->direct_io)
> > > +                                       rc = cifs_resend_wdata(wdata, &tmp_list, ctx);
> > > +                               else {
> > > +                                       iov_iter_advance(&tmp_from,
> > >                                                  wdata->offset -
> > > ctx->pos);
> > >
> > > -                               rc = cifs_write_from_iter(wdata->offset,
> > > +                                       rc =
> > > + cifs_write_from_iter(wdata->offset,
> > >                                                 wdata->bytes, &tmp_from,
> > >                                                 ctx->cfile, cifs_sb, &tmp_list,
> > >                                                 ctx);
> > > +                               }
> > >
> > >                                 list_splice(&tmp_list, &ctx->list);
> > >
> > > @@ -2687,8 +2777,9 @@ static void collect_uncached_write_data(struct
> > cifs_aio_ctx *ctx)
> > >                 kref_put(&wdata->refcount, cifs_uncached_writedata_release);
> > >         }
> > >
> > > -       for (i = 0; i < ctx->npages; i++)
> > > -               put_page(ctx->bv[i].bv_page);
> > > +       if (!ctx->direct_io)
> > > +               for (i = 0; i < ctx->npages; i++)
> > > +                       put_page(ctx->bv[i].bv_page);
> > >
> > >         cifs_stats_bytes_written(tcon, ctx->total_len);
> > >         set_bit(CIFS_INO_INVALID_MAPPING,
> > > &CIFS_I(dentry->d_inode)->flags); @@ -2703,7 +2794,7 @@ static void
> > collect_uncached_write_data(struct cifs_aio_ctx *ctx)
> > >                 complete(&ctx->done);
> > >  }
> > >
> > > -ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
> > > +static ssize_t __cifs_writev(struct kiocb *iocb, struct iov_iter
> > > +*from, bool direct)
> > >  {
> > >         struct file *file = iocb->ki_filp;
> > >         ssize_t total_written = 0;
> > > @@ -2712,13 +2803,18 @@ ssize_t cifs_user_writev(struct kiocb *iocb,
> > struct iov_iter *from)
> > >         struct cifs_sb_info *cifs_sb;
> > >         struct cifs_aio_ctx *ctx;
> > >         struct iov_iter saved_from = *from;
> > > +       size_t len = iov_iter_count(from);
> > >         int rc;
> > >
> > >         /*
> > > -        * BB - optimize the way when signing is disabled. We can drop this
> > > -        * extra memory-to-memory copying and use iovec buffers for
> > constructing
> > > -        * write request.
> > > +        * iov_iter_get_pages_alloc doesn't work with ITER_KVEC.
> > > +        * In this case, fall back to non-direct write function.
> > > +        * this could be improved by getting pages directly in
> > > + ITER_KVEC
> > >          */
> > > +       if (direct && from->type & ITER_KVEC) {
> > > +               cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n");
> > > +               direct = false;
> > > +       }
> > >
> > >         rc = generic_write_checks(iocb, from);
> > >         if (rc <= 0)
> > > @@ -2742,10 +2838,16 @@ ssize_t cifs_user_writev(struct kiocb *iocb,
> > > struct iov_iter *from)
> > >
> > >         ctx->pos = iocb->ki_pos;
> > >
> > > -       rc = setup_aio_ctx_iter(ctx, from, WRITE);
> > > -       if (rc) {
> > > -               kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > > -               return rc;
> > > +       if (direct) {
> > > +               ctx->direct_io = true;
> > > +               ctx->iter = *from;
> > > +               ctx->len = len;
> > > +       } else {
> > > +               rc = setup_aio_ctx_iter(ctx, from, WRITE);
> > > +               if (rc) {
> > > +                       kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > > +                       return rc;
> > > +               }
> > >         }
> > >
> > >         /* grab a lock here due to read response handlers can access
> > > ctx */ @@ -2795,6 +2897,16 @@ ssize_t cifs_user_writev(struct kiocb
> > *iocb, struct iov_iter *from)
> > >         return total_written;
> > >  }
> > >
> > > +ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from)
> > > +{
> > > +       return __cifs_writev(iocb, from, true); }
> > > +
> > > +ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) {
> > > +       return __cifs_writev(iocb, from, false); }
> > > +
> > >  static ssize_t
> > >  cifs_writev(struct kiocb *iocb, struct iov_iter *from)  {
> > > --
> > > 2.7.4
> > >
> >
> > Did you measure the performance benefit of using O_DIRECT with your
> > patches for non-RDMA mode?
>
> Here are some of the results measured on TCP/IP over IPoIB 40G Infiniband. Please note that IPoIB is done in software so it's slower than a 40G Ethernet NIC.
>
> All servers are running 2 X Intel(R) Xeon(R) CPU E5-2650 v3 @ 2.30GHz. Tested on FIO 64k read or write with queue depths 1 or 16. All the tests are running with 1 FIO job, and with "direct=1".
>
> Without direct I/O:
> read 64k d1     113669KB/s
> read 64k d16    618428KB/s
> write 64k d1    134911KB/s
> write 64k d16   457005KB/s
>
> With direct I/O:
> read 64k d1     127134KB/s
> read 64k d16    714629KB/s
> write 64k d1    129268KB/s
> write 64k d16   461054KB/s
>
> Direct I/O improvement:
> read 64k d1     11%
> read 64k d16    15%
> write 64k d1    -5%
                      ^^^
This is strange. Is it consistent across multiple runs?

> write 64k d16   1%

Good read performance improvements overall and it seems write
performance results need more investigations.

--
Best regards,
Pavel Shilovsky
Long Li Nov. 29, 2018, 9:29 p.m. | #4
> Subject: Re: [Patch v4 2/3] CIFS: Add support for direct I/O write
> 
> ср, 28 нояб. 2018 г. в 18:20, Long Li <longli@microsoft.com>:
> >
> > > Subject: Re: [Patch v4 2/3] CIFS: Add support for direct I/O write
> > >
> > > ср, 31 окт. 2018 г. в 15:26, Long Li <longli@linuxonhyperv.com>:
> > > >
> > > > From: Long Li <longli@microsoft.com>
> > > >
> > > > With direct I/O write, user supplied buffers are pinned to the
> > > > memory and data are transferred directly from user buffers to the
> transport layer.
> > > >
> > > > Change in v3: add support for kernel AIO
> > > >
> > > > Change in v4:
> > > > Refactor common write code to __cifs_writev for direct and non-direct
> I/O.
> > > > Retry on direct I/O failure.
> > > >
> > > > Signed-off-by: Long Li <longli@microsoft.com>
> > > > ---
> > > >  fs/cifs/cifsfs.h |   1 +
> > > >  fs/cifs/file.c   | 194
> +++++++++++++++++++++++++++++++++++++++++++----
> > > --------
> > > >  2 files changed, 154 insertions(+), 41 deletions(-)
> > > >
> > > > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > > > 7fba9aa..e9c5103 100644
> > > > --- a/fs/cifs/cifsfs.h
> > > > +++ b/fs/cifs/cifsfs.h
> > > > @@ -105,6 +105,7 @@ extern ssize_t cifs_user_readv(struct kiocb
> > > > *iocb, struct iov_iter *to);  extern ssize_t
> > > > cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
> > > > extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct
> > > > iov_iter *to);  extern ssize_t cifs_user_writev(struct kiocb
> > > > *iocb, struct iov_iter *from);
> > > > +extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct
> > > > +iov_iter *from);
> > > >  extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct
> > > > iov_iter *from);  extern int cifs_lock(struct file *, int, struct
> > > > file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t,
> > > > int); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index
> > > > daab878..1a41c04 100644
> > > > --- a/fs/cifs/file.c
> > > > +++ b/fs/cifs/file.c
> > > > @@ -2524,6 +2524,55 @@ wdata_fill_from_iovec(struct cifs_writedata
> > > > *wdata, struct iov_iter *from,  }
> > > >
> > > >  static int
> > > > +cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head
> > > > +*wdata_list, struct cifs_aio_ctx *ctx) {
> > > > +       int wait_retry = 0;
> > > > +       unsigned int wsize, credits;
> > > > +       int rc;
> > > > +       struct TCP_Server_Info *server =
> > > > +tlink_tcon(wdata->cfile->tlink)->ses->server;
> > > > +
> > > > +       /*
> > > > +        * Try to resend this wdata, waiting for credits up to 3 seconds.
> > > > +        * Note: we are attempting to resend the whole wdata not
> > > > + in
> > > segments
> > > > +        */
> > > > +       do {
> > > > +               rc = server->ops->wait_mtu_credits(server,
> > > > + wdata->bytes, &wsize, &credits);
> > > > +
> > > > +               if (rc)
> > > > +                       break;
> > > > +
> > > > +               if (wsize < wdata->bytes) {
> > > > +                       add_credits_and_wake_if(server, credits, 0);
> > > > +                       msleep(1000);
> > > > +                       wait_retry++;
> > > > +               }
> > > > +       } while (wsize < wdata->bytes && wait_retry < 3);
> > > > +
> > > > +       if (wsize < wdata->bytes) {
> > > > +               rc = -EBUSY;
> > > > +               goto out;
> > > > +       }
> > > > +
> > > > +       rc = -EAGAIN;
> > > > +       while (rc == -EAGAIN)
> > > > +               if (!wdata->cfile->invalidHandle ||
> > > > +                   !(rc = cifs_reopen_file(wdata->cfile, false)))
> > > > +                       rc = server->ops->async_writev(wdata,
> > > > +
> > > > + cifs_uncached_writedata_release);
> > > > +
> > > > +       if (!rc) {
> > > > +               list_add_tail(&wdata->list, wdata_list);
> > > > +               return 0;
> > > > +       }
> > > > +
> > > > +       add_credits_and_wake_if(server, wdata->credits, 0);
> > > > +out:
> > > > +       kref_put(&wdata->refcount,
> > > > +cifs_uncached_writedata_release);
> > > > +
> > > > +       return rc;
> > > > +}
> > > > +
> > > > +static int
> > > >  cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
> > > >                      struct cifsFileInfo *open_file,
> > > >                      struct cifs_sb_info *cifs_sb, struct
> > > > list_head *wdata_list, @@ -2537,6 +2586,8 @@
> > > > cifs_write_from_iter(loff_t offset,
> > > size_t len, struct iov_iter *from,
> > > >         loff_t saved_offset = offset;
> > > >         pid_t pid;
> > > >         struct TCP_Server_Info *server;
> > > > +       struct page **pagevec;
> > > > +       size_t start;
> > > >
> > > >         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
> > > >                 pid = open_file->pid; @@ -2553,38 +2604,74 @@
> > > > cifs_write_from_iter(loff_t offset, size_t len,
> > > struct iov_iter *from,
> > > >                 if (rc)
> > > >                         break;
> > > >
> > > > -               nr_pages = get_numpages(wsize, len, &cur_len);
> > > > -               wdata = cifs_writedata_alloc(nr_pages,
> > > > +               if (ctx->direct_io) {
> > > > +                       cur_len = iov_iter_get_pages_alloc(
> > > > +                               from, &pagevec, wsize, &start);
> > > > +                       if (cur_len < 0) {
> > > > +                               cifs_dbg(VFS,
> > > > +                                       "direct_writev couldn't get user pages "
> > > > +                                       "(rc=%zd) iter type %d iov_offset %zd count"
> > > > +                                       " %zd\n",
> > > > +                                       cur_len, from->type,
> > > > +                                       from->iov_offset, from->count);
> > > > +                               dump_stack();
> > > > +                               break;
> > > > +                       }
> > > > +                       iov_iter_advance(from, cur_len);
> > > > +
> > > > +                       nr_pages = (cur_len + start + PAGE_SIZE -
> > > > + 1) / PAGE_SIZE;
> > > > +
> > > > +                       wdata =
> > > > + cifs_writedata_direct_alloc(pagevec,
> > > >                                              cifs_uncached_writev_complete);
> > > > -               if (!wdata) {
> > > > -                       rc = -ENOMEM;
> > > > -                       add_credits_and_wake_if(server, credits, 0);
> > > > -                       break;
> > > > -               }
> > > > +                       if (!wdata) {
> > > > +                               rc = -ENOMEM;
> > > > +                               add_credits_and_wake_if(server, credits, 0);
> > > > +                               break;
> > > > +                       }
> > > >
> > > > -               rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
> > > > -               if (rc) {
> > > > -                       kfree(wdata);
> > > > -                       add_credits_and_wake_if(server, credits, 0);
> > > > -                       break;
> > > > -               }
> > > >
> > > > -               num_pages = nr_pages;
> > > > -               rc = wdata_fill_from_iovec(wdata, from, &cur_len,
> &num_pages);
> > > > -               if (rc) {
> > > > -                       for (i = 0; i < nr_pages; i++)
> > > > -                               put_page(wdata->pages[i]);
> > > > -                       kfree(wdata);
> > > > -                       add_credits_and_wake_if(server, credits, 0);
> > > > -                       break;
> > > > -               }
> > > > +                       wdata->page_offset = start;
> > > > +                       wdata->tailsz =
> > > > +                               nr_pages > 1 ?
> > > > +                                       cur_len - (PAGE_SIZE - start) -
> > > > +                                       (nr_pages - 2) * PAGE_SIZE :
> > > > +                                       cur_len;
> > > > +               } else {
> > > > +                       nr_pages = get_numpages(wsize, len, &cur_len);
> > > > +                       wdata = cifs_writedata_alloc(nr_pages,
> > > > +                                            cifs_uncached_writev_complete);
> > > > +                       if (!wdata) {
> > > > +                               rc = -ENOMEM;
> > > > +                               add_credits_and_wake_if(server, credits, 0);
> > > > +                               break;
> > > > +                       }
> > > >
> > > > -               /*
> > > > -                * Bring nr_pages down to the number of pages we actually
> used,
> > > > -                * and free any pages that we didn't use.
> > > > -                */
> > > > -               for ( ; nr_pages > num_pages; nr_pages--)
> > > > -                       put_page(wdata->pages[nr_pages - 1]);
> > > > +                       rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
> > > > +                       if (rc) {
> > > > +                               kfree(wdata);
> > > > +                               add_credits_and_wake_if(server, credits, 0);
> > > > +                               break;
> > > > +                       }
> > > > +
> > > > +                       num_pages = nr_pages;
> > > > +                       rc = wdata_fill_from_iovec(wdata, from,
> > > > + &cur_len,
> > > &num_pages);
> > > > +                       if (rc) {
> > > > +                               for (i = 0; i < nr_pages; i++)
> > > > +                                       put_page(wdata->pages[i]);
> > > > +                               kfree(wdata);
> > > > +                               add_credits_and_wake_if(server, credits, 0);
> > > > +                               break;
> > > > +                       }
> > > > +
> > > > +                       /*
> > > > +                        * Bring nr_pages down to the number of
> > > > + pages we actually
> > > used,
> > > > +                        * and free any pages that we didn't use.
> > > > +                        */
> > > > +                       for ( ; nr_pages > num_pages; nr_pages--)
> > > > +                               put_page(wdata->pages[nr_pages -
> > > > + 1]);
> > > > +
> > > > +                       wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
> > > > +               }
> > > >
> > > >                 wdata->sync_mode = WB_SYNC_ALL;
> > > >                 wdata->nr_pages = nr_pages; @@ -2593,7 +2680,6 @@
> > > > cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
> > > >                 wdata->pid = pid;
> > > >                 wdata->bytes = cur_len;
> > > >                 wdata->pagesz = PAGE_SIZE;
> > > > -               wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
> > > >                 wdata->credits = credits;
> > > >                 wdata->ctx = ctx;
> > > >                 kref_get(&ctx->refcount); @@ -2668,13 +2754,17 @@
> > > > static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
> > > >                                 INIT_LIST_HEAD(&tmp_list);
> > > >                                 list_del_init(&wdata->list);
> > > >
> > > > -                               iov_iter_advance(&tmp_from,
> > > > +                               if (ctx->direct_io)
> > > > +                                       rc = cifs_resend_wdata(wdata, &tmp_list, ctx);
> > > > +                               else {
> > > > +
> > > > + iov_iter_advance(&tmp_from,
> > > >                                                  wdata->offset -
> > > > ctx->pos);
> > > >
> > > > -                               rc = cifs_write_from_iter(wdata->offset,
> > > > +                                       rc =
> > > > + cifs_write_from_iter(wdata->offset,
> > > >                                                 wdata->bytes, &tmp_from,
> > > >                                                 ctx->cfile, cifs_sb, &tmp_list,
> > > >                                                 ctx);
> > > > +                               }
> > > >
> > > >                                 list_splice(&tmp_list,
> > > > &ctx->list);
> > > >
> > > > @@ -2687,8 +2777,9 @@ static void
> > > > collect_uncached_write_data(struct
> > > cifs_aio_ctx *ctx)
> > > >                 kref_put(&wdata->refcount,
> cifs_uncached_writedata_release);
> > > >         }
> > > >
> > > > -       for (i = 0; i < ctx->npages; i++)
> > > > -               put_page(ctx->bv[i].bv_page);
> > > > +       if (!ctx->direct_io)
> > > > +               for (i = 0; i < ctx->npages; i++)
> > > > +                       put_page(ctx->bv[i].bv_page);
> > > >
> > > >         cifs_stats_bytes_written(tcon, ctx->total_len);
> > > >         set_bit(CIFS_INO_INVALID_MAPPING,
> > > > &CIFS_I(dentry->d_inode)->flags); @@ -2703,7 +2794,7 @@ static
> > > > void
> > > collect_uncached_write_data(struct cifs_aio_ctx *ctx)
> > > >                 complete(&ctx->done);  }
> > > >
> > > > -ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter
> > > > *from)
> > > > +static ssize_t __cifs_writev(struct kiocb *iocb, struct iov_iter
> > > > +*from, bool direct)
> > > >  {
> > > >         struct file *file = iocb->ki_filp;
> > > >         ssize_t total_written = 0; @@ -2712,13 +2803,18 @@ ssize_t
> > > > cifs_user_writev(struct kiocb *iocb,
> > > struct iov_iter *from)
> > > >         struct cifs_sb_info *cifs_sb;
> > > >         struct cifs_aio_ctx *ctx;
> > > >         struct iov_iter saved_from = *from;
> > > > +       size_t len = iov_iter_count(from);
> > > >         int rc;
> > > >
> > > >         /*
> > > > -        * BB - optimize the way when signing is disabled. We can drop this
> > > > -        * extra memory-to-memory copying and use iovec buffers for
> > > constructing
> > > > -        * write request.
> > > > +        * iov_iter_get_pages_alloc doesn't work with ITER_KVEC.
> > > > +        * In this case, fall back to non-direct write function.
> > > > +        * this could be improved by getting pages directly in
> > > > + ITER_KVEC
> > > >          */
> > > > +       if (direct && from->type & ITER_KVEC) {
> > > > +               cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n");
> > > > +               direct = false;
> > > > +       }
> > > >
> > > >         rc = generic_write_checks(iocb, from);
> > > >         if (rc <= 0)
> > > > @@ -2742,10 +2838,16 @@ ssize_t cifs_user_writev(struct kiocb
> > > > *iocb, struct iov_iter *from)
> > > >
> > > >         ctx->pos = iocb->ki_pos;
> > > >
> > > > -       rc = setup_aio_ctx_iter(ctx, from, WRITE);
> > > > -       if (rc) {
> > > > -               kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > > > -               return rc;
> > > > +       if (direct) {
> > > > +               ctx->direct_io = true;
> > > > +               ctx->iter = *from;
> > > > +               ctx->len = len;
> > > > +       } else {
> > > > +               rc = setup_aio_ctx_iter(ctx, from, WRITE);
> > > > +               if (rc) {
> > > > +                       kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > > > +                       return rc;
> > > > +               }
> > > >         }
> > > >
> > > >         /* grab a lock here due to read response handlers can
> > > > access ctx */ @@ -2795,6 +2897,16 @@ ssize_t
> > > > cifs_user_writev(struct kiocb
> > > *iocb, struct iov_iter *from)
> > > >         return total_written;
> > > >  }
> > > >
> > > > +ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter
> > > > +*from) {
> > > > +       return __cifs_writev(iocb, from, true); }
> > > > +
> > > > +ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) {
> > > > +       return __cifs_writev(iocb, from, false); }
> > > > +
> > > >  static ssize_t
> > > >  cifs_writev(struct kiocb *iocb, struct iov_iter *from)  {
> > > > --
> > > > 2.7.4
> > > >
> > >
> > > Did you measure the performance benefit of using O_DIRECT with your
> > > patches for non-RDMA mode?
> >
> > Here are some of the results measured on TCP/IP over IPoIB 40G Infiniband.
> Please note that IPoIB is done in software so it's slower than a 40G Ethernet
> NIC.
> >
> > All servers are running 2 X Intel(R) Xeon(R) CPU E5-2650 v3 @ 2.30GHz.
> Tested on FIO 64k read or write with queue depths 1 or 16. All the tests are
> running with 1 FIO job, and with "direct=1".
> >
> > Without direct I/O:
> > read 64k d1     113669KB/s
> > read 64k d16    618428KB/s
> > write 64k d1    134911KB/s
> > write 64k d16   457005KB/s
> >
> > With direct I/O:
> > read 64k d1     127134KB/s
> > read 64k d16    714629KB/s
> > write 64k d1    129268KB/s
> > write 64k d16   461054KB/s
> >
> > Direct I/O improvement:
> > read 64k d1     11%
> > read 64k d16    15%
> > write 64k d1    -5%
>                       ^^^
> This is strange. Is it consistent across multiple runs?
> 
> > write 64k d16   1%
> 
> Good read performance improvements overall and it seems write
> performance results need more investigations.

My apologies, I found out that I was doing it wrong for comparing write tests. They were actually all direct I/O, 5% difference is due to benchmark variation at the time of tests.

Here are the re-run (3 times, each time for 90 seconds) for 64k write. This time, Linux client, Windows server and 40G IB switch are rebooted prior to tests.

direct I/O

64k write d16
470m/s
616m/s
479m/s

64k write d1
176m/s
160m/s
156m/s


without direct I/O

64k write d16
534m/s
405m/s
406m/s

64k write d1
146m/s
147m/s
147m/s

I'm not sure why test results at depth 16 are jumpy, but overall there are improvements in direct I/O for 64k writes.
Tom Talpey Nov. 29, 2018, 9:45 p.m. | #5
> -----Original Message-----
> From: linux-cifs-owner@vger.kernel.org <linux-cifs-owner@vger.kernel.org> On
> Behalf Of Long Li
> Sent: Thursday, November 29, 2018 4:30 PM
> To: Pavel Shilovsky <piastryyy@gmail.com>
> Cc: Steve French <sfrench@samba.org>; linux-cifs <linux-cifs@vger.kernel.org>;
> samba-technical <samba-technical@lists.samba.org>; Kernel Mailing List
> <linux-kernel@vger.kernel.org>
> Subject: RE: [Patch v4 2/3] CIFS: Add support for direct I/O write
> 
> > Subject: Re: [Patch v4 2/3] CIFS: Add support for direct I/O write
> >
> > ср, 28 нояб. 2018 г. в 18:20, Long Li <longli@microsoft.com>:
> > >
> > > > Subject: Re: [Patch v4 2/3] CIFS: Add support for direct I/O write
> > > >
> > > > ср, 31 окт. 2018 г. в 15:26, Long Li <longli@linuxonhyperv.com>:
> > > > >
> > > > > From: Long Li <longli@microsoft.com>
> > > > >
> > > > > With direct I/O write, user supplied buffers are pinned to the
> > > > > memory and data are transferred directly from user buffers to the
> > transport layer.
> > > > >
> > > > > Change in v3: add support for kernel AIO
> > > > >
> > > > > Change in v4:
> > > > > Refactor common write code to __cifs_writev for direct and non-direct
> > I/O.
> > > > > Retry on direct I/O failure.
> > > > >
> > > > > Signed-off-by: Long Li <longli@microsoft.com>
> > > > > ---
> > > > >  fs/cifs/cifsfs.h |   1 +
> > > > >  fs/cifs/file.c   | 194
> > +++++++++++++++++++++++++++++++++++++++++++----
> > > > --------
> > > > >  2 files changed, 154 insertions(+), 41 deletions(-)
> > > > >
> > > > > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > > > > 7fba9aa..e9c5103 100644
> > > > > --- a/fs/cifs/cifsfs.h
> > > > > +++ b/fs/cifs/cifsfs.h
> > > > > @@ -105,6 +105,7 @@ extern ssize_t cifs_user_readv(struct kiocb
> > > > > *iocb, struct iov_iter *to);  extern ssize_t
> > > > > cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
> > > > > extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct
> > > > > iov_iter *to);  extern ssize_t cifs_user_writev(struct kiocb
> > > > > *iocb, struct iov_iter *from);
> > > > > +extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct
> > > > > +iov_iter *from);
> > > > >  extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct
> > > > > iov_iter *from);  extern int cifs_lock(struct file *, int, struct
> > > > > file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t,
> > > > > int); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index
> > > > > daab878..1a41c04 100644
> > > > > --- a/fs/cifs/file.c
> > > > > +++ b/fs/cifs/file.c
> > > > > @@ -2524,6 +2524,55 @@ wdata_fill_from_iovec(struct cifs_writedata
> > > > > *wdata, struct iov_iter *from,  }
> > > > >
> > > > >  static int
> > > > > +cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head
> > > > > +*wdata_list, struct cifs_aio_ctx *ctx) {
> > > > > +       int wait_retry = 0;
> > > > > +       unsigned int wsize, credits;
> > > > > +       int rc;
> > > > > +       struct TCP_Server_Info *server =
> > > > > +tlink_tcon(wdata->cfile->tlink)->ses->server;
> > > > > +
> > > > > +       /*
> > > > > +        * Try to resend this wdata, waiting for credits up to 3 seconds.
> > > > > +        * Note: we are attempting to resend the whole wdata not
> > > > > + in
> > > > segments
> > > > > +        */
> > > > > +       do {
> > > > > +               rc = server->ops->wait_mtu_credits(server,
> > > > > + wdata->bytes, &wsize, &credits);
> > > > > +
> > > > > +               if (rc)
> > > > > +                       break;
> > > > > +
> > > > > +               if (wsize < wdata->bytes) {
> > > > > +                       add_credits_and_wake_if(server, credits, 0);
> > > > > +                       msleep(1000);
> > > > > +                       wait_retry++;
> > > > > +               }
> > > > > +       } while (wsize < wdata->bytes && wait_retry < 3);
> > > > > +
> > > > > +       if (wsize < wdata->bytes) {
> > > > > +               rc = -EBUSY;
> > > > > +               goto out;
> > > > > +       }
> > > > > +
> > > > > +       rc = -EAGAIN;
> > > > > +       while (rc == -EAGAIN)
> > > > > +               if (!wdata->cfile->invalidHandle ||
> > > > > +                   !(rc = cifs_reopen_file(wdata->cfile, false)))
> > > > > +                       rc = server->ops->async_writev(wdata,
> > > > > +
> > > > > + cifs_uncached_writedata_release);
> > > > > +
> > > > > +       if (!rc) {
> > > > > +               list_add_tail(&wdata->list, wdata_list);
> > > > > +               return 0;
> > > > > +       }
> > > > > +
> > > > > +       add_credits_and_wake_if(server, wdata->credits, 0);
> > > > > +out:
> > > > > +       kref_put(&wdata->refcount,
> > > > > +cifs_uncached_writedata_release);
> > > > > +
> > > > > +       return rc;
> > > > > +}
> > > > > +
> > > > > +static int
> > > > >  cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
> > > > >                      struct cifsFileInfo *open_file,
> > > > >                      struct cifs_sb_info *cifs_sb, struct
> > > > > list_head *wdata_list, @@ -2537,6 +2586,8 @@
> > > > > cifs_write_from_iter(loff_t offset,
> > > > size_t len, struct iov_iter *from,
> > > > >         loff_t saved_offset = offset;
> > > > >         pid_t pid;
> > > > >         struct TCP_Server_Info *server;
> > > > > +       struct page **pagevec;
> > > > > +       size_t start;
> > > > >
> > > > >         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
> > > > >                 pid = open_file->pid; @@ -2553,38 +2604,74 @@
> > > > > cifs_write_from_iter(loff_t offset, size_t len,
> > > > struct iov_iter *from,
> > > > >                 if (rc)
> > > > >                         break;
> > > > >
> > > > > -               nr_pages = get_numpages(wsize, len, &cur_len);
> > > > > -               wdata = cifs_writedata_alloc(nr_pages,
> > > > > +               if (ctx->direct_io) {
> > > > > +                       cur_len = iov_iter_get_pages_alloc(
> > > > > +                               from, &pagevec, wsize, &start);
> > > > > +                       if (cur_len < 0) {
> > > > > +                               cifs_dbg(VFS,
> > > > > +                                       "direct_writev couldn't get user pages "
> > > > > +                                       "(rc=%zd) iter type %d iov_offset %zd count"
> > > > > +                                       " %zd\n",
> > > > > +                                       cur_len, from->type,
> > > > > +                                       from->iov_offset, from->count);
> > > > > +                               dump_stack();
> > > > > +                               break;
> > > > > +                       }
> > > > > +                       iov_iter_advance(from, cur_len);
> > > > > +
> > > > > +                       nr_pages = (cur_len + start + PAGE_SIZE -
> > > > > + 1) / PAGE_SIZE;
> > > > > +
> > > > > +                       wdata =
> > > > > + cifs_writedata_direct_alloc(pagevec,
> > > > >                                              cifs_uncached_writev_complete);
> > > > > -               if (!wdata) {
> > > > > -                       rc = -ENOMEM;
> > > > > -                       add_credits_and_wake_if(server, credits, 0);
> > > > > -                       break;
> > > > > -               }
> > > > > +                       if (!wdata) {
> > > > > +                               rc = -ENOMEM;
> > > > > +                               add_credits_and_wake_if(server, credits, 0);
> > > > > +                               break;
> > > > > +                       }
> > > > >
> > > > > -               rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
> > > > > -               if (rc) {
> > > > > -                       kfree(wdata);
> > > > > -                       add_credits_and_wake_if(server, credits, 0);
> > > > > -                       break;
> > > > > -               }
> > > > >
> > > > > -               num_pages = nr_pages;
> > > > > -               rc = wdata_fill_from_iovec(wdata, from, &cur_len,
> > &num_pages);
> > > > > -               if (rc) {
> > > > > -                       for (i = 0; i < nr_pages; i++)
> > > > > -                               put_page(wdata->pages[i]);
> > > > > -                       kfree(wdata);
> > > > > -                       add_credits_and_wake_if(server, credits, 0);
> > > > > -                       break;
> > > > > -               }
> > > > > +                       wdata->page_offset = start;
> > > > > +                       wdata->tailsz =
> > > > > +                               nr_pages > 1 ?
> > > > > +                                       cur_len - (PAGE_SIZE - start) -
> > > > > +                                       (nr_pages - 2) * PAGE_SIZE :
> > > > > +                                       cur_len;
> > > > > +               } else {
> > > > > +                       nr_pages = get_numpages(wsize, len, &cur_len);
> > > > > +                       wdata = cifs_writedata_alloc(nr_pages,
> > > > > +                                            cifs_uncached_writev_complete);
> > > > > +                       if (!wdata) {
> > > > > +                               rc = -ENOMEM;
> > > > > +                               add_credits_and_wake_if(server, credits, 0);
> > > > > +                               break;
> > > > > +                       }
> > > > >
> > > > > -               /*
> > > > > -                * Bring nr_pages down to the number of pages we actually
> > used,
> > > > > -                * and free any pages that we didn't use.
> > > > > -                */
> > > > > -               for ( ; nr_pages > num_pages; nr_pages--)
> > > > > -                       put_page(wdata->pages[nr_pages - 1]);
> > > > > +                       rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
> > > > > +                       if (rc) {
> > > > > +                               kfree(wdata);
> > > > > +                               add_credits_and_wake_if(server, credits, 0);
> > > > > +                               break;
> > > > > +                       }
> > > > > +
> > > > > +                       num_pages = nr_pages;
> > > > > +                       rc = wdata_fill_from_iovec(wdata, from,
> > > > > + &cur_len,
> > > > &num_pages);
> > > > > +                       if (rc) {
> > > > > +                               for (i = 0; i < nr_pages; i++)
> > > > > +                                       put_page(wdata->pages[i]);
> > > > > +                               kfree(wdata);
> > > > > +                               add_credits_and_wake_if(server, credits, 0);
> > > > > +                               break;
> > > > > +                       }
> > > > > +
> > > > > +                       /*
> > > > > +                        * Bring nr_pages down to the number of
> > > > > + pages we actually
> > > > used,
> > > > > +                        * and free any pages that we didn't use.
> > > > > +                        */
> > > > > +                       for ( ; nr_pages > num_pages; nr_pages--)
> > > > > +                               put_page(wdata->pages[nr_pages -
> > > > > + 1]);
> > > > > +
> > > > > +                       wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
> > > > > +               }
> > > > >
> > > > >                 wdata->sync_mode = WB_SYNC_ALL;
> > > > >                 wdata->nr_pages = nr_pages; @@ -2593,7 +2680,6 @@
> > > > > cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
> > > > >                 wdata->pid = pid;
> > > > >                 wdata->bytes = cur_len;
> > > > >                 wdata->pagesz = PAGE_SIZE;
> > > > > -               wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
> > > > >                 wdata->credits = credits;
> > > > >                 wdata->ctx = ctx;
> > > > >                 kref_get(&ctx->refcount); @@ -2668,13 +2754,17 @@
> > > > > static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
> > > > >                                 INIT_LIST_HEAD(&tmp_list);
> > > > >                                 list_del_init(&wdata->list);
> > > > >
> > > > > -                               iov_iter_advance(&tmp_from,
> > > > > +                               if (ctx->direct_io)
> > > > > +                                       rc = cifs_resend_wdata(wdata, &tmp_list, ctx);
> > > > > +                               else {
> > > > > +
> > > > > + iov_iter_advance(&tmp_from,
> > > > >                                                  wdata->offset -
> > > > > ctx->pos);
> > > > >
> > > > > -                               rc = cifs_write_from_iter(wdata->offset,
> > > > > +                                       rc =
> > > > > + cifs_write_from_iter(wdata->offset,
> > > > >                                                 wdata->bytes, &tmp_from,
> > > > >                                                 ctx->cfile, cifs_sb, &tmp_list,
> > > > >                                                 ctx);
> > > > > +                               }
> > > > >
> > > > >                                 list_splice(&tmp_list,
> > > > > &ctx->list);
> > > > >
> > > > > @@ -2687,8 +2777,9 @@ static void
> > > > > collect_uncached_write_data(struct
> > > > cifs_aio_ctx *ctx)
> > > > >                 kref_put(&wdata->refcount,
> > cifs_uncached_writedata_release);
> > > > >         }
> > > > >
> > > > > -       for (i = 0; i < ctx->npages; i++)
> > > > > -               put_page(ctx->bv[i].bv_page);
> > > > > +       if (!ctx->direct_io)
> > > > > +               for (i = 0; i < ctx->npages; i++)
> > > > > +                       put_page(ctx->bv[i].bv_page);
> > > > >
> > > > >         cifs_stats_bytes_written(tcon, ctx->total_len);
> > > > >         set_bit(CIFS_INO_INVALID_MAPPING,
> > > > > &CIFS_I(dentry->d_inode)->flags); @@ -2703,7 +2794,7 @@ static
> > > > > void
> > > > collect_uncached_write_data(struct cifs_aio_ctx *ctx)
> > > > >                 complete(&ctx->done);  }
> > > > >
> > > > > -ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter
> > > > > *from)
> > > > > +static ssize_t __cifs_writev(struct kiocb *iocb, struct iov_iter
> > > > > +*from, bool direct)
> > > > >  {
> > > > >         struct file *file = iocb->ki_filp;
> > > > >         ssize_t total_written = 0; @@ -2712,13 +2803,18 @@ ssize_t
> > > > > cifs_user_writev(struct kiocb *iocb,
> > > > struct iov_iter *from)
> > > > >         struct cifs_sb_info *cifs_sb;
> > > > >         struct cifs_aio_ctx *ctx;
> > > > >         struct iov_iter saved_from = *from;
> > > > > +       size_t len = iov_iter_count(from);
> > > > >         int rc;
> > > > >
> > > > >         /*
> > > > > -        * BB - optimize the way when signing is disabled. We can drop this
> > > > > -        * extra memory-to-memory copying and use iovec buffers for
> > > > constructing
> > > > > -        * write request.
> > > > > +        * iov_iter_get_pages_alloc doesn't work with ITER_KVEC.
> > > > > +        * In this case, fall back to non-direct write function.
> > > > > +        * this could be improved by getting pages directly in
> > > > > + ITER_KVEC
> > > > >          */
> > > > > +       if (direct && from->type & ITER_KVEC) {
> > > > > +               cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n");
> > > > > +               direct = false;
> > > > > +       }
> > > > >
> > > > >         rc = generic_write_checks(iocb, from);
> > > > >         if (rc <= 0)
> > > > > @@ -2742,10 +2838,16 @@ ssize_t cifs_user_writev(struct kiocb
> > > > > *iocb, struct iov_iter *from)
> > > > >
> > > > >         ctx->pos = iocb->ki_pos;
> > > > >
> > > > > -       rc = setup_aio_ctx_iter(ctx, from, WRITE);
> > > > > -       if (rc) {
> > > > > -               kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > > > > -               return rc;
> > > > > +       if (direct) {
> > > > > +               ctx->direct_io = true;
> > > > > +               ctx->iter = *from;
> > > > > +               ctx->len = len;
> > > > > +       } else {
> > > > > +               rc = setup_aio_ctx_iter(ctx, from, WRITE);
> > > > > +               if (rc) {
> > > > > +                       kref_put(&ctx->refcount, cifs_aio_ctx_release);
> > > > > +                       return rc;
> > > > > +               }
> > > > >         }
> > > > >
> > > > >         /* grab a lock here due to read response handlers can
> > > > > access ctx */ @@ -2795,6 +2897,16 @@ ssize_t
> > > > > cifs_user_writev(struct kiocb
> > > > *iocb, struct iov_iter *from)
> > > > >         return total_written;
> > > > >  }
> > > > >
> > > > > +ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter
> > > > > +*from) {
> > > > > +       return __cifs_writev(iocb, from, true); }
> > > > > +
> > > > > +ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) {
> > > > > +       return __cifs_writev(iocb, from, false); }
> > > > > +
> > > > >  static ssize_t
> > > > >  cifs_writev(struct kiocb *iocb, struct iov_iter *from)  {
> > > > > --
> > > > > 2.7.4
> > > > >
> > > >
> > > > Did you measure the performance benefit of using O_DIRECT with your
> > > > patches for non-RDMA mode?
> > >
> > > Here are some of the results measured on TCP/IP over IPoIB 40G Infiniband.
> > Please note that IPoIB is done in software so it's slower than a 40G Ethernet
> > NIC.
> > >
> > > All servers are running 2 X Intel(R) Xeon(R) CPU E5-2650 v3 @ 2.30GHz.
> > Tested on FIO 64k read or write with queue depths 1 or 16. All the tests are
> > running with 1 FIO job, and with "direct=1".
> > >
> > > Without direct I/O:
> > > read 64k d1     113669KB/s
> > > read 64k d16    618428KB/s
> > > write 64k d1    134911KB/s
> > > write 64k d16   457005KB/s
> > >
> > > With direct I/O:
> > > read 64k d1     127134KB/s
> > > read 64k d16    714629KB/s
> > > write 64k d1    129268KB/s
> > > write 64k d16   461054KB/s
> > >
> > > Direct I/O improvement:
> > > read 64k d1     11%
> > > read 64k d16    15%
> > > write 64k d1    -5%
> >                       ^^^
> > This is strange. Is it consistent across multiple runs?
> >
> > > write 64k d16   1%
> >
> > Good read performance improvements overall and it seems write
> > performance results need more investigations.
> 
> My apologies, I found out that I was doing it wrong for comparing write tests.
> They were actually all direct I/O, 5% difference is due to benchmark variation at
> the time of tests.
> 
> Here are the re-run (3 times, each time for 90 seconds) for 64k write. This time,
> Linux client, Windows server and 40G IB switch are rebooted prior to tests.
> 
> direct I/O
> 
> 64k write d16
> 470m/s
> 616m/s
> 479m/s
> 
> 64k write d1
> 176m/s
> 160m/s
> 156m/s
> 
> 
> without direct I/O
> 
> 64k write d16
> 534m/s
> 405m/s
> 406m/s
> 
> 64k write d1
> 146m/s
> 147m/s
> 147m/s
> 
> I'm not sure why test results at depth 16 are jumpy, but overall there are
> improvements in direct I/O for 64k writes.

Better! Regarding the "jumpy" results, it's likely from queuing effects creeping in.
You might want to try doing a sweep of write queue depth, say 1, 4, 8, 16, and more,
then comparing the results across them all, and ignoring the jumpy ones - assuming
the jumpiness appears at about the same point. The idea being to compare the
improvement at the maximum stable point. Getting onto a decent 10GbE NIC would
also help. Still, it's clear there's a win here.

I'll chat with you offline about testing with John Hubbard's get_user_pages() RFC.
Would be very interesting to compare using that, too.

Tom.

Patch

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7fba9aa..e9c5103 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -105,6 +105,7 @@  extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
+extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from);
 extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, loff_t, loff_t, int);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index daab878..1a41c04 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2524,6 +2524,55 @@  wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
 }
 
 static int
+cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, struct cifs_aio_ctx *ctx)
+{
+	int wait_retry = 0;
+	unsigned int wsize, credits;
+	int rc;
+	struct TCP_Server_Info *server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+
+	/*
+	 * Try to resend this wdata, waiting for credits up to 3 seconds.
+	 * Note: we are attempting to resend the whole wdata not in segments
+	 */
+	do {
+		rc = server->ops->wait_mtu_credits(server, wdata->bytes, &wsize, &credits);
+
+		if (rc)
+			break;
+
+		if (wsize < wdata->bytes) {
+			add_credits_and_wake_if(server, credits, 0);
+			msleep(1000);
+			wait_retry++;
+		}
+	} while (wsize < wdata->bytes && wait_retry < 3);
+
+	if (wsize < wdata->bytes) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	rc = -EAGAIN;
+	while (rc == -EAGAIN)
+		if (!wdata->cfile->invalidHandle ||
+		    !(rc = cifs_reopen_file(wdata->cfile, false)))
+			rc = server->ops->async_writev(wdata,
+					cifs_uncached_writedata_release);
+
+	if (!rc) {
+		list_add_tail(&wdata->list, wdata_list);
+		return 0;
+	}
+
+	add_credits_and_wake_if(server, wdata->credits, 0);
+out:
+	kref_put(&wdata->refcount, cifs_uncached_writedata_release);
+
+	return rc;
+}
+
+static int
 cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
 		     struct cifsFileInfo *open_file,
 		     struct cifs_sb_info *cifs_sb, struct list_head *wdata_list,
@@ -2537,6 +2586,8 @@  cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
 	loff_t saved_offset = offset;
 	pid_t pid;
 	struct TCP_Server_Info *server;
+	struct page **pagevec;
+	size_t start;
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
 		pid = open_file->pid;
@@ -2553,38 +2604,74 @@  cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
 		if (rc)
 			break;
 
-		nr_pages = get_numpages(wsize, len, &cur_len);
-		wdata = cifs_writedata_alloc(nr_pages,
+		if (ctx->direct_io) {
+			cur_len = iov_iter_get_pages_alloc(
+				from, &pagevec, wsize, &start);
+			if (cur_len < 0) {
+				cifs_dbg(VFS,
+					"direct_writev couldn't get user pages "
+					"(rc=%zd) iter type %d iov_offset %zd count"
+					" %zd\n",
+					cur_len, from->type,
+					from->iov_offset, from->count);
+				dump_stack();
+				break;
+			}
+			iov_iter_advance(from, cur_len);
+
+			nr_pages = (cur_len + start + PAGE_SIZE - 1) / PAGE_SIZE;
+
+			wdata = cifs_writedata_direct_alloc(pagevec,
 					     cifs_uncached_writev_complete);
-		if (!wdata) {
-			rc = -ENOMEM;
-			add_credits_and_wake_if(server, credits, 0);
-			break;
-		}
+			if (!wdata) {
+				rc = -ENOMEM;
+				add_credits_and_wake_if(server, credits, 0);
+				break;
+			}
 
-		rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
-		if (rc) {
-			kfree(wdata);
-			add_credits_and_wake_if(server, credits, 0);
-			break;
-		}
 
-		num_pages = nr_pages;
-		rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages);
-		if (rc) {
-			for (i = 0; i < nr_pages; i++)
-				put_page(wdata->pages[i]);
-			kfree(wdata);
-			add_credits_and_wake_if(server, credits, 0);
-			break;
-		}
+			wdata->page_offset = start;
+			wdata->tailsz =
+				nr_pages > 1 ?
+					cur_len - (PAGE_SIZE - start) -
+					(nr_pages - 2) * PAGE_SIZE :
+					cur_len;
+		} else {
+			nr_pages = get_numpages(wsize, len, &cur_len);
+			wdata = cifs_writedata_alloc(nr_pages,
+					     cifs_uncached_writev_complete);
+			if (!wdata) {
+				rc = -ENOMEM;
+				add_credits_and_wake_if(server, credits, 0);
+				break;
+			}
 
-		/*
-		 * Bring nr_pages down to the number of pages we actually used,
-		 * and free any pages that we didn't use.
-		 */
-		for ( ; nr_pages > num_pages; nr_pages--)
-			put_page(wdata->pages[nr_pages - 1]);
+			rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
+			if (rc) {
+				kfree(wdata);
+				add_credits_and_wake_if(server, credits, 0);
+				break;
+			}
+
+			num_pages = nr_pages;
+			rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages);
+			if (rc) {
+				for (i = 0; i < nr_pages; i++)
+					put_page(wdata->pages[i]);
+				kfree(wdata);
+				add_credits_and_wake_if(server, credits, 0);
+				break;
+			}
+
+			/*
+			 * Bring nr_pages down to the number of pages we actually used,
+			 * and free any pages that we didn't use.
+			 */
+			for ( ; nr_pages > num_pages; nr_pages--)
+				put_page(wdata->pages[nr_pages - 1]);
+
+			wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
+		}
 
 		wdata->sync_mode = WB_SYNC_ALL;
 		wdata->nr_pages = nr_pages;
@@ -2593,7 +2680,6 @@  cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
 		wdata->pid = pid;
 		wdata->bytes = cur_len;
 		wdata->pagesz = PAGE_SIZE;
-		wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
 		wdata->credits = credits;
 		wdata->ctx = ctx;
 		kref_get(&ctx->refcount);
@@ -2668,13 +2754,17 @@  static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
 				INIT_LIST_HEAD(&tmp_list);
 				list_del_init(&wdata->list);
 
-				iov_iter_advance(&tmp_from,
+				if (ctx->direct_io)
+					rc = cifs_resend_wdata(wdata, &tmp_list, ctx);
+				else {
+					iov_iter_advance(&tmp_from,
 						 wdata->offset - ctx->pos);
 
-				rc = cifs_write_from_iter(wdata->offset,
+					rc = cifs_write_from_iter(wdata->offset,
 						wdata->bytes, &tmp_from,
 						ctx->cfile, cifs_sb, &tmp_list,
 						ctx);
+				}
 
 				list_splice(&tmp_list, &ctx->list);
 
@@ -2687,8 +2777,9 @@  static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
 		kref_put(&wdata->refcount, cifs_uncached_writedata_release);
 	}
 
-	for (i = 0; i < ctx->npages; i++)
-		put_page(ctx->bv[i].bv_page);
+	if (!ctx->direct_io)
+		for (i = 0; i < ctx->npages; i++)
+			put_page(ctx->bv[i].bv_page);
 
 	cifs_stats_bytes_written(tcon, ctx->total_len);
 	set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags);
@@ -2703,7 +2794,7 @@  static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
 		complete(&ctx->done);
 }
 
-ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t __cifs_writev(struct kiocb *iocb, struct iov_iter *from, bool direct)
 {
 	struct file *file = iocb->ki_filp;
 	ssize_t total_written = 0;
@@ -2712,13 +2803,18 @@  ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
 	struct cifs_sb_info *cifs_sb;
 	struct cifs_aio_ctx *ctx;
 	struct iov_iter saved_from = *from;
+	size_t len = iov_iter_count(from);
 	int rc;
 
 	/*
-	 * BB - optimize the way when signing is disabled. We can drop this
-	 * extra memory-to-memory copying and use iovec buffers for constructing
-	 * write request.
+	 * iov_iter_get_pages_alloc doesn't work with ITER_KVEC.
+	 * In this case, fall back to non-direct write function.
+	 * this could be improved by getting pages directly in ITER_KVEC
 	 */
+	if (direct && from->type & ITER_KVEC) {
+		cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n");
+		direct = false;
+	}
 
 	rc = generic_write_checks(iocb, from);
 	if (rc <= 0)
@@ -2742,10 +2838,16 @@  ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
 
 	ctx->pos = iocb->ki_pos;
 
-	rc = setup_aio_ctx_iter(ctx, from, WRITE);
-	if (rc) {
-		kref_put(&ctx->refcount, cifs_aio_ctx_release);
-		return rc;
+	if (direct) {
+		ctx->direct_io = true;
+		ctx->iter = *from;
+		ctx->len = len;
+	} else {
+		rc = setup_aio_ctx_iter(ctx, from, WRITE);
+		if (rc) {
+			kref_put(&ctx->refcount, cifs_aio_ctx_release);
+			return rc;
+		}
 	}
 
 	/* grab a lock here due to read response handlers can access ctx */
@@ -2795,6 +2897,16 @@  ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
 	return total_written;
 }
 
+ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from)
+{
+	return __cifs_writev(iocb, from, true);
+}
+
+ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
+{
+	return __cifs_writev(iocb, from, false);
+}
+
 static ssize_t
 cifs_writev(struct kiocb *iocb, struct iov_iter *from)
 {