diff mbox series

[026/104] virtiofsd: Fast path for virtio read

Message ID 20191212163904.159893-27-dgilbert@redhat.com
State New
Headers show
Series virtiofs daemon [all] | expand

Commit Message

Dr. David Alan Gilbert Dec. 12, 2019, 4:37 p.m. UTC
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>

Readv the data straight into the guests buffer.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
With fix by:
Signed-off-by: Eryu Guan <eguan@linux.alibaba.com>
---
 tools/virtiofsd/fuse_lowlevel.c |   5 +
 tools/virtiofsd/fuse_virtio.c   | 159 ++++++++++++++++++++++++++++++++
 tools/virtiofsd/fuse_virtio.h   |   4 +
 3 files changed, 168 insertions(+)

Comments

Masayoshi Mizuma Jan. 17, 2020, 6:54 p.m. UTC | #1
On Thu, Dec 12, 2019 at 04:37:46PM +0000, Dr. David Alan Gilbert (git) wrote:
> From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> 
> Readv the data straight into the guests buffer.
> 
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> With fix by:
> Signed-off-by: Eryu Guan <eguan@linux.alibaba.com>
> ---
>  tools/virtiofsd/fuse_lowlevel.c |   5 +
>  tools/virtiofsd/fuse_virtio.c   | 159 ++++++++++++++++++++++++++++++++
>  tools/virtiofsd/fuse_virtio.h   |   4 +
>  3 files changed, 168 insertions(+)
> 
> diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
> index c2b114cf5b..5f80625652 100644
> --- a/tools/virtiofsd/fuse_lowlevel.c
> +++ b/tools/virtiofsd/fuse_lowlevel.c
> @@ -475,6 +475,11 @@ static int fuse_send_data_iov_fallback(struct fuse_session *se,
>          return fuse_send_msg(se, ch, iov, iov_count);
>      }
>  
> +    if (fuse_lowlevel_is_virtio(se) && buf->count == 1 &&
> +        buf->buf[0].flags == (FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK)) {
> +        return virtio_send_data_iov(se, ch, iov, iov_count, buf, len);
> +    }
> +
>      abort(); /* Will have taken vhost path */
>      return 0;
>  }
> diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
> index c33e0f7e8c..146cd3f702 100644
> --- a/tools/virtiofsd/fuse_virtio.c
> +++ b/tools/virtiofsd/fuse_virtio.c
> @@ -230,6 +230,165 @@ err:
>      return ret;
>  }
>  
> +/*
> + * Callback from fuse_send_data_iov_* when it's virtio and the buffer
> + * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK
> + * We need send the iov and then the buffer.
> + * Return 0 on success
> + */
> +int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
> +                         struct iovec *iov, int count, struct fuse_bufvec *buf,
> +                         size_t len)
> +{
> +    int ret = 0;
> +    VuVirtqElement *elem;
> +    VuVirtq *q;
> +
> +    assert(count >= 1);
> +    assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
> +
> +    struct fuse_out_header *out = iov[0].iov_base;
> +    /* TODO: Endianness! */
> +
> +    size_t iov_len = iov_size(iov, count);
> +    size_t tosend_len = iov_len + len;
> +
> +    out->len = tosend_len;
> +
> +    fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__,
> +             count, len, iov_len);
> +
> +    /* unique == 0 is notification which we don't support */
> +    assert(out->unique);
> +
> +    /* For virtio we always have ch */
> +    assert(ch);
> +    assert(!ch->qi->reply_sent);
> +    elem = ch->qi->qe;
> +    q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx];
> +
> +    /* The 'in' part of the elem is to qemu */
> +    unsigned int in_num = elem->in_num;
> +    struct iovec *in_sg = elem->in_sg;
> +    size_t in_len = iov_size(in_sg, in_num);
> +    fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
> +             __func__, elem->index, in_num, in_len);
> +
> +    /*
> +     * The elem should have room for a 'fuse_out_header' (out from fuse)
> +     * plus the data based on the len in the header.
> +     */
> +    if (in_len < sizeof(struct fuse_out_header)) {
> +        fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
> +                 __func__, elem->index);

> +        ret = -E2BIG;

The ret should be positive value, right?

           ret = E2BIG;

> +        goto err;
> +    }
> +    if (in_len < tosend_len) {
> +        fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
> +                 __func__, elem->index, tosend_len);

> +        ret = -E2BIG;

           ret = E2BIG;

> +        goto err;
> +    }
> +
> +    /* TODO: Limit to 'len' */
> +
> +    /* First copy the header data from iov->in_sg */
> +    copy_iov(iov, count, in_sg, in_num, iov_len);
> +
> +    /*
> +     * Build a copy of the the in_sg iov so we can skip bits in it,
> +     * including changing the offsets
> +     */

> +    struct iovec *in_sg_cpy = calloc(sizeof(struct iovec), in_num);

       assert(in_sg_cpy) should be here? in case calloc() fails...

> +    memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num);
> +    /* These get updated as we skip */
> +    struct iovec *in_sg_ptr = in_sg_cpy;
> +    int in_sg_cpy_count = in_num;
> +
> +    /* skip over parts of in_sg that contained the header iov */
> +    size_t skip_size = iov_len;
> +
> +    size_t in_sg_left = 0;
> +    do {
> +        while (skip_size != 0 && in_sg_cpy_count) {
> +            if (skip_size >= in_sg_ptr[0].iov_len) {
> +                skip_size -= in_sg_ptr[0].iov_len;
> +                in_sg_ptr++;
> +                in_sg_cpy_count--;
> +            } else {
> +                in_sg_ptr[0].iov_len -= skip_size;
> +                in_sg_ptr[0].iov_base += skip_size;
> +                break;
> +            }
> +        }
> +
> +        int i;
> +        for (i = 0, in_sg_left = 0; i < in_sg_cpy_count; i++) {
> +            in_sg_left += in_sg_ptr[i].iov_len;
> +        }
> +        fuse_log(FUSE_LOG_DEBUG,
> +                 "%s: after skip skip_size=%zd in_sg_cpy_count=%d "
> +                 "in_sg_left=%zd\n",
> +                 __func__, skip_size, in_sg_cpy_count, in_sg_left);
> +        ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count,
> +                     buf->buf[0].pos);
> +

> +        fuse_log(FUSE_LOG_DEBUG, "%s: preadv_res=%d(%m) len=%zd\n",
> +                 __func__, ret, len);

"%m" should be removed? because it may show the previous errno even if preadv()
is succsess. Like as:

[ID: 00000079] virtio_send_data_iov: after skip skip_size=0 in_sg_cpy_count=1 in_sg_left=65536
[ID: 00000079] virtio_send_data_iov: preadv_res=16000(No such file or directory) len=65536

Otherwise, looks good to me:

Reviewed-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>

Thanks,
Masa

> +        if (ret == -1) {
> +            ret = errno;
> +            free(in_sg_cpy);
> +            goto err;
> +        }
> +        if (ret < len && ret) {
> +            fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__);
> +            /* Skip over this much next time around */
> +            skip_size = ret;
> +            buf->buf[0].pos += ret;
> +            len -= ret;
> +
> +            /* Lets do another read */
> +            continue;
> +        }
> +        if (!ret) {
> +            /* EOF case? */
> +            fuse_log(FUSE_LOG_DEBUG, "%s: !ret in_sg_left=%zd\n", __func__,
> +                     in_sg_left);
> +            break;
> +        }
> +        if (ret != len) {
> +            fuse_log(FUSE_LOG_DEBUG, "%s: ret!=len\n", __func__);
> +            ret = EIO;
> +            free(in_sg_cpy);
> +            goto err;
> +        }
> +        in_sg_left -= ret;
> +        len -= ret;
> +    } while (in_sg_left);
> +    free(in_sg_cpy);
> +
> +    /* Need to fix out->len on EOF */
> +    if (len) {
> +        struct fuse_out_header *out_sg = in_sg[0].iov_base;
> +
> +        tosend_len -= len;
> +        out_sg->len = tosend_len;
> +    }
> +
> +    ret = 0;
> +
> +    vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len);
> +    vu_queue_notify(&se->virtio_dev->dev, q);
> +
> +err:
> +    if (ret == 0) {
> +        ch->qi->reply_sent = true;
> +    }
> +
> +    return ret;
> +}
> +
>  /* Thread function for individual queues, created when a queue is 'started' */
>  static void *fv_queue_thread(void *opaque)
>  {
> diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h
> index 135a14875a..cc676b9193 100644
> --- a/tools/virtiofsd/fuse_virtio.h
> +++ b/tools/virtiofsd/fuse_virtio.h
> @@ -26,4 +26,8 @@ int virtio_loop(struct fuse_session *se);
>  int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
>                      struct iovec *iov, int count);
>  
> +int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
> +                         struct iovec *iov, int count,
> +                         struct fuse_bufvec *buf, size_t len);
> +
>  #endif
> -- 
> 2.23.0
> 
>
Dr. David Alan Gilbert Jan. 20, 2020, 12:32 p.m. UTC | #2
* Masayoshi Mizuma (msys.mizuma@gmail.com) wrote:
> On Thu, Dec 12, 2019 at 04:37:46PM +0000, Dr. David Alan Gilbert (git) wrote:
> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > 
> > Readv the data straight into the guests buffer.
> > 
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > With fix by:
> > Signed-off-by: Eryu Guan <eguan@linux.alibaba.com>
> > ---
> >  tools/virtiofsd/fuse_lowlevel.c |   5 +
> >  tools/virtiofsd/fuse_virtio.c   | 159 ++++++++++++++++++++++++++++++++
> >  tools/virtiofsd/fuse_virtio.h   |   4 +
> >  3 files changed, 168 insertions(+)
> > 
> > diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
> > index c2b114cf5b..5f80625652 100644
> > --- a/tools/virtiofsd/fuse_lowlevel.c
> > +++ b/tools/virtiofsd/fuse_lowlevel.c
> > @@ -475,6 +475,11 @@ static int fuse_send_data_iov_fallback(struct fuse_session *se,
> >          return fuse_send_msg(se, ch, iov, iov_count);
> >      }
> >  
> > +    if (fuse_lowlevel_is_virtio(se) && buf->count == 1 &&
> > +        buf->buf[0].flags == (FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK)) {
> > +        return virtio_send_data_iov(se, ch, iov, iov_count, buf, len);
> > +    }
> > +
> >      abort(); /* Will have taken vhost path */
> >      return 0;
> >  }
> > diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
> > index c33e0f7e8c..146cd3f702 100644
> > --- a/tools/virtiofsd/fuse_virtio.c
> > +++ b/tools/virtiofsd/fuse_virtio.c
> > @@ -230,6 +230,165 @@ err:
> >      return ret;
> >  }
> >  
> > +/*
> > + * Callback from fuse_send_data_iov_* when it's virtio and the buffer
> > + * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK
> > + * We need send the iov and then the buffer.
> > + * Return 0 on success
> > + */
> > +int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
> > +                         struct iovec *iov, int count, struct fuse_bufvec *buf,
> > +                         size_t len)
> > +{
> > +    int ret = 0;
> > +    VuVirtqElement *elem;
> > +    VuVirtq *q;
> > +
> > +    assert(count >= 1);
> > +    assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
> > +
> > +    struct fuse_out_header *out = iov[0].iov_base;
> > +    /* TODO: Endianness! */
> > +
> > +    size_t iov_len = iov_size(iov, count);
> > +    size_t tosend_len = iov_len + len;
> > +
> > +    out->len = tosend_len;
> > +
> > +    fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__,
> > +             count, len, iov_len);
> > +
> > +    /* unique == 0 is notification which we don't support */
> > +    assert(out->unique);
> > +
> > +    /* For virtio we always have ch */
> > +    assert(ch);
> > +    assert(!ch->qi->reply_sent);
> > +    elem = ch->qi->qe;
> > +    q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx];
> > +
> > +    /* The 'in' part of the elem is to qemu */
> > +    unsigned int in_num = elem->in_num;
> > +    struct iovec *in_sg = elem->in_sg;
> > +    size_t in_len = iov_size(in_sg, in_num);
> > +    fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
> > +             __func__, elem->index, in_num, in_len);
> > +
> > +    /*
> > +     * The elem should have room for a 'fuse_out_header' (out from fuse)
> > +     * plus the data based on the len in the header.
> > +     */
> > +    if (in_len < sizeof(struct fuse_out_header)) {
> > +        fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
> > +                 __func__, elem->index);
> 
> > +        ret = -E2BIG;
> 
> The ret should be positive value, right?
> 
>            ret = E2BIG;

Yes, I think so.

> > +        goto err;
> > +    }
> > +    if (in_len < tosend_len) {
> > +        fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
> > +                 __func__, elem->index, tosend_len);
> 
> > +        ret = -E2BIG;
> 
>            ret = E2BIG;
> 
> > +        goto err;
> > +    }
> > +
> > +    /* TODO: Limit to 'len' */
> > +
> > +    /* First copy the header data from iov->in_sg */
> > +    copy_iov(iov, count, in_sg, in_num, iov_len);
> > +
> > +    /*
> > +     * Build a copy of the the in_sg iov so we can skip bits in it,
> > +     * including changing the offsets
> > +     */
> 
> > +    struct iovec *in_sg_cpy = calloc(sizeof(struct iovec), in_num);
> 
>        assert(in_sg_cpy) should be here? in case calloc() fails...

Thanks, added.

> > +    memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num);
> > +    /* These get updated as we skip */
> > +    struct iovec *in_sg_ptr = in_sg_cpy;
> > +    int in_sg_cpy_count = in_num;
> > +
> > +    /* skip over parts of in_sg that contained the header iov */
> > +    size_t skip_size = iov_len;
> > +
> > +    size_t in_sg_left = 0;
> > +    do {
> > +        while (skip_size != 0 && in_sg_cpy_count) {
> > +            if (skip_size >= in_sg_ptr[0].iov_len) {
> > +                skip_size -= in_sg_ptr[0].iov_len;
> > +                in_sg_ptr++;
> > +                in_sg_cpy_count--;
> > +            } else {
> > +                in_sg_ptr[0].iov_len -= skip_size;
> > +                in_sg_ptr[0].iov_base += skip_size;
> > +                break;
> > +            }
> > +        }
> > +
> > +        int i;
> > +        for (i = 0, in_sg_left = 0; i < in_sg_cpy_count; i++) {
> > +            in_sg_left += in_sg_ptr[i].iov_len;
> > +        }
> > +        fuse_log(FUSE_LOG_DEBUG,
> > +                 "%s: after skip skip_size=%zd in_sg_cpy_count=%d "
> > +                 "in_sg_left=%zd\n",
> > +                 __func__, skip_size, in_sg_cpy_count, in_sg_left);
> > +        ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count,
> > +                     buf->buf[0].pos);
> > +
> 
> > +        fuse_log(FUSE_LOG_DEBUG, "%s: preadv_res=%d(%m) len=%zd\n",
> > +                 __func__, ret, len);
> 
> "%m" should be removed? because it may show the previous errno even if preadv()
> is succsess. Like as:
> 
> [ID: 00000079] virtio_send_data_iov: after skip skip_size=0 in_sg_cpy_count=1 in_sg_left=65536
> [ID: 00000079] virtio_send_data_iov: preadv_res=16000(No such file or directory) len=65536

I think there's another problem; that fuse_log might corrupt errno, so
we return a bad errno below it.
So I'll split it into two separate fuse_log's - one inside the (ret ==
-1_ block with the %m and one after without it.

> Otherwise, looks good to me:
> 
> Reviewed-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>

Thanks

> 
> Thanks,
> Masa
> 
> > +        if (ret == -1) {
> > +            ret = errno;
> > +            free(in_sg_cpy);
> > +            goto err;
> > +        }
> > +        if (ret < len && ret) {
> > +            fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__);
> > +            /* Skip over this much next time around */
> > +            skip_size = ret;
> > +            buf->buf[0].pos += ret;
> > +            len -= ret;
> > +
> > +            /* Lets do another read */
> > +            continue;
> > +        }
> > +        if (!ret) {
> > +            /* EOF case? */
> > +            fuse_log(FUSE_LOG_DEBUG, "%s: !ret in_sg_left=%zd\n", __func__,
> > +                     in_sg_left);
> > +            break;
> > +        }
> > +        if (ret != len) {
> > +            fuse_log(FUSE_LOG_DEBUG, "%s: ret!=len\n", __func__);
> > +            ret = EIO;
> > +            free(in_sg_cpy);
> > +            goto err;
> > +        }
> > +        in_sg_left -= ret;
> > +        len -= ret;
> > +    } while (in_sg_left);
> > +    free(in_sg_cpy);
> > +
> > +    /* Need to fix out->len on EOF */
> > +    if (len) {
> > +        struct fuse_out_header *out_sg = in_sg[0].iov_base;
> > +
> > +        tosend_len -= len;
> > +        out_sg->len = tosend_len;
> > +    }
> > +
> > +    ret = 0;
> > +
> > +    vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len);
> > +    vu_queue_notify(&se->virtio_dev->dev, q);
> > +
> > +err:
> > +    if (ret == 0) {
> > +        ch->qi->reply_sent = true;
> > +    }
> > +
> > +    return ret;
> > +}
> > +
> >  /* Thread function for individual queues, created when a queue is 'started' */
> >  static void *fv_queue_thread(void *opaque)
> >  {
> > diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h
> > index 135a14875a..cc676b9193 100644
> > --- a/tools/virtiofsd/fuse_virtio.h
> > +++ b/tools/virtiofsd/fuse_virtio.h
> > @@ -26,4 +26,8 @@ int virtio_loop(struct fuse_session *se);
> >  int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
> >                      struct iovec *iov, int count);
> >  
> > +int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
> > +                         struct iovec *iov, int count,
> > +                         struct fuse_bufvec *buf, size_t len);
> > +
> >  #endif
> > -- 
> > 2.23.0
> > 
> > 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox series

Patch

diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
index c2b114cf5b..5f80625652 100644
--- a/tools/virtiofsd/fuse_lowlevel.c
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -475,6 +475,11 @@  static int fuse_send_data_iov_fallback(struct fuse_session *se,
         return fuse_send_msg(se, ch, iov, iov_count);
     }
 
+    if (fuse_lowlevel_is_virtio(se) && buf->count == 1 &&
+        buf->buf[0].flags == (FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK)) {
+        return virtio_send_data_iov(se, ch, iov, iov_count, buf, len);
+    }
+
     abort(); /* Will have taken vhost path */
     return 0;
 }
diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
index c33e0f7e8c..146cd3f702 100644
--- a/tools/virtiofsd/fuse_virtio.c
+++ b/tools/virtiofsd/fuse_virtio.c
@@ -230,6 +230,165 @@  err:
     return ret;
 }
 
+/*
+ * Callback from fuse_send_data_iov_* when it's virtio and the buffer
+ * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK
+ * We need send the iov and then the buffer.
+ * Return 0 on success
+ */
+int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
+                         struct iovec *iov, int count, struct fuse_bufvec *buf,
+                         size_t len)
+{
+    int ret = 0;
+    VuVirtqElement *elem;
+    VuVirtq *q;
+
+    assert(count >= 1);
+    assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
+
+    struct fuse_out_header *out = iov[0].iov_base;
+    /* TODO: Endianness! */
+
+    size_t iov_len = iov_size(iov, count);
+    size_t tosend_len = iov_len + len;
+
+    out->len = tosend_len;
+
+    fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__,
+             count, len, iov_len);
+
+    /* unique == 0 is notification which we don't support */
+    assert(out->unique);
+
+    /* For virtio we always have ch */
+    assert(ch);
+    assert(!ch->qi->reply_sent);
+    elem = ch->qi->qe;
+    q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx];
+
+    /* The 'in' part of the elem is to qemu */
+    unsigned int in_num = elem->in_num;
+    struct iovec *in_sg = elem->in_sg;
+    size_t in_len = iov_size(in_sg, in_num);
+    fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
+             __func__, elem->index, in_num, in_len);
+
+    /*
+     * The elem should have room for a 'fuse_out_header' (out from fuse)
+     * plus the data based on the len in the header.
+     */
+    if (in_len < sizeof(struct fuse_out_header)) {
+        fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
+                 __func__, elem->index);
+        ret = -E2BIG;
+        goto err;
+    }
+    if (in_len < tosend_len) {
+        fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
+                 __func__, elem->index, tosend_len);
+        ret = -E2BIG;
+        goto err;
+    }
+
+    /* TODO: Limit to 'len' */
+
+    /* First copy the header data from iov->in_sg */
+    copy_iov(iov, count, in_sg, in_num, iov_len);
+
+    /*
+     * Build a copy of the the in_sg iov so we can skip bits in it,
+     * including changing the offsets
+     */
+    struct iovec *in_sg_cpy = calloc(sizeof(struct iovec), in_num);
+    memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num);
+    /* These get updated as we skip */
+    struct iovec *in_sg_ptr = in_sg_cpy;
+    int in_sg_cpy_count = in_num;
+
+    /* skip over parts of in_sg that contained the header iov */
+    size_t skip_size = iov_len;
+
+    size_t in_sg_left = 0;
+    do {
+        while (skip_size != 0 && in_sg_cpy_count) {
+            if (skip_size >= in_sg_ptr[0].iov_len) {
+                skip_size -= in_sg_ptr[0].iov_len;
+                in_sg_ptr++;
+                in_sg_cpy_count--;
+            } else {
+                in_sg_ptr[0].iov_len -= skip_size;
+                in_sg_ptr[0].iov_base += skip_size;
+                break;
+            }
+        }
+
+        int i;
+        for (i = 0, in_sg_left = 0; i < in_sg_cpy_count; i++) {
+            in_sg_left += in_sg_ptr[i].iov_len;
+        }
+        fuse_log(FUSE_LOG_DEBUG,
+                 "%s: after skip skip_size=%zd in_sg_cpy_count=%d "
+                 "in_sg_left=%zd\n",
+                 __func__, skip_size, in_sg_cpy_count, in_sg_left);
+        ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count,
+                     buf->buf[0].pos);
+
+        fuse_log(FUSE_LOG_DEBUG, "%s: preadv_res=%d(%m) len=%zd\n",
+                 __func__, ret, len);
+        if (ret == -1) {
+            ret = errno;
+            free(in_sg_cpy);
+            goto err;
+        }
+        if (ret < len && ret) {
+            fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__);
+            /* Skip over this much next time around */
+            skip_size = ret;
+            buf->buf[0].pos += ret;
+            len -= ret;
+
+            /* Lets do another read */
+            continue;
+        }
+        if (!ret) {
+            /* EOF case? */
+            fuse_log(FUSE_LOG_DEBUG, "%s: !ret in_sg_left=%zd\n", __func__,
+                     in_sg_left);
+            break;
+        }
+        if (ret != len) {
+            fuse_log(FUSE_LOG_DEBUG, "%s: ret!=len\n", __func__);
+            ret = EIO;
+            free(in_sg_cpy);
+            goto err;
+        }
+        in_sg_left -= ret;
+        len -= ret;
+    } while (in_sg_left);
+    free(in_sg_cpy);
+
+    /* Need to fix out->len on EOF */
+    if (len) {
+        struct fuse_out_header *out_sg = in_sg[0].iov_base;
+
+        tosend_len -= len;
+        out_sg->len = tosend_len;
+    }
+
+    ret = 0;
+
+    vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len);
+    vu_queue_notify(&se->virtio_dev->dev, q);
+
+err:
+    if (ret == 0) {
+        ch->qi->reply_sent = true;
+    }
+
+    return ret;
+}
+
 /* Thread function for individual queues, created when a queue is 'started' */
 static void *fv_queue_thread(void *opaque)
 {
diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h
index 135a14875a..cc676b9193 100644
--- a/tools/virtiofsd/fuse_virtio.h
+++ b/tools/virtiofsd/fuse_virtio.h
@@ -26,4 +26,8 @@  int virtio_loop(struct fuse_session *se);
 int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
                     struct iovec *iov, int count);
 
+int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
+                         struct iovec *iov, int count,
+                         struct fuse_bufvec *buf, size_t len);
+
 #endif