Patchwork [2/2] dataplane: change vring API to use VirtQueueElement

login
register
mail settings
Submitter Paolo Bonzini
Date May 10, 2013, 12:27 p.m.
Message ID <1368188849-5647-3-git-send-email-pbonzini@redhat.com>
Download mbox | patch
Permalink /patch/242976/
State New
Headers show

Comments

Paolo Bonzini - May 10, 2013, 12:27 p.m.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/block/dataplane/virtio-blk.c     | 83 ++++++++++++++-----------------------
 hw/virtio/dataplane/vring.c         | 46 +++++++++++---------
 include/hw/virtio/dataplane/vring.h |  6 +--
 3 files changed, 59 insertions(+), 76 deletions(-)
Stefan Hajnoczi - May 13, 2013, 11:48 a.m.
On Fri, May 10, 2013 at 02:27:29PM +0200, Paolo Bonzini wrote:
> @@ -305,29 +285,28 @@ static void handle_notify(EventNotifier *e)
>          vring_disable_notification(s->vdev, &s->vring);
>  
>          for (;;) {
> -            head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num);
> -            if (head < 0) {
> +            ret = vring_pop(s->vdev, &s->vring, &elem);
> +            if (ret < 0) {
>                  break; /* no more requests */

elem is leaked - we don't vring_push(elem)!

>              }
>  
> -            trace_virtio_blk_data_plane_process_request(s, out_num, in_num,
> -                                                        head);
> +            trace_virtio_blk_data_plane_process_request(s, elem->out_num,
> +                                                        elem->in_num, elem->index);
>  
> -            if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) {
> +            if (process_request(&s->ioqueue, elem) < 0) {
>                  vring_set_broken(&s->vring);
>                  break;

elem is leaked on -EFAULT.

> @@ -268,6 +270,10 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
>       * the index we've seen. */
>      head = vring->vr.avail->ring[last_avail_idx % num];
>  
> +    elem = *p_elem = g_slice_new(VirtQueueElement);
> +    memset(elem, 0, sizeof(*elem));

VirtQueueElement is 48 KB.  We cannot affort to memset, see
de6c8042ec55da18702fa51f09072fcaa315edc3.  At that time dd if=/dev/vda
of=/dev/null iflag=direct bs=8k resulted in 10% host CPU utilization in
memset.

Patch

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 0356665..a665115 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -36,7 +36,7 @@  enum {
 typedef struct {
     struct iocb iocb;               /* Linux AIO control block */
     QEMUIOVector *inhdr;            /* iovecs for virtio_blk_inhdr */
-    unsigned int head;              /* vring descriptor index */
+    VirtQueueElement *elem;         /* saved data from the virtqueue */
     struct iovec *bounce_iov;       /* used if guest buffers are unaligned */
     QEMUIOVector *read_qiov;        /* for read completion /w bounce buffer */
 } VirtIOBlockRequest;
@@ -98,7 +98,7 @@  static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque)
         len = 0;
     }
 
-    trace_virtio_blk_data_plane_complete_request(s, req->head, ret);
+    trace_virtio_blk_data_plane_complete_request(s, req->elem->index, ret);
 
     if (req->read_qiov) {
         assert(req->bounce_iov);
@@ -120,12 +120,12 @@  static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque)
      * written to, but for virtio-blk it seems to be the number of bytes
      * transferred plus the status bytes.
      */
-    vring_push(&s->vring, req->head, len + sizeof(hdr));
-
+    vring_push(&s->vring, req->elem, len + sizeof(hdr));
+    req->elem = NULL;
     s->num_reqs--;
 }
 
-static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head,
+static void complete_request_early(VirtIOBlockDataPlane *s, VirtQueueElement *elem,
                                    QEMUIOVector *inhdr, unsigned char status)
 {
     struct virtio_blk_inhdr hdr = {
@@ -136,26 +136,26 @@  static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head,
     qemu_iovec_destroy(inhdr);
     g_slice_free(QEMUIOVector, inhdr);
 
-    vring_push(&s->vring, head, sizeof(hdr));
+    vring_push(&s->vring, elem, sizeof(hdr));
     notify_guest(s);
 }
 
 /* Get disk serial number */
 static void do_get_id_cmd(VirtIOBlockDataPlane *s,
                           struct iovec *iov, unsigned int iov_cnt,
-                          unsigned int head, QEMUIOVector *inhdr)
+                          VirtQueueElement *elem, QEMUIOVector *inhdr)
 {
     char id[VIRTIO_BLK_ID_BYTES];
 
     /* Serial number not NUL-terminated when shorter than buffer */
     strncpy(id, s->blk->serial ? s->blk->serial : "", sizeof(id));
     iov_from_buf(iov, iov_cnt, 0, id, sizeof(id));
-    complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
+    complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_OK);
 }
 
 static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
-                       struct iovec *iov, unsigned int iov_cnt,
-                       long long offset, unsigned int head,
+                       struct iovec *iov, unsigned iov_cnt,
+                       long long offset, VirtQueueElement *elem,
                        QEMUIOVector *inhdr)
 {
     struct iocb *iocb;
@@ -188,19 +188,20 @@  static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
 
     /* Fill in virtio block metadata needed for completion */
     VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
-    req->head = head;
+    req->elem = elem;
     req->inhdr = inhdr;
     req->bounce_iov = bounce_iov;
     req->read_qiov = read_qiov;
     return 0;
 }
 
-static int process_request(IOQueue *ioq, struct iovec iov[],
-                           unsigned int out_num, unsigned int in_num,
-                           unsigned int head)
+static int process_request(IOQueue *ioq, VirtQueueElement *elem)
 {
     VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue);
-    struct iovec *in_iov = &iov[out_num];
+    struct iovec *iov = elem->out_sg;
+    struct iovec *in_iov = elem->in_sg;
+    unsigned out_num = elem->out_num;
+    unsigned in_num = elem->in_num;
     struct virtio_blk_outhdr outhdr;
     QEMUIOVector *inhdr;
     size_t in_size;
@@ -231,29 +232,29 @@  static int process_request(IOQueue *ioq, struct iovec iov[],
 
     switch (outhdr.type) {
     case VIRTIO_BLK_T_IN:
-        do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr);
+        do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, elem, inhdr);
         return 0;
 
     case VIRTIO_BLK_T_OUT:
-        do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr);
+        do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, elem, inhdr);
         return 0;
 
     case VIRTIO_BLK_T_SCSI_CMD:
         /* TODO support SCSI commands */
-        complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP);
+        complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_UNSUPP);
         return 0;
 
     case VIRTIO_BLK_T_FLUSH:
         /* TODO fdsync not supported by Linux AIO, do it synchronously here! */
         if (qemu_fdatasync(s->fd) < 0) {
-            complete_request_early(s, head, inhdr, VIRTIO_BLK_S_IOERR);
+            complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_IOERR);
         } else {
-            complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
+            complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_OK);
         }
         return 0;
 
     case VIRTIO_BLK_T_GET_ID:
-        do_get_id_cmd(s, in_iov, in_num, head, inhdr);
+        do_get_id_cmd(s, in_iov, in_num, elem, inhdr);
         return 0;
 
     default:
@@ -274,29 +275,8 @@  static void handle_notify(EventNotifier *e)
     VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane,
                                            host_notifier);
 
-    /* There is one array of iovecs into which all new requests are extracted
-     * from the vring.  Requests are read from the vring and the translated
-     * descriptors are written to the iovecs array.  The iovecs do not have to
-     * persist across handle_notify() calls because the kernel copies the
-     * iovecs on io_submit().
-     *
-     * Handling io_submit() EAGAIN may require storing the requests across
-     * handle_notify() calls until the kernel has sufficient resources to
-     * accept more I/O.  This is not implemented yet.
-     */
-    struct iovec iovec[VRING_MAX];
-    struct iovec *end = &iovec[VRING_MAX];
-    struct iovec *iov = iovec;
-
-    /* When a request is read from the vring, the index of the first descriptor
-     * (aka head) is returned so that the completed request can be pushed onto
-     * the vring later.
-     *
-     * The number of hypervisor read-only iovecs is out_num.  The number of
-     * hypervisor write-only iovecs is in_num.
-     */
-    int head;
-    unsigned int out_num = 0, in_num = 0;
+    VirtQueueElement *elem;
+    int ret;
     unsigned int num_queued;
 
     event_notifier_test_and_clear(&s->host_notifier);
@@ -305,29 +285,28 @@  static void handle_notify(EventNotifier *e)
         vring_disable_notification(s->vdev, &s->vring);
 
         for (;;) {
-            head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num);
-            if (head < 0) {
+            ret = vring_pop(s->vdev, &s->vring, &elem);
+            if (ret < 0) {
                 break; /* no more requests */
             }
 
-            trace_virtio_blk_data_plane_process_request(s, out_num, in_num,
-                                                        head);
+            trace_virtio_blk_data_plane_process_request(s, elem->out_num,
+                                                        elem->in_num, elem->index);
 
-            if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) {
+            if (process_request(&s->ioqueue, elem) < 0) {
                 vring_set_broken(&s->vring);
                 break;
             }
-            iov += out_num + in_num;
         }
 
-        if (likely(head == -EAGAIN)) { /* vring emptied */
+        if (likely(ret == -EAGAIN)) { /* vring emptied */
             /* Re-enable guest->host notifies and stop processing the vring.
              * But if the guest has snuck in more descriptors, keep processing.
              */
             if (vring_enable_notification(s->vdev, &s->vring)) {
                 break;
             }
-        } else { /* head == -ENOBUFS or fatal error, iovecs[] is depleted */
+        } else { /* ret == -ENOBUFS or fatal error, iovecs[] is depleted */
             /* Since there are no iovecs[] left, stop processing for now.  Do
              * not re-enable guest->host notifies since the I/O completion
              * handler knows to check for more vring descriptors anyway.
diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c
index bd1ff25..a358dd5 100644
--- a/hw/virtio/dataplane/vring.c
+++ b/hw/virtio/dataplane/vring.c
@@ -108,29 +108,32 @@  bool vring_should_notify(VirtIODevice *vdev, Vring *vring)
 }
 
 
-static int get_desc(Vring *vring,
-                    struct iovec iov[], struct iovec *iov_end,
-                    unsigned int *out_num, unsigned int *in_num,
+static int get_desc(Vring *vring, VirtQueueElement *elem,
                     struct vring_desc *desc)
 {
     unsigned *num;
+    struct iovec *iov;
+    hwaddr *addr;
 
     if (desc->flags & VRING_DESC_F_WRITE) {
-        num = in_num;
+        num = &elem->in_num;
+        iov = &elem->in_sg[*num];
+        addr = &elem->in_addr[*num];
     } else {
-        num = out_num;
+        num = &elem->out_num;
+        iov = &elem->out_sg[*num];
+        addr = &elem->out_addr[*num];
 
         /* If it's an output descriptor, they're all supposed
          * to come before any input descriptors. */
-        if (unlikely(*in_num)) {
+        if (unlikely(elem->in_num)) {
             error_report("Descriptor has out after in");
             return -EFAULT;
         }
     }
 
     /* Stop for now if there are not enough iovecs available. */
-    iov += *in_num + *out_num;
-    if (iov >= iov_end) {
+    if (*num >= VIRTQUEUE_MAX_SIZE) {
         return -ENOBUFS;
     }
 
@@ -144,14 +147,13 @@  static int get_desc(Vring *vring,
     }
 
     iov->iov_len = desc->len;
+    *addr = desc->addr;
     *num += 1;
     return 0;
 }
 
 /* This is stolen from linux/drivers/vhost/vhost.c. */
-static int get_indirect(Vring *vring,
-                        struct iovec iov[], struct iovec *iov_end,
-                        unsigned int *out_num, unsigned int *in_num,
+static int get_indirect(Vring *vring, VirtQueueElement *elem,
                         struct vring_desc *indirect)
 {
     struct vring_desc desc;
@@ -209,7 +211,7 @@  static int get_indirect(Vring *vring,
             return -EFAULT;
         }
 
-        ret = get_desc(vring, iov, iov_end, out_num, in_num, &desc);
+        ret = get_desc(vring, elem, &desc);
         if (ret < 0) {
             vring->broken |= (ret == -EFAULT);
             return ret;
@@ -231,12 +233,12 @@  static int get_indirect(Vring *vring,
  * Stolen from linux/drivers/vhost/vhost.c.
  */
 int vring_pop(VirtIODevice *vdev, Vring *vring,
-              struct iovec iov[], struct iovec *iov_end,
-              unsigned int *out_num, unsigned int *in_num)
+              VirtQueueElement **p_elem)
 {
     struct vring_desc desc;
     unsigned int i, head, found = 0, num = vring->vr.num;
     uint16_t avail_idx, last_avail_idx;
+    VirtQueueElement *elem;
     int ret;
 
     /* If there was a fatal error then refuse operation */
@@ -268,6 +270,10 @@  int vring_pop(VirtIODevice *vdev, Vring *vring,
      * the index we've seen. */
     head = vring->vr.avail->ring[last_avail_idx % num];
 
+    elem = *p_elem = g_slice_new(VirtQueueElement);
+    memset(elem, 0, sizeof(*elem));
+    elem->index = head;
+    
     /* If their number is silly, that's an error. */
     if (unlikely(head >= num)) {
         error_report("Guest says index %u > %u is available", head, num);
@@ -279,9 +285,6 @@  int vring_pop(VirtIODevice *vdev, Vring *vring,
         vring_avail_event(&vring->vr) = vring->vr.avail->idx;
     }
 
-    /* When we start there are none of either input nor output. */
-    *out_num = *in_num = 0;
-
     i = head;
     do {
         if (unlikely(i >= num)) {
@@ -301,14 +304,14 @@  int vring_pop(VirtIODevice *vdev, Vring *vring,
         barrier();
 
         if (desc.flags & VRING_DESC_F_INDIRECT) {
-            int ret = get_indirect(vring, iov, iov_end, out_num, in_num, &desc);
+            int ret = get_indirect(vring, elem, &desc);
             if (ret < 0) {
                 return ret;
             }
             continue;
         }
 
-        ret = get_desc(vring, iov, iov_end, out_num, in_num, &desc);
+        ret = get_desc(vring, elem, &desc);
         if (ret < 0) {
             vring->broken |= (ret == -EFAULT);
             return ret;
@@ -326,11 +329,14 @@  int vring_pop(VirtIODevice *vdev, Vring *vring,
  *
  * Stolen from linux/drivers/vhost/vhost.c.
  */
-void vring_push(Vring *vring, unsigned int head, int len)
+void vring_push(Vring *vring, VirtQueueElement *elem, int len)
 {
     struct vring_used_elem *used;
+    unsigned int head = elem->index;
     uint16_t new;
 
+    g_slice_free(VirtQueueElement, elem);
+
     /* Don't touch vring if a fatal error occurred */
     if (vring->broken) {
         return;
diff --git a/include/hw/virtio/dataplane/vring.h b/include/hw/virtio/dataplane/vring.h
index 9380cb5..d9c9f8f 100644
--- a/include/hw/virtio/dataplane/vring.h
+++ b/include/hw/virtio/dataplane/vring.h
@@ -54,9 +54,7 @@  void vring_teardown(Vring *vring);
 void vring_disable_notification(VirtIODevice *vdev, Vring *vring);
 bool vring_enable_notification(VirtIODevice *vdev, Vring *vring);
 bool vring_should_notify(VirtIODevice *vdev, Vring *vring);
-int vring_pop(VirtIODevice *vdev, Vring *vring,
-              struct iovec iov[], struct iovec *iov_end,
-              unsigned int *out_num, unsigned int *in_num);
-void vring_push(Vring *vring, unsigned int head, int len);
+int vring_pop(VirtIODevice *vdev, Vring *vring, VirtQueueElement **elem);
+void vring_push(Vring *vring, VirtQueueElement *elem, int len);
 
 #endif /* VRING_H */