diff mbox

[v2,2/2] virtio-scsi: Optimize virtio_scsi_init_req

Message ID 1410758605-29375-3-git-send-email-famz@redhat.com
State New
Headers show

Commit Message

Fam Zheng Sept. 15, 2014, 5:23 a.m. UTC
The VirtQueueElement is a very big structure (>48k!), since it will be
initialzed by virtqueue_pop, we can save the expensive zeroing here.

This saves a few microseconds per request in my test:

[fio-test]      rw         bs         iodepth    jobs       bw         iops       latency
--------------------------------------------------------------------------------------------
Before          read       4k         1          1          110        28269      34
After           read       4k         1          1          131        33745      28

Whereas,

virtio-blk      read       4k         1          1          217        55673      16

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 hw/scsi/virtio-scsi.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

Comments

Paolo Bonzini Sept. 15, 2014, 10:17 a.m. UTC | #1
Il 15/09/2014 07:23, Fam Zheng ha scritto:
>      SCSIRequest *sreq;
>      size_t resp_size;
>      enum SCSIXferMode mode;
> -    QEMUIOVector resp_iov;
>      union {
>          VirtIOSCSICmdResp     cmd;
>          VirtIOSCSICtrlTMFResp tmf;
> @@ -68,23 +75,27 @@ static inline SCSIDevice *virtio_scsi_device_find(VirtIOSCSI *s, uint8_t *lun)
>  static VirtIOSCSIReq *virtio_scsi_init_req(VirtIOSCSI *s, VirtQueue *vq)
>  {
>      VirtIOSCSIReq *req;
> -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
> -
> -    req = g_malloc0(sizeof(*req) + vs->cdb_size);
> +    VirtIOSCSICommon *vs = (VirtIOSCSICommon *)s;
> +    const size_t zero_skip = offsetof(VirtIOSCSIReq, elem)
> +                             + sizeof(VirtQueueElement);
>  
> +    req = g_slice_alloc(sizeof(*req) + vs->cdb_size);

Looks good, but why do you need to zero the union?  You only need to
zero sreq, resp_size and mode, don't you (and at this point, memset
becomes superfluous)?

Paolo
Fam Zheng Sept. 16, 2014, 7:16 a.m. UTC | #2
On Mon, 09/15 12:17, Paolo Bonzini wrote:
> Il 15/09/2014 07:23, Fam Zheng ha scritto:
> >      SCSIRequest *sreq;
> >      size_t resp_size;
> >      enum SCSIXferMode mode;
> > -    QEMUIOVector resp_iov;
> >      union {
> >          VirtIOSCSICmdResp     cmd;
> >          VirtIOSCSICtrlTMFResp tmf;
> > @@ -68,23 +75,27 @@ static inline SCSIDevice *virtio_scsi_device_find(VirtIOSCSI *s, uint8_t *lun)
> >  static VirtIOSCSIReq *virtio_scsi_init_req(VirtIOSCSI *s, VirtQueue *vq)
> >  {
> >      VirtIOSCSIReq *req;
> > -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
> > -
> > -    req = g_malloc0(sizeof(*req) + vs->cdb_size);
> > +    VirtIOSCSICommon *vs = (VirtIOSCSICommon *)s;
> > +    const size_t zero_skip = offsetof(VirtIOSCSIReq, elem)
> > +                             + sizeof(VirtQueueElement);
> >  
> > +    req = g_slice_alloc(sizeof(*req) + vs->cdb_size);
> 
> Looks good, but why do you need to zero the union?  You only need to
> zero sreq, resp_size and mode, don't you (and at this point, memset
> becomes superfluous)?
> 

The structures in unions are not zeroed by caller, also leaving them breaks
virtio-scsi in my test.

FWIW, I will remove the "req->sreq = NULL;" two lines below in v3. At this
point tuning these small fields are subtle optimization compared to the arrays,
I say let's just simply keep the memset so that adding more fields in the
future are also safe.

Fam
Paolo Bonzini Sept. 16, 2014, 8:15 a.m. UTC | #3
Il 16/09/2014 09:16, Fam Zheng ha scritto:
> On Mon, 09/15 12:17, Paolo Bonzini wrote:
>> Il 15/09/2014 07:23, Fam Zheng ha scritto:
>>>      SCSIRequest *sreq;
>>>      size_t resp_size;
>>>      enum SCSIXferMode mode;
>>> -    QEMUIOVector resp_iov;
>>>      union {
>>>          VirtIOSCSICmdResp     cmd;
>>>          VirtIOSCSICtrlTMFResp tmf;
>>> @@ -68,23 +75,27 @@ static inline SCSIDevice *virtio_scsi_device_find(VirtIOSCSI *s, uint8_t *lun)
>>>  static VirtIOSCSIReq *virtio_scsi_init_req(VirtIOSCSI *s, VirtQueue *vq)
>>>  {
>>>      VirtIOSCSIReq *req;
>>> -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
>>> -
>>> -    req = g_malloc0(sizeof(*req) + vs->cdb_size);
>>> +    VirtIOSCSICommon *vs = (VirtIOSCSICommon *)s;
>>> +    const size_t zero_skip = offsetof(VirtIOSCSIReq, elem)
>>> +                             + sizeof(VirtQueueElement);
>>>  
>>> +    req = g_slice_alloc(sizeof(*req) + vs->cdb_size);
>>
>> Looks good, but why do you need to zero the union?  You only need to
>> zero sreq, resp_size and mode, don't you (and at this point, memset
>> becomes superfluous)?
>>
> 
> The structures in unions are not zeroed by caller, also leaving them breaks
> virtio-scsi in my test.
> 
> FWIW, I will remove the "req->sreq = NULL;" two lines below in v3. At this
> point tuning these small fields are subtle optimization compared to the arrays,
> I say let's just simply keep the memset so that adding more fields in the
> future are also safe.

Perhaps the response fields have to be zeroed?  The request shouldn't
need it.  It can be done separately though---the VirtQueueElement is the
big one that we have to fix.

Paolo
diff mbox

Patch

diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 86aba88..7bf03c4 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -24,12 +24,19 @@ 
 typedef struct VirtIOSCSIReq {
     VirtIOSCSI *dev;
     VirtQueue *vq;
-    VirtQueueElement elem;
     QEMUSGList qsgl;
+    QEMUIOVector resp_iov;
+
+    /* Note:
+     * - fields before elem are initialized by virtio_scsi_init_req;
+     * - elem is uninitialized at the time of allocation.
+     * - fields after elem are zeroed by virtio_scsi_init_req.
+     * */
+
+    VirtQueueElement elem;
     SCSIRequest *sreq;
     size_t resp_size;
     enum SCSIXferMode mode;
-    QEMUIOVector resp_iov;
     union {
         VirtIOSCSICmdResp     cmd;
         VirtIOSCSICtrlTMFResp tmf;
@@ -68,23 +75,27 @@  static inline SCSIDevice *virtio_scsi_device_find(VirtIOSCSI *s, uint8_t *lun)
 static VirtIOSCSIReq *virtio_scsi_init_req(VirtIOSCSI *s, VirtQueue *vq)
 {
     VirtIOSCSIReq *req;
-    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
-
-    req = g_malloc0(sizeof(*req) + vs->cdb_size);
+    VirtIOSCSICommon *vs = (VirtIOSCSICommon *)s;
+    const size_t zero_skip = offsetof(VirtIOSCSIReq, elem)
+                             + sizeof(VirtQueueElement);
 
+    req = g_slice_alloc(sizeof(*req) + vs->cdb_size);
     req->vq = vq;
     req->dev = s;
     req->sreq = NULL;
     qemu_sglist_init(&req->qsgl, DEVICE(s), 8, &address_space_memory);
     qemu_iovec_init(&req->resp_iov, 1);
+    memset((uint8_t *)req + zero_skip, 0, sizeof(*req) - zero_skip);
     return req;
 }
 
 static void virtio_scsi_free_req(VirtIOSCSIReq *req)
 {
+    VirtIOSCSICommon *vs = (VirtIOSCSICommon *)req->dev;
+
     qemu_iovec_destroy(&req->resp_iov);
     qemu_sglist_destroy(&req->qsgl);
-    g_free(req);
+    g_slice_free1(sizeof(*req) + vs->cdb_size, req);
 }
 
 static void virtio_scsi_complete_req(VirtIOSCSIReq *req)