diff mbox

[07/10] virtio: combine the read of a descriptor

Message ID 1454236146-23293-8-git-send-email-pbonzini@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini Jan. 31, 2016, 10:29 a.m. UTC
Compared to vring, virtio has a performance penalty of 10%.  Fix it
by combining all the reads for a descriptor in a single address_space_read
call.  This also simplifies the code nicely.

Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/virtio/virtio.c | 86 ++++++++++++++++++++++--------------------------------
 1 file changed, 35 insertions(+), 51 deletions(-)

Comments

Gonglei (Arei) Feb. 3, 2016, 12:34 p.m. UTC | #1
Hi,

> Subject: [Qemu-devel] [PATCH 07/10] virtio: combine the read of a descriptor
> 
> Compared to vring, virtio has a performance penalty of 10%.  Fix it
> by combining all the reads for a descriptor in a single address_space_read
> call.  This also simplifies the code nicely.
> 
> Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  hw/virtio/virtio.c | 86 ++++++++++++++++++++++--------------------------------
>  1 file changed, 35 insertions(+), 51 deletions(-)
> 

Unbelievable! After applying this patch, the virtio-crypto speed can attach 74MB/sec, host
Cpu overhead is 180% (the main thread 100% and vcpu threads 80%)

Testing AES-128-CBC cipher: 
        Encrypting in chunks of 256 bytes: done. 371.94 MiB in 5.02 secs: 74.12 MiB/sec (1523475 packets)
        Encrypting in chunks of 256 bytes: done. 369.85 MiB in 5.01 secs: 73.88 MiB/sec (1514900 packets)
        Encrypting in chunks of 256 bytes: done. 371.07 MiB in 5.02 secs: 73.97 MiB/sec (1519914 packets)
        Encrypting in chunks of 256 bytes: done. 371.66 MiB in 5.02 secs: 74.09 MiB/sec (1522309 packets)
        Encrypting in chunks of 256 bytes: done. 371.79 MiB in 5.02 secs: 74.12 MiB/sec (1522868 packets)
        Encrypting in chunks of 256 bytes: done. 371.94 MiB in 5.02 secs: 74.15 MiB/sec (1523457 packets)
        Encrypting in chunks of 256 bytes: done. 371.90 MiB in 5.02 secs: 74.14 MiB/sec (1523317 packets)
        Encrypting in chunks of 256 bytes: done. 371.71 MiB in 5.02 secs: 74.10 MiB/sec (1522522 packets)

15.95%  qemu-kvm                 [.] address_space_translate
  6.98%  qemu-kvm                 [.] qemu_get_ram_ptr
  4.87%  libpthread-2.19.so       [.] __pthread_mutex_unlock_usercnt
  4.40%  qemu-kvm                 [.] qemu_ram_addr_from_host
  3.79%  qemu-kvm                 [.] address_space_map
  3.41%  libc-2.19.so             [.] _int_malloc
  3.29%  libc-2.19.so             [.] _int_free
  3.07%  libc-2.19.so             [.] malloc
  2.95%  libpthread-2.19.so       [.] pthread_mutex_lock
  2.94%  qemu-kvm                 [.] phys_page_find
  2.73%  qemu-kvm                 [.] address_space_translate_internal
  2.65%  libc-2.19.so             [.] malloc_consolidate
  2.35%  libc-2.19.so             [.] __memcpy_sse2_unaligned
  1.72%  qemu-kvm                 [.] find_next_zero_bit
  1.38%  qemu-kvm                 [.] address_space_rw
  1.34%  qemu-kvm                 [.] object_unref
  1.30%  qemu-kvm                 [.] object_ref
  1.28%  qemu-kvm                 [.] virtqueue_pop
  1.20%  libc-2.19.so             [.] memset
  1.11%  qemu-kvm                 [.] virtio_notify

Thank you so much!

Regards,
-Gonglei

> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 79a635f..2433866 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -107,35 +107,15 @@ void virtio_queue_update_rings(VirtIODevice *vdev,
> int n)
>                                vring->align);
>  }
> 
> -static inline uint64_t vring_desc_addr(VirtIODevice *vdev, hwaddr desc_pa,
> -                                       int i)
> +static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc,
> +                            hwaddr desc_pa, int i)
>  {
> -    hwaddr pa;
> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
> -    return virtio_ldq_phys(vdev, pa);
> -}
> -
> -static inline uint32_t vring_desc_len(VirtIODevice *vdev, hwaddr desc_pa, int
> i)
> -{
> -    hwaddr pa;
> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
> -    return virtio_ldl_phys(vdev, pa);
> -}
> -
> -static inline uint16_t vring_desc_flags(VirtIODevice *vdev, hwaddr desc_pa,
> -                                        int i)
> -{
> -    hwaddr pa;
> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
> -    return virtio_lduw_phys(vdev, pa);
> -}
> -
> -static inline uint16_t vring_desc_next(VirtIODevice *vdev, hwaddr desc_pa,
> -                                       int i)
> -{
> -    hwaddr pa;
> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
> -    return virtio_lduw_phys(vdev, pa);
> +    address_space_read(&address_space_memory, desc_pa + i *
> sizeof(VRingDesc),
> +                       MEMTXATTRS_UNSPECIFIED, (void *)desc,
> sizeof(VRingDesc));
> +    virtio_tswap64s(vdev, &desc->addr);
> +    virtio_tswap32s(vdev, &desc->len);
> +    virtio_tswap16s(vdev, &desc->flags);
> +    virtio_tswap16s(vdev, &desc->next);
>  }
> 
>  static inline uint16_t vring_avail_flags(VirtQueue *vq)
> @@ -345,18 +325,18 @@ static unsigned int virtqueue_get_head(VirtQueue
> *vq, unsigned int idx)
>      return head;
>  }
> 
> -static unsigned virtqueue_next_desc(VirtIODevice *vdev, hwaddr desc_pa,
> -                                    unsigned int i, unsigned int max)
> +static unsigned virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc
> *desc,
> +                                         hwaddr desc_pa, unsigned
> int max)
>  {
>      unsigned int next;
> 
>      /* If this descriptor says it doesn't chain, we're done. */
> -    if (!(vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_NEXT)) {
> +    if (!(desc->flags & VRING_DESC_F_NEXT)) {
>          return max;
>      }
> 
>      /* Check they're not leading us off end of descriptors. */
> -    next = vring_desc_next(vdev, desc_pa, i);
> +    next = desc->next;
>      /* Make sure compiler knows to grab that: we don't want it changing! */
>      smp_wmb();
> 
> @@ -365,6 +345,7 @@ static unsigned virtqueue_next_desc(VirtIODevice
> *vdev, hwaddr desc_pa,
>          exit(1);
>      }
> 
> +    vring_desc_read(vdev, desc, desc_pa, next);
>      return next;
>  }
> 
> @@ -381,6 +362,7 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
> unsigned int *in_bytes,
>      while (virtqueue_num_heads(vq, idx)) {
>          VirtIODevice *vdev = vq->vdev;
>          unsigned int max, num_bufs, indirect = 0;
> +        VRingDesc desc;
>          hwaddr desc_pa;
>          int i;
> 
> @@ -388,9 +370,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
> unsigned int *in_bytes,
>          num_bufs = total_bufs;
>          i = virtqueue_get_head(vq, idx++);
>          desc_pa = vq->vring.desc;
> +        vring_desc_read(vdev, &desc, desc_pa, i);
> 
> -        if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) {
> -            if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) {
> +        if (desc.flags & VRING_DESC_F_INDIRECT) {
> +            if (desc.len % sizeof(VRingDesc)) {
>                  error_report("Invalid size for indirect buffer table");
>                  exit(1);
>              }
> @@ -403,9 +386,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
> unsigned int *in_bytes,
> 
>              /* loop over the indirect descriptor table */
>              indirect = 1;
> -            max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc);
> -            desc_pa = vring_desc_addr(vdev, desc_pa, i);
> +            max = desc.len / sizeof(VRingDesc);
> +            desc_pa = desc.addr;
>              num_bufs = i = 0;
> +            vring_desc_read(vdev, &desc, desc_pa, i);
>          }
> 
>          do {
> @@ -415,15 +399,15 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
> unsigned int *in_bytes,
>                  exit(1);
>              }
> 
> -            if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE)
> {
> -                in_total += vring_desc_len(vdev, desc_pa, i);
> +            if (desc.flags & VRING_DESC_F_WRITE) {
> +                in_total += desc.len;
>              } else {
> -                out_total += vring_desc_len(vdev, desc_pa, i);
> +                out_total += desc.len;
>              }
>              if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
>                  goto done;
>              }
> -        } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max);
> +        } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa,
> max)) != max);
> 
>          if (!indirect)
>              total_bufs = num_bufs;
> @@ -545,6 +529,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
>      unsigned out_num, in_num;
>      hwaddr addr[VIRTQUEUE_MAX_SIZE];
>      struct iovec iov[VIRTQUEUE_MAX_SIZE];
> +    VRingDesc desc;
> 
>      if (!virtqueue_num_heads(vq, vq->last_avail_idx)) {
>          return NULL;
> @@ -560,33 +545,32 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
>          vring_set_avail_event(vq, vq->last_avail_idx);
>      }
> 
> -    if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) {
> -        if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) {
> +    vring_desc_read(vdev, &desc, desc_pa, i);
> +    if (desc.flags & VRING_DESC_F_INDIRECT) {
> +        if (desc.len % sizeof(VRingDesc)) {
>              error_report("Invalid size for indirect buffer table");
>              exit(1);
>          }
> 
>          /* loop over the indirect descriptor table */
> -        max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc);
> -        desc_pa = vring_desc_addr(vdev, desc_pa, i);
> +        max = desc.len / sizeof(VRingDesc);
> +        desc_pa = desc.addr;
>          i = 0;
> +        vring_desc_read(vdev, &desc, desc_pa, i);
>      }
> 
>      /* Collect all the descriptors */
>      do {
> -        hwaddr pa = vring_desc_addr(vdev, desc_pa, i);
> -        size_t len = vring_desc_len(vdev, desc_pa, i);
> -
> -        if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) {
> +        if (desc.flags & VRING_DESC_F_WRITE) {
>              virtqueue_map_desc(&in_num, addr + out_num, iov +
> out_num,
> -                               VIRTQUEUE_MAX_SIZE - out_num, true,
> pa, len);
> +                               VIRTQUEUE_MAX_SIZE - out_num, true,
> desc.addr, desc.len);
>          } else {
>              if (in_num) {
>                  error_report("Incorrect order for descriptors");
>                  exit(1);
>              }
>              virtqueue_map_desc(&out_num, addr, iov,
> -                               VIRTQUEUE_MAX_SIZE, false, pa, len);
> +                               VIRTQUEUE_MAX_SIZE, false,
> desc.addr, desc.len);
>          }
> 
>          /* If we've got too many, that implies a descriptor loop. */
> @@ -594,7 +578,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
>              error_report("Looped descriptor");
>              exit(1);
>          }
> -    } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max);
> +    } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, max)) !=
> max);
> 
>      /* Now copy what we have collected and mapped */
>      elem = virtqueue_alloc_element(sz, out_num, in_num);
> --
> 2.5.0
> 
>
Paolo Bonzini Feb. 3, 2016, 1:40 p.m. UTC | #2
On 03/02/2016 13:34, Gonglei (Arei) wrote:
> Hi,
> 
>> Subject: [Qemu-devel] [PATCH 07/10] virtio: combine the read of a descriptor
>>
>> Compared to vring, virtio has a performance penalty of 10%.  Fix it
>> by combining all the reads for a descriptor in a single address_space_read
>> call.  This also simplifies the code nicely.
>>
>> Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> ---
>>  hw/virtio/virtio.c | 86 ++++++++++++++++++++++--------------------------------
>>  1 file changed, 35 insertions(+), 51 deletions(-)
>>
> 
> Unbelievable! After applying this patch, the virtio-crypto speed can attach 74MB/sec, host
> Cpu overhead is 180% (the main thread 100% and vcpu threads 80%)

The three patches from Vincenzo will help too.  What was it like before?

Also, are you using ioeventfd or dataplane?  virtio-crypto sounds like
something that could be very easily run outside the "big QEMU lock".

Paolo

> Testing AES-128-CBC cipher: 
>         Encrypting in chunks of 256 bytes: done. 371.94 MiB in 5.02 secs: 74.12 MiB/sec (1523475 packets)
>         Encrypting in chunks of 256 bytes: done. 369.85 MiB in 5.01 secs: 73.88 MiB/sec (1514900 packets)
>         Encrypting in chunks of 256 bytes: done. 371.07 MiB in 5.02 secs: 73.97 MiB/sec (1519914 packets)
>         Encrypting in chunks of 256 bytes: done. 371.66 MiB in 5.02 secs: 74.09 MiB/sec (1522309 packets)
>         Encrypting in chunks of 256 bytes: done. 371.79 MiB in 5.02 secs: 74.12 MiB/sec (1522868 packets)
>         Encrypting in chunks of 256 bytes: done. 371.94 MiB in 5.02 secs: 74.15 MiB/sec (1523457 packets)
>         Encrypting in chunks of 256 bytes: done. 371.90 MiB in 5.02 secs: 74.14 MiB/sec (1523317 packets)
>         Encrypting in chunks of 256 bytes: done. 371.71 MiB in 5.02 secs: 74.10 MiB/sec (1522522 packets)
> 
> 15.95%  qemu-kvm                 [.] address_space_translate
>   6.98%  qemu-kvm                 [.] qemu_get_ram_ptr
>   4.87%  libpthread-2.19.so       [.] __pthread_mutex_unlock_usercnt
>   4.40%  qemu-kvm                 [.] qemu_ram_addr_from_host
>   3.79%  qemu-kvm                 [.] address_space_map
>   3.41%  libc-2.19.so             [.] _int_malloc
>   3.29%  libc-2.19.so             [.] _int_free
>   3.07%  libc-2.19.so             [.] malloc
>   2.95%  libpthread-2.19.so       [.] pthread_mutex_lock
>   2.94%  qemu-kvm                 [.] phys_page_find
>   2.73%  qemu-kvm                 [.] address_space_translate_internal
>   2.65%  libc-2.19.so             [.] malloc_consolidate
>   2.35%  libc-2.19.so             [.] __memcpy_sse2_unaligned
>   1.72%  qemu-kvm                 [.] find_next_zero_bit
>   1.38%  qemu-kvm                 [.] address_space_rw
>   1.34%  qemu-kvm                 [.] object_unref
>   1.30%  qemu-kvm                 [.] object_ref
>   1.28%  qemu-kvm                 [.] virtqueue_pop
>   1.20%  libc-2.19.so             [.] memset
>   1.11%  qemu-kvm                 [.] virtio_notify
> 
> Thank you so much!
> 
> Regards,
> -Gonglei
> 
>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>> index 79a635f..2433866 100644
>> --- a/hw/virtio/virtio.c
>> +++ b/hw/virtio/virtio.c
>> @@ -107,35 +107,15 @@ void virtio_queue_update_rings(VirtIODevice *vdev,
>> int n)
>>                                vring->align);
>>  }
>>
>> -static inline uint64_t vring_desc_addr(VirtIODevice *vdev, hwaddr desc_pa,
>> -                                       int i)
>> +static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc,
>> +                            hwaddr desc_pa, int i)
>>  {
>> -    hwaddr pa;
>> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
>> -    return virtio_ldq_phys(vdev, pa);
>> -}
>> -
>> -static inline uint32_t vring_desc_len(VirtIODevice *vdev, hwaddr desc_pa, int
>> i)
>> -{
>> -    hwaddr pa;
>> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
>> -    return virtio_ldl_phys(vdev, pa);
>> -}
>> -
>> -static inline uint16_t vring_desc_flags(VirtIODevice *vdev, hwaddr desc_pa,
>> -                                        int i)
>> -{
>> -    hwaddr pa;
>> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
>> -    return virtio_lduw_phys(vdev, pa);
>> -}
>> -
>> -static inline uint16_t vring_desc_next(VirtIODevice *vdev, hwaddr desc_pa,
>> -                                       int i)
>> -{
>> -    hwaddr pa;
>> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
>> -    return virtio_lduw_phys(vdev, pa);
>> +    address_space_read(&address_space_memory, desc_pa + i *
>> sizeof(VRingDesc),
>> +                       MEMTXATTRS_UNSPECIFIED, (void *)desc,
>> sizeof(VRingDesc));
>> +    virtio_tswap64s(vdev, &desc->addr);
>> +    virtio_tswap32s(vdev, &desc->len);
>> +    virtio_tswap16s(vdev, &desc->flags);
>> +    virtio_tswap16s(vdev, &desc->next);
>>  }
>>
>>  static inline uint16_t vring_avail_flags(VirtQueue *vq)
>> @@ -345,18 +325,18 @@ static unsigned int virtqueue_get_head(VirtQueue
>> *vq, unsigned int idx)
>>      return head;
>>  }
>>
>> -static unsigned virtqueue_next_desc(VirtIODevice *vdev, hwaddr desc_pa,
>> -                                    unsigned int i, unsigned int max)
>> +static unsigned virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc
>> *desc,
>> +                                         hwaddr desc_pa, unsigned
>> int max)
>>  {
>>      unsigned int next;
>>
>>      /* If this descriptor says it doesn't chain, we're done. */
>> -    if (!(vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_NEXT)) {
>> +    if (!(desc->flags & VRING_DESC_F_NEXT)) {
>>          return max;
>>      }
>>
>>      /* Check they're not leading us off end of descriptors. */
>> -    next = vring_desc_next(vdev, desc_pa, i);
>> +    next = desc->next;
>>      /* Make sure compiler knows to grab that: we don't want it changing! */
>>      smp_wmb();
>>
>> @@ -365,6 +345,7 @@ static unsigned virtqueue_next_desc(VirtIODevice
>> *vdev, hwaddr desc_pa,
>>          exit(1);
>>      }
>>
>> +    vring_desc_read(vdev, desc, desc_pa, next);
>>      return next;
>>  }
>>
>> @@ -381,6 +362,7 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
>> unsigned int *in_bytes,
>>      while (virtqueue_num_heads(vq, idx)) {
>>          VirtIODevice *vdev = vq->vdev;
>>          unsigned int max, num_bufs, indirect = 0;
>> +        VRingDesc desc;
>>          hwaddr desc_pa;
>>          int i;
>>
>> @@ -388,9 +370,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
>> unsigned int *in_bytes,
>>          num_bufs = total_bufs;
>>          i = virtqueue_get_head(vq, idx++);
>>          desc_pa = vq->vring.desc;
>> +        vring_desc_read(vdev, &desc, desc_pa, i);
>>
>> -        if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) {
>> -            if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) {
>> +        if (desc.flags & VRING_DESC_F_INDIRECT) {
>> +            if (desc.len % sizeof(VRingDesc)) {
>>                  error_report("Invalid size for indirect buffer table");
>>                  exit(1);
>>              }
>> @@ -403,9 +386,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
>> unsigned int *in_bytes,
>>
>>              /* loop over the indirect descriptor table */
>>              indirect = 1;
>> -            max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc);
>> -            desc_pa = vring_desc_addr(vdev, desc_pa, i);
>> +            max = desc.len / sizeof(VRingDesc);
>> +            desc_pa = desc.addr;
>>              num_bufs = i = 0;
>> +            vring_desc_read(vdev, &desc, desc_pa, i);
>>          }
>>
>>          do {
>> @@ -415,15 +399,15 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
>> unsigned int *in_bytes,
>>                  exit(1);
>>              }
>>
>> -            if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE)
>> {
>> -                in_total += vring_desc_len(vdev, desc_pa, i);
>> +            if (desc.flags & VRING_DESC_F_WRITE) {
>> +                in_total += desc.len;
>>              } else {
>> -                out_total += vring_desc_len(vdev, desc_pa, i);
>> +                out_total += desc.len;
>>              }
>>              if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
>>                  goto done;
>>              }
>> -        } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max);
>> +        } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa,
>> max)) != max);
>>
>>          if (!indirect)
>>              total_bufs = num_bufs;
>> @@ -545,6 +529,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
>>      unsigned out_num, in_num;
>>      hwaddr addr[VIRTQUEUE_MAX_SIZE];
>>      struct iovec iov[VIRTQUEUE_MAX_SIZE];
>> +    VRingDesc desc;
>>
>>      if (!virtqueue_num_heads(vq, vq->last_avail_idx)) {
>>          return NULL;
>> @@ -560,33 +545,32 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
>>          vring_set_avail_event(vq, vq->last_avail_idx);
>>      }
>>
>> -    if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) {
>> -        if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) {
>> +    vring_desc_read(vdev, &desc, desc_pa, i);
>> +    if (desc.flags & VRING_DESC_F_INDIRECT) {
>> +        if (desc.len % sizeof(VRingDesc)) {
>>              error_report("Invalid size for indirect buffer table");
>>              exit(1);
>>          }
>>
>>          /* loop over the indirect descriptor table */
>> -        max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc);
>> -        desc_pa = vring_desc_addr(vdev, desc_pa, i);
>> +        max = desc.len / sizeof(VRingDesc);
>> +        desc_pa = desc.addr;
>>          i = 0;
>> +        vring_desc_read(vdev, &desc, desc_pa, i);
>>      }
>>
>>      /* Collect all the descriptors */
>>      do {
>> -        hwaddr pa = vring_desc_addr(vdev, desc_pa, i);
>> -        size_t len = vring_desc_len(vdev, desc_pa, i);
>> -
>> -        if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) {
>> +        if (desc.flags & VRING_DESC_F_WRITE) {
>>              virtqueue_map_desc(&in_num, addr + out_num, iov +
>> out_num,
>> -                               VIRTQUEUE_MAX_SIZE - out_num, true,
>> pa, len);
>> +                               VIRTQUEUE_MAX_SIZE - out_num, true,
>> desc.addr, desc.len);
>>          } else {
>>              if (in_num) {
>>                  error_report("Incorrect order for descriptors");
>>                  exit(1);
>>              }
>>              virtqueue_map_desc(&out_num, addr, iov,
>> -                               VIRTQUEUE_MAX_SIZE, false, pa, len);
>> +                               VIRTQUEUE_MAX_SIZE, false,
>> desc.addr, desc.len);
>>          }
>>
>>          /* If we've got too many, that implies a descriptor loop. */
>> @@ -594,7 +578,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
>>              error_report("Looped descriptor");
>>              exit(1);
>>          }
>> -    } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max);
>> +    } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, max)) !=
>> max);
>>
>>      /* Now copy what we have collected and mapped */
>>      elem = virtqueue_alloc_element(sz, out_num, in_num);
>> --
>> 2.5.0
>>
>>
> 
> 
>
Gonglei (Arei) Feb. 4, 2016, 7:48 a.m. UTC | #3
Hi Paolo,

> From: Paolo Bonzini [mailto:paolo.bonzini@gmail.com] On Behalf Of Paolo
> Bonzini
>
> On 03/02/2016 13:34, Gonglei (Arei) wrote:
> > Hi,
> >
> >> Subject: [Qemu-devel] [PATCH 07/10] virtio: combine the read of a descriptor
> >>
> >> Compared to vring, virtio has a performance penalty of 10%.  Fix it
> >> by combining all the reads for a descriptor in a single address_space_read
> >> call.  This also simplifies the code nicely.
> >>
> >> Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
> >> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> >> ---
> >>  hw/virtio/virtio.c | 86 ++++++++++++++++++++++--------------------------------
> >>  1 file changed, 35 insertions(+), 51 deletions(-)
> >>
> >
> > Unbelievable! After applying this patch, the virtio-crypto speed can attach
> 74MB/sec, host
> > Cpu overhead is 180% (the main thread 100% and vcpu threads 80%)
> 
> The three patches from Vincenzo will help too.  What was it like before?
> 
That's true, and I replied in the cover letter.

> Also, are you using ioeventfd or dataplane?  virtio-crypto sounds like
> something that could be very easily run outside the "big QEMU lock".
> 

Yes, all testing results based on the conditions of ioeventfd enabled.  And I
also realized the virtio-crypto-dataplane scheme, got the below results:

Testing AES-128-CBC cipher: 
        Encrypting in chunks of 256 bytes: done. 370.81 MiB in 5.02 secs: 73.92 MiB/sec (1518832 packets)
        Encrypting in chunks of 256 bytes: done. 374.09 MiB in 5.02 secs: 74.51 MiB/sec (1532272 packets)
        Encrypting in chunks of 256 bytes: done. 372.05 MiB in 5.02 secs: 74.11 MiB/sec (1523920 packets)
        Encrypting in chunks of 256 bytes: done. 365.77 MiB in 5.02 secs: 72.86 MiB/sec (1498200 packets)
        Encrypting in chunks of 256 bytes: done. 379.63 MiB in 5.02 secs: 75.62 MiB/sec (1554976 packets)
        Encrypting in chunks of 256 bytes: done. 374.61 MiB in 5.02 secs: 74.62 MiB/sec (1534384 packets)
        Encrypting in chunks of 256 bytes: done. 372.13 MiB in 5.02 secs: 74.12 MiB/sec (1524226 packets)
        Encrypting in chunks of 256 bytes: done. 374.18 MiB in 5.02 secs: 74.53 MiB/sec (1532656 packets)


11.44%  qemu-kvm                 [.] memory_region_find
  6.31%  qemu-kvm                 [.] qemu_get_ram_ptr
  4.61%  libpthread-2.19.so       [.] __pthread_mutex_unlock_usercnt
  3.54%  qemu-kvm                 [.] qemu_ram_addr_from_host
  2.80%  libpthread-2.19.so       [.] pthread_mutex_lock
  2.55%  qemu-kvm                 [.] object_unref
  2.49%  libc-2.19.so             [.] malloc
  2.47%  libc-2.19.so             [.] _int_malloc
  2.34%  libc-2.19.so             [.] _int_free
  2.18%  qemu-kvm                 [.] object_ref
  2.18%  qemu-kvm                 [.] address_space_translate
  2.03%  libc-2.19.so             [.] __memcpy_sse2_unaligned
  1.76%  libc-2.19.so             [.] malloc_consolidate
  1.56%  qemu-kvm                 [.] addrrange_intersection
  1.52%  qemu-kvm                 [.] vring_pop
  1.36%  qemu-kvm                 [.] find_next_zero_bit
  1.30%  [kernel]                 [k] native_write_msr_safe
  1.29%  qemu-kvm                 [.] addrrange_intersects
  1.21%  qemu-kvm                 [.] vring_map
  0.93%  qemu-kvm                 [.] virtio_notify

Do you have any thoughts to decrease the cpu overhead and get higher through output? Thanks!

Regards,
-Gonglei

> Paolo
> 
> > Testing AES-128-CBC cipher:
> >         Encrypting in chunks of 256 bytes: done. 371.94 MiB in 5.02 secs:
> 74.12 MiB/sec (1523475 packets)
> >         Encrypting in chunks of 256 bytes: done. 369.85 MiB in 5.01 secs:
> 73.88 MiB/sec (1514900 packets)
> >         Encrypting in chunks of 256 bytes: done. 371.07 MiB in 5.02 secs:
> 73.97 MiB/sec (1519914 packets)
> >         Encrypting in chunks of 256 bytes: done. 371.66 MiB in 5.02 secs:
> 74.09 MiB/sec (1522309 packets)
> >         Encrypting in chunks of 256 bytes: done. 371.79 MiB in 5.02 secs:
> 74.12 MiB/sec (1522868 packets)
> >         Encrypting in chunks of 256 bytes: done. 371.94 MiB in 5.02 secs:
> 74.15 MiB/sec (1523457 packets)
> >         Encrypting in chunks of 256 bytes: done. 371.90 MiB in 5.02 secs:
> 74.14 MiB/sec (1523317 packets)
> >         Encrypting in chunks of 256 bytes: done. 371.71 MiB in 5.02 secs:
> 74.10 MiB/sec (1522522 packets)
> >
> > 15.95%  qemu-kvm                 [.] address_space_translate
> >   6.98%  qemu-kvm                 [.] qemu_get_ram_ptr
> >   4.87%  libpthread-2.19.so       [.] __pthread_mutex_unlock_usercnt
> >   4.40%  qemu-kvm                 [.] qemu_ram_addr_from_host
> >   3.79%  qemu-kvm                 [.] address_space_map
> >   3.41%  libc-2.19.so             [.] _int_malloc
> >   3.29%  libc-2.19.so             [.] _int_free
> >   3.07%  libc-2.19.so             [.] malloc
> >   2.95%  libpthread-2.19.so       [.] pthread_mutex_lock
> >   2.94%  qemu-kvm                 [.] phys_page_find
> >   2.73%  qemu-kvm                 [.]
> address_space_translate_internal
> >   2.65%  libc-2.19.so             [.] malloc_consolidate
> >   2.35%  libc-2.19.so             [.] __memcpy_sse2_unaligned
> >   1.72%  qemu-kvm                 [.] find_next_zero_bit
> >   1.38%  qemu-kvm                 [.] address_space_rw
> >   1.34%  qemu-kvm                 [.] object_unref
> >   1.30%  qemu-kvm                 [.] object_ref
> >   1.28%  qemu-kvm                 [.] virtqueue_pop
> >   1.20%  libc-2.19.so             [.] memset
> >   1.11%  qemu-kvm                 [.] virtio_notify
> >
> > Thank you so much!
> >
> > Regards,
> > -Gonglei
> >
> >> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> >> index 79a635f..2433866 100644
> >> --- a/hw/virtio/virtio.c
> >> +++ b/hw/virtio/virtio.c
> >> @@ -107,35 +107,15 @@ void virtio_queue_update_rings(VirtIODevice
> *vdev,
> >> int n)
> >>                                vring->align);
> >>  }
> >>
> >> -static inline uint64_t vring_desc_addr(VirtIODevice *vdev, hwaddr desc_pa,
> >> -                                       int i)
> >> +static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc,
> >> +                            hwaddr desc_pa, int i)
> >>  {
> >> -    hwaddr pa;
> >> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
> >> -    return virtio_ldq_phys(vdev, pa);
> >> -}
> >> -
> >> -static inline uint32_t vring_desc_len(VirtIODevice *vdev, hwaddr desc_pa,
> int
> >> i)
> >> -{
> >> -    hwaddr pa;
> >> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
> >> -    return virtio_ldl_phys(vdev, pa);
> >> -}
> >> -
> >> -static inline uint16_t vring_desc_flags(VirtIODevice *vdev, hwaddr
> desc_pa,
> >> -                                        int i)
> >> -{
> >> -    hwaddr pa;
> >> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
> >> -    return virtio_lduw_phys(vdev, pa);
> >> -}
> >> -
> >> -static inline uint16_t vring_desc_next(VirtIODevice *vdev, hwaddr desc_pa,
> >> -                                       int i)
> >> -{
> >> -    hwaddr pa;
> >> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
> >> -    return virtio_lduw_phys(vdev, pa);
> >> +    address_space_read(&address_space_memory, desc_pa + i *
> >> sizeof(VRingDesc),
> >> +                       MEMTXATTRS_UNSPECIFIED, (void *)desc,
> >> sizeof(VRingDesc));
> >> +    virtio_tswap64s(vdev, &desc->addr);
> >> +    virtio_tswap32s(vdev, &desc->len);
> >> +    virtio_tswap16s(vdev, &desc->flags);
> >> +    virtio_tswap16s(vdev, &desc->next);
> >>  }
> >>
> >>  static inline uint16_t vring_avail_flags(VirtQueue *vq)
> >> @@ -345,18 +325,18 @@ static unsigned int
> virtqueue_get_head(VirtQueue
> >> *vq, unsigned int idx)
> >>      return head;
> >>  }
> >>
> >> -static unsigned virtqueue_next_desc(VirtIODevice *vdev, hwaddr desc_pa,
> >> -                                    unsigned int i, unsigned int
> max)
> >> +static unsigned virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc
> >> *desc,
> >> +                                         hwaddr desc_pa,
> unsigned
> >> int max)
> >>  {
> >>      unsigned int next;
> >>
> >>      /* If this descriptor says it doesn't chain, we're done. */
> >> -    if (!(vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_NEXT)) {
> >> +    if (!(desc->flags & VRING_DESC_F_NEXT)) {
> >>          return max;
> >>      }
> >>
> >>      /* Check they're not leading us off end of descriptors. */
> >> -    next = vring_desc_next(vdev, desc_pa, i);
> >> +    next = desc->next;
> >>      /* Make sure compiler knows to grab that: we don't want it changing!
> */
> >>      smp_wmb();
> >>
> >> @@ -365,6 +345,7 @@ static unsigned virtqueue_next_desc(VirtIODevice
> >> *vdev, hwaddr desc_pa,
> >>          exit(1);
> >>      }
> >>
> >> +    vring_desc_read(vdev, desc, desc_pa, next);
> >>      return next;
> >>  }
> >>
> >> @@ -381,6 +362,7 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
> >> unsigned int *in_bytes,
> >>      while (virtqueue_num_heads(vq, idx)) {
> >>          VirtIODevice *vdev = vq->vdev;
> >>          unsigned int max, num_bufs, indirect = 0;
> >> +        VRingDesc desc;
> >>          hwaddr desc_pa;
> >>          int i;
> >>
> >> @@ -388,9 +370,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
> >> unsigned int *in_bytes,
> >>          num_bufs = total_bufs;
> >>          i = virtqueue_get_head(vq, idx++);
> >>          desc_pa = vq->vring.desc;
> >> +        vring_desc_read(vdev, &desc, desc_pa, i);
> >>
> >> -        if (vring_desc_flags(vdev, desc_pa, i) &
> VRING_DESC_F_INDIRECT) {
> >> -            if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) {
> >> +        if (desc.flags & VRING_DESC_F_INDIRECT) {
> >> +            if (desc.len % sizeof(VRingDesc)) {
> >>                  error_report("Invalid size for indirect buffer table");
> >>                  exit(1);
> >>              }
> >> @@ -403,9 +386,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
> >> unsigned int *in_bytes,
> >>
> >>              /* loop over the indirect descriptor table */
> >>              indirect = 1;
> >> -            max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc);
> >> -            desc_pa = vring_desc_addr(vdev, desc_pa, i);
> >> +            max = desc.len / sizeof(VRingDesc);
> >> +            desc_pa = desc.addr;
> >>              num_bufs = i = 0;
> >> +            vring_desc_read(vdev, &desc, desc_pa, i);
> >>          }
> >>
> >>          do {
> >> @@ -415,15 +399,15 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
> >> unsigned int *in_bytes,
> >>                  exit(1);
> >>              }
> >>
> >> -            if (vring_desc_flags(vdev, desc_pa, i) &
> VRING_DESC_F_WRITE)
> >> {
> >> -                in_total += vring_desc_len(vdev, desc_pa, i);
> >> +            if (desc.flags & VRING_DESC_F_WRITE) {
> >> +                in_total += desc.len;
> >>              } else {
> >> -                out_total += vring_desc_len(vdev, desc_pa, i);
> >> +                out_total += desc.len;
> >>              }
> >>              if (in_total >= max_in_bytes && out_total >=
> max_out_bytes) {
> >>                  goto done;
> >>              }
> >> -        } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max);
> >> +        } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa,
> >> max)) != max);
> >>
> >>          if (!indirect)
> >>              total_bufs = num_bufs;
> >> @@ -545,6 +529,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
> >>      unsigned out_num, in_num;
> >>      hwaddr addr[VIRTQUEUE_MAX_SIZE];
> >>      struct iovec iov[VIRTQUEUE_MAX_SIZE];
> >> +    VRingDesc desc;
> >>
> >>      if (!virtqueue_num_heads(vq, vq->last_avail_idx)) {
> >>          return NULL;
> >> @@ -560,33 +545,32 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
> >>          vring_set_avail_event(vq, vq->last_avail_idx);
> >>      }
> >>
> >> -    if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) {
> >> -        if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) {
> >> +    vring_desc_read(vdev, &desc, desc_pa, i);
> >> +    if (desc.flags & VRING_DESC_F_INDIRECT) {
> >> +        if (desc.len % sizeof(VRingDesc)) {
> >>              error_report("Invalid size for indirect buffer table");
> >>              exit(1);
> >>          }
> >>
> >>          /* loop over the indirect descriptor table */
> >> -        max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc);
> >> -        desc_pa = vring_desc_addr(vdev, desc_pa, i);
> >> +        max = desc.len / sizeof(VRingDesc);
> >> +        desc_pa = desc.addr;
> >>          i = 0;
> >> +        vring_desc_read(vdev, &desc, desc_pa, i);
> >>      }
> >>
> >>      /* Collect all the descriptors */
> >>      do {
> >> -        hwaddr pa = vring_desc_addr(vdev, desc_pa, i);
> >> -        size_t len = vring_desc_len(vdev, desc_pa, i);
> >> -
> >> -        if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) {
> >> +        if (desc.flags & VRING_DESC_F_WRITE) {
> >>              virtqueue_map_desc(&in_num, addr + out_num, iov +
> >> out_num,
> >> -                               VIRTQUEUE_MAX_SIZE - out_num,
> true,
> >> pa, len);
> >> +                               VIRTQUEUE_MAX_SIZE - out_num,
> true,
> >> desc.addr, desc.len);
> >>          } else {
> >>              if (in_num) {
> >>                  error_report("Incorrect order for descriptors");
> >>                  exit(1);
> >>              }
> >>              virtqueue_map_desc(&out_num, addr, iov,
> >> -                               VIRTQUEUE_MAX_SIZE, false, pa,
> len);
> >> +                               VIRTQUEUE_MAX_SIZE, false,
> >> desc.addr, desc.len);
> >>          }
> >>
> >>          /* If we've got too many, that implies a descriptor loop. */
> >> @@ -594,7 +578,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
> >>              error_report("Looped descriptor");
> >>              exit(1);
> >>          }
> >> -    } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max);
> >> +    } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, max)) !=
> >> max);
> >>
> >>      /* Now copy what we have collected and mapped */
> >>      elem = virtqueue_alloc_element(sz, out_num, in_num);
> >> --
> >> 2.5.0
> >>
> >>
> >
> >
> >
Paolo Bonzini Feb. 4, 2016, 10:18 a.m. UTC | #4
On 04/02/2016 08:48, Gonglei (Arei) wrote:
> 11.44%  qemu-kvm                 [.] memory_region_find
>   6.31%  qemu-kvm                 [.] qemu_get_ram_ptr
>   4.61%  libpthread-2.19.so       [.] __pthread_mutex_unlock_usercnt
>   3.54%  qemu-kvm                 [.] qemu_ram_addr_from_host
>   2.80%  libpthread-2.19.so       [.] pthread_mutex_lock
>   2.55%  qemu-kvm                 [.] object_unref
>   2.49%  libc-2.19.so             [.] malloc
>   2.47%  libc-2.19.so             [.] _int_malloc
>   2.34%  libc-2.19.so             [.] _int_free
>   2.18%  qemu-kvm                 [.] object_ref
>   2.18%  qemu-kvm                 [.] address_space_translate
>   2.03%  libc-2.19.so             [.] __memcpy_sse2_unaligned
>   1.76%  libc-2.19.so             [.] malloc_consolidate
>   1.56%  qemu-kvm                 [.] addrrange_intersection
>   1.52%  qemu-kvm                 [.] vring_pop
>   1.36%  qemu-kvm                 [.] find_next_zero_bit
>   1.30%  [kernel]                 [k] native_write_msr_safe
>   1.29%  qemu-kvm                 [.] addrrange_intersects
>   1.21%  qemu-kvm                 [.] vring_map
>   0.93%  qemu-kvm                 [.] virtio_notify
> 
> Do you have any thoughts to decrease the cpu overhead and get higher through output? Thanks!

Using bigger chunks than 256 bytes will reduce the overhead in
memory_region_find and qemu_get_ram_ptr.  You could expect
 a further 10-12% improvement.

Paolo
Gonglei (Arei) Feb. 5, 2016, 6:16 a.m. UTC | #5
Regards,
-Gonglei


> -----Original Message-----
> From: Paolo Bonzini [mailto:pbonzini@redhat.com]
> Sent: Thursday, February 04, 2016 6:18 PM
> To: Gonglei (Arei); qemu-devel@nongnu.org
> Cc: cornelia.huck@de.ibm.com; mst@redhat.com
> Subject: Re: [PATCH 07/10] virtio: combine the read of a descriptor
> 
> 
> 
> On 04/02/2016 08:48, Gonglei (Arei) wrote:
> > 11.44%  qemu-kvm                 [.] memory_region_find
> >   6.31%  qemu-kvm                 [.] qemu_get_ram_ptr
> >   4.61%  libpthread-2.19.so       [.] __pthread_mutex_unlock_usercnt
> >   3.54%  qemu-kvm                 [.] qemu_ram_addr_from_host
> >   2.80%  libpthread-2.19.so       [.] pthread_mutex_lock
> >   2.55%  qemu-kvm                 [.] object_unref
> >   2.49%  libc-2.19.so             [.] malloc
> >   2.47%  libc-2.19.so             [.] _int_malloc
> >   2.34%  libc-2.19.so             [.] _int_free
> >   2.18%  qemu-kvm                 [.] object_ref
> >   2.18%  qemu-kvm                 [.] address_space_translate
> >   2.03%  libc-2.19.so             [.] __memcpy_sse2_unaligned
> >   1.76%  libc-2.19.so             [.] malloc_consolidate
> >   1.56%  qemu-kvm                 [.] addrrange_intersection
> >   1.52%  qemu-kvm                 [.] vring_pop
> >   1.36%  qemu-kvm                 [.] find_next_zero_bit
> >   1.30%  [kernel]                 [k] native_write_msr_safe
> >   1.29%  qemu-kvm                 [.] addrrange_intersects
> >   1.21%  qemu-kvm                 [.] vring_map
> >   0.93%  qemu-kvm                 [.] virtio_notify
> >
> > Do you have any thoughts to decrease the cpu overhead and get higher
> through output? Thanks!
> 
> Using bigger chunks than 256 bytes will reduce the overhead in
> memory_region_find and qemu_get_ram_ptr.  You could expect
>  a further 10-12% improvement.
> 
Yes, you're right. This is the testing result:

        Encrypting in chunks of 256 bytes: done. 386.89 MiB in 5.02 secs: 77.13 MiB/sec (1584701 packets)
        Encrypting in chunks of 512 bytes: done. 756.80 MiB in 5.02 secs: 150.86 MiB/sec (1549918 packets)
        Encrypting in chunks of 1024 bytes: done. 1.30 GiB in 5.02 secs: 0.26 GiB/sec (1358614 packets)
        Encrypting in chunks of 2048 bytes: done. 2.42 GiB in 5.02 secs: 0.48 GiB/sec (1270223 packets)
        Encrypting in chunks of 4096 bytes: done. 3.99 GiB in 5.02 secs: 0.79 GiB/sec (1046680 packets)
        Encrypting in chunks of 8192 bytes: done. 6.12 GiB in 5.02 secs: 1.22 GiB/sec (802379 packets)
        Encrypting in chunks of 16384 bytes: done. 8.48 GiB in 5.04 secs: 1.68 GiB/sec (556046 packets)
        Encrypting in chunks of 32768 bytes: done. 10.42 GiB in 5.07 secs: 2.06 GiB/sec (341524 packets)

But 256-byte packet is the main chunk size of packet in CT scenarios.

Regards,
-Gonglei
diff mbox

Patch

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 79a635f..2433866 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -107,35 +107,15 @@  void virtio_queue_update_rings(VirtIODevice *vdev, int n)
                               vring->align);
 }
 
-static inline uint64_t vring_desc_addr(VirtIODevice *vdev, hwaddr desc_pa,
-                                       int i)
+static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc,
+                            hwaddr desc_pa, int i)
 {
-    hwaddr pa;
-    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
-    return virtio_ldq_phys(vdev, pa);
-}
-
-static inline uint32_t vring_desc_len(VirtIODevice *vdev, hwaddr desc_pa, int i)
-{
-    hwaddr pa;
-    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
-    return virtio_ldl_phys(vdev, pa);
-}
-
-static inline uint16_t vring_desc_flags(VirtIODevice *vdev, hwaddr desc_pa,
-                                        int i)
-{
-    hwaddr pa;
-    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
-    return virtio_lduw_phys(vdev, pa);
-}
-
-static inline uint16_t vring_desc_next(VirtIODevice *vdev, hwaddr desc_pa,
-                                       int i)
-{
-    hwaddr pa;
-    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
-    return virtio_lduw_phys(vdev, pa);
+    address_space_read(&address_space_memory, desc_pa + i * sizeof(VRingDesc),
+                       MEMTXATTRS_UNSPECIFIED, (void *)desc, sizeof(VRingDesc));
+    virtio_tswap64s(vdev, &desc->addr);
+    virtio_tswap32s(vdev, &desc->len);
+    virtio_tswap16s(vdev, &desc->flags);
+    virtio_tswap16s(vdev, &desc->next);
 }
 
 static inline uint16_t vring_avail_flags(VirtQueue *vq)
@@ -345,18 +325,18 @@  static unsigned int virtqueue_get_head(VirtQueue *vq, unsigned int idx)
     return head;
 }
 
-static unsigned virtqueue_next_desc(VirtIODevice *vdev, hwaddr desc_pa,
-                                    unsigned int i, unsigned int max)
+static unsigned virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc *desc,
+                                         hwaddr desc_pa, unsigned int max)
 {
     unsigned int next;
 
     /* If this descriptor says it doesn't chain, we're done. */
-    if (!(vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_NEXT)) {
+    if (!(desc->flags & VRING_DESC_F_NEXT)) {
         return max;
     }
 
     /* Check they're not leading us off end of descriptors. */
-    next = vring_desc_next(vdev, desc_pa, i);
+    next = desc->next;
     /* Make sure compiler knows to grab that: we don't want it changing! */
     smp_wmb();
 
@@ -365,6 +345,7 @@  static unsigned virtqueue_next_desc(VirtIODevice *vdev, hwaddr desc_pa,
         exit(1);
     }
 
+    vring_desc_read(vdev, desc, desc_pa, next);
     return next;
 }
 
@@ -381,6 +362,7 @@  void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
     while (virtqueue_num_heads(vq, idx)) {
         VirtIODevice *vdev = vq->vdev;
         unsigned int max, num_bufs, indirect = 0;
+        VRingDesc desc;
         hwaddr desc_pa;
         int i;
 
@@ -388,9 +370,10 @@  void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
         num_bufs = total_bufs;
         i = virtqueue_get_head(vq, idx++);
         desc_pa = vq->vring.desc;
+        vring_desc_read(vdev, &desc, desc_pa, i);
 
-        if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) {
-            if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) {
+        if (desc.flags & VRING_DESC_F_INDIRECT) {
+            if (desc.len % sizeof(VRingDesc)) {
                 error_report("Invalid size for indirect buffer table");
                 exit(1);
             }
@@ -403,9 +386,10 @@  void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
 
             /* loop over the indirect descriptor table */
             indirect = 1;
-            max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc);
-            desc_pa = vring_desc_addr(vdev, desc_pa, i);
+            max = desc.len / sizeof(VRingDesc);
+            desc_pa = desc.addr;
             num_bufs = i = 0;
+            vring_desc_read(vdev, &desc, desc_pa, i);
         }
 
         do {
@@ -415,15 +399,15 @@  void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
                 exit(1);
             }
 
-            if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) {
-                in_total += vring_desc_len(vdev, desc_pa, i);
+            if (desc.flags & VRING_DESC_F_WRITE) {
+                in_total += desc.len;
             } else {
-                out_total += vring_desc_len(vdev, desc_pa, i);
+                out_total += desc.len;
             }
             if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
                 goto done;
             }
-        } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max);
+        } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, max)) != max);
 
         if (!indirect)
             total_bufs = num_bufs;
@@ -545,6 +529,7 @@  void *virtqueue_pop(VirtQueue *vq, size_t sz)
     unsigned out_num, in_num;
     hwaddr addr[VIRTQUEUE_MAX_SIZE];
     struct iovec iov[VIRTQUEUE_MAX_SIZE];
+    VRingDesc desc;
 
     if (!virtqueue_num_heads(vq, vq->last_avail_idx)) {
         return NULL;
@@ -560,33 +545,32 @@  void *virtqueue_pop(VirtQueue *vq, size_t sz)
         vring_set_avail_event(vq, vq->last_avail_idx);
     }
 
-    if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) {
-        if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) {
+    vring_desc_read(vdev, &desc, desc_pa, i);
+    if (desc.flags & VRING_DESC_F_INDIRECT) {
+        if (desc.len % sizeof(VRingDesc)) {
             error_report("Invalid size for indirect buffer table");
             exit(1);
         }
 
         /* loop over the indirect descriptor table */
-        max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc);
-        desc_pa = vring_desc_addr(vdev, desc_pa, i);
+        max = desc.len / sizeof(VRingDesc);
+        desc_pa = desc.addr;
         i = 0;
+        vring_desc_read(vdev, &desc, desc_pa, i);
     }
 
     /* Collect all the descriptors */
     do {
-        hwaddr pa = vring_desc_addr(vdev, desc_pa, i);
-        size_t len = vring_desc_len(vdev, desc_pa, i);
-
-        if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) {
+        if (desc.flags & VRING_DESC_F_WRITE) {
             virtqueue_map_desc(&in_num, addr + out_num, iov + out_num,
-                               VIRTQUEUE_MAX_SIZE - out_num, true, pa, len);
+                               VIRTQUEUE_MAX_SIZE - out_num, true, desc.addr, desc.len);
         } else {
             if (in_num) {
                 error_report("Incorrect order for descriptors");
                 exit(1);
             }
             virtqueue_map_desc(&out_num, addr, iov,
-                               VIRTQUEUE_MAX_SIZE, false, pa, len);
+                               VIRTQUEUE_MAX_SIZE, false, desc.addr, desc.len);
         }
 
         /* If we've got too many, that implies a descriptor loop. */
@@ -594,7 +578,7 @@  void *virtqueue_pop(VirtQueue *vq, size_t sz)
             error_report("Looped descriptor");
             exit(1);
         }
-    } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max);
+    } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, max)) != max);
 
     /* Now copy what we have collected and mapped */
     elem = virtqueue_alloc_element(sz, out_num, in_num);