diff mbox

[1/7] virtio: allow byte swapping for vring and config access

Message ID 87ob97nz7x.fsf@rustcorp.com.au
State New
Headers show

Commit Message

Rusty Russell Aug. 9, 2013, 6:40 a.m. UTC
Anthony Liguori <anthony@codemonkey.ws> writes:
> I suspect this is a premature optimization.  With a weak function called
> directly in the accessors below, I suspect you would see no measurable
> performance overhead compared to this approach.
>
> It's all very predictable so the CPU should do a decent job optimizing
> the if () away.

Perhaps.  I was leery of introducing performance regressions, but the
actual I/O tends to dominate anyway.

So I tested this, by adding the patch (below) and benchmarking
qemu-system-i386 on my laptop before and after.

Setup: Intel(R) Core(TM) i5 CPU       M 560  @ 2.67GHz
(Performance cpu governer enabled)
Guest: virtio user net, virtio block on raw file, 1 CPU, 512MB RAM.
(Qemu run under eatmydata to eliminate syncs)

First test: ping -f -c 10000 -q 10.0.2.0 (100 times)
(Ping chosen since packets stay in qemu's user net code)

BEFORE:
        MIN: 824ms
        MAX: 914ms
        AVG: 876.95ms
        STDDEV: 16ms

AFTER:
        MIN: 872ms
        MAX: 933ms
        AVG: 904.35ms
        STDDEV: 15ms

Second test: dd if=/dev/vda iflag=direct count=10000 of=/dev/null (100 times)

BEFORE:
        MIN: 0.927994sec
        MAX: 1.051640sec
        AVG: 0.99733sec
        STDDEV: 0.028sec

AFTER:
        MIN: 0.941706sec
        MAX: 1.034810sec
        AVG: 0.988692sec
        STDDEV: 0.021sec

So, we can notice performance on ping, but anything which does actual IO
is a wash.

Cheers,
Rusty.

Comments

Anthony Liguori Aug. 9, 2013, 2:10 p.m. UTC | #1
Rusty Russell <rusty@rustcorp.com.au> writes:

> Anthony Liguori <anthony@codemonkey.ws> writes:
>> I suspect this is a premature optimization.  With a weak function called
>> directly in the accessors below, I suspect you would see no measurable
>> performance overhead compared to this approach.
>>
>> It's all very predictable so the CPU should do a decent job optimizing
>> the if () away.
>
> Perhaps.  I was leery of introducing performance regressions, but the
> actual I/O tends to dominate anyway.
>
> So I tested this, by adding the patch (below) and benchmarking
> qemu-system-i386 on my laptop before and after.
>
> Setup: Intel(R) Core(TM) i5 CPU       M 560  @ 2.67GHz
> (Performance cpu governer enabled)
> Guest: virtio user net, virtio block on raw file, 1 CPU, 512MB RAM.
> (Qemu run under eatmydata to eliminate syncs)

FYI, cache=unsafe is equivalent to using eatmydata.

> First test: ping -f -c 10000 -q 10.0.2.0 (100 times)
> (Ping chosen since packets stay in qemu's user net code)
>
> BEFORE:
>         MIN: 824ms
>         MAX: 914ms
>         AVG: 876.95ms
>         STDDEV: 16ms
>
> AFTER:
>         MIN: 872ms
>         MAX: 933ms
>         AVG: 904.35ms
>         STDDEV: 15ms

I can reproduce this although I also see a larger standard deviation.

BEFORE:
	MIN: 496
	MAX: 1055
        AVG: 873.22
        STDEV: 136.88

AFTER:
        MIN: 494
        MAX: 1456
        AVG: 947.77
        STDEV: 150.89

In my datasets, the stdev is higher in the after case implying that
there is more variation.  Indeed, the MIN is pretty much the same.

GCC is inlining the functions, I'm still surprised that it's measurable
at all.

At any rate, I think the advantage of not increasing the amount of
target specific code outweighs the performance difference here.  As you
said, if there is real I/O, the differences isn't noticable.

Regards,

Anthony Liguori
Rusty Russell Aug. 11, 2013, 11:46 p.m. UTC | #2
Anthony Liguori <anthony@codemonkey.ws> writes:
> Rusty Russell <rusty@rustcorp.com.au> writes:
>> (Qemu run under eatmydata to eliminate syncs)
>
> FYI, cache=unsafe is equivalent to using eatmydata.

Ah, thanks!

> I can reproduce this although I also see a larger standard deviation.
>
> BEFORE:
> 	MIN: 496
> 	MAX: 1055
>         AVG: 873.22
>         STDEV: 136.88
>
> AFTER:
>         MIN: 494
>         MAX: 1456
>         AVG: 947.77
>         STDEV: 150.89

BTW, how did you generate these stats?  Consider this my plug for my
little stats filter:
        https://github.com/rustyrussell/stats

>
> In my datasets, the stdev is higher in the after case implying that
> there is more variation.  Indeed, the MIN is pretty much the same.
>
> GCC is inlining the functions, I'm still surprised that it's measurable
> at all.

GCC won't inline across compilation units without -flto though, so the
stub call won't be inlined, right?

Cheers,
Rusty.
diff mbox

Patch

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 2887f17..df8733b 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -85,20 +85,6 @@  struct VirtQueue
     EventNotifier host_notifier;
 };
 
-#ifdef TARGET_VIRTIO_SWAPENDIAN
-bool virtio_byteswap;
-
-/* Ask target code if we should swap endian for all vring and config access. */
-static void mark_endian(void)
-{
-    virtio_byteswap = virtio_swap_endian();
-}
-#else
-static void mark_endian(void)
-{
-}
-#endif
-
 /* virt queue functions */
 static void virtqueue_init(VirtQueue *vq)
 {
@@ -540,9 +526,6 @@  void virtio_set_status(VirtIODevice *vdev, uint8_t val)
     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
     trace_virtio_set_status(vdev, val);
 
-    /* If guest virtio endian is uncertain, set it now. */
-    mark_endian();
-
     if (k->set_status) {
         k->set_status(vdev, val);
     }
diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h
index b1d531e..ea4166a 100644
--- a/include/hw/virtio/virtio-access.h
+++ b/include/hw/virtio/virtio-access.h
@@ -13,18 +13,9 @@ 
 #ifndef _QEMU_VIRTIO_ACCESS_H
 #define _QEMU_VIRTIO_ACCESS_H
 
-#ifdef TARGET_VIRTIO_SWAPENDIAN
-/* Architectures which need biendian define this function. */
-extern bool virtio_swap_endian(void);
-
-extern bool virtio_byteswap;
-#else
-#define virtio_byteswap false
-#endif
-
 static inline uint16_t virtio_lduw_phys(hwaddr pa)
 {
-    if (virtio_byteswap) {
+    if (cpu_get_byteswap()) {
         return bswap16(lduw_phys(pa));
     }
     return lduw_phys(pa);
@@ -33,7 +24,7 @@  static inline uint16_t virtio_lduw_phys(hwaddr pa)
 
 static inline uint32_t virtio_ldl_phys(hwaddr pa)
 {
-    if (virtio_byteswap) {
+    if (cpu_get_byteswap()) {
         return bswap32(ldl_phys(pa));
     }
     return ldl_phys(pa);
@@ -41,7 +32,7 @@  static inline uint32_t virtio_ldl_phys(hwaddr pa)
 
 static inline uint64_t virtio_ldq_phys(hwaddr pa)
 {
-    if (virtio_byteswap) {
+    if (cpu_get_byteswap()) {
         return bswap64(ldq_phys(pa));
     }
     return ldq_phys(pa);
@@ -49,7 +40,7 @@  static inline uint64_t virtio_ldq_phys(hwaddr pa)
 
 static inline void virtio_stw_phys(hwaddr pa, uint16_t value)
 {
-    if (virtio_byteswap) {
+    if (cpu_get_byteswap()) {
         stw_phys(pa, bswap16(value));
     } else {
         stw_phys(pa, value);
@@ -58,7 +49,7 @@  static inline void virtio_stw_phys(hwaddr pa, uint16_t value)
 
 static inline void virtio_stl_phys(hwaddr pa, uint32_t value)
 {
-    if (virtio_byteswap) {
+    if (cpu_get_byteswap()) {
         stl_phys(pa, bswap32(value));
     } else {
         stl_phys(pa, value);
@@ -67,7 +58,7 @@  static inline void virtio_stl_phys(hwaddr pa, uint32_t value)
 
 static inline void virtio_stw_p(void *ptr, uint16_t v)
 {
-    if (virtio_byteswap) {
+    if (cpu_get_byteswap()) {
 	stw_p(ptr, bswap16(v));
     } else {
 	stw_p(ptr, v);
@@ -76,7 +67,7 @@  static inline void virtio_stw_p(void *ptr, uint16_t v)
 
 static inline void virtio_stl_p(void *ptr, uint32_t v)
 {
-    if (virtio_byteswap) {
+    if (cpu_get_byteswap()) {
 	stl_p(ptr, bswap32(v));
     } else {
 	stl_p(ptr, v);
@@ -85,7 +76,7 @@  static inline void virtio_stl_p(void *ptr, uint32_t v)
 
 static inline void virtio_stq_p(void *ptr, uint64_t v)
 {
-    if (virtio_byteswap) {
+    if (cpu_get_byteswap()) {
 	stq_p(ptr, bswap64(v));
     } else {
 	stq_p(ptr, v);
@@ -94,7 +85,7 @@  static inline void virtio_stq_p(void *ptr, uint64_t v)
 
 static inline int virtio_lduw_p(const void *ptr)
 {
-    if (virtio_byteswap) {
+    if (cpu_get_byteswap()) {
 	return bswap16(lduw_p(ptr));
     } else {
 	return lduw_p(ptr);
@@ -103,7 +94,7 @@  static inline int virtio_lduw_p(const void *ptr)
 
 static inline int virtio_ldl_p(const void *ptr)
 {
-    if (virtio_byteswap) {
+    if (cpu_get_byteswap()) {
 	return bswap32(ldl_p(ptr));
     } else {
 	return ldl_p(ptr);
@@ -112,7 +103,7 @@  static inline int virtio_ldl_p(const void *ptr)
 
 static inline uint64_t virtio_ldq_p(const void *ptr)
 {
-    if (virtio_byteswap) {
+    if (cpu_get_byteswap()) {
 	return bswap64(ldq_p(ptr));
     } else {
 	return ldq_p(ptr);
@@ -121,7 +112,7 @@  static inline uint64_t virtio_ldq_p(const void *ptr)
 
 static inline uint32_t virtio_tswap32(uint32_t s)
 {
-    if (virtio_byteswap) {
+    if (cpu_get_byteswap()) {
 	return bswap32(tswap32(s));
     } else {
 	return tswap32(s);
@@ -131,7 +122,7 @@  static inline uint32_t virtio_tswap32(uint32_t s)
 static inline void virtio_tswap32s(uint32_t *s)
 {
     tswap32s(s);
-    if (virtio_byteswap) {
+    if (cpu_get_byteswap()) {
 	*s = bswap32(*s);
     }
 }
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index a5bb515..cc5068f 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -357,4 +357,11 @@  void cpu_reset_interrupt(CPUState *cpu, int mask);
  */
 void cpu_resume(CPUState *cpu);
 
+/**
+ * cpu_get_byteswap:
+ *
+ * Is (any) CPU running in byteswapped mode.  Normally false.
+ */
+bool cpu_get_byteswap(void);
+
 #endif
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index 9b701b4..d4af94a 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -25,3 +25,4 @@  stub-obj-y += vm-stop.o
 stub-obj-y += vmstate.o
 stub-obj-$(CONFIG_WIN32) += fd-register.o
 stub-obj-y += cpus.o
+stub-obj-y += cpu_byteswap.o
diff --git a/target-ppc/misc_helper.c b/target-ppc/misc_helper.c
index b031586..18d399d 100644
--- a/target-ppc/misc_helper.c
+++ b/target-ppc/misc_helper.c
@@ -118,9 +118,7 @@  void ppc_store_msr(CPUPPCState *env, target_ulong value)
     hreg_store_msr(env, value, 0);
 }
 
-/* Our virtio accesses are LE if the first CPU is LE when they touch
- * it.  We assume endian doesn't change after that! */ 
-bool virtio_swap_endian(void)
+bool cpu_get_byteswap(void)
 {
     return first_cpu->hflags & (1 << MSR_LE);
 }