Patchwork [1/7] virtio: allow byte swapping for vring and config access

login
register
mail settings
Submitter Rusty Russell
Date Aug. 8, 2013, 5:15 a.m.
Message ID <1375938949-22622-2-git-send-email-rusty@rustcorp.com.au>
Download mbox | patch
Permalink /patch/265653/
State New
Headers show

Comments

Rusty Russell - Aug. 8, 2013, 5:15 a.m.
Virtio is currently defined to work as "guest endian", but this is a
problem if the guest can change endian.  As most targets can't change
endian, we make it a per-target option to avoid pessimising.

This is based on a simpler patch by Anthony Liguouri, which only handled
the vring accesses.  We also need some drivers to access these helpers,
eg. for data which contains headers.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 hw/virtio/virtio.c                |  46 +++++++++----
 include/hw/virtio/virtio-access.h | 138 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 170 insertions(+), 14 deletions(-)
 create mode 100644 include/hw/virtio/virtio-access.h
Anthony Liguori - Aug. 8, 2013, 1:31 p.m.
Rusty Russell <rusty@rustcorp.com.au> writes:

> Virtio is currently defined to work as "guest endian", but this is a
> problem if the guest can change endian.  As most targets can't change
> endian, we make it a per-target option to avoid pessimising.
>
> This is based on a simpler patch by Anthony Liguouri, which only handled
> the vring accesses.  We also need some drivers to access these helpers,
> eg. for data which contains headers.
>
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> ---
>  hw/virtio/virtio.c                |  46 +++++++++----
>  include/hw/virtio/virtio-access.h | 138 ++++++++++++++++++++++++++++++++++++++
>  2 files changed, 170 insertions(+), 14 deletions(-)
>  create mode 100644 include/hw/virtio/virtio-access.h
>
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 8176c14..2887f17 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -18,6 +18,7 @@
>  #include "hw/virtio/virtio.h"
>  #include "qemu/atomic.h"
>  #include "hw/virtio/virtio-bus.h"
> +#include "hw/virtio/virtio-access.h"
>  
>  /* The alignment to use between consumer and producer parts of vring.
>   * x86 pagesize again. */
> @@ -84,6 +85,20 @@ struct VirtQueue
>      EventNotifier host_notifier;
>  };
>  
> +#ifdef TARGET_VIRTIO_SWAPENDIAN
> +bool virtio_byteswap;
> +
> +/* Ask target code if we should swap endian for all vring and config access. */
> +static void mark_endian(void)
> +{
> +    virtio_byteswap = virtio_swap_endian();
> +}
> +#else
> +static void mark_endian(void)
> +{
> +}
> +#endif
> +

It would be very good to avoid a target specific define here.  We would
like to move to only building a single copy of the virtio code.

We have a mechanism to do weak functions via stubs/.  I think it would
be better to do cpu_get_byteswap() as a stub function and then overload
it in the ppc64 code.

>  /* virt queue functions */
>  static void virtqueue_init(VirtQueue *vq)
>  {
> @@ -100,49 +115,49 @@ static inline uint64_t vring_desc_addr(hwaddr desc_pa, int i)
>  {
>      hwaddr pa;
>      pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
> -    return ldq_phys(pa);
> +    return virtio_ldq_phys(pa);
>  }
>  
>  static inline uint32_t vring_desc_len(hwaddr desc_pa, int i)
>  {
>      hwaddr pa;
>      pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
> -    return ldl_phys(pa);
> +    return virtio_ldl_phys(pa);
>  }
>  
>  static inline uint16_t vring_desc_flags(hwaddr desc_pa, int i)
>  {
>      hwaddr pa;
>      pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
> -    return lduw_phys(pa);
> +    return virtio_lduw_phys(pa);
>  }
>  
>  static inline uint16_t vring_desc_next(hwaddr desc_pa, int i)
>  {
>      hwaddr pa;
>      pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
> -    return lduw_phys(pa);
> +    return virtio_lduw_phys(pa);
>  }
>  
>  static inline uint16_t vring_avail_flags(VirtQueue *vq)
>  {
>      hwaddr pa;
>      pa = vq->vring.avail + offsetof(VRingAvail, flags);
> -    return lduw_phys(pa);
> +    return virtio_lduw_phys(pa);
>  }
>  
>  static inline uint16_t vring_avail_idx(VirtQueue *vq)
>  {
>      hwaddr pa;
>      pa = vq->vring.avail + offsetof(VRingAvail, idx);
> -    return lduw_phys(pa);
> +    return virtio_lduw_phys(pa);
>  }
>  
>  static inline uint16_t vring_avail_ring(VirtQueue *vq, int i)
>  {
>      hwaddr pa;
>      pa = vq->vring.avail + offsetof(VRingAvail, ring[i]);
> -    return lduw_phys(pa);
> +    return virtio_lduw_phys(pa);
>  }
>  
>  static inline uint16_t vring_used_event(VirtQueue *vq)
> @@ -154,42 +169,42 @@ static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val)
>  {
>      hwaddr pa;
>      pa = vq->vring.used + offsetof(VRingUsed, ring[i].id);
> -    stl_phys(pa, val);
> +    virtio_stl_phys(pa, val);
>  }
>  
>  static inline void vring_used_ring_len(VirtQueue *vq, int i, uint32_t val)
>  {
>      hwaddr pa;
>      pa = vq->vring.used + offsetof(VRingUsed, ring[i].len);
> -    stl_phys(pa, val);
> +    virtio_stl_phys(pa, val);
>  }
>  
>  static uint16_t vring_used_idx(VirtQueue *vq)
>  {
>      hwaddr pa;
>      pa = vq->vring.used + offsetof(VRingUsed, idx);
> -    return lduw_phys(pa);
> +    return virtio_lduw_phys(pa);
>  }
>  
>  static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val)
>  {
>      hwaddr pa;
>      pa = vq->vring.used + offsetof(VRingUsed, idx);
> -    stw_phys(pa, val);
> +    virtio_stw_phys(pa, val);
>  }
>  
>  static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask)
>  {
>      hwaddr pa;
>      pa = vq->vring.used + offsetof(VRingUsed, flags);
> -    stw_phys(pa, lduw_phys(pa) | mask);
> +    virtio_stw_phys(pa, virtio_lduw_phys(pa) | mask);
>  }
>  
>  static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask)
>  {
>      hwaddr pa;
>      pa = vq->vring.used + offsetof(VRingUsed, flags);
> -    stw_phys(pa, lduw_phys(pa) & ~mask);
> +    virtio_stw_phys(pa, virtio_lduw_phys(pa) & ~mask);
>  }
>  
>  static inline void vring_avail_event(VirtQueue *vq, uint16_t val)
> @@ -199,7 +214,7 @@ static inline void vring_avail_event(VirtQueue *vq, uint16_t val)
>          return;
>      }
>      pa = vq->vring.used + offsetof(VRingUsed, ring[vq->vring.num]);
> -    stw_phys(pa, val);
> +    virtio_stw_phys(pa, val);
>  }
>  
>  void virtio_queue_set_notification(VirtQueue *vq, int enable)
> @@ -525,6 +540,9 @@ void virtio_set_status(VirtIODevice *vdev, uint8_t val)
>      VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
>      trace_virtio_set_status(vdev, val);
>  
> +    /* If guest virtio endian is uncertain, set it now. */
> +    mark_endian();
> +
>      if (k->set_status) {
>          k->set_status(vdev, val);
>      }
> diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h
> new file mode 100644
> index 0000000..b1d531e
> --- /dev/null
> +++ b/include/hw/virtio/virtio-access.h
> @@ -0,0 +1,138 @@
> +/*
> + * Virtio Accessor Support: In case your target can change endian.
> + *
> + * Copyright IBM, Corp. 2013
> + *
> + * Authors:
> + *  Rusty Russell   <rusty@au.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + */
> +#ifndef _QEMU_VIRTIO_ACCESS_H
> +#define _QEMU_VIRTIO_ACCESS_H
> +
> +#ifdef TARGET_VIRTIO_SWAPENDIAN
> +/* Architectures which need biendian define this function. */
> +extern bool virtio_swap_endian(void);
> +
> +extern bool virtio_byteswap;
> +#else
> +#define virtio_byteswap false
> +#endif

I suspect this is a premature optimization.  With a weak function called
directly in the accessors below, I suspect you would see no measurable
performance overhead compared to this approach.

It's all very predictable so the CPU should do a decent job optimizing
the if () away.

Regards,

Anthony Liguori
Andreas Färber - Aug. 8, 2013, 2:28 p.m.
Am 08.08.2013 15:31, schrieb Anthony Liguori:
> Rusty Russell <rusty@rustcorp.com.au> writes:
> 
>> Virtio is currently defined to work as "guest endian", but this is a
>> problem if the guest can change endian.  As most targets can't change
>> endian, we make it a per-target option to avoid pessimising.
>>
>> This is based on a simpler patch by Anthony Liguouri, which only handled
>> the vring accesses.  We also need some drivers to access these helpers,
>> eg. for data which contains headers.
>>
>> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
>> ---
>>  hw/virtio/virtio.c                |  46 +++++++++----
>>  include/hw/virtio/virtio-access.h | 138 ++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 170 insertions(+), 14 deletions(-)
>>  create mode 100644 include/hw/virtio/virtio-access.h
>>
>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>> index 8176c14..2887f17 100644
>> --- a/hw/virtio/virtio.c
>> +++ b/hw/virtio/virtio.c
>> @@ -18,6 +18,7 @@
>>  #include "hw/virtio/virtio.h"
>>  #include "qemu/atomic.h"
>>  #include "hw/virtio/virtio-bus.h"
>> +#include "hw/virtio/virtio-access.h"
>>  
>>  /* The alignment to use between consumer and producer parts of vring.
>>   * x86 pagesize again. */
>> @@ -84,6 +85,20 @@ struct VirtQueue
>>      EventNotifier host_notifier;
>>  };
>>  
>> +#ifdef TARGET_VIRTIO_SWAPENDIAN
>> +bool virtio_byteswap;
>> +
>> +/* Ask target code if we should swap endian for all vring and config access. */
>> +static void mark_endian(void)
>> +{
>> +    virtio_byteswap = virtio_swap_endian();
>> +}
>> +#else
>> +static void mark_endian(void)
>> +{
>> +}
>> +#endif
>> +
> 
> It would be very good to avoid a target specific define here.  We would
> like to move to only building a single copy of the virtio code.

+1

> We have a mechanism to do weak functions via stubs/.  I think it would
> be better to do cpu_get_byteswap() as a stub function and then overload
> it in the ppc64 code.

If this as your name indicates is a per-CPU function then it should go
into CPUClass. Interesting question is, what is virtio supposed to do if
we have two ppc CPUs, one is Big Endian, the other is Little Endian.
We'd need to check current_cpu then, which for Xen is always NULL.

Andreas
Anthony Liguori - Aug. 8, 2013, 3:40 p.m.
Andreas Färber <afaerber@suse.de> writes:

> Am 08.08.2013 15:31, schrieb Anthony Liguori:
>> Rusty Russell <rusty@rustcorp.com.au> writes:
>> 
>>> Virtio is currently defined to work as "guest endian", but this is a
>>> problem if the guest can change endian.  As most targets can't change
>>> endian, we make it a per-target option to avoid pessimising.
>>>
>>> This is based on a simpler patch by Anthony Liguouri, which only handled
>>> the vring accesses.  We also need some drivers to access these helpers,
>>> eg. for data which contains headers.
>>>
>>> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
>>> ---
>>>  hw/virtio/virtio.c                |  46 +++++++++----
>>>  include/hw/virtio/virtio-access.h | 138 ++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 170 insertions(+), 14 deletions(-)
>>>  create mode 100644 include/hw/virtio/virtio-access.h
>>>
>>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>>> index 8176c14..2887f17 100644
>>> --- a/hw/virtio/virtio.c
>>> +++ b/hw/virtio/virtio.c
>>> @@ -18,6 +18,7 @@
>>>  #include "hw/virtio/virtio.h"
>>>  #include "qemu/atomic.h"
>>>  #include "hw/virtio/virtio-bus.h"
>>> +#include "hw/virtio/virtio-access.h"
>>>  
>>>  /* The alignment to use between consumer and producer parts of vring.
>>>   * x86 pagesize again. */
>>> @@ -84,6 +85,20 @@ struct VirtQueue
>>>      EventNotifier host_notifier;
>>>  };
>>>  
>>> +#ifdef TARGET_VIRTIO_SWAPENDIAN
>>> +bool virtio_byteswap;
>>> +
>>> +/* Ask target code if we should swap endian for all vring and config access. */
>>> +static void mark_endian(void)
>>> +{
>>> +    virtio_byteswap = virtio_swap_endian();
>>> +}
>>> +#else
>>> +static void mark_endian(void)
>>> +{
>>> +}
>>> +#endif
>>> +
>> 
>> It would be very good to avoid a target specific define here.  We would
>> like to move to only building a single copy of the virtio code.
>
> +1
>
>> We have a mechanism to do weak functions via stubs/.  I think it would
>> be better to do cpu_get_byteswap() as a stub function and then overload
>> it in the ppc64 code.
>
> If this as your name indicates is a per-CPU function then it should go
> into CPUClass. Interesting question is, what is virtio supposed to do if
> we have two ppc CPUs, one is Big Endian, the other is Little Endian.

PPC64 is big endian.  AFAIK, there is no such thing as a little endian
PPC64 processor.

This is just a processor mode where loads/stores are byte lane swapped.
Hence the name 'cpu_get_byteswap'.  It's just asking whether the
load/stores are being swapped or not.

At least for PPC64, it's not possible to enable/disable byte lane
swapping for individual CPUs.  It's done through a system-wide hcall.

FWIW, I think most bi-endian architectures are this way too so I think
this is equally applicable to ARM.  Peter, is that right?

Regards,

Anthony Liguori

> We'd need to check current_cpu then, which for Xen is always NULL.
>
> Andreas
>
> -- 
> SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
> GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
Daniel P. Berrange - Aug. 8, 2013, 3:45 p.m.
On Thu, Aug 08, 2013 at 10:40:28AM -0500, Anthony Liguori wrote:
> Andreas Färber <afaerber@suse.de> writes:
> >> We have a mechanism to do weak functions via stubs/.  I think it would
> >> be better to do cpu_get_byteswap() as a stub function and then overload
> >> it in the ppc64 code.
> >
> > If this as your name indicates is a per-CPU function then it should go
> > into CPUClass. Interesting question is, what is virtio supposed to do if
> > we have two ppc CPUs, one is Big Endian, the other is Little Endian.
> 
> PPC64 is big endian.  AFAIK, there is no such thing as a little endian
> PPC64 processor.

Unless I'm misunderstanding, this thread seems to suggest otherwise:

  "[Qemu-devel] [PATCH 0/5] 64bit PowerPC little endian support"

  https://lists.nongnu.org/archive/html/qemu-devel/2013-08/msg00813.html


Daniel
Peter Maydell - Aug. 8, 2013, 3:48 p.m.
On 8 August 2013 16:40, Anthony Liguori <anthony@codemonkey.ws> wrote:
> PPC64 is big endian.  AFAIK, there is no such thing as a little endian
> PPC64 processor.

What's your definition of "little endian processor" here if
it isn't "one which is doing byte swaps of data"? I would
describe a PPC64 with the relevant mode bit set as "little
endian".

> This is just a processor mode where loads/stores are byte lane swapped.
> Hence the name 'cpu_get_byteswap'.  It's just asking whether the
> load/stores are being swapped or not.
>
> At least for PPC64, it's not possible to enable/disable byte lane
> swapping for individual CPUs.  It's done through a system-wide hcall.
>
> FWIW, I think most bi-endian architectures are this way too so I think
> this is equally applicable to ARM.  Peter, is that right?

ARM's bi-endian story is complicated and depends on the CPU.
Older CPUs didn't do byte-lane swapping of data; instead
when the CPU was configured in "big endian" mode they would
do address munging (XOR the address with 3 when doing a byte
access; XOR with 1 for halfword access). New CPUs do byte-lane
swapping (but only for data, not for instruction fetch).
CPUs in the transition period can do either.

In all cases, this is a per-cpu-core thing: you can have
one core configured to be bigendian and the other little
endian. You could in theory have the OS support both big
and little endian userspace processes. We ideally would
want to support "QEMU is a little endian process but the
VM's vcpu is currently bigendian".

Fuller writeup here:
http://translatedcode.wordpress.com/2012/04/02/this-end-up/

-- PMM
Anthony Liguori - Aug. 8, 2013, 4:07 p.m.
"Daniel P. Berrange" <berrange@redhat.com> writes:

> On Thu, Aug 08, 2013 at 10:40:28AM -0500, Anthony Liguori wrote:
>> Andreas Färber <afaerber@suse.de> writes:
>> >> We have a mechanism to do weak functions via stubs/.  I think it would
>> >> be better to do cpu_get_byteswap() as a stub function and then overload
>> >> it in the ppc64 code.
>> >
>> > If this as your name indicates is a per-CPU function then it should go
>> > into CPUClass. Interesting question is, what is virtio supposed to do if
>> > we have two ppc CPUs, one is Big Endian, the other is Little Endian.
>> 
>> PPC64 is big endian.  AFAIK, there is no such thing as a little endian
>> PPC64 processor.
>
> Unless I'm misunderstanding, this thread seems to suggest otherwise:
>
>   "[Qemu-devel] [PATCH 0/5] 64bit PowerPC little endian support"
>
>   https://lists.nongnu.org/archive/html/qemu-devel/2013-08/msg00813.html

Yeah, it's confusing.  It feels like little endian to most software but
the distinction in hardware (and therefore QEMU) is important.

It's the same processor.  It still starts executing big endian
instructions.  A magic register value is tweaked and loads/stores are
swapped.  CPU data structures are still read as big endian though.  It's
really just load/stores that are affected.

The distinction is important in QEMU.  ppc64 is still
TARGET_WORDS_BIGENDIAN.  We still want most stl_phys to treat integers
as big endian.  There's just this extra concept that CPU loads/stores
are sometimes byte swapped.  That affects virtio but not a lot else.

Regards,

Anthony Liguori

>
>
> Daniel
> -- 
> |: http://berrange.com      -o-    http://www.flickr.com/photos/dberrange/ :|
> |: http://libvirt.org              -o-             http://virt-manager.org :|
> |: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
> |: http://entangle-photo.org       -o-       http://live.gnome.org/gtk-vnc :|
Anthony Liguori - Aug. 8, 2013, 4:11 p.m.
Peter Maydell <peter.maydell@linaro.org> writes:

> On 8 August 2013 16:40, Anthony Liguori <anthony@codemonkey.ws> wrote:
>> PPC64 is big endian.  AFAIK, there is no such thing as a little endian
>> PPC64 processor.
>
> What's your definition of "little endian processor" here if
> it isn't "one which is doing byte swaps of data"? I would
> describe a PPC64 with the relevant mode bit set as "little
> endian".

Let's focus this to QEMU.  PPC64 is still TARGET_WORDS_BIGENDIAN.  It
would be totally wrong to make this change to either a function call or
to TARGET_WORDS_LITTLEENDIAN.

>> This is just a processor mode where loads/stores are byte lane swapped.
>> Hence the name 'cpu_get_byteswap'.  It's just asking whether the
>> load/stores are being swapped or not.
>>
>> At least for PPC64, it's not possible to enable/disable byte lane
>> swapping for individual CPUs.  It's done through a system-wide hcall.
>>
>> FWIW, I think most bi-endian architectures are this way too so I think
>> this is equally applicable to ARM.  Peter, is that right?
>
> ARM's bi-endian story is complicated and depends on the CPU.
> Older CPUs didn't do byte-lane swapping of data; instead
> when the CPU was configured in "big endian" mode they would
> do address munging (XOR the address with 3 when doing a byte
> access; XOR with 1 for halfword access). New CPUs do byte-lane
> swapping (but only for data, not for instruction fetch).
> CPUs in the transition period can do either.
>
> In all cases, this is a per-cpu-core thing: you can have
> one core configured to be bigendian and the other little
> endian. You could in theory have the OS support both big
> and little endian userspace processes. We ideally would
> want to support "QEMU is a little endian process but the
> VM's vcpu is currently bigendian".

Eek.  Yeah, I guess we need to tie this to a CPUState then.

>
> Fuller writeup here:
> http://translatedcode.wordpress.com/2012/04/02/this-end-up/

Excellent, thanks!

Regards,

Anthony Liguori

>
> -- PMM
Peter Maydell - Aug. 8, 2013, 4:14 p.m.
On 8 August 2013 17:07, Anthony Liguori <anthony@codemonkey.ws> wrote:
> It's the same processor.  It still starts executing big endian
> instructions.  A magic register value is tweaked and loads/stores are
> swapped.

I dunno about PPC, but for ARM generally the boot-up state is
controlled by config signals which a SoC or board can hardwire,
so you can have a SoC which is configured to start in big-endian
mode.

> CPU data structures are still read as big endian though.

Do you have an example of what you mean by "CPU data structure"?

> The distinction is important in QEMU.  ppc64 is still
> TARGET_WORDS_BIGENDIAN.

Ideally TARGET_WORDS_BIGENDIAN would go away -- it is forcing
at compile time a setting which is actually a runtime one,
and a lot of the weirdness here flows from that.

> We still want most stl_phys to treat integers
> as big endian.

Any stl_phys() should [in an ideal design] be tied to a
"bus master" which has its own idea of which endianness
it is. That is, an stl_phys() for a DMA controller model
ought to use the endianness programmed for the DMA controller,
not whatever the CPU happens to be using.

-- PMM
Andreas Färber - Aug. 8, 2013, 4:24 p.m.
Am 08.08.2013 17:40, schrieb Anthony Liguori:
> Andreas Färber <afaerber@suse.de> writes:
>> Am 08.08.2013 15:31, schrieb Anthony Liguori:
>>> We have a mechanism to do weak functions via stubs/.  I think it would
>>> be better to do cpu_get_byteswap() as a stub function and then overload
>>> it in the ppc64 code.
>>
>> If this as your name indicates is a per-CPU function then it should go
>> into CPUClass. Interesting question is, what is virtio supposed to do if
>> we have two ppc CPUs, one is Big Endian, the other is Little Endian.
> 
> PPC64 is big endian.  AFAIK, there is no such thing as a little endian
> PPC64 processor.
> 
> This is just a processor mode where loads/stores are byte lane swapped.
> Hence the name 'cpu_get_byteswap'.  It's just asking whether the
> load/stores are being swapped or not.

Exactly, just read it as "is in ... Endian mode". On the CPUs I am more
familiar with (e.g., 970), this used to be controlled via an MSR bit,
which as CPUPPCState::msr exists per CPUState. I did not check on real
hardware, but from the QEMU code this would allow for the mixed-endian
scenario described above.

> At least for PPC64, it's not possible to enable/disable byte lane
> swapping for individual CPUs.  It's done through a system-wide hcall.

What is offending me is only the following: If we name it
cpu_get_byteswap() as proposed by you, then its first argument should be
a CPUState *cpu. Its value would be read from the derived type's state,
such as the MSR bit in the code path that you wanted duplicated. The
function implementing that register-reading would be a hook in CPUClass,
with a default implementation in qom/cpu.c rather than a fallback in
stubs/. To access CPUClass, CPUState cannot be NULL, as brought up by
Stefano for cpu_do_unassigned_access(); not following that pattern
prevents mixing CPU architectures, which my large refactorings have
partially been about. Cf. my guest-memory-dump refactoring.

If it is just some random global value, then please don't call it
cpu_*(). Since sPAPR is not a target of its own, I don't see how/where
you want to implement that hcall query as per-target function either,
that might rather call for a QEMUMachine hook?

I don't care or argue about byte lanes here, I am just trying to keep
API design consistent and not regressing on the way to heterogeneous
emulation.

Regards,
Andreas

> FWIW, I think most bi-endian architectures are this way too so I think
> this is equally applicable to ARM.  Peter, is that right?
> 
> Regards,
> 
> Anthony Liguori
> 
>> We'd need to check current_cpu then, which for Xen is always NULL.
>>
>> Andreas
Anthony Liguori - Aug. 8, 2013, 4:25 p.m.
Peter Maydell <peter.maydell@linaro.org> writes:

> On 8 August 2013 17:07, Anthony Liguori <anthony@codemonkey.ws> wrote:
>> It's the same processor.  It still starts executing big endian
>> instructions.  A magic register value is tweaked and loads/stores are
>> swapped.
>
> I dunno about PPC, but for ARM generally the boot-up state is
> controlled by config signals which a SoC or board can hardwire,
> so you can have a SoC which is configured to start in big-endian
> mode.
>
>> CPU data structures are still read as big endian though.
>
> Do you have an example of what you mean by "CPU data structure"?

MMU tlb hash table.  If you grep ldl target-ppc/* you'll see that there
are only a few cases where bswap occurs.

>> The distinction is important in QEMU.  ppc64 is still
>> TARGET_WORDS_BIGENDIAN.
>
> Ideally TARGET_WORDS_BIGENDIAN would go away -- it is forcing
> at compile time a setting which is actually a runtime one,
> and a lot of the weirdness here flows from that.
>
>> We still want most stl_phys to treat integers
>> as big endian.
>
> Any stl_phys() should [in an ideal design] be tied to a
> "bus master" which has its own idea of which endianness
> it is. That is, an stl_phys() for a DMA controller model
> ought to use the endianness programmed for the DMA controller,
> not whatever the CPU happens to be using.

We have the DMA API that attempts to do this but maybe we need to
generalize it a bit more...

I think it's pretty true that we need a context and that the context
for, say instruction fetch, is distinct from the context for load/store
instructions.

Regards,

Anthony Liguori

>
> -- PMM
Peter Maydell - Aug. 8, 2013, 4:30 p.m.
On 8 August 2013 17:25, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Peter Maydell <peter.maydell@linaro.org> writes:
>> On 8 August 2013 17:07, Anthony Liguori <anthony@codemonkey.ws> wrote:
>>> CPU data structures are still read as big endian though.
>>
>> Do you have an example of what you mean by "CPU data structure"?
>
> MMU tlb hash table.  If you grep ldl target-ppc/* you'll see that there
> are only a few cases where bswap occurs.

Oh, right, that sort of in-memory data structure. If I
understand correctly, the equivalent of that for ARM would
be the MMU translation tables; on ARM there's a system
control register bit for which endianness those are.

>> Any stl_phys() should [in an ideal design] be tied to a
>> "bus master" which has its own idea of which endianness
>> it is. That is, an stl_phys() for a DMA controller model
>> ought to use the endianness programmed for the DMA controller,
>> not whatever the CPU happens to be using.
>
> We have the DMA API that attempts to do this but maybe we need to
> generalize it a bit more...
>
> I think it's pretty true that we need a context and that the context
> for, say instruction fetch, is distinct from the context for load/store
> instructions.

A context might also give us a place to put other interesting
information which in hardware gets passed around as transaction
attributes on the bus, such as "is this a userspace or privileged
instruction".

-- PMM
Rusty Russell - Aug. 9, 2013, 12:08 a.m.
Andreas Färber <afaerber@suse.de> writes:
> Am 08.08.2013 15:31, schrieb Anthony Liguori:
>> Rusty Russell <rusty@rustcorp.com.au> writes:
>> 
>>> Virtio is currently defined to work as "guest endian", but this is a
>>> problem if the guest can change endian.  As most targets can't change
>>> endian, we make it a per-target option to avoid pessimising.
>>>
>>> This is based on a simpler patch by Anthony Liguouri, which only handled
>>> the vring accesses.  We also need some drivers to access these helpers,
>>> eg. for data which contains headers.
>>>
>>> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
>>> ---
>>>  hw/virtio/virtio.c                |  46 +++++++++----
>>>  include/hw/virtio/virtio-access.h | 138 ++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 170 insertions(+), 14 deletions(-)
>>>  create mode 100644 include/hw/virtio/virtio-access.h
>>>
>>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>>> index 8176c14..2887f17 100644
>>> --- a/hw/virtio/virtio.c
>>> +++ b/hw/virtio/virtio.c
>>> @@ -18,6 +18,7 @@
>>>  #include "hw/virtio/virtio.h"
>>>  #include "qemu/atomic.h"
>>>  #include "hw/virtio/virtio-bus.h"
>>> +#include "hw/virtio/virtio-access.h"
>>>  
>>>  /* The alignment to use between consumer and producer parts of vring.
>>>   * x86 pagesize again. */
>>> @@ -84,6 +85,20 @@ struct VirtQueue
>>>      EventNotifier host_notifier;
>>>  };
>>>  
>>> +#ifdef TARGET_VIRTIO_SWAPENDIAN
>>> +bool virtio_byteswap;
>>> +
>>> +/* Ask target code if we should swap endian for all vring and config access. */
>>> +static void mark_endian(void)
>>> +{
>>> +    virtio_byteswap = virtio_swap_endian();
>>> +}
>>> +#else
>>> +static void mark_endian(void)
>>> +{
>>> +}
>>> +#endif
>>> +
>> 
>> It would be very good to avoid a target specific define here.  We would
>> like to move to only building a single copy of the virtio code.
>
> +1
>
>> We have a mechanism to do weak functions via stubs/.  I think it would
>> be better to do cpu_get_byteswap() as a stub function and then overload
>> it in the ppc64 code.
>
> If this as your name indicates is a per-CPU function then it should go
> into CPUClass. Interesting question is, what is virtio supposed to do if
> we have two ppc CPUs, one is Big Endian, the other is Little Endian.
> We'd need to check current_cpu then, which for Xen is always NULL.

This is why the check is performed on a random CPU when they first
acknowledge the device.  It's a hacky assumption, but that's why there's
a proposal to nail virtio to LE for the 1.0 OASIS standard.

You can't actually change endian of a virtio device in flight: it
doesn't make sense since there's readable state there already.

Cheers,
Rusty.
Rusty Russell - Aug. 9, 2013, 2:58 a.m.
Anthony Liguori <anthony@codemonkey.ws> writes:
> "Daniel P. Berrange" <berrange@redhat.com> writes:
>
>> On Thu, Aug 08, 2013 at 10:40:28AM -0500, Anthony Liguori wrote:
>>> Andreas Färber <afaerber@suse.de> writes:
>>> >> We have a mechanism to do weak functions via stubs/.  I think it would
>>> >> be better to do cpu_get_byteswap() as a stub function and then overload
>>> >> it in the ppc64 code.
>>> >
>>> > If this as your name indicates is a per-CPU function then it should go
>>> > into CPUClass. Interesting question is, what is virtio supposed to do if
>>> > we have two ppc CPUs, one is Big Endian, the other is Little Endian.
>>> 
>>> PPC64 is big endian.  AFAIK, there is no such thing as a little endian
>>> PPC64 processor.
>>
>> Unless I'm misunderstanding, this thread seems to suggest otherwise:
>>
>>   "[Qemu-devel] [PATCH 0/5] 64bit PowerPC little endian support"
>>
>>   https://lists.nongnu.org/archive/html/qemu-devel/2013-08/msg00813.html
>
> Yeah, it's confusing.  It feels like little endian to most software but
> the distinction in hardware (and therefore QEMU) is important.
>
> It's the same processor.  It still starts executing big endian
> instructions.  A magic register value is tweaked and loads/stores are
> swapped.  CPU data structures are still read as big endian though.  It's
> really just load/stores that are affected.
>
> The distinction is important in QEMU.  ppc64 is still
> TARGET_WORDS_BIGENDIAN.  We still want most stl_phys to treat integers
> as big endian.  There's just this extra concept that CPU loads/stores
> are sometimes byte swapped.  That affects virtio but not a lot else.

You've redefined endian here; please don't do that.  Endian is the order
in memory which a CPU does loads and stores.  From any reasonable
definition, PPC is bi-endian.

It's actually a weird thing for the qemu core to know at all: almost
everything which cares is in target-specific code.  The exceptions are
gdb stubs and virtio, both of which are "native endian" (and that weird
code in exec.c: what is notdirty_mem_write?).

Your argument that we shouldn't fix stl_* might be justifiable (ie. just
hack virtio and gdb as one-offs), but it's neither clear nor "least
surprise".

Chers,
Rusty.
Rusty Russell - Aug. 9, 2013, 7:35 a.m.
Andreas Färber <afaerber@suse.de> writes:
> Am 08.08.2013 17:40, schrieb Anthony Liguori:
>> Andreas Färber <afaerber@suse.de> writes:
>>> Am 08.08.2013 15:31, schrieb Anthony Liguori:
>>>> We have a mechanism to do weak functions via stubs/.  I think it would
>>>> be better to do cpu_get_byteswap() as a stub function and then overload
>>>> it in the ppc64 code.
>>>
>>> If this as your name indicates is a per-CPU function then it should go
>>> into CPUClass. Interesting question is, what is virtio supposed to do if
>>> we have two ppc CPUs, one is Big Endian, the other is Little Endian.
>> 
>> PPC64 is big endian.  AFAIK, there is no such thing as a little endian
>> PPC64 processor.
>> 
>> This is just a processor mode where loads/stores are byte lane swapped.
>> Hence the name 'cpu_get_byteswap'.  It's just asking whether the
>> load/stores are being swapped or not.
>
> Exactly, just read it as "is in ... Endian mode". On the CPUs I am more
> familiar with (e.g., 970), this used to be controlled via an MSR bit,
> which as CPUPPCState::msr exists per CPUState. I did not check on real
> hardware, but from the QEMU code this would allow for the mixed-endian
> scenario described above.
>
>> At least for PPC64, it's not possible to enable/disable byte lane
>> swapping for individual CPUs.  It's done through a system-wide hcall.
>
> What is offending me is only the following: If we name it
> cpu_get_byteswap() as proposed by you, then its first argument should be
> a CPUState *cpu. Its value would be read from the derived type's state,
> such as the MSR bit in the code path that you wanted duplicated. The
> function implementing that register-reading would be a hook in CPUClass,
> with a default implementation in qom/cpu.c rather than a fallback in
> stubs/. To access CPUClass, CPUState cannot be NULL, as brought up by
> Stefano for cpu_do_unassigned_access(); not following that pattern
> prevents mixing CPU architectures, which my large refactorings have
> partially been about. Cf. my guest-memory-dump refactoring.
>
> If it is just some random global value, then please don't call it
> cpu_*(). Since sPAPR is not a target of its own, I don't see how/where
> you want to implement that hcall query as per-target function either,
> that might rather call for a QEMUMachine hook?
>
> I don't care or argue about byte lanes here, I am just trying to keep
> API design consistent and not regressing on the way to heterogeneous
> emulation.

That's a lot of replumbing and indirect function calls for a fairly
obscure case.  We certainly don't have a nice CPUState lying around in
virtio at the moment, for example.

I can try to plumb this in if there's consensus, but I suspect it's
making the job 10x harder.

(The next logical step would be for st* and ld* to take the cpu to query
its endianness, Anthony's weird ideas notwithstanding).

Cheers,
Rusty.
Peter Maydell - Aug. 9, 2013, 7:42 a.m.
On 9 August 2013 08:35, Rusty Russell <rusty@rustcorp.com.au> wrote:
> That's a lot of replumbing and indirect function calls for a fairly
> obscure case.  We certainly don't have a nice CPUState lying around in
> virtio at the moment, for example.

Actually if you're in an IO routine you do: it will be in the
global variable 'current_cpu'. Most IO functions don't need this,
but it's what we use for things like "this device behaviour varies
depending on which CPU accesses it".

-- PMM
Benjamin Herrenschmidt - Aug. 9, 2013, 7:49 a.m.
On Fri, 2013-08-09 at 17:05 +0930, Rusty Russell wrote:

> > Exactly, just read it as "is in ... Endian mode". On the CPUs I am more
> > familiar with (e.g., 970), this used to be controlled via an MSR bit,

970 didn't have an LE mode :-)

> > which as CPUPPCState::msr exists per CPUState. I did not check on real
> > hardware, but from the QEMU code this would allow for the mixed-endian
> > scenario described above.

This whole exercise should have nothing to do with the current endian
mode of the CPU. If for example you are running lx86 (the x86 emulator
IBM provides) which exploits MSR:LE on POWER7 to run x86 binaries in
userspace, you don't want virtio to suddenly change endian !

The information we care about is the endianness of the operating system.

The most logical way to infer it is a different bit, which used to be
MSR:ILE and is now in LPCR for guests and controlled via a hypercall on
pseries, which indicates what is the endianness of interrupt vectors.

IE. It indicates how the cpu should set MSR:LE when taking an interrupt,
regardless of what the current MSR:LE value is at any given point in
time.

So what should be done in fact is whenever *that* bit is changed
(currently via hcall, maybe via MSR:ILE if we emulate that on older
models or LPCR when we emulate that), then the qemu cpu model can "call
out" to change the "OS endianness" which we can propagate to virtio.

Anything trying to do stuff based on the "current" endianness in the MSR
sounds like a cesspit to me.

> (The next logical step would be for st* and ld* to take the cpu to query
> its endianness, Anthony's weird ideas notwithstanding).

Why would we even consider something like that ?

Ben.
Peter Maydell - Aug. 9, 2013, 8:05 a.m.
On 9 August 2013 03:58, Rusty Russell <rusty@rustcorp.com.au> wrote:
> Anthony Liguori <anthony@codemonkey.ws> writes:
>> The distinction is important in QEMU.  ppc64 is still
>> TARGET_WORDS_BIGENDIAN.  We still want most stl_phys to treat integers
>> as big endian.  There's just this extra concept that CPU loads/stores
>> are sometimes byte swapped.  That affects virtio but not a lot else.
>
> You've redefined endian here; please don't do that.  Endian is the order
> in memory which a CPU does loads and stores.

Agreed (subject to the complicating factor that it's possible for
a CPU to have the order to be different for data, instruction and
TLB walk loads, so it's not a single setting for a CPU).

> From any reasonable definition, PPC is bi-endian.
>
> It's actually a weird thing for the qemu core to know at all:

It's a TCG performance optimisation and/or simplification hack,
basically -- by hardcoding the endianness we think the core is at
compile time we can either always-byteswap or never-byteswap in the
fast paths.

> almost
> everything which cares is in target-specific code.  The exceptions are
> gdb stubs and virtio, both of which are "native endian" (and that weird
> code in exec.c: what is notdirty_mem_write?).

The code for actually doing TCG CPU loads and stores cares too.

notdirty_mem_write is (I think) part of how we handle self-modifying
code: if you write to a page in memory that we have translated some
code from, then we need to intercept that write so that we can throw
away the translated code. notdirty_mem_write() is the function that
does that interception, throws away the code, figures out if we still
need to intercept next time around, and actually does the access
the guest asked for. (It might also be used for spotting when the
guest touches memory during migration and thus that page needs to be
retransmitted -- I haven't checked.)

-- PMM
Anthony Liguori - Aug. 9, 2013, 2:16 p.m.
Rusty Russell <rusty@rustcorp.com.au> writes:

> Anthony Liguori <anthony@codemonkey.ws> writes:
>> "Daniel P. Berrange" <berrange@redhat.com> writes:
>>
>> The distinction is important in QEMU.  ppc64 is still
>> TARGET_WORDS_BIGENDIAN.  We still want most stl_phys to treat integers
>> as big endian.  There's just this extra concept that CPU loads/stores
>> are sometimes byte swapped.  That affects virtio but not a lot else.
>
> You've redefined endian here; please don't do that.  Endian is the order
> in memory which a CPU does loads and stores.  From any reasonable
> definition, PPC is bi-endian.
>
> It's actually a weird thing for the qemu core to know at all: almost
> everything which cares is in target-specific code.  The exceptions are
> gdb stubs and virtio, both of which are "native endian" (and that weird
> code in exec.c: what is notdirty_mem_write?).
>
> Your argument that we shouldn't fix stl_* might be justifiable (ie. just
> hack virtio and gdb as one-offs), but it's neither clear nor "least
> surprise".

That's not what I'm suggesting.

I'm suggesting that we should introduce multiple variants of {ld,st}*
for different types of memory access.

These are bad names, but I'm thinking along the lines of:

/* Read a word as the load/store instructions would */
cpu_ldst_ldw()

/* Read a word as the instruction fetch unit would */
cpu_fetch_ldw()

/* Read a word as the hardware MMU would */
cpu_mmu_ldw()

Peter was suggesting that instead of having separate functions, we
should use a context:

ldw(cpu->ldst, ..)
ldw(cpu->fetch, ..)
...

I think I prefer functions though over a context.  But this is really
about TCG, not virtio.  As Ben pointed out, virtio endianness needs to
be independent of CPUs.  We process the ring outside of a specific CPU
context and it's possible that if we pick an arbitrary one, it will be
in the wrong context (if running BE userspace).

The only real problem I have with your original patch is putting virtio
knowledge in the target code.  I think your adjusted version is fine.

Regards,

Anthony Liguori

>
> Chers,
> Rusty.
Andreas Färber - Aug. 9, 2013, 3:15 p.m.
Am 09.08.2013 09:35, schrieb Rusty Russell:
> Andreas Färber <afaerber@suse.de> writes:
>> [...] If we name it
>> cpu_get_byteswap() as proposed by you, then its first argument should be
>> a CPUState *cpu. Its value would be read from the derived type's state,
>> such as the MSR bit in the code path that you wanted duplicated. The
>> function implementing that register-reading would be a hook in CPUClass,
>> with a default implementation in qom/cpu.c rather than a fallback in
>> stubs/. To access CPUClass, CPUState cannot be NULL, as brought up by
>> Stefano for cpu_do_unassigned_access(); not following that pattern
>> prevents mixing CPU architectures, which my large refactorings have
>> partially been about. Cf. my guest-memory-dump refactoring.
>>
>> If it is just some random global value, then please don't call it
>> cpu_*(). Since sPAPR is not a target of its own, I don't see how/where
>> you want to implement that hcall query as per-target function either,
>> that might rather call for a QEMUMachine hook?
>>
>> I don't care or argue about byte lanes here, I am just trying to keep
>> API design consistent and not regressing on the way to heterogeneous
>> emulation.
> 
> That's a lot of replumbing and indirect function calls for a fairly
> obscure case.

It's how QOM methods generally work. And yes, little endian ppc64 is in
fact a pretty obscure case. But IBM was just recently reported to adopt
the IP licensing model like ARM, so chances are we will see the same
mixed-core scenarios as with ARM + MicroBlaze/SuperH these days.

http://news.techeye.net/chips/ibms-launches-intel-server-challenge

Generally the problem is that we can't have multiple same-named global
functions when combining multiple targets, so we need a way to dispatch
- either from the individual CPU or from the machine. I would assume in
practice mixed cores will have the same endianness.

Or by making endianness a user-tweakable property of the virtio devices
rather than trying to auto-detect it.

>  We certainly don't have a nice CPUState lying around in
> virtio at the moment, for example.

Compare
http://git.qemu.org/?p=qemu.git;a=commit;h=c658b94f6e8c206c59d02aa6fbac285b86b53d2c

cpu_single_env has since been renamed to the mentioned current_cpu and
been changed to CPUState type.

> I can try to plumb this in if there's consensus, but I suspect it's
> making the job 10x harder.

I doubt it's that complicated, estimated less than ten minutes for me,
and not doing it is making the other job significantly harder.
cpu_get_dump_info() is already a hard nut to crack.

Regards,
Andreas
Rusty Russell - Aug. 12, 2013, 12:28 a.m.
Benjamin Herrenschmidt <benh@kernel.crashing.org> writes:
> This whole exercise should have nothing to do with the current endian
> mode of the CPU. If for example you are running lx86 (the x86 emulator
> IBM provides) which exploits MSR:LE on POWER7 to run x86 binaries in
> userspace, you don't want virtio to suddenly change endian !
>
> The information we care about is the endianness of the operating system.

Which is why my original patches nabbed the endianness when the target
updated the virtio device status.

You're making an assumption about the nature of the guest, that they
don't pass the virtio device directly through to userspace.

I don't care, though.  The point is to make something which works, until
the Real Fix (LE virtio).

> The most logical way to infer it is a different bit, which used to be
> MSR:ILE and is now in LPCR for guests and controlled via a hypercall on
> pseries, which indicates what is the endianness of interrupt vectors.
>
> IE. It indicates how the cpu should set MSR:LE when taking an interrupt,
> regardless of what the current MSR:LE value is at any given point in
> time.
>
> So what should be done in fact is whenever *that* bit is changed
> (currently via hcall, maybe via MSR:ILE if we emulate that on older
> models or LPCR when we emulate that), then the qemu cpu model can "call
> out" to change the "OS endianness" which we can propagate to virtio.
>
> Anything trying to do stuff based on the "current" endianness in the MSR
> sounds like a cesspit to me.

OK.  What should Anton's gdb stub do then?

Cheers,
Rusty.
Benjamin Herrenschmidt - Aug. 12, 2013, 12:49 a.m.
On Mon, 2013-08-12 at 09:58 +0930, Rusty Russell wrote:
> Benjamin Herrenschmidt <benh@kernel.crashing.org> writes:
> > This whole exercise should have nothing to do with the current endian
> > mode of the CPU. If for example you are running lx86 (the x86 emulator
> > IBM provides) which exploits MSR:LE on POWER7 to run x86 binaries in
> > userspace, you don't want virtio to suddenly change endian !
> >
> > The information we care about is the endianness of the operating system.
> 
> Which is why my original patches nabbed the endianness when the target
> updated the virtio device status.
> 
> You're making an assumption about the nature of the guest, that they
> don't pass the virtio device directly through to userspace.

Two points here:

 - Userspace is VERY likely to have the same endianness as the operating
system.

 - The case where we might support "foreign endian" userspace *and* pass
virtio directly to it *and* give a shit about virtio v1.0 doesn't exist
anywhere but your imagination right now :-)

> I don't care, though.  The point is to make something which works, until
> the Real Fix (LE virtio).

Exactly.

> > The most logical way to infer it is a different bit, which used to be
> > MSR:ILE and is now in LPCR for guests and controlled via a hypercall on
> > pseries, which indicates what is the endianness of interrupt vectors.
> >
> > IE. It indicates how the cpu should set MSR:LE when taking an interrupt,
> > regardless of what the current MSR:LE value is at any given point in
> > time.
> >
> > So what should be done in fact is whenever *that* bit is changed
> > (currently via hcall, maybe via MSR:ILE if we emulate that on older
> > models or LPCR when we emulate that), then the qemu cpu model can "call
> > out" to change the "OS endianness" which we can propagate to virtio.
> >
> > Anything trying to do stuff based on the "current" endianness in the MSR
> > sounds like a cesspit to me.
> 
> OK.  What should Anton's gdb stub do then?

Something else. It's a different problem and needs a different solution.

For one, I think, we should first fix the root problem with gdb (tagging
endianness in the protocol etc...) and once that's done, look at what
band-aid can be applied for old stuff if we care at all (it's not like
LE ppc64 is going to not require a new gdb anyway).

Cheers,
Ben.
Rusty Russell - Aug. 12, 2013, 7:49 a.m.
Peter Maydell <peter.maydell@linaro.org> writes:
> On 9 August 2013 08:35, Rusty Russell <rusty@rustcorp.com.au> wrote:
>> That's a lot of replumbing and indirect function calls for a fairly
>> obscure case.  We certainly don't have a nice CPUState lying around in
>> virtio at the moment, for example.
>
> Actually if you're in an IO routine you do: it will be in the
> global variable 'current_cpu'. Most IO functions don't need this,
> but it's what we use for things like "this device behaviour varies
> depending on which CPU accesses it".

Hmm, that was NULL for me when called from virtio.  I have stuck with
first_cpu in the ppc64 case.

Patches coming,
Rusty.

Patch

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 8176c14..2887f17 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -18,6 +18,7 @@ 
 #include "hw/virtio/virtio.h"
 #include "qemu/atomic.h"
 #include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
 
 /* The alignment to use between consumer and producer parts of vring.
  * x86 pagesize again. */
@@ -84,6 +85,20 @@  struct VirtQueue
     EventNotifier host_notifier;
 };
 
+#ifdef TARGET_VIRTIO_SWAPENDIAN
+bool virtio_byteswap;
+
+/* Ask target code if we should swap endian for all vring and config access. */
+static void mark_endian(void)
+{
+    virtio_byteswap = virtio_swap_endian();
+}
+#else
+static void mark_endian(void)
+{
+}
+#endif
+
 /* virt queue functions */
 static void virtqueue_init(VirtQueue *vq)
 {
@@ -100,49 +115,49 @@  static inline uint64_t vring_desc_addr(hwaddr desc_pa, int i)
 {
     hwaddr pa;
     pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
-    return ldq_phys(pa);
+    return virtio_ldq_phys(pa);
 }
 
 static inline uint32_t vring_desc_len(hwaddr desc_pa, int i)
 {
     hwaddr pa;
     pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
-    return ldl_phys(pa);
+    return virtio_ldl_phys(pa);
 }
 
 static inline uint16_t vring_desc_flags(hwaddr desc_pa, int i)
 {
     hwaddr pa;
     pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
-    return lduw_phys(pa);
+    return virtio_lduw_phys(pa);
 }
 
 static inline uint16_t vring_desc_next(hwaddr desc_pa, int i)
 {
     hwaddr pa;
     pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
-    return lduw_phys(pa);
+    return virtio_lduw_phys(pa);
 }
 
 static inline uint16_t vring_avail_flags(VirtQueue *vq)
 {
     hwaddr pa;
     pa = vq->vring.avail + offsetof(VRingAvail, flags);
-    return lduw_phys(pa);
+    return virtio_lduw_phys(pa);
 }
 
 static inline uint16_t vring_avail_idx(VirtQueue *vq)
 {
     hwaddr pa;
     pa = vq->vring.avail + offsetof(VRingAvail, idx);
-    return lduw_phys(pa);
+    return virtio_lduw_phys(pa);
 }
 
 static inline uint16_t vring_avail_ring(VirtQueue *vq, int i)
 {
     hwaddr pa;
     pa = vq->vring.avail + offsetof(VRingAvail, ring[i]);
-    return lduw_phys(pa);
+    return virtio_lduw_phys(pa);
 }
 
 static inline uint16_t vring_used_event(VirtQueue *vq)
@@ -154,42 +169,42 @@  static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val)
 {
     hwaddr pa;
     pa = vq->vring.used + offsetof(VRingUsed, ring[i].id);
-    stl_phys(pa, val);
+    virtio_stl_phys(pa, val);
 }
 
 static inline void vring_used_ring_len(VirtQueue *vq, int i, uint32_t val)
 {
     hwaddr pa;
     pa = vq->vring.used + offsetof(VRingUsed, ring[i].len);
-    stl_phys(pa, val);
+    virtio_stl_phys(pa, val);
 }
 
 static uint16_t vring_used_idx(VirtQueue *vq)
 {
     hwaddr pa;
     pa = vq->vring.used + offsetof(VRingUsed, idx);
-    return lduw_phys(pa);
+    return virtio_lduw_phys(pa);
 }
 
 static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val)
 {
     hwaddr pa;
     pa = vq->vring.used + offsetof(VRingUsed, idx);
-    stw_phys(pa, val);
+    virtio_stw_phys(pa, val);
 }
 
 static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask)
 {
     hwaddr pa;
     pa = vq->vring.used + offsetof(VRingUsed, flags);
-    stw_phys(pa, lduw_phys(pa) | mask);
+    virtio_stw_phys(pa, virtio_lduw_phys(pa) | mask);
 }
 
 static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask)
 {
     hwaddr pa;
     pa = vq->vring.used + offsetof(VRingUsed, flags);
-    stw_phys(pa, lduw_phys(pa) & ~mask);
+    virtio_stw_phys(pa, virtio_lduw_phys(pa) & ~mask);
 }
 
 static inline void vring_avail_event(VirtQueue *vq, uint16_t val)
@@ -199,7 +214,7 @@  static inline void vring_avail_event(VirtQueue *vq, uint16_t val)
         return;
     }
     pa = vq->vring.used + offsetof(VRingUsed, ring[vq->vring.num]);
-    stw_phys(pa, val);
+    virtio_stw_phys(pa, val);
 }
 
 void virtio_queue_set_notification(VirtQueue *vq, int enable)
@@ -525,6 +540,9 @@  void virtio_set_status(VirtIODevice *vdev, uint8_t val)
     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
     trace_virtio_set_status(vdev, val);
 
+    /* If guest virtio endian is uncertain, set it now. */
+    mark_endian();
+
     if (k->set_status) {
         k->set_status(vdev, val);
     }
diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h
new file mode 100644
index 0000000..b1d531e
--- /dev/null
+++ b/include/hw/virtio/virtio-access.h
@@ -0,0 +1,138 @@ 
+/*
+ * Virtio Accessor Support: In case your target can change endian.
+ *
+ * Copyright IBM, Corp. 2013
+ *
+ * Authors:
+ *  Rusty Russell   <rusty@au.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#ifndef _QEMU_VIRTIO_ACCESS_H
+#define _QEMU_VIRTIO_ACCESS_H
+
+#ifdef TARGET_VIRTIO_SWAPENDIAN
+/* Architectures which need biendian define this function. */
+extern bool virtio_swap_endian(void);
+
+extern bool virtio_byteswap;
+#else
+#define virtio_byteswap false
+#endif
+
+static inline uint16_t virtio_lduw_phys(hwaddr pa)
+{
+    if (virtio_byteswap) {
+        return bswap16(lduw_phys(pa));
+    }
+    return lduw_phys(pa);
+    
+}
+
+static inline uint32_t virtio_ldl_phys(hwaddr pa)
+{
+    if (virtio_byteswap) {
+        return bswap32(ldl_phys(pa));
+    }
+    return ldl_phys(pa);
+}
+
+static inline uint64_t virtio_ldq_phys(hwaddr pa)
+{
+    if (virtio_byteswap) {
+        return bswap64(ldq_phys(pa));
+    }
+    return ldq_phys(pa);
+}
+
+static inline void virtio_stw_phys(hwaddr pa, uint16_t value)
+{
+    if (virtio_byteswap) {
+        stw_phys(pa, bswap16(value));
+    } else {
+        stw_phys(pa, value);
+    }
+}
+
+static inline void virtio_stl_phys(hwaddr pa, uint32_t value)
+{
+    if (virtio_byteswap) {
+        stl_phys(pa, bswap32(value));
+    } else {
+        stl_phys(pa, value);
+    }
+}
+
+static inline void virtio_stw_p(void *ptr, uint16_t v)
+{
+    if (virtio_byteswap) {
+	stw_p(ptr, bswap16(v));
+    } else {
+	stw_p(ptr, v);
+    }
+}
+
+static inline void virtio_stl_p(void *ptr, uint32_t v)
+{
+    if (virtio_byteswap) {
+	stl_p(ptr, bswap32(v));
+    } else {
+	stl_p(ptr, v);
+    }
+}
+
+static inline void virtio_stq_p(void *ptr, uint64_t v)
+{
+    if (virtio_byteswap) {
+	stq_p(ptr, bswap64(v));
+    } else {
+	stq_p(ptr, v);
+    }
+}
+
+static inline int virtio_lduw_p(const void *ptr)
+{
+    if (virtio_byteswap) {
+	return bswap16(lduw_p(ptr));
+    } else {
+	return lduw_p(ptr);
+    }
+}
+
+static inline int virtio_ldl_p(const void *ptr)
+{
+    if (virtio_byteswap) {
+	return bswap32(ldl_p(ptr));
+    } else {
+	return ldl_p(ptr);
+    }
+}
+
+static inline uint64_t virtio_ldq_p(const void *ptr)
+{
+    if (virtio_byteswap) {
+	return bswap64(ldq_p(ptr));
+    } else {
+	return ldq_p(ptr);
+    }
+}
+
+static inline uint32_t virtio_tswap32(uint32_t s)
+{
+    if (virtio_byteswap) {
+	return bswap32(tswap32(s));
+    } else {
+	return tswap32(s);
+    }
+}
+
+static inline void virtio_tswap32s(uint32_t *s)
+{
+    tswap32s(s);
+    if (virtio_byteswap) {
+	*s = bswap32(*s);
+    }
+}
+#endif /* _QEMU_VIRTIO_ACCESS_H */