Patchwork [1/9] exec: add endian specific phys ld/st functions

login
register
mail settings
Submitter Alexander Graf
Date July 5, 2011, 4:28 p.m.
Message ID <1309883290-17595-2-git-send-email-agraf@suse.de>
Download mbox | patch
Permalink /patch/103348/
State New
Headers show

Comments

Alexander Graf - July 5, 2011, 4:28 p.m.
Device code some times needs to access physical memory and does that
through the ld./st._phys functions. However, these are the exact same
functions that the CPU uses to access memory, which means they will
be endianness swapped depending on the target CPU.

However, devices don't know about the CPU's endianness, but instead
access memory directly using their own interface to the memory bus,
so they need some way to read data with their native endianness.

This patch adds _le and _be functions to ld./st._phys.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 cpu-common.h |   12 +++++++
 exec.c       |  102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+), 0 deletions(-)
Blue Swirl - July 5, 2011, 9:48 p.m.
On Tue, Jul 5, 2011 at 7:28 PM, Alexander Graf <agraf@suse.de> wrote:
> Device code some times needs to access physical memory and does that
> through the ld./st._phys functions. However, these are the exact same
> functions that the CPU uses to access memory, which means they will
> be endianness swapped depending on the target CPU.
>
> However, devices don't know about the CPU's endianness, but instead
> access memory directly using their own interface to the memory bus,
> so they need some way to read data with their native endianness.
>
> This patch adds _le and _be functions to ld./st._phys.
>
> Signed-off-by: Alexander Graf <agraf@suse.de>
> ---
>  cpu-common.h |   12 +++++++
>  exec.c       |  102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 114 insertions(+), 0 deletions(-)
>
> diff --git a/cpu-common.h b/cpu-common.h
> index b027e43..c6a2b5f 100644
> --- a/cpu-common.h
> +++ b/cpu-common.h
> @@ -135,14 +135,26 @@ void qemu_flush_coalesced_mmio_buffer(void);
>
>  uint32_t ldub_phys(target_phys_addr_t addr);
>  uint32_t lduw_phys(target_phys_addr_t addr);
> +uint32_t lduw_le_phys(target_phys_addr_t addr);
> +uint32_t lduw_be_phys(target_phys_addr_t addr);
>  uint32_t ldl_phys(target_phys_addr_t addr);
> +uint32_t ldl_le_phys(target_phys_addr_t addr);
> +uint32_t ldl_be_phys(target_phys_addr_t addr);
>  uint64_t ldq_phys(target_phys_addr_t addr);
> +uint64_t ldq_le_phys(target_phys_addr_t addr);
> +uint64_t ldq_be_phys(target_phys_addr_t addr);
>  void stl_phys_notdirty(target_phys_addr_t addr, uint32_t val);
>  void stq_phys_notdirty(target_phys_addr_t addr, uint64_t val);
>  void stb_phys(target_phys_addr_t addr, uint32_t val);
>  void stw_phys(target_phys_addr_t addr, uint32_t val);
> +void stw_le_phys(target_phys_addr_t addr, uint32_t val);
> +void stw_be_phys(target_phys_addr_t addr, uint32_t val);
>  void stl_phys(target_phys_addr_t addr, uint32_t val);
> +void stl_le_phys(target_phys_addr_t addr, uint32_t val);
> +void stl_be_phys(target_phys_addr_t addr, uint32_t val);
>  void stq_phys(target_phys_addr_t addr, uint64_t val);
> +void stq_le_phys(target_phys_addr_t addr, uint64_t val);
> +void stq_be_phys(target_phys_addr_t addr, uint64_t val);
>
>  void cpu_physical_memory_write_rom(target_phys_addr_t addr,
>                                    const uint8_t *buf, int len);
> diff --git a/exec.c b/exec.c
> index 4c45299..5f2f87e 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -4158,6 +4158,24 @@ uint32_t ldl_phys(target_phys_addr_t addr)
>     return val;
>  }
>
> +uint32_t ldl_le_phys(target_phys_addr_t addr)
> +{
> +#if defined(TARGET_WORDS_BIGENDIAN)
> +    return bswap32(ldl_phys(addr));

This would introduce a second bswap in some cases. Please make instead
two versions of ldl_phys which use ldl_le/be_p instead of ldl_p. Then
ldl_phys could be #defined to the suitable function.

BTW, these functions would need a serious cleanup, there's a lot code
duplication and comments about XXX: optimize.

> +#else
> +    return ldl_phys(addr);
> +#endif
> +}
> +
> +uint32_t ldl_be_phys(target_phys_addr_t addr)
> +{
> +#if defined(TARGET_WORDS_BIGENDIAN)
> +    return ldl_phys(addr);
> +#else
> +    return bswap32(ldl_phys(addr));
> +#endif
> +}
> +
>  /* warning: addr must be aligned */
>  uint64_t ldq_phys(target_phys_addr_t addr)
>  {
> @@ -4196,6 +4214,24 @@ uint64_t ldq_phys(target_phys_addr_t addr)
>     return val;
>  }
>
> +uint64_t ldq_le_phys(target_phys_addr_t addr)
> +{
> +#if defined(TARGET_WORDS_BIGENDIAN)
> +    return bswap64(ldq_phys(addr));
> +#else
> +    return ldq_phys(addr);
> +#endif
> +}
> +
> +uint64_t ldq_be_phys(target_phys_addr_t addr)
> +{
> +#if defined(TARGET_WORDS_BIGENDIAN)
> +    return ldq_phys(addr);
> +#else
> +    return bswap64(ldq_phys(addr));
> +#endif
> +}
> +
>  /* XXX: optimize */
>  uint32_t ldub_phys(target_phys_addr_t addr)
>  {
> @@ -4236,6 +4272,24 @@ uint32_t lduw_phys(target_phys_addr_t addr)
>     return val;
>  }
>
> +uint32_t lduw_le_phys(target_phys_addr_t addr)
> +{
> +#if defined(TARGET_WORDS_BIGENDIAN)
> +    return bswap16(lduw_phys(addr));
> +#else
> +    return lduw_phys(addr);
> +#endif
> +}
> +
> +uint32_t lduw_be_phys(target_phys_addr_t addr)
> +{
> +#if defined(TARGET_WORDS_BIGENDIAN)
> +    return lduw_phys(addr);
> +#else
> +    return bswap16(lduw_phys(addr));
> +#endif
> +}
> +
>  /* warning: addr must be aligned. The ram page is not masked as dirty
>    and the code inside is not invalidated. It is useful if the dirty
>    bits are used to track modified PTEs */
> @@ -4343,6 +4397,24 @@ void stl_phys(target_phys_addr_t addr, uint32_t val)
>     }
>  }
>
> +void stl_le_phys(target_phys_addr_t addr, uint32_t val)
> +{
> +#if defined(TARGET_WORDS_BIGENDIAN)
> +    return stl_phys(addr, bswap32(val));
> +#else
> +    return stl_phys(addr, val);
> +#endif
> +}
> +
> +void stl_be_phys(target_phys_addr_t addr, uint32_t val)
> +{
> +#if defined(TARGET_WORDS_BIGENDIAN)
> +    return stl_phys(addr, val);
> +#else
> +    return stl_phys(addr, bswap32(val));
> +#endif
> +}
> +
>  /* XXX: optimize */
>  void stb_phys(target_phys_addr_t addr, uint32_t val)
>  {
> @@ -4386,6 +4458,24 @@ void stw_phys(target_phys_addr_t addr, uint32_t val)
>     }
>  }
>
> +void stw_le_phys(target_phys_addr_t addr, uint32_t val)
> +{
> +#if defined(TARGET_WORDS_BIGENDIAN)
> +    return stw_phys(addr, bswap16(val));
> +#else
> +    return stw_phys(addr, val);
> +#endif
> +}
> +
> +void stw_be_phys(target_phys_addr_t addr, uint32_t val)
> +{
> +#if defined(TARGET_WORDS_BIGENDIAN)
> +    return stw_phys(addr, val);
> +#else
> +    return stw_phys(addr, bswap16(val));
> +#endif
> +}
> +
>  /* XXX: optimize */
>  void stq_phys(target_phys_addr_t addr, uint64_t val)
>  {
> @@ -4393,6 +4483,18 @@ void stq_phys(target_phys_addr_t addr, uint64_t val)
>     cpu_physical_memory_write(addr, &val, 8);
>  }
>
> +void stq_le_phys(target_phys_addr_t addr, uint64_t val)
> +{
> +    val = cpu_to_le64(val);
> +    cpu_physical_memory_write(addr, &val, 8);
> +}
> +
> +void stq_be_phys(target_phys_addr_t addr, uint64_t val)
> +{
> +    val = cpu_to_be64(val);
> +    cpu_physical_memory_write(addr, &val, 8);
> +}
> +
>  /* virtual memory access for debug (includes writing to ROM) */
>  int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
>                         uint8_t *buf, int len, int is_write)
> --
> 1.7.3.4
>
>
>
Alexander Graf - July 5, 2011, 9:55 p.m.
On 05.07.2011, at 23:48, Blue Swirl wrote:

> On Tue, Jul 5, 2011 at 7:28 PM, Alexander Graf <agraf@suse.de> wrote:
>> Device code some times needs to access physical memory and does that
>> through the ld./st._phys functions. However, these are the exact same
>> functions that the CPU uses to access memory, which means they will
>> be endianness swapped depending on the target CPU.
>> 
>> However, devices don't know about the CPU's endianness, but instead
>> access memory directly using their own interface to the memory bus,
>> so they need some way to read data with their native endianness.
>> 
>> This patch adds _le and _be functions to ld./st._phys.
>> 
>> Signed-off-by: Alexander Graf <agraf@suse.de>
>> ---
>>  cpu-common.h |   12 +++++++
>>  exec.c       |  102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 114 insertions(+), 0 deletions(-)
>> 
>> diff --git a/cpu-common.h b/cpu-common.h
>> index b027e43..c6a2b5f 100644
>> --- a/cpu-common.h
>> +++ b/cpu-common.h
>> @@ -135,14 +135,26 @@ void qemu_flush_coalesced_mmio_buffer(void);
>> 
>>  uint32_t ldub_phys(target_phys_addr_t addr);
>>  uint32_t lduw_phys(target_phys_addr_t addr);
>> +uint32_t lduw_le_phys(target_phys_addr_t addr);
>> +uint32_t lduw_be_phys(target_phys_addr_t addr);
>>  uint32_t ldl_phys(target_phys_addr_t addr);
>> +uint32_t ldl_le_phys(target_phys_addr_t addr);
>> +uint32_t ldl_be_phys(target_phys_addr_t addr);
>>  uint64_t ldq_phys(target_phys_addr_t addr);
>> +uint64_t ldq_le_phys(target_phys_addr_t addr);
>> +uint64_t ldq_be_phys(target_phys_addr_t addr);
>>  void stl_phys_notdirty(target_phys_addr_t addr, uint32_t val);
>>  void stq_phys_notdirty(target_phys_addr_t addr, uint64_t val);
>>  void stb_phys(target_phys_addr_t addr, uint32_t val);
>>  void stw_phys(target_phys_addr_t addr, uint32_t val);
>> +void stw_le_phys(target_phys_addr_t addr, uint32_t val);
>> +void stw_be_phys(target_phys_addr_t addr, uint32_t val);
>>  void stl_phys(target_phys_addr_t addr, uint32_t val);
>> +void stl_le_phys(target_phys_addr_t addr, uint32_t val);
>> +void stl_be_phys(target_phys_addr_t addr, uint32_t val);
>>  void stq_phys(target_phys_addr_t addr, uint64_t val);
>> +void stq_le_phys(target_phys_addr_t addr, uint64_t val);
>> +void stq_be_phys(target_phys_addr_t addr, uint64_t val);
>> 
>>  void cpu_physical_memory_write_rom(target_phys_addr_t addr,
>>                                    const uint8_t *buf, int len);
>> diff --git a/exec.c b/exec.c
>> index 4c45299..5f2f87e 100644
>> --- a/exec.c
>> +++ b/exec.c
>> @@ -4158,6 +4158,24 @@ uint32_t ldl_phys(target_phys_addr_t addr)
>>     return val;
>>  }
>> 
>> +uint32_t ldl_le_phys(target_phys_addr_t addr)
>> +{
>> +#if defined(TARGET_WORDS_BIGENDIAN)
>> +    return bswap32(ldl_phys(addr));
> 
> This would introduce a second bswap in some cases. Please make instead
> two versions of ldl_phys which use ldl_le/be_p instead of ldl_p. Then
> ldl_phys could be #defined to the suitable function.

Yeah, I was thinking to do that one at first and then realized how MMIO gets tricky, since we also need to potentially swap it again, as by now it returns values in target CPU endianness. But if you prefer, I can dig into that.

> 
> BTW, these functions would need a serious cleanup, there's a lot code
> duplication and comments about XXX: optimize.

Yes :). One thing at a time :).


Alex
Blue Swirl - July 5, 2011, 10:05 p.m.
On Wed, Jul 6, 2011 at 12:55 AM, Alexander Graf <agraf@suse.de> wrote:
>
> On 05.07.2011, at 23:48, Blue Swirl wrote:
>
>> On Tue, Jul 5, 2011 at 7:28 PM, Alexander Graf <agraf@suse.de> wrote:
>>> Device code some times needs to access physical memory and does that
>>> through the ld./st._phys functions. However, these are the exact same
>>> functions that the CPU uses to access memory, which means they will
>>> be endianness swapped depending on the target CPU.
>>>
>>> However, devices don't know about the CPU's endianness, but instead
>>> access memory directly using their own interface to the memory bus,
>>> so they need some way to read data with their native endianness.
>>>
>>> This patch adds _le and _be functions to ld./st._phys.
>>>
>>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>> ---
>>>  cpu-common.h |   12 +++++++
>>>  exec.c       |  102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 114 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/cpu-common.h b/cpu-common.h
>>> index b027e43..c6a2b5f 100644
>>> --- a/cpu-common.h
>>> +++ b/cpu-common.h
>>> @@ -135,14 +135,26 @@ void qemu_flush_coalesced_mmio_buffer(void);
>>>
>>>  uint32_t ldub_phys(target_phys_addr_t addr);
>>>  uint32_t lduw_phys(target_phys_addr_t addr);
>>> +uint32_t lduw_le_phys(target_phys_addr_t addr);
>>> +uint32_t lduw_be_phys(target_phys_addr_t addr);
>>>  uint32_t ldl_phys(target_phys_addr_t addr);
>>> +uint32_t ldl_le_phys(target_phys_addr_t addr);
>>> +uint32_t ldl_be_phys(target_phys_addr_t addr);
>>>  uint64_t ldq_phys(target_phys_addr_t addr);
>>> +uint64_t ldq_le_phys(target_phys_addr_t addr);
>>> +uint64_t ldq_be_phys(target_phys_addr_t addr);
>>>  void stl_phys_notdirty(target_phys_addr_t addr, uint32_t val);
>>>  void stq_phys_notdirty(target_phys_addr_t addr, uint64_t val);
>>>  void stb_phys(target_phys_addr_t addr, uint32_t val);
>>>  void stw_phys(target_phys_addr_t addr, uint32_t val);
>>> +void stw_le_phys(target_phys_addr_t addr, uint32_t val);
>>> +void stw_be_phys(target_phys_addr_t addr, uint32_t val);
>>>  void stl_phys(target_phys_addr_t addr, uint32_t val);
>>> +void stl_le_phys(target_phys_addr_t addr, uint32_t val);
>>> +void stl_be_phys(target_phys_addr_t addr, uint32_t val);
>>>  void stq_phys(target_phys_addr_t addr, uint64_t val);
>>> +void stq_le_phys(target_phys_addr_t addr, uint64_t val);
>>> +void stq_be_phys(target_phys_addr_t addr, uint64_t val);
>>>
>>>  void cpu_physical_memory_write_rom(target_phys_addr_t addr,
>>>                                    const uint8_t *buf, int len);
>>> diff --git a/exec.c b/exec.c
>>> index 4c45299..5f2f87e 100644
>>> --- a/exec.c
>>> +++ b/exec.c
>>> @@ -4158,6 +4158,24 @@ uint32_t ldl_phys(target_phys_addr_t addr)
>>>     return val;
>>>  }
>>>
>>> +uint32_t ldl_le_phys(target_phys_addr_t addr)
>>> +{
>>> +#if defined(TARGET_WORDS_BIGENDIAN)
>>> +    return bswap32(ldl_phys(addr));
>>
>> This would introduce a second bswap in some cases. Please make instead
>> two versions of ldl_phys which use ldl_le/be_p instead of ldl_p. Then
>> ldl_phys could be #defined to the suitable function.
>
> Yeah, I was thinking to do that one at first and then realized how MMIO gets tricky, since we also need to potentially swap it again, as by now it returns values in target CPU endianness. But if you prefer, I can dig into that.

Maybe the swapendian thing could be adjusted so that it's possible to
access the device in either LE or BE way directly? For example, there
could be two io_mem_read/write tables, the current one for CPU and
another one for device MMIO with known endianness. Or even three
tables: CPU, BE, LE.

>>
>> BTW, these functions would need a serious cleanup, there's a lot code
>> duplication and comments about XXX: optimize.
>
> Yes :). One thing at a time :).
>
>
> Alex
>
>
Alexander Graf - July 5, 2011, 10:13 p.m.
On 06.07.2011, at 00:05, Blue Swirl wrote:

> On Wed, Jul 6, 2011 at 12:55 AM, Alexander Graf <agraf@suse.de> wrote:
>> 
>> On 05.07.2011, at 23:48, Blue Swirl wrote:
>> 
>>> On Tue, Jul 5, 2011 at 7:28 PM, Alexander Graf <agraf@suse.de> wrote:
>>>> Device code some times needs to access physical memory and does that
>>>> through the ld./st._phys functions. However, these are the exact same
>>>> functions that the CPU uses to access memory, which means they will
>>>> be endianness swapped depending on the target CPU.
>>>> 
>>>> However, devices don't know about the CPU's endianness, but instead
>>>> access memory directly using their own interface to the memory bus,
>>>> so they need some way to read data with their native endianness.
>>>> 
>>>> This patch adds _le and _be functions to ld./st._phys.
>>>> 
>>>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>>> ---
>>>>  cpu-common.h |   12 +++++++
>>>>  exec.c       |  102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  2 files changed, 114 insertions(+), 0 deletions(-)
>>>> 
>>>> diff --git a/cpu-common.h b/cpu-common.h
>>>> index b027e43..c6a2b5f 100644
>>>> --- a/cpu-common.h
>>>> +++ b/cpu-common.h
>>>> @@ -135,14 +135,26 @@ void qemu_flush_coalesced_mmio_buffer(void);
>>>> 
>>>>  uint32_t ldub_phys(target_phys_addr_t addr);
>>>>  uint32_t lduw_phys(target_phys_addr_t addr);
>>>> +uint32_t lduw_le_phys(target_phys_addr_t addr);
>>>> +uint32_t lduw_be_phys(target_phys_addr_t addr);
>>>>  uint32_t ldl_phys(target_phys_addr_t addr);
>>>> +uint32_t ldl_le_phys(target_phys_addr_t addr);
>>>> +uint32_t ldl_be_phys(target_phys_addr_t addr);
>>>>  uint64_t ldq_phys(target_phys_addr_t addr);
>>>> +uint64_t ldq_le_phys(target_phys_addr_t addr);
>>>> +uint64_t ldq_be_phys(target_phys_addr_t addr);
>>>>  void stl_phys_notdirty(target_phys_addr_t addr, uint32_t val);
>>>>  void stq_phys_notdirty(target_phys_addr_t addr, uint64_t val);
>>>>  void stb_phys(target_phys_addr_t addr, uint32_t val);
>>>>  void stw_phys(target_phys_addr_t addr, uint32_t val);
>>>> +void stw_le_phys(target_phys_addr_t addr, uint32_t val);
>>>> +void stw_be_phys(target_phys_addr_t addr, uint32_t val);
>>>>  void stl_phys(target_phys_addr_t addr, uint32_t val);
>>>> +void stl_le_phys(target_phys_addr_t addr, uint32_t val);
>>>> +void stl_be_phys(target_phys_addr_t addr, uint32_t val);
>>>>  void stq_phys(target_phys_addr_t addr, uint64_t val);
>>>> +void stq_le_phys(target_phys_addr_t addr, uint64_t val);
>>>> +void stq_be_phys(target_phys_addr_t addr, uint64_t val);
>>>> 
>>>>  void cpu_physical_memory_write_rom(target_phys_addr_t addr,
>>>>                                    const uint8_t *buf, int len);
>>>> diff --git a/exec.c b/exec.c
>>>> index 4c45299..5f2f87e 100644
>>>> --- a/exec.c
>>>> +++ b/exec.c
>>>> @@ -4158,6 +4158,24 @@ uint32_t ldl_phys(target_phys_addr_t addr)
>>>>     return val;
>>>>  }
>>>> 
>>>> +uint32_t ldl_le_phys(target_phys_addr_t addr)
>>>> +{
>>>> +#if defined(TARGET_WORDS_BIGENDIAN)
>>>> +    return bswap32(ldl_phys(addr));
>>> 
>>> This would introduce a second bswap in some cases. Please make instead
>>> two versions of ldl_phys which use ldl_le/be_p instead of ldl_p. Then
>>> ldl_phys could be #defined to the suitable function.
>> 
>> Yeah, I was thinking to do that one at first and then realized how MMIO gets tricky, since we also need to potentially swap it again, as by now it returns values in target CPU endianness. But if you prefer, I can dig into that.
> 
> Maybe the swapendian thing could be adjusted so that it's possible to
> access the device in either LE or BE way directly? For example, there
> could be two io_mem_read/write tables, the current one for CPU and
> another one for device MMIO with known endianness. Or even three
> tables: CPU, BE, LE.

If you take a look at the patches following this one, you can easily see how few devices actually use these functions. I don't think the additional memory usage would count up for a couple of byte swaps here really.

We could however try to be clever and directly check if the device mmio is a swapendian function and just bypass it. I'm just not sure it's worth the additional overhead for the non-swapped case (which should be the normal one).


Alex
Blue Swirl - July 5, 2011, 10:22 p.m.
On Wed, Jul 6, 2011 at 1:13 AM, Alexander Graf <agraf@suse.de> wrote:
>
> On 06.07.2011, at 00:05, Blue Swirl wrote:
>
>> On Wed, Jul 6, 2011 at 12:55 AM, Alexander Graf <agraf@suse.de> wrote:
>>>
>>> On 05.07.2011, at 23:48, Blue Swirl wrote:
>>>
>>>> On Tue, Jul 5, 2011 at 7:28 PM, Alexander Graf <agraf@suse.de> wrote:
>>>>> Device code some times needs to access physical memory and does that
>>>>> through the ld./st._phys functions. However, these are the exact same
>>>>> functions that the CPU uses to access memory, which means they will
>>>>> be endianness swapped depending on the target CPU.
>>>>>
>>>>> However, devices don't know about the CPU's endianness, but instead
>>>>> access memory directly using their own interface to the memory bus,
>>>>> so they need some way to read data with their native endianness.
>>>>>
>>>>> This patch adds _le and _be functions to ld./st._phys.
>>>>>
>>>>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>>>> ---
>>>>>  cpu-common.h |   12 +++++++
>>>>>  exec.c       |  102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>  2 files changed, 114 insertions(+), 0 deletions(-)
>>>>>
>>>>> diff --git a/cpu-common.h b/cpu-common.h
>>>>> index b027e43..c6a2b5f 100644
>>>>> --- a/cpu-common.h
>>>>> +++ b/cpu-common.h
>>>>> @@ -135,14 +135,26 @@ void qemu_flush_coalesced_mmio_buffer(void);
>>>>>
>>>>>  uint32_t ldub_phys(target_phys_addr_t addr);
>>>>>  uint32_t lduw_phys(target_phys_addr_t addr);
>>>>> +uint32_t lduw_le_phys(target_phys_addr_t addr);
>>>>> +uint32_t lduw_be_phys(target_phys_addr_t addr);
>>>>>  uint32_t ldl_phys(target_phys_addr_t addr);
>>>>> +uint32_t ldl_le_phys(target_phys_addr_t addr);
>>>>> +uint32_t ldl_be_phys(target_phys_addr_t addr);
>>>>>  uint64_t ldq_phys(target_phys_addr_t addr);
>>>>> +uint64_t ldq_le_phys(target_phys_addr_t addr);
>>>>> +uint64_t ldq_be_phys(target_phys_addr_t addr);
>>>>>  void stl_phys_notdirty(target_phys_addr_t addr, uint32_t val);
>>>>>  void stq_phys_notdirty(target_phys_addr_t addr, uint64_t val);
>>>>>  void stb_phys(target_phys_addr_t addr, uint32_t val);
>>>>>  void stw_phys(target_phys_addr_t addr, uint32_t val);
>>>>> +void stw_le_phys(target_phys_addr_t addr, uint32_t val);
>>>>> +void stw_be_phys(target_phys_addr_t addr, uint32_t val);
>>>>>  void stl_phys(target_phys_addr_t addr, uint32_t val);
>>>>> +void stl_le_phys(target_phys_addr_t addr, uint32_t val);
>>>>> +void stl_be_phys(target_phys_addr_t addr, uint32_t val);
>>>>>  void stq_phys(target_phys_addr_t addr, uint64_t val);
>>>>> +void stq_le_phys(target_phys_addr_t addr, uint64_t val);
>>>>> +void stq_be_phys(target_phys_addr_t addr, uint64_t val);
>>>>>
>>>>>  void cpu_physical_memory_write_rom(target_phys_addr_t addr,
>>>>>                                    const uint8_t *buf, int len);
>>>>> diff --git a/exec.c b/exec.c
>>>>> index 4c45299..5f2f87e 100644
>>>>> --- a/exec.c
>>>>> +++ b/exec.c
>>>>> @@ -4158,6 +4158,24 @@ uint32_t ldl_phys(target_phys_addr_t addr)
>>>>>     return val;
>>>>>  }
>>>>>
>>>>> +uint32_t ldl_le_phys(target_phys_addr_t addr)
>>>>> +{
>>>>> +#if defined(TARGET_WORDS_BIGENDIAN)
>>>>> +    return bswap32(ldl_phys(addr));
>>>>
>>>> This would introduce a second bswap in some cases. Please make instead
>>>> two versions of ldl_phys which use ldl_le/be_p instead of ldl_p. Then
>>>> ldl_phys could be #defined to the suitable function.
>>>
>>> Yeah, I was thinking to do that one at first and then realized how MMIO gets tricky, since we also need to potentially swap it again, as by now it returns values in target CPU endianness. But if you prefer, I can dig into that.
>>
>> Maybe the swapendian thing could be adjusted so that it's possible to
>> access the device in either LE or BE way directly? For example, there
>> could be two io_mem_read/write tables, the current one for CPU and
>> another one for device MMIO with known endianness. Or even three
>> tables: CPU, BE, LE.
>
> If you take a look at the patches following this one, you can easily see how few devices actually use these functions. I don't think the additional memory usage would count up for a couple of byte swaps here really.
>
> We could however try to be clever and directly check if the device mmio is a swapendian function and just bypass it. I'm just not sure it's worth the additional overhead for the non-swapped case (which should be the normal one).

Neither seems to be very attractive option. It may be enough to
optimize just RAM accesses and forget about extra bswap for MMIO for
now.

Patch

diff --git a/cpu-common.h b/cpu-common.h
index b027e43..c6a2b5f 100644
--- a/cpu-common.h
+++ b/cpu-common.h
@@ -135,14 +135,26 @@  void qemu_flush_coalesced_mmio_buffer(void);
 
 uint32_t ldub_phys(target_phys_addr_t addr);
 uint32_t lduw_phys(target_phys_addr_t addr);
+uint32_t lduw_le_phys(target_phys_addr_t addr);
+uint32_t lduw_be_phys(target_phys_addr_t addr);
 uint32_t ldl_phys(target_phys_addr_t addr);
+uint32_t ldl_le_phys(target_phys_addr_t addr);
+uint32_t ldl_be_phys(target_phys_addr_t addr);
 uint64_t ldq_phys(target_phys_addr_t addr);
+uint64_t ldq_le_phys(target_phys_addr_t addr);
+uint64_t ldq_be_phys(target_phys_addr_t addr);
 void stl_phys_notdirty(target_phys_addr_t addr, uint32_t val);
 void stq_phys_notdirty(target_phys_addr_t addr, uint64_t val);
 void stb_phys(target_phys_addr_t addr, uint32_t val);
 void stw_phys(target_phys_addr_t addr, uint32_t val);
+void stw_le_phys(target_phys_addr_t addr, uint32_t val);
+void stw_be_phys(target_phys_addr_t addr, uint32_t val);
 void stl_phys(target_phys_addr_t addr, uint32_t val);
+void stl_le_phys(target_phys_addr_t addr, uint32_t val);
+void stl_be_phys(target_phys_addr_t addr, uint32_t val);
 void stq_phys(target_phys_addr_t addr, uint64_t val);
+void stq_le_phys(target_phys_addr_t addr, uint64_t val);
+void stq_be_phys(target_phys_addr_t addr, uint64_t val);
 
 void cpu_physical_memory_write_rom(target_phys_addr_t addr,
                                    const uint8_t *buf, int len);
diff --git a/exec.c b/exec.c
index 4c45299..5f2f87e 100644
--- a/exec.c
+++ b/exec.c
@@ -4158,6 +4158,24 @@  uint32_t ldl_phys(target_phys_addr_t addr)
     return val;
 }
 
+uint32_t ldl_le_phys(target_phys_addr_t addr)
+{
+#if defined(TARGET_WORDS_BIGENDIAN)
+    return bswap32(ldl_phys(addr));
+#else
+    return ldl_phys(addr);
+#endif
+}
+
+uint32_t ldl_be_phys(target_phys_addr_t addr)
+{
+#if defined(TARGET_WORDS_BIGENDIAN)
+    return ldl_phys(addr);
+#else
+    return bswap32(ldl_phys(addr));
+#endif
+}
+
 /* warning: addr must be aligned */
 uint64_t ldq_phys(target_phys_addr_t addr)
 {
@@ -4196,6 +4214,24 @@  uint64_t ldq_phys(target_phys_addr_t addr)
     return val;
 }
 
+uint64_t ldq_le_phys(target_phys_addr_t addr)
+{
+#if defined(TARGET_WORDS_BIGENDIAN)
+    return bswap64(ldq_phys(addr));
+#else
+    return ldq_phys(addr);
+#endif
+}
+
+uint64_t ldq_be_phys(target_phys_addr_t addr)
+{
+#if defined(TARGET_WORDS_BIGENDIAN)
+    return ldq_phys(addr);
+#else
+    return bswap64(ldq_phys(addr));
+#endif
+}
+
 /* XXX: optimize */
 uint32_t ldub_phys(target_phys_addr_t addr)
 {
@@ -4236,6 +4272,24 @@  uint32_t lduw_phys(target_phys_addr_t addr)
     return val;
 }
 
+uint32_t lduw_le_phys(target_phys_addr_t addr)
+{
+#if defined(TARGET_WORDS_BIGENDIAN)
+    return bswap16(lduw_phys(addr));
+#else
+    return lduw_phys(addr);
+#endif
+}
+
+uint32_t lduw_be_phys(target_phys_addr_t addr)
+{
+#if defined(TARGET_WORDS_BIGENDIAN)
+    return lduw_phys(addr);
+#else
+    return bswap16(lduw_phys(addr));
+#endif
+}
+
 /* warning: addr must be aligned. The ram page is not masked as dirty
    and the code inside is not invalidated. It is useful if the dirty
    bits are used to track modified PTEs */
@@ -4343,6 +4397,24 @@  void stl_phys(target_phys_addr_t addr, uint32_t val)
     }
 }
 
+void stl_le_phys(target_phys_addr_t addr, uint32_t val)
+{
+#if defined(TARGET_WORDS_BIGENDIAN)
+    return stl_phys(addr, bswap32(val));
+#else
+    return stl_phys(addr, val);
+#endif
+}
+
+void stl_be_phys(target_phys_addr_t addr, uint32_t val)
+{
+#if defined(TARGET_WORDS_BIGENDIAN)
+    return stl_phys(addr, val);
+#else
+    return stl_phys(addr, bswap32(val));
+#endif
+}
+
 /* XXX: optimize */
 void stb_phys(target_phys_addr_t addr, uint32_t val)
 {
@@ -4386,6 +4458,24 @@  void stw_phys(target_phys_addr_t addr, uint32_t val)
     }
 }
 
+void stw_le_phys(target_phys_addr_t addr, uint32_t val)
+{
+#if defined(TARGET_WORDS_BIGENDIAN)
+    return stw_phys(addr, bswap16(val));
+#else
+    return stw_phys(addr, val);
+#endif
+}
+
+void stw_be_phys(target_phys_addr_t addr, uint32_t val)
+{
+#if defined(TARGET_WORDS_BIGENDIAN)
+    return stw_phys(addr, val);
+#else
+    return stw_phys(addr, bswap16(val));
+#endif
+}
+
 /* XXX: optimize */
 void stq_phys(target_phys_addr_t addr, uint64_t val)
 {
@@ -4393,6 +4483,18 @@  void stq_phys(target_phys_addr_t addr, uint64_t val)
     cpu_physical_memory_write(addr, &val, 8);
 }
 
+void stq_le_phys(target_phys_addr_t addr, uint64_t val)
+{
+    val = cpu_to_le64(val);
+    cpu_physical_memory_write(addr, &val, 8);
+}
+
+void stq_be_phys(target_phys_addr_t addr, uint64_t val)
+{
+    val = cpu_to_be64(val);
+    cpu_physical_memory_write(addr, &val, 8);
+}
+
 /* virtual memory access for debug (includes writing to ROM) */
 int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
                         uint8_t *buf, int len, int is_write)