Patchwork [v3,11/14] ioport: Switch dispatching to memory core layer

login
register
mail settings
Submitter Jan Kiszka
Date June 22, 2013, 6:07 a.m.
Message ID <a42b019eee5a56e024379d2f98d55aacdd371b41.1371881222.git.jan.kiszka@web.de>
Download mbox | patch
Permalink /patch/253349/
State New
Headers show

Comments

Jan Kiszka - June 22, 2013, 6:07 a.m.
From: Jan Kiszka <jan.kiszka@siemens.com>

The current ioport dispatcher is a complex beast, mostly due to the
need to deal with old portio interface users. But we can overcome it
without converting all portio users by embedding the required base
address of a MemoryRegionPortio access into that data structure. That
removes the need to have the additional MemoryRegionIORange structure
in the loop on every access.

To handle old portio memory ops, we simply install dispatching handlers
for portio memory regions when registering them with the memory core.
This removes the need for the old_portio field.

We can drop the additional aliasing of ioport regions and also the
special address space listener. cpu_in and cpu_out now simply call
address_space_read/write. And we can concentrate portio handling in a
single source file.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 exec.c                         |   27 --------
 include/exec/ioport.h          |    1 -
 include/exec/memory-internal.h |    2 -
 include/exec/memory.h          |    5 +-
 ioport.c                       |  137 +++++++++++++++++++++++++++++-----------
 memory.c                       |   88 -------------------------
 6 files changed, 101 insertions(+), 159 deletions(-)
Hervé Poussineau - June 23, 2013, 8:50 p.m.
Jan Kiszka a écrit :
> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> The current ioport dispatcher is a complex beast, mostly due to the
> need to deal with old portio interface users. But we can overcome it
> without converting all portio users by embedding the required base
> address of a MemoryRegionPortio access into that data structure. That
> removes the need to have the additional MemoryRegionIORange structure
> in the loop on every access.
> 
> To handle old portio memory ops, we simply install dispatching handlers
> for portio memory regions when registering them with the memory core.
> This removes the need for the old_portio field.
> 
> We can drop the additional aliasing of ioport regions and also the
> special address space listener. cpu_in and cpu_out now simply call
> address_space_read/write. And we can concentrate portio handling in a
> single source file.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---

...

> +
> +static void portio_write(void *opaque, hwaddr addr, uint64_t data,
> +                         unsigned size)
> +{
> +    MemoryRegionPortioList *mrpio = opaque;
> +    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size, true);
> +
> +    if (mrp) {
> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
> +    } else if (size == 2) {
> +        mrp = find_portio(mrpio, addr, 1, true);
> +        assert(mrp);
> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
> +        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data >> 8);
> +    }
> +}
> +
> +static const MemoryRegionOps portio_ops = {
> +    .read = portio_read,
> +    .write = portio_write,
> +    .valid.unaligned = true,
> +    .impl.unaligned = true,
> +};
> +

You need to mark these operations as DEVICE_LITTLE_ENDIAN.
In portio_write above, you clearly assume that data is in LE format.

This fixes PPC PReP emulation, which would otherwise be broken with this 
patchset.

Hervé
Jan Kiszka - June 24, 2013, 6:07 a.m.
On 2013-06-23 22:50, Hervé Poussineau wrote:
> Jan Kiszka a écrit :
>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>
>> The current ioport dispatcher is a complex beast, mostly due to the
>> need to deal with old portio interface users. But we can overcome it
>> without converting all portio users by embedding the required base
>> address of a MemoryRegionPortio access into that data structure. That
>> removes the need to have the additional MemoryRegionIORange structure
>> in the loop on every access.
>>
>> To handle old portio memory ops, we simply install dispatching handlers
>> for portio memory regions when registering them with the memory core.
>> This removes the need for the old_portio field.
>>
>> We can drop the additional aliasing of ioport regions and also the
>> special address space listener. cpu_in and cpu_out now simply call
>> address_space_read/write. And we can concentrate portio handling in a
>> single source file.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
> 
> ...
> 
>> +
>> +static void portio_write(void *opaque, hwaddr addr, uint64_t data,
>> +                         unsigned size)
>> +{
>> +    MemoryRegionPortioList *mrpio = opaque;
>> +    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size,
>> true);
>> +
>> +    if (mrp) {
>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
>> +    } else if (size == 2) {
>> +        mrp = find_portio(mrpio, addr, 1, true);
>> +        assert(mrp);
>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data
>> >> 8);
>> +    }
>> +}
>> +
>> +static const MemoryRegionOps portio_ops = {
>> +    .read = portio_read,
>> +    .write = portio_write,
>> +    .valid.unaligned = true,
>> +    .impl.unaligned = true,
>> +};
>> +
> 
> You need to mark these operations as DEVICE_LITTLE_ENDIAN.
> In portio_write above, you clearly assume that data is in LE format.

Anything behind PIO is little endian, of course. Will add this.

> 
> This fixes PPC PReP emulation, which would otherwise be broken with this
> patchset.

Thanks,
Jan
Alexander Graf - July 11, 2013, 12:29 p.m.
On 24.06.2013, at 08:07, Jan Kiszka wrote:

> On 2013-06-23 22:50, Hervé Poussineau wrote:
>> Jan Kiszka a écrit :
>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>> 
>>> The current ioport dispatcher is a complex beast, mostly due to the
>>> need to deal with old portio interface users. But we can overcome it
>>> without converting all portio users by embedding the required base
>>> address of a MemoryRegionPortio access into that data structure. That
>>> removes the need to have the additional MemoryRegionIORange structure
>>> in the loop on every access.
>>> 
>>> To handle old portio memory ops, we simply install dispatching handlers
>>> for portio memory regions when registering them with the memory core.
>>> This removes the need for the old_portio field.
>>> 
>>> We can drop the additional aliasing of ioport regions and also the
>>> special address space listener. cpu_in and cpu_out now simply call
>>> address_space_read/write. And we can concentrate portio handling in a
>>> single source file.
>>> 
>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>> ---
>> 
>> ...
>> 
>>> +
>>> +static void portio_write(void *opaque, hwaddr addr, uint64_t data,
>>> +                         unsigned size)
>>> +{
>>> +    MemoryRegionPortioList *mrpio = opaque;
>>> +    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size,
>>> true);
>>> +
>>> +    if (mrp) {
>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
>>> +    } else if (size == 2) {
>>> +        mrp = find_portio(mrpio, addr, 1, true);
>>> +        assert(mrp);
>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data
>>>>> 8);
>>> +    }
>>> +}
>>> +
>>> +static const MemoryRegionOps portio_ops = {
>>> +    .read = portio_read,
>>> +    .write = portio_write,
>>> +    .valid.unaligned = true,
>>> +    .impl.unaligned = true,
>>> +};
>>> +
>> 
>> You need to mark these operations as DEVICE_LITTLE_ENDIAN.
>> In portio_write above, you clearly assume that data is in LE format.
> 
> Anything behind PIO is little endian, of course. Will add this.

This patch breaks VGA on PPC as it is in master today.


Alex

> 
>> 
>> This fixes PPC PReP emulation, which would otherwise be broken with this
>> patchset.
> 
> Thanks,
> Jan
> 
>
Alexander Graf - July 11, 2013, 12:34 p.m.
On 11.07.2013, at 14:29, Alexander Graf wrote:

> 
> On 24.06.2013, at 08:07, Jan Kiszka wrote:
> 
>> On 2013-06-23 22:50, Hervé Poussineau wrote:
>>> Jan Kiszka a écrit :
>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>> 
>>>> The current ioport dispatcher is a complex beast, mostly due to the
>>>> need to deal with old portio interface users. But we can overcome it
>>>> without converting all portio users by embedding the required base
>>>> address of a MemoryRegionPortio access into that data structure. That
>>>> removes the need to have the additional MemoryRegionIORange structure
>>>> in the loop on every access.
>>>> 
>>>> To handle old portio memory ops, we simply install dispatching handlers
>>>> for portio memory regions when registering them with the memory core.
>>>> This removes the need for the old_portio field.
>>>> 
>>>> We can drop the additional aliasing of ioport regions and also the
>>>> special address space listener. cpu_in and cpu_out now simply call
>>>> address_space_read/write. And we can concentrate portio handling in a
>>>> single source file.
>>>> 
>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>> ---
>>> 
>>> ...
>>> 
>>>> +
>>>> +static void portio_write(void *opaque, hwaddr addr, uint64_t data,
>>>> +                         unsigned size)
>>>> +{
>>>> +    MemoryRegionPortioList *mrpio = opaque;
>>>> +    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size,
>>>> true);
>>>> +
>>>> +    if (mrp) {
>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
>>>> +    } else if (size == 2) {
>>>> +        mrp = find_portio(mrpio, addr, 1, true);
>>>> +        assert(mrp);
>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data
>>>>>> 8);
>>>> +    }
>>>> +}
>>>> +
>>>> +static const MemoryRegionOps portio_ops = {
>>>> +    .read = portio_read,
>>>> +    .write = portio_write,
>>>> +    .valid.unaligned = true,
>>>> +    .impl.unaligned = true,
>>>> +};
>>>> +
>>> 
>>> You need to mark these operations as DEVICE_LITTLE_ENDIAN.
>>> In portio_write above, you clearly assume that data is in LE format.
>> 
>> Anything behind PIO is little endian, of course. Will add this.
> 
> This patch breaks VGA on PPC as it is in master today.

If I don't mark portio as little endian it works as expected. There's probably someone swapping things twice.


Alex
Andreas Färber - July 11, 2013, 12:46 p.m.
Am 11.07.2013 14:34, schrieb Alexander Graf:
> 
> On 11.07.2013, at 14:29, Alexander Graf wrote:
> 
>>
>> On 24.06.2013, at 08:07, Jan Kiszka wrote:
>>
>>> On 2013-06-23 22:50, Hervé Poussineau wrote:
>>>> Jan Kiszka a écrit :
>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>
>>>>> The current ioport dispatcher is a complex beast, mostly due to the
>>>>> need to deal with old portio interface users. But we can overcome it
>>>>> without converting all portio users by embedding the required base
>>>>> address of a MemoryRegionPortio access into that data structure. That
>>>>> removes the need to have the additional MemoryRegionIORange structure
>>>>> in the loop on every access.
>>>>>
>>>>> To handle old portio memory ops, we simply install dispatching handlers
>>>>> for portio memory regions when registering them with the memory core.
>>>>> This removes the need for the old_portio field.
>>>>>
>>>>> We can drop the additional aliasing of ioport regions and also the
>>>>> special address space listener. cpu_in and cpu_out now simply call
>>>>> address_space_read/write. And we can concentrate portio handling in a
>>>>> single source file.
>>>>>
>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>> ---
>>>>
>>>> ...
>>>>
>>>>> +
>>>>> +static void portio_write(void *opaque, hwaddr addr, uint64_t data,
>>>>> +                         unsigned size)
>>>>> +{
>>>>> +    MemoryRegionPortioList *mrpio = opaque;
>>>>> +    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size,
>>>>> true);
>>>>> +
>>>>> +    if (mrp) {
>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
>>>>> +    } else if (size == 2) {
>>>>> +        mrp = find_portio(mrpio, addr, 1, true);
>>>>> +        assert(mrp);
>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data
>>>>>>> 8);
>>>>> +    }
>>>>> +}
>>>>> +
>>>>> +static const MemoryRegionOps portio_ops = {
>>>>> +    .read = portio_read,
>>>>> +    .write = portio_write,
>>>>> +    .valid.unaligned = true,
>>>>> +    .impl.unaligned = true,
>>>>> +};
>>>>> +
>>>>
>>>> You need to mark these operations as DEVICE_LITTLE_ENDIAN.
>>>> In portio_write above, you clearly assume that data is in LE format.
>>>
>>> Anything behind PIO is little endian, of course. Will add this.
>>
>> This patch breaks VGA on PPC as it is in master today.
> 
> If I don't mark portio as little endian it works as expected. There's probably someone swapping things twice.

sPAPR has its MemoryRegion marked Little Endian:

http://git.qemu.org/?p=qemu.git;a=blobdiff;f=hw/spapr_pci.c;h=a08ed11166595bdc493065beb64d4ce5b7b0dded;hp=c2c3079d21d5be2647faf85a8c608ac995d2ca62;hb=a3cfa18eb075c7ef78358ca1956fe7b01caa1724;hpb=286d52ebfc0d0d53c2a878e454292fea14bad41b

Possibly we can now apply Hervé's patches on top to remove that hack again?

Andreas
Alexander Graf - July 11, 2013, 12:48 p.m.
On 11.07.2013, at 14:46, Andreas Färber wrote:

> Am 11.07.2013 14:34, schrieb Alexander Graf:
>> 
>> On 11.07.2013, at 14:29, Alexander Graf wrote:
>> 
>>> 
>>> On 24.06.2013, at 08:07, Jan Kiszka wrote:
>>> 
>>>> On 2013-06-23 22:50, Hervé Poussineau wrote:
>>>>> Jan Kiszka a écrit :
>>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>> 
>>>>>> The current ioport dispatcher is a complex beast, mostly due to the
>>>>>> need to deal with old portio interface users. But we can overcome it
>>>>>> without converting all portio users by embedding the required base
>>>>>> address of a MemoryRegionPortio access into that data structure. That
>>>>>> removes the need to have the additional MemoryRegionIORange structure
>>>>>> in the loop on every access.
>>>>>> 
>>>>>> To handle old portio memory ops, we simply install dispatching handlers
>>>>>> for portio memory regions when registering them with the memory core.
>>>>>> This removes the need for the old_portio field.
>>>>>> 
>>>>>> We can drop the additional aliasing of ioport regions and also the
>>>>>> special address space listener. cpu_in and cpu_out now simply call
>>>>>> address_space_read/write. And we can concentrate portio handling in a
>>>>>> single source file.
>>>>>> 
>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>> ---
>>>>> 
>>>>> ...
>>>>> 
>>>>>> +
>>>>>> +static void portio_write(void *opaque, hwaddr addr, uint64_t data,
>>>>>> +                         unsigned size)
>>>>>> +{
>>>>>> +    MemoryRegionPortioList *mrpio = opaque;
>>>>>> +    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size,
>>>>>> true);
>>>>>> +
>>>>>> +    if (mrp) {
>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
>>>>>> +    } else if (size == 2) {
>>>>>> +        mrp = find_portio(mrpio, addr, 1, true);
>>>>>> +        assert(mrp);
>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data
>>>>>>>> 8);
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +static const MemoryRegionOps portio_ops = {
>>>>>> +    .read = portio_read,
>>>>>> +    .write = portio_write,
>>>>>> +    .valid.unaligned = true,
>>>>>> +    .impl.unaligned = true,
>>>>>> +};
>>>>>> +
>>>>> 
>>>>> You need to mark these operations as DEVICE_LITTLE_ENDIAN.
>>>>> In portio_write above, you clearly assume that data is in LE format.
>>>> 
>>>> Anything behind PIO is little endian, of course. Will add this.
>>> 
>>> This patch breaks VGA on PPC as it is in master today.
>> 
>> If I don't mark portio as little endian it works as expected. There's probably someone swapping things twice.
> 
> sPAPR has its MemoryRegion marked Little Endian:

sPAPR works, it's only the Mac machines that break.


Apex
Alexander Graf - July 11, 2013, 1:28 p.m.
On 11.07.2013, at 14:48, Alexander Graf wrote:

> 
> On 11.07.2013, at 14:46, Andreas Färber wrote:
> 
>> Am 11.07.2013 14:34, schrieb Alexander Graf:
>>> 
>>> On 11.07.2013, at 14:29, Alexander Graf wrote:
>>> 
>>>> 
>>>> On 24.06.2013, at 08:07, Jan Kiszka wrote:
>>>> 
>>>>> On 2013-06-23 22:50, Hervé Poussineau wrote:
>>>>>> Jan Kiszka a écrit :
>>>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>> 
>>>>>>> The current ioport dispatcher is a complex beast, mostly due to the
>>>>>>> need to deal with old portio interface users. But we can overcome it
>>>>>>> without converting all portio users by embedding the required base
>>>>>>> address of a MemoryRegionPortio access into that data structure. That
>>>>>>> removes the need to have the additional MemoryRegionIORange structure
>>>>>>> in the loop on every access.
>>>>>>> 
>>>>>>> To handle old portio memory ops, we simply install dispatching handlers
>>>>>>> for portio memory regions when registering them with the memory core.
>>>>>>> This removes the need for the old_portio field.
>>>>>>> 
>>>>>>> We can drop the additional aliasing of ioport regions and also the
>>>>>>> special address space listener. cpu_in and cpu_out now simply call
>>>>>>> address_space_read/write. And we can concentrate portio handling in a
>>>>>>> single source file.
>>>>>>> 
>>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>> ---
>>>>>> 
>>>>>> ...
>>>>>> 
>>>>>>> +
>>>>>>> +static void portio_write(void *opaque, hwaddr addr, uint64_t data,
>>>>>>> +                         unsigned size)
>>>>>>> +{
>>>>>>> +    MemoryRegionPortioList *mrpio = opaque;
>>>>>>> +    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size,
>>>>>>> true);
>>>>>>> +
>>>>>>> +    if (mrp) {
>>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
>>>>>>> +    } else if (size == 2) {
>>>>>>> +        mrp = find_portio(mrpio, addr, 1, true);
>>>>>>> +        assert(mrp);
>>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
>>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data
>>>>>>>>> 8);
>>>>>>> +    }
>>>>>>> +}
>>>>>>> +
>>>>>>> +static const MemoryRegionOps portio_ops = {
>>>>>>> +    .read = portio_read,
>>>>>>> +    .write = portio_write,
>>>>>>> +    .valid.unaligned = true,
>>>>>>> +    .impl.unaligned = true,
>>>>>>> +};
>>>>>>> +
>>>>>> 
>>>>>> You need to mark these operations as DEVICE_LITTLE_ENDIAN.
>>>>>> In portio_write above, you clearly assume that data is in LE format.
>>>>> 
>>>>> Anything behind PIO is little endian, of course. Will add this.
>>>> 
>>>> This patch breaks VGA on PPC as it is in master today.
>>> 
>>> If I don't mark portio as little endian it works as expected. There's probably someone swapping things twice.
>> 
>> sPAPR has its MemoryRegion marked Little Endian:
> 
> sPAPR works, it's only the Mac machines that break.

So IIUC before cpu_inw and inl were doing portio accesses in host native endianness. Now with this change they do little endian accesses. All sensible callers of cpu_inX assume that data is passed in native endianness though and already do the little endian conversion themselves.

Semantically having your PCI host bridge do the endianness conversion is the correct way of handling it, as that's where the conversion happens. If it makes life easier to do it in the isa bridging code, that's fine for me too though. But then we'll have to get rid of all endianness swaps that already happen in PCI bridges.


Alex
Alexander Graf - July 11, 2013, 1:35 p.m.
On 11.07.2013, at 15:28, Alexander Graf wrote:

> 
> On 11.07.2013, at 14:48, Alexander Graf wrote:
> 
>> 
>> On 11.07.2013, at 14:46, Andreas Färber wrote:
>> 
>>> Am 11.07.2013 14:34, schrieb Alexander Graf:
>>>> 
>>>> On 11.07.2013, at 14:29, Alexander Graf wrote:
>>>> 
>>>>> 
>>>>> On 24.06.2013, at 08:07, Jan Kiszka wrote:
>>>>> 
>>>>>> On 2013-06-23 22:50, Hervé Poussineau wrote:
>>>>>>> Jan Kiszka a écrit :
>>>>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>>> 
>>>>>>>> The current ioport dispatcher is a complex beast, mostly due to the
>>>>>>>> need to deal with old portio interface users. But we can overcome it
>>>>>>>> without converting all portio users by embedding the required base
>>>>>>>> address of a MemoryRegionPortio access into that data structure. That
>>>>>>>> removes the need to have the additional MemoryRegionIORange structure
>>>>>>>> in the loop on every access.
>>>>>>>> 
>>>>>>>> To handle old portio memory ops, we simply install dispatching handlers
>>>>>>>> for portio memory regions when registering them with the memory core.
>>>>>>>> This removes the need for the old_portio field.
>>>>>>>> 
>>>>>>>> We can drop the additional aliasing of ioport regions and also the
>>>>>>>> special address space listener. cpu_in and cpu_out now simply call
>>>>>>>> address_space_read/write. And we can concentrate portio handling in a
>>>>>>>> single source file.
>>>>>>>> 
>>>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>>> ---
>>>>>>> 
>>>>>>> ...
>>>>>>> 
>>>>>>>> +
>>>>>>>> +static void portio_write(void *opaque, hwaddr addr, uint64_t data,
>>>>>>>> +                         unsigned size)
>>>>>>>> +{
>>>>>>>> +    MemoryRegionPortioList *mrpio = opaque;
>>>>>>>> +    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size,
>>>>>>>> true);
>>>>>>>> +
>>>>>>>> +    if (mrp) {
>>>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
>>>>>>>> +    } else if (size == 2) {
>>>>>>>> +        mrp = find_portio(mrpio, addr, 1, true);
>>>>>>>> +        assert(mrp);
>>>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
>>>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data
>>>>>>>>>> 8);
>>>>>>>> +    }
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static const MemoryRegionOps portio_ops = {
>>>>>>>> +    .read = portio_read,
>>>>>>>> +    .write = portio_write,
>>>>>>>> +    .valid.unaligned = true,
>>>>>>>> +    .impl.unaligned = true,
>>>>>>>> +};
>>>>>>>> +
>>>>>>> 
>>>>>>> You need to mark these operations as DEVICE_LITTLE_ENDIAN.
>>>>>>> In portio_write above, you clearly assume that data is in LE format.
>>>>>> 
>>>>>> Anything behind PIO is little endian, of course. Will add this.
>>>>> 
>>>>> This patch breaks VGA on PPC as it is in master today.
>>>> 
>>>> If I don't mark portio as little endian it works as expected. There's probably someone swapping things twice.
>>> 
>>> sPAPR has its MemoryRegion marked Little Endian:
>> 
>> sPAPR works, it's only the Mac machines that break.
> 
> So IIUC before cpu_inw and inl were doing portio accesses in host native endianness. Now with this change they do little endian accesses. All sensible callers of cpu_inX assume that data is passed in native endianness though and already do the little endian conversion themselves.
> 
> Semantically having your PCI host bridge do the endianness conversion is the correct way of handling it, as that's where the conversion happens. If it makes life easier to do it in the isa bridging code, that's fine for me too though. But then we'll have to get rid of all endianness swaps that already happen in PCI bridges.

Looking at a few more cpu_inX users I just stumbled over this MIPS code:

static uint64_t rtc_read(void *opaque, hwaddr addr, unsigned size)
{
    return cpu_inw(0x71);
}

static void rtc_write(void *opaque, hwaddr addr,
                      uint64_t val, unsigned size)
{
    cpu_outw(0x71, val & 0xff);
}

static const MemoryRegionOps rtc_ops = {
    .read = rtc_read,
    .write = rtc_write,
    .endianness = DEVICE_NATIVE_ENDIAN,
};


That one returned the ioport data in native endianness before (big endian) and should also be broken now. In this case there's no PCI bus to swizzle or not swizzle things around though.

Alex
Benjamin Herrenschmidt - July 11, 2013, 10:30 p.m.
On Thu, 2013-07-11 at 15:28 +0200, Alexander Graf wrote:
> So IIUC before cpu_inw and inl were doing portio accesses in host
> native endianness. Now with this change they do little endian
> accesses. All sensible callers of cpu_inX assume that data is passed
> in native endianness though and already do the little endian
> conversion themselves.
> 
> Semantically having your PCI host bridge do the endianness conversion
> is the correct way of handling it, as that's where the conversion
> happens. If it makes life easier to do it in the isa bridging code,
> that's fine for me too though. But then we'll have to get rid of all
> endianness swaps that already happen in PCI bridges.

Or stop being utterly insane and remove all that endianness crap in
bridges etc... :-) The whole thing is completely ass backward to begin
with.

Cheers,
Ben.
Benjamin Herrenschmidt - July 11, 2013, 10:32 p.m.
On Thu, 2013-07-11 at 15:28 +0200, Alexander Graf wrote:
> 
> Semantically having your PCI host bridge do the endianness conversion
> is the correct way of handling it, as that's where the conversion
> happens. If it makes life easier to do it in the isa bridging code,
> that's fine for me too though. But then we'll have to get rid of all
> endianness swaps that already happen in PCI bridges.

BTW. Why would IOs to a PCI VGA device go through an "ISA bridge" of any
kind ? PCI IO space is PCI IO space... whether there is an ISA bridge or
not is a separate thing and PCI VGA cards certainly don't sit behind
one.

Ben.
Alexander Graf - July 12, 2013, 3:18 a.m.
Am 12.07.2013 um 00:32 schrieb Benjamin Herrenschmidt <benh@kernel.crashing.org>:

> On Thu, 2013-07-11 at 15:28 +0200, Alexander Graf wrote:
>> 
>> Semantically having your PCI host bridge do the endianness conversion
>> is the correct way of handling it, as that's where the conversion
>> happens. If it makes life easier to do it in the isa bridging code,
>> that's fine for me too though. But then we'll have to get rid of all
>> endianness swaps that already happen in PCI bridges.
> 
> BTW. Why would IOs to a PCI VGA device go through an "ISA bridge" of any
> kind ? PCI IO space is PCI IO space... whether there is an ISA bridge or
> not is a separate thing and PCI VGA cards certainly don't sit behind
> one.

We model a single system wide io space today and access to that one happens through you pci host controller. I just messed up the terminology here.

Alex

> 
> Ben.
> 
>
Benjamin Herrenschmidt - July 12, 2013, 11:35 a.m.
On Fri, 2013-07-12 at 05:18 +0200, Alexander Graf wrote:
> We model a single system wide io space today and access to that one
> happens through you pci host controller. I just messed up the
> terminology here.

Single system wide IO space is broken. We have separate IO space per
PHB. That was working afaik.

In any case, I completely object to all that business with conversion in
bridges.

That's fundamentally WRONG.

The whole business of endianness in qemu is a mess. In the end what
matters and the only thing that does is:

 * The endianness of a given memory access by the guest (which may or
may not be the endianness of the guest -> MSR:LE, byteswap load/store
instsructions, etc..)

vs.

 * The endianness of the target device register (and I say register ...
a framebuffer does NOT have endianness per-se and thus accesses to BAR
mapping a "memory" range (framebuffer, ROM, ...) should go such that the
*byte order* of individual bytes is preserved, which typically means
untranslated).

Unless they are completely broken (and those exist, don't get me wrong,
though mostly they are a thing of a past long gone), bridges and busses
have no effect on endianness.

So I'm not sure what you guys are up to, but from what I read, it's
wrong, and the fact at this stage is that your broke IO space (and thus
virtio and VGA) on powerpc (including pseries).

Cheers,
Ben.
Anthony Liguori - July 12, 2013, 12:56 p.m.
Alexander Graf <agraf@suse.de> writes:

> On 11.07.2013, at 14:29, Alexander Graf wrote:
>
>> This patch breaks VGA on PPC as it is in master today.
>
> If I don't mark portio as little endian it works as expected. There's
> probably someone swapping things twice.

This is the correct fix.  Can you please send a patch?

I/O dispatch functions take a native endian argument and when
cpu_{in,out][bwl] is called it's passed a native endian argument.  There
is no need to ever byte swap.

The ISA bus is going to have data pins numbered D0..D15.  There is no
concept of endianness.  D0 is the LSB.  How the CPU stores bytes in
memory is orthogonal to what is happening on the bus.

Regards,

Anthony Liguori

>
>
> Alex
Alexander Graf - July 12, 2013, 2:30 p.m.
Am 12.07.2013 um 14:56 schrieb Anthony Liguori <anthony@codemonkey.ws>:

> Alexander Graf <agraf@suse.de> writes:
> 
>> On 11.07.2013, at 14:29, Alexander Graf wrote:
>> 
>>> This patch breaks VGA on PPC as it is in master today.
>> 
>> If I don't mark portio as little endian it works as expected. There's
>> probably someone swapping things twice.
> 
> This is the correct fix.  Can you please send a patch?

I only have my phone with me right now. The problem is that we need to

  1) revert the ioport access ops back to native
  2) fix the le assumption for the w-on-b emulation case

Alex

> 
> I/O dispatch functions take a native endian argument and when
> cpu_{in,out][bwl] is called it's passed a native endian argument.  There
> is no need to ever byte swap.
> 
> The ISA bus is going to have data pins numbered D0..D15.  There is no
> concept of endianness.  D0 is the LSB.  How the CPU stores bytes in
> memory is orthogonal to what is happening on the bus.
> 
> Regards,
> 
> Anthony Liguori
> 
>> 
>> 
>> Alex
Hervé Poussineau - July 12, 2013, 5:04 p.m.
Benjamin Herrenschmidt a écrit :
> On Fri, 2013-07-12 at 05:18 +0200, Alexander Graf wrote:
>> We model a single system wide io space today and access to that one
>> happens through you pci host controller. I just messed up the
>> terminology here.
> 
> Single system wide IO space is broken. We have separate IO space per
> PHB. That was working afaik.
> 
> In any case, I completely object to all that business with conversion in
> bridges.
> 
> That's fundamentally WRONG.

Indeed.

> 
> The whole business of endianness in qemu is a mess. In the end what
> matters and the only thing that does is:
> 
>  * The endianness of a given memory access by the guest (which may or
> may not be the endianness of the guest -> MSR:LE, byteswap load/store
> instsructions, etc..)
> 

On PowerPC PReP, you have the MSR:LE bit in the CPU, and the system 
endianness (port 0x92 on PReP system I/O board). Of course, both can be 
changed independently, but they are usually changed within a few 
instructions.

When only one of them is changed, this means some "interesting" things 
can happen. For example, in [1]:
"When PowerPC little-endian is in effect but before system's 
little-endian is in effect, the address munge becomes effective. If we 
need to access system endian port (byte) address, say 0x80000092, the 
program needs to issue 0x80000095 instead, to compensate (unmunge) for 
the effect of PowerPC address modification."

Those will be required for QEMU to be able to run BeOS, AIX, or Windows 
NT or PowerPC PReP.

In the light of this, I think there should only be one endianness for 
all memory accesses (which could be changed at runtime), and all 
bridges/devices should ask for "same endianness as parent" or "reverse 
endianness as parent", but not for big, little, or native endianness.

Regards,

Hervé

[1] ftp://ftp.software.ibm.com/rs6000/technology/spec/endian.ps)
Anthony Liguori - July 12, 2013, 5:49 p.m.
Benjamin Herrenschmidt <benh@kernel.crashing.org> writes:

> On Fri, 2013-07-12 at 05:18 +0200, Alexander Graf wrote:
>> We model a single system wide io space today and access to that one
>> happens through you pci host controller. I just messed up the
>> terminology here.
>
> Single system wide IO space is broken. We have separate IO space per
> PHB. That was working afaik.

Hrm, probably not.  We don't propagate I/O spaces very well today.

> In any case, I completely object to all that business with conversion in
> bridges.
>
> That's fundamentally WRONG.
>
> The whole business of endianness in qemu is a mess. In the end what
> matters and the only thing that does is:

It's not as bad as you think I suspect.

>  * The endianness of a given memory access by the guest (which may or
> may not be the endianness of the guest -> MSR:LE, byteswap load/store
> instsructions, etc..)

Correct.

> vs.
>
>  * The endianness of the target device register (and I say register ...
> a framebuffer does NOT have endianness per-se and thus accesses to BAR
> mapping a "memory" range (framebuffer, ROM, ...) should go such that the
> *byte order* of individual bytes is preserved, which typically means
> untranslated).

Yes.  To put it another way, an MMIO write is a store and depending on
the VCPU, that will result in a write with a certain byte order.  That
byte order should be preserved.

However, what we don't model today, and why we have the silly
endianness in MemoryRegionOps, is the fact that I/O may pass through
multiple layers and those layers may change byte ordering.

We jump through great hoops to have a flat dispatch table.  I've never
liked it but that's what we do.  That means that in cases where a host
bridge may do byte swapping, we cannot easily support that.

That's really what endianness is for but it's abused.

> Unless they are completely broken (and those exist, don't get me wrong,
> though mostly they are a thing of a past long gone), bridges and busses
> have no effect on endianness.

That's simply not true.  There are programmable PCI host bridges that
support byte swapping.  Some allow this to be done on a per-device basis
too.

> So I'm not sure what you guys are up to, but from what I read, it's
> wrong, and the fact at this stage is that your broke IO space (and thus
> virtio and VGA) on powerpc (including pseries).

I'm not sure what this patch was trying to do but it was certainly
wrong.

Regards,

Anthony Liguori

>
> Cheers,
> Ben.
Peter Maydell - July 12, 2013, 6:26 p.m.
On 12 July 2013 18:49, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Benjamin Herrenschmidt <benh@kernel.crashing.org> writes:
>> On Fri, 2013-07-12 at 05:18 +0200, Alexander Graf wrote:
>>> We model a single system wide io space today and access to that one
>>> happens through you pci host controller. I just messed up the
>>> terminology here.
>>
>> Single system wide IO space is broken. We have separate IO space per
>> PHB. That was working afaik.
>
> Hrm, probably not.  We don't propagate I/O spaces very well today.
>
>> In any case, I completely object to all that business with conversion in
>> bridges.
>>
>> That's fundamentally WRONG.

It's not wrong when the hardware actually does a byteswap at
some level in the memory hierarchy. You can see this for instance
on ARMv7M systems, where byteswapping for bigendian happens at
an intermediate level that not all accesses go through:

 [CPU] ---->  [byteswap here] --> [memory and ext. devices]
         |
         -->  [internal memory mapped devices]

so some things see always little endian regardless.

>> The whole business of endianness in qemu is a mess. In the end what
>> matters and the only thing that does is:
>
> It's not as bad as you think I suspect.
>
>>  * The endianness of a given memory access by the guest (which may or
>> may not be the endianness of the guest -> MSR:LE, byteswap load/store
>> instsructions, etc..)
>
> Correct.
>
>> vs.
>>
>>  * The endianness of the target device register (and I say register ...
>> a framebuffer does NOT have endianness per-se and thus accesses to BAR
>> mapping a "memory" range (framebuffer, ROM, ...) should go such that the
>> *byte order* of individual bytes is preserved, which typically means
>> untranslated).
>
> Yes.  To put it another way, an MMIO write is a store and depending on
> the VCPU, that will result in a write with a certain byte order.  That
> byte order should be preserved.
>
> However, what we don't model today, and why we have the silly
> endianness in MemoryRegionOps, is the fact that I/O may pass through
> multiple layers and those layers may change byte ordering.
>
> We jump through great hoops to have a flat dispatch table.  I've never
> liked it but that's what we do.  That means that in cases where a host
> bridge may do byte swapping, we cannot easily support that.

We could support that if we cared to -- you just have to have a
container MemoryRegion type which is a byte-swapping container
(or just have a flag on existing containers, I suppose).
Then as you flatten the regions into the flat table you keep
track of how many levels of byteswapping each region goes through,
and you end up with a single 'byteswap or not?' flag for each
section of your flat dispatch table.

(Our other serious endianness problem is that we don't really
do very well at supporting a TCG CPU arbitrarily flipping
endianness -- TARGET_WORDS_BIGENDIAN is a compile time setting
and ideally it should not be.)

thanks
-- PMM
Anthony Liguori - July 12, 2013, 7:06 p.m.
Hervé Poussineau <hpoussin@reactos.org> writes:

> When only one of them is changed, this means some "interesting" things 
> can happen. For example, in [1]:
> "When PowerPC little-endian is in effect but before system's 
> little-endian is in effect, the address munge becomes effective. If we 
> need to access system endian port (byte) address, say 0x80000092, the 
> program needs to issue 0x80000095 instead, to compensate (unmunge) for 
> the effect of PowerPC address modification."
>
> Those will be required for QEMU to be able to run BeOS, AIX, or Windows 
> NT or PowerPC PReP.
>
> In the light of this, I think there should only be one endianness for 
> all memory accesses (which could be changed at runtime),

We already do this, it's "host native endian".

> and all 
> bridges/devices should ask for "same endianness as parent" or "reverse 
> endianness as parent", but not for big, little, or native endianness.

I/O doesn't propagate so there's no way to say something like this.
That's the fundamental problem IMHO.

Regards,

Anthony Liguori

>
> Regards,
>
> Hervé
>
> [1] ftp://ftp.software.ibm.com/rs6000/technology/spec/endian.ps)
Anthony Liguori - July 12, 2013, 7:36 p.m.
Hervé Poussineau <hpoussin@reactos.org> writes:

> Jan Kiszka a écrit :
>> From: Jan Kiszka <jan.kiszka@siemens.com>
>> 
>> The current ioport dispatcher is a complex beast, mostly due to the
>> need to deal with old portio interface users. But we can overcome it
>> without converting all portio users by embedding the required base
>> address of a MemoryRegionPortio access into that data structure. That
>> removes the need to have the additional MemoryRegionIORange structure
>> in the loop on every access.
>> 
>> To handle old portio memory ops, we simply install dispatching handlers
>> for portio memory regions when registering them with the memory core.
>> This removes the need for the old_portio field.
>> 
>> We can drop the additional aliasing of ioport regions and also the
>> special address space listener. cpu_in and cpu_out now simply call
>> address_space_read/write. And we can concentrate portio handling in a
>> single source file.
>> 
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
>
> ...
>
>> +
>> +static void portio_write(void *opaque, hwaddr addr, uint64_t data,
>> +                         unsigned size)
>> +{
>> +    MemoryRegionPortioList *mrpio = opaque;
>> +    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size, true);
>> +
>> +    if (mrp) {
>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
>> +    } else if (size == 2) {
>> +        mrp = find_portio(mrpio, addr, 1, true);
>> +        assert(mrp);
>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data >> 8);
>> +    }
>> +}
>> +
>> +static const MemoryRegionOps portio_ops = {
>> +    .read = portio_read,
>> +    .write = portio_write,
>> +    .valid.unaligned = true,
>> +    .impl.unaligned = true,
>> +};
>> +
>
> You need to mark these operations as DEVICE_LITTLE_ENDIAN.
> In portio_write above, you clearly assume that data is in LE format.
>
> This fixes PPC PReP emulation, which would otherwise be broken with this 
> patchset.

Did you actually test that as this is how the code behaved previously?

Regards,

Anthony Liguori

>
> Hervé
Benjamin Herrenschmidt - July 12, 2013, 10:39 p.m.
On Fri, 2013-07-12 at 19:04 +0200, Hervé Poussineau wrote:
> "When PowerPC little-endian is in effect but before system's 
> little-endian is in effect, the address munge becomes effective. If we 
> need to access system endian port (byte) address, say 0x80000092, the 
> program needs to issue 0x80000095 instead, to compensate (unmunge) for 
> the effect of PowerPC address modification."
> 
> Those will be required for QEMU to be able to run BeOS, AIX, or Windows 
> NT or PowerPC PReP.

Funnily enough we've dropped support for Linux on PReP a long time
ago :-)

> In the light of this, I think there should only be one endianness for 
> all memory accesses (which could be changed at runtime), and all 
> bridges/devices should ask for "same endianness as parent" or "reverse 
> endianness as parent", but not for big, little, or native endianness.

Right. I would in fact avoid the term "endianness" that tends to confuse
people and basically stick to this:

 - For a given access, know (attribute) which byte is first in ascending
address order

 - For a given target *register* (and again, this ONLY applies to registers
and in fact, some devices might change their register "endianness" and also
have several banks with a different one), which byte is first in ascending
address order.

This is at that point that a conversion might need to occur, and only at that
point if there is a mismatch.

Ben.
Benjamin Herrenschmidt - July 12, 2013, 10:44 p.m.
On Fri, 2013-07-12 at 12:49 -0500, Anthony Liguori wrote:
> 
> > Unless they are completely broken (and those exist, don't get me wrong,
> > though mostly they are a thing of a past long gone), bridges and busses
> > have no effect on endianness.
> 
> That's simply not true.  There are programmable PCI host bridges that
> support byte swapping.  Some allow this to be done on a per-device basis
> too.

Those bridges are utterly broken and the result of very sick HW design
mind. I've encountered that sort of thing before and the only sane way
to use them is to disable that shit.

If a bridge swaps, it breaks byte order of non-register accesses (FIFO
transfers, DMAs, ...) and hell breaks lose.

Unless you confuse with the byte lane swapping that must occur when
bridging a processor to a bus when the processor changes its bus byte
order.

IE. Some processors (generally older, people have grown clues since
then) that can support dual endian operations had the bad habit of
reversing the location of the low-address and high-address bytes on
their bus (or rather didn't properly reverse LSB/MSB and thus requires
bridges to do it). Such processors do require the bridge to swap the
byte lanes when changing endianness. However I've been told that even
ARM doesn't do that any more.

In any case, even if you want to model a piece of crap like that, you
shouldn't do so by calling the bus "big endian" or "little endian" but
something like a byte lane swap attribute, which more precisely
describes what the bridge is doing. This is not endianness.

> > So I'm not sure what you guys are up to, but from what I read, it's
> > wrong, and the fact at this stage is that your broke IO space (and thus
> > virtio and VGA) on powerpc (including pseries).
> 
> I'm not sure what this patch was trying to do but it was certainly
> wrong.

Right :-)

Ben.
Benjamin Herrenschmidt - July 12, 2013, 10:50 p.m.
On Fri, 2013-07-12 at 19:26 +0100, Peter Maydell wrote:

> It's not wrong when the hardware actually does a byteswap at
> some level in the memory hierarchy. You can see this for instance
> on ARMv7M systems, where byteswapping for bigendian happens at
> an intermediate level that not all accesses go through:
> 
>  [CPU] ---->  [byteswap here] --> [memory and ext. devices]
>          |
>          -->  [internal memory mapped devices]
> 
> so some things see always little endian regardless.

Ugh ? That's so completely fucked up, if that's indeed what the HW is
doing this is a piece of trash and the designers are in urgent need of
being turned into fertilizer.

Unless again you are talking about "lane swapping" which allows to
preserve the byte address invariance when the CPU decides to flip its
bus around, but I would have thought that modern CPUs do not do that
sort of shit anymore.

In any case, it cannot be represented with an "endian" attribute at the
bridge level, that doesn't mean anything. Again, the only endian
attribute that exists are the byte order of the original access (which
byte has the lowest address, regardless of significance of those bytes
in the target, ie, purely from a qemu standpoint, in the variable that
carries the access around inside qemu, which byte has the lowest
address), and the same on the target device (at which point a concept of
significance does apply, but it's a guest driver business to get it
right, qemu just need to make sure byte 0 goes to byte 0).

If a bridge flips things around in a way that breaks the model, then add
some property describing the flipping properties but don't call it "big
endian" or "little endian" at the bridge level, that has no meaning,
confuses things and introduces breakage like we have seen.

> >> The whole business of endianness in qemu is a mess. In the end what
> >> matters and the only thing that does is:
> >
> > It's not as bad as you think I suspect.
> >
> >>  * The endianness of a given memory access by the guest (which may or
> >> may not be the endianness of the guest -> MSR:LE, byteswap load/store
> >> instsructions, etc..)
> >
> > Correct.
> >
> >> vs.
> >>
> >>  * The endianness of the target device register (and I say register ...
> >> a framebuffer does NOT have endianness per-se and thus accesses to BAR
> >> mapping a "memory" range (framebuffer, ROM, ...) should go such that the
> >> *byte order* of individual bytes is preserved, which typically means
> >> untranslated).
> >
> > Yes.  To put it another way, an MMIO write is a store and depending on
> > the VCPU, that will result in a write with a certain byte order.  That
> > byte order should be preserved.
> >
> > However, what we don't model today, and why we have the silly
> > endianness in MemoryRegionOps, is the fact that I/O may pass through
> > multiple layers and those layers may change byte ordering.
> >
> > We jump through great hoops to have a flat dispatch table.  I've never
> > liked it but that's what we do.  That means that in cases where a host
> > bridge may do byte swapping, we cannot easily support that.
> 
> We could support that if we cared to -- you just have to have a
> container MemoryRegion type which is a byte-swapping container
> (or just have a flag on existing containers, I suppose).
> Then as you flatten the regions into the flat table you keep
> track of how many levels of byteswapping each region goes through,
> and you end up with a single 'byteswap or not?' flag for each
> section of your flat dispatch table.
> 
> (Our other serious endianness problem is that we don't really
> do very well at supporting a TCG CPU arbitrarily flipping
> endianness -- TARGET_WORDS_BIGENDIAN is a compile time setting
> and ideally it should not be.)

Our experience is that it actually works fine for almost everything
except virtio :-) ie mostly TARGET_WORDS_BIGENDIAN is irrelevant (and
should be).

Cheers,
Ben.
Benjamin Herrenschmidt - July 12, 2013, 10:59 p.m.
On Fri, 2013-07-12 at 14:06 -0500, Anthony Liguori wrote:
> > In the light of this, I think there should only be one endianness
> for 
> > all memory accesses (which could be changed at runtime),
> 
> We already do this, it's "host native endian".

The simple fact that you used the word "endian" makes it broken here :-)

If the guest has done a byte reverse access for example, what you have
is not "host native endian" anymore.

The simple fact that you use "endian" to describe what you carry around
is a source of wrongness and errors. It's misleading and people always
get that wrong (I did a bloody talk on the matter at two conferences now
and it seems people really don't get what endianness really is about).

> > and all 
> > bridges/devices should ask for "same endianness as parent" or
> "reverse 
> > endianness as parent", but not for big, little, or native
> endianness.
> 
> I/O doesn't propagate so there's no way to say something like this.
> That's the fundamental problem IMHO.

There is no such thing as "endianness of parent" again. This is all
wrong. Wrong terminology, wrong concepts.

Do not try to use the "endian" attribute on a bus or bridge, it will
only confuse things further.

Stick to what is the byte order of your transport (much more precise
term, ie. which byte is the lowest address within the host variable that
carries the access).

Ben.
Peter Maydell - July 12, 2013, 11:10 p.m.
On 12 July 2013 23:50, Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> On Fri, 2013-07-12 at 19:26 +0100, Peter Maydell wrote:
>> It's not wrong when the hardware actually does a byteswap at
>> some level in the memory hierarchy. You can see this for instance
>> on ARMv7M systems, where byteswapping for bigendian happens at
>> an intermediate level that not all accesses go through:
>>
>>  [CPU] ---->  [byteswap here] --> [memory and ext. devices]
>>          |
>>          -->  [internal memory mapped devices]
>>
>> so some things see always little endian regardless.
>
> Ugh ? That's so completely fucked up, if that's indeed what the HW is
> doing this is a piece of trash and the designers are in urgent need of
> being turned into fertilizer.
>
> Unless again you are talking about "lane swapping" which allows to
> preserve the byte address invariance when the CPU decides to flip its
> bus around, but I would have thought that modern CPUs do not do that
> sort of shit anymore.

The block marked "byteswap here" does "byte invariant bigendian",
so byte accesses are unchanged, 16 bit accesses have the two words
of data flipped, and 32 bit accesses have the four bytes flipped;
this happens as the data passes through; addresses are unchanged.
It only happens if the CPU is configured by the guest to operate
in big-endian mode, obviously.
(Contrast 'word invariant bigendian', which is what ARM used to do,
where the addresses are changed but the data is not. That would be
pretty painful to implement in the memory region API though it is
of course trivial in hardware since it is just XORing of the low
address bits according to the access size...)

> Again, the only endian
> attribute that exists are the byte order of the original access (which
> byte has the lowest address, regardless of significance of those bytes
> in the target, ie, purely from a qemu standpoint, in the variable that
> carries the access around inside qemu, which byte has the lowest
> address)

What does this even mean? At the point where a memory access leaves
the CPU (emulation or real hardware) it has (a) an address and
(b) a width -- a 16 bit access is neither big nor little endian,
it's just a request for 16 bits of data (on real hardware it's
typically a bus transaction on a bunch of data lines with some
control lines indicating transaction width). Now the CPU emulation
may internally be intending to put that data into its emulated
register one way round or the other, but that's an internal detail
of the CPU emulation. (Similarly for stores.)

>, and the same on the target device (at which point a concept of
> significance does apply, but it's a guest driver business to get it
> right, qemu just need to make sure byte 0 goes to byte 0).

Similarly, at the target device end there is no concept
of a "big endian access" -- we make a request for 16
bits of data at a particular address (via the MemoryRegion
API) and the device returns 16 bits of data. It's entirely
possible to design hardware so that byte access to address
X, halfword access to address X and word access to address
X all return entirely different data (though it would be
a bit perverse.) (As an implementation convenience we may
choose to provide helper infrastructure so you don't have
to actually implement all of byte/halfword/word access by hand.)

> If a bridge flips things around in a way that breaks the
> model,

That breaks what model?

> then add some property describing the flipping
> properties but don't call it "big
> endian" or "little endian" at the bridge level, that has no meaning,
> confuses things and introduces breakage like we have seen.

I'm happy to call the property "byteswap", yes, because
that's what it does. If you did two of these in a row you'd
get a no-op.

>> (Our other serious endianness problem is that we don't really
>> do very well at supporting a TCG CPU arbitrarily flipping
>> endianness -- TARGET_WORDS_BIGENDIAN is a compile time setting
>> and ideally it should not be.)
>
> Our experience is that it actually works fine for almost everything
> except virtio :-) ie mostly TARGET_WORDS_BIGENDIAN is irrelevant (and
> should be).

I agree that TARGET_WORDS_BIGENDIAN *should* go away, but
it exists currently. Do you actually implement a CPU which
does dynamic endianness flipping? Is it at all efficient
in the config which is the opposite of whatever
TARGET_WORDS_BIGENDIAN says?

thanks
-- PMM
Benjamin Herrenschmidt - July 12, 2013, 11:49 p.m.
On Sat, 2013-07-13 at 00:10 +0100, Peter Maydell wrote:

> The block marked "byteswap here" does "byte invariant bigendian",
> so byte accesses are unchanged, 16 bit accesses have the two words
> of data flipped, and 32 bit accesses have the four bytes flipped;
> this happens as the data passes through; addresses are unchanged.
> It only happens if the CPU is configured by the guest to operate
> in big-endian mode, obviously.
> (Contrast 'word invariant bigendian', which is what ARM used to do,
> where the addresses are changed but the data is not. That would be
> pretty painful to implement in the memory region API though it is
> of course trivial in hardware since it is just XORing of the low
> address bits according to the access size...)

Which means that ARM switches the byte order of its bus when
switching endian which is very very wrong... oh well.

PowerPCs operate in both endians and nowadays do not require any
adjustement at the bus level. It's simple, the definition that matter
and the only definition that matter for data is which byte is at what
address and that shouldn't change. For addresses, they should be in a
fixed significance order, not change order based on the "endianness" of
the processor.

Or to say things differently, byte order of data should be fixed, and
the "endianness" of the bus (which is purely in that case the byte order
of addresses) as well. ARM seem to have chosen to go from fixing the
byte order of data & changing the bus endianness, to changing the byte
order of data to preserve the bus endianness, or something along those
lines. Both approaches are WRONG.

Anyway, whatever, what is done is done, point is, what you have is a
byte lane swapper which is similar to what older ppc did which indeed
is needed if your bus flips around. I would still not model that using
the term "endianness" of either the bus or bridge. It's again purely a
statement of what byte lane coming out corresponds to what *address*,
regardless of byte significance.

> > Again, the only endian
> > attribute that exists are the byte order of the original access (which
> > byte has the lowest address, regardless of significance of those bytes
> > in the target, ie, purely from a qemu standpoint, in the variable that
> > carries the access around inside qemu, which byte has the lowest
> > address)
> 
> What does this even mean? At the point where a memory access leaves
> the CPU (emulation or real hardware) it has (a) an address and
> (b) a width -- a 16 bit access is neither big nor little endian,

Not from the point of view of the bus indeed. The value inside might or
might not have an endianness but that's purely somebody else business.

> it's just a request for 16 bits of data (on real hardware it's
> typically a bus transaction on a bunch of data lines with some
> control lines indicating transaction width). Now the CPU emulation
> may internally be intending to put that data into its emulated
> register one way round or the other, but that's an internal detail
> of the CPU emulation. (Similarly for stores.)

My statement above meant (sorry if it wasn't clear) that what matters is
that when qemu carries that request around (thus carries those 16-bits
in a u16 variable inside qemu itself, ie, the argument of some
load/store callback), the only attribute of interest is which of the
bytes in that u16 variable is the first in ascending address order.

IE. This is a property of the processor bus, it is *defined* which of
those 2 bytes that form that 16 bit wide "access" is supposed to go at
what address as part of the bus definition (though ARM seems to flip it
around).

Obviously, it should be done according to the host endianness, and thus
one would expect that qemu just "storing" that in memory results in the
two bytes being laid out in the right order.

My point here is that if any conversion at that level is needed, it's
purely somewhere in TCG to ensure that what's in the variable carried
out of TCG is represented according to the intended byte order of the
original access.

That's why the word "endianness" is so confusing. At this point, if
anything, the only endianness that exists is the one of the host CPU :-)

Now when we cross your byte-lanes adjusting bridge in qemu, two things
can happen.

 - The bridge is configured properly and it's a nop
 - The bridge is *not* configured properly, and you basically need to
perform a lane swap on anything crossing that bridge.

I wouldn't call that "endian". I wouldn't model that by saying that
some range of addresses is "LE" or "BE" or "Host Endian" or whatever ...

The best way to represent that in qemu would be to have some kind of
"lane swap" attribute which is set based on the (mis)match of the
CPU and bridge configuration.

> >, and the same on the target device (at which point a concept of
> > significance does apply, but it's a guest driver business to get it
> > right, qemu just need to make sure byte 0 goes to byte 0).
> 
> Similarly, at the target device end there is no concept
> of a "big endian access" -- we make a request for 16
> bits of data at a particular address (via the MemoryRegion
> API) and the device returns 16 bits of data.

>  It's entirely
> possible to design hardware so that byte access to address
> X, halfword access to address X and word access to address
> X all return entirely different data (though it would be
> a bit perverse.) (As an implementation convenience we may
> choose to provide helper infrastructure so you don't have
> to actually implement all of byte/halfword/word access by hand.)

In fact, I think some x86 IO devices did that perverse thing in the
past. It's not actually always possible. On modern busses it tends not
to be actually. Often, busses do *not* have low address bits below their
width but use byte enables instead. Now  of course the device could be
perverse enough to try to use those to "deduce" the address and still
return different values but I doubt anybody does that :-)

However this is academic. The point is that the bus the device is on has
a definition of what byte is expected first in ascending address order,
which should match the way qemu carry the data.

Endianness at the device level thus is purely about the interpretation
of those data at a register level, because qemu is no HW.

> > If a bridge flips things around in a way that breaks the
> > model,
> 
> That breaks what model?

Byte order invariance which is what we care about. If the bridge
preserves that, then it's a nop for qemu.

> > then add some property describing the flipping
> > properties but don't call it "big
> > endian" or "little endian" at the bridge level, that has no meaning,
> > confuses things and introduces breakage like we have seen.
> 
> I'm happy to call the property "byteswap", yes, because
> that's what it does. If you did two of these in a row you'd
> get a no-op.

Right but the bridge you mentioned "byteswap" configuration is based on
the configuration of the processor bus byte order, and if those are in
sync, the bridge should essentially be a nop for all intend and purpose.

If byte order is preserved, then the "value" qemu carries around can go
unmodified, all the way.

> >> (Our other serious endianness problem is that we don't really
> >> do very well at supporting a TCG CPU arbitrarily flipping
> >> endianness -- TARGET_WORDS_BIGENDIAN is a compile time setting
> >> and ideally it should not be.)
> >
> > Our experience is that it actually works fine for almost everything
> > except virtio :-) ie mostly TARGET_WORDS_BIGENDIAN is irrelevant (and
> > should be).
> 
> I agree that TARGET_WORDS_BIGENDIAN *should* go away, but
> it exists currently. Do you actually implement a CPU which
> does dynamic endianness flipping? Is it at all efficient
> in the config which is the opposite of whatever
> TARGET_WORDS_BIGENDIAN says?

Yes. I haven't measured the speed (in fact I haven't looked at the code
either, others have, but I know it works).

Cheers,
Ben.
Paolo Bonzini - July 13, 2013, 2:38 p.m.
Il 11/07/2013 14:46, Andreas Färber ha scritto:
> sPAPR has its MemoryRegion marked Little Endian:
> 
> http://git.qemu.org/?p=qemu.git;a=blobdiff;f=hw/spapr_pci.c;h=a08ed11166595bdc493065beb64d4ce5b7b0dded;hp=c2c3079d21d5be2647faf85a8c608ac995d2ca62;hb=a3cfa18eb075c7ef78358ca1956fe7b01caa1724;hpb=286d52ebfc0d0d53c2a878e454292fea14bad41b
> 
> Possibly we can now apply Hervé's patches on top to remove that hack again?

I can post a pull request with Herve's patch, if we agree that it's the
right thing.

Paolo
Anthony Liguori - July 13, 2013, 3:22 p.m.
On Sat, Jul 13, 2013 at 9:38 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 11/07/2013 14:46, Andreas Färber ha scritto:
>> sPAPR has its MemoryRegion marked Little Endian:
>>
>> http://git.qemu.org/?p=qemu.git;a=blobdiff;f=hw/spapr_pci.c;h=a08ed11166595bdc493065beb64d4ce5b7b0dded;hp=c2c3079d21d5be2647faf85a8c608ac995d2ca62;hb=a3cfa18eb075c7ef78358ca1956fe7b01caa1724;hpb=286d52ebfc0d0d53c2a878e454292fea14bad41b
>>
>> Possibly we can now apply Hervé's patches on top to remove that hack again?
>
> I can post a pull request with Herve's patch, if we agree that it's the
> right thing.

http://permalink.gmane.org/gmane.comp.emulators.qemu/221950

Here's what's happening:

1) MMIO request goes to sPAPR PIO area, the vCPU was in BE mode but by
the time the handler is called, the value is in host byte order.

2) sPAPR (incorrectly) byte swaps by marking the region as little
endian (data is now garbage)

3) The portio layer (incorrectly) byte swaps because it is marked as
little endian (data is now good)

4) Dispatch happens to VGA device which (incorrectly) byte swaps
because it is marked as little endian (data is now bad)

(2), (3), and (4) are all wrong.  By removing either (2) or (3) we can
"fix" the regression but that's just because two wrongs make a right
in this situation.

We should remove *all* of the LE markings from ISA devices, remove the
portio mark, and the sPAPR mark.  That's the right fix.

Regards,

Anthony Liguori

>
> Paolo
>
Hervé Poussineau - July 13, 2013, 6:11 p.m.
Anthony Liguori a écrit :
> On Sat, Jul 13, 2013 at 9:38 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> Il 11/07/2013 14:46, Andreas Färber ha scritto:
>>> sPAPR has its MemoryRegion marked Little Endian:
>>>
>>> http://git.qemu.org/?p=qemu.git;a=blobdiff;f=hw/spapr_pci.c;h=a08ed11166595bdc493065beb64d4ce5b7b0dded;hp=c2c3079d21d5be2647faf85a8c608ac995d2ca62;hb=a3cfa18eb075c7ef78358ca1956fe7b01caa1724;hpb=286d52ebfc0d0d53c2a878e454292fea14bad41b
>>>
>>> Possibly we can now apply Hervé's patches on top to remove that hack again?
>> I can post a pull request with Herve's patch, if we agree that it's the
>> right thing.
> 
> http://permalink.gmane.org/gmane.comp.emulators.qemu/221950
> 
> Here's what's happening:
> 
> 1) MMIO request goes to sPAPR PIO area, the vCPU was in BE mode but by
> the time the handler is called, the value is in host byte order.
> 
> 2) sPAPR (incorrectly) byte swaps by marking the region as little
> endian (data is now garbage)
> 
> 3) The portio layer (incorrectly) byte swaps because it is marked as
> little endian (data is now good)
> 
> 4) Dispatch happens to VGA device which (incorrectly) byte swaps
> because it is marked as little endian (data is now bad)
> 
> (2), (3), and (4) are all wrong.  By removing either (2) or (3) we can
> "fix" the regression but that's just because two wrongs make a right
> in this situation.
> 
> We should remove *all* of the LE markings from ISA devices, remove the
> portio mark, and the sPAPR mark.  That's the right fix.

OK for that if that fixes sPAPR.

Hervé
Paolo Bonzini - July 14, 2013, 6:15 a.m.
Il 13/07/2013 17:22, Anthony Liguori ha scritto:
> 1) MMIO request goes to sPAPR PIO area, the vCPU was in BE mode but by
> the time the handler is called, the value is in host byte order.
> 
> 2) sPAPR (incorrectly) byte swaps by marking the region as little
> endian (data is now garbage)
> 
> 3) The portio layer (incorrectly) byte swaps because it is marked as
> little endian (data is now good)
> 
> 4) Dispatch happens to VGA device which (incorrectly) byte swaps
> because it is marked as little endian (data is now bad)
> 
> (2), (3), and (4) are all wrong.  By removing either (2) or (3) we can
> "fix" the regression but that's just because two wrongs make a right
> in this situation.
> 
> We should remove *all* of the LE markings from ISA devices, remove the
> portio mark, and the sPAPR mark.  That's the right fix.

So the bug here is that we have multiple levels of dispatch.  Byte
swapping in the dispatch level only works as long as every dispatch is
merged, which is not the case.

However, I do suspect that you have broken PREP again, because PREP has
1/3/4 but not 2.  Removing (2) IIUC amounts to re-applying commit
a178274efabcbbc5d44805b51def874e47051325, and I think that's a better fix.

Also, what devices exactly would have a non-native byte order?!?  I'm
confused...

Paolo
Anthony Liguori - July 14, 2013, 1:05 p.m.
Paolo Bonzini <pbonzini@redhat.com> writes:

> Il 13/07/2013 17:22, Anthony Liguori ha scritto:
>> 1) MMIO request goes to sPAPR PIO area, the vCPU was in BE mode but by
>> the time the handler is called, the value is in host byte order.
>> 
>> 2) sPAPR (incorrectly) byte swaps by marking the region as little
>> endian (data is now garbage)
>> 
>> 3) The portio layer (incorrectly) byte swaps because it is marked as
>> little endian (data is now good)
>> 
>> 4) Dispatch happens to VGA device which (incorrectly) byte swaps
>> because it is marked as little endian (data is now bad)
>> 
>> (2), (3), and (4) are all wrong.  By removing either (2) or (3) we can
>> "fix" the regression but that's just because two wrongs make a right
>> in this situation.
>> 
>> We should remove *all* of the LE markings from ISA devices, remove the
>> portio mark, and the sPAPR mark.  That's the right fix.
>
> So the bug here is that we have multiple levels of dispatch.  Byte
> swapping in the dispatch level only works as long as every dispatch is
> merged, which is not the case.

It's not clear to me if "endianness" makes sense as a concept in the
memory API.  If a bus wants to byte swap, it would have to redispatch
anyway.

> However, I do suspect that you have broken PREP again, because PREP has
> 1/3/4 but not 2.

"Broken" is a relative term...  Not all ISA devices do (4)--see the
various audio devices.

Jan's original patch did code motion and added the LE flag to portio.
My patch simply reverted the added logic to the code motion so if it
broke PREP, PREP has been broken for quite some time.

> Removing (2) IIUC amounts to re-applying commit
> a178274efabcbbc5d44805b51def874e47051325, and I think that's a better
> fix.

It's a bigger change, but I think we should just remove
MemoryRegionOps::endianness altogether.

> Also, what devices exactly would have a non-native byte order?!?  I'm
> confused...

MMIO/PIO requests don't have a byte order.  It's literally 64 or 32 data
pins that are numbered D0..D31 whereas D0 is the LSB.  It doesn't matter
how the pins are arranged.

It's possible for busses to "byte lane swap" the data lines such that
D0..D7 is rerouted to D23..D31, etc. in order to deal with poorly
written drivers but as benh so colorfully put, this introduces more
problems than it solves.

Unfortunately, the existance of MemoryRegionOps::endianness makes this
difficult to untangle because we have a careful combination of "two
wrongs making a right".  In all fairness, this problem predates the
memory API.  The memory API just carried forward the brokenness.

Device originated DMA is a different story.  This has to happen with a
specific endianness but that is orthogonal to the memory API.

Regards,

Anthony Liguori

>
> Paolo
Peter Maydell - July 14, 2013, 2:58 p.m.
On 14 July 2013 14:05, Anthony Liguori <anthony@codemonkey.ws> wrote:
>> Also, what devices exactly would have a non-native byte order?!?  I'm
>> confused...
>
> MMIO/PIO requests don't have a byte order.  It's literally 64 or 32 data
> pins that are numbered D0..D31 whereas D0 is the LSB.  It doesn't matter
> how the pins are arranged.

Devices themselves do have a byte order, though, right? Specifically,
if you do a 32 bit read of address 0 on a device and an 8 bit read,
then you can distinguish a BE device from an LE one.
(Most notably, RAM in QEMU is always host-endian...)
Devices which only allow 32 bit reads and abort any others wouldn't
have an endianness though.

(I need to sit down and think about this all and draw diagrams
and look at what we currently do, though. BE guests on LE hosts
with and without KVM look particularly thorny.)

-- PMM
Anthony Liguori - July 14, 2013, 3:18 p.m.
On Sun, Jul 14, 2013 at 9:58 AM, Peter Maydell <peter.maydell@linaro.org> wrote:
> On 14 July 2013 14:05, Anthony Liguori <anthony@codemonkey.ws> wrote:
>>> Also, what devices exactly would have a non-native byte order?!?  I'm
>>> confused...
>>
>> MMIO/PIO requests don't have a byte order.  It's literally 64 or 32 data
>> pins that are numbered D0..D31 whereas D0 is the LSB.  It doesn't matter
>> how the pins are arranged.
>
> Devices themselves do have a byte order, though, right? Specifically,
> if you do a 32 bit read of address 0 on a device and an 8 bit read,

It depends on the bus and device.  Busses don't necessary pass the I/O
size down to the device like that.  If it does, the device may do any
number of things (including ignoring the request entirely.

What's most common AFAIK is that the access is treated as a word
access and then truncated.  IOW, the device sees the 32-bit word read
but somewhere along the way, the top 24 bits are discarded.

The real interesting question is what happens when you do a byte
access at address 1.  I think most devices simply don't allow that.

> then you can distinguish a BE device from an LE one.
> (Most notably, RAM in QEMU is always host-endian...)
> Devices which only allow 32 bit reads and abort any others wouldn't
> have an endianness though.

My guess is that if you do this with a PCI device we have marked as
LE, you'll get a truncated 32-bit read which will make it appear LE.
But that doesn't mean it's LE.

> (I need to sit down and think about this all and draw diagrams
> and look at what we currently do, though. BE guests on LE hosts
> with and without KVM look particularly thorny.)

I took a first pass at cleaning this up and broke PPC so I'm
investigating it further.  There may be another layer of silliness
hidden somewhere too.

Regards,

Anthony Liguori

> -- PMM
Peter Maydell - July 14, 2013, 4:50 p.m.
On 14 July 2013 16:18, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On Sun, Jul 14, 2013 at 9:58 AM, Peter Maydell <peter.maydell@linaro.org> wrote:
>> Devices themselves do have a byte order, though, right? Specifically,
>> if you do a 32 bit read of address 0 on a device and an 8 bit read,
>
> It depends on the bus and device.  Busses don't necessary pass the I/O
> size down to the device like that.  If it does, the device may do any
> number of things (including ignoring the request entirely.

Agreed.

> What's most common AFAIK is that the access is treated as a word
> access and then truncated.  IOW, the device sees the 32-bit word read
> but somewhere along the way, the top 24 bits are discarded.
>
> The real interesting question is what happens when you do a byte
> access at address 1.  I think most devices simply don't allow that.

Mmm, and where it is permitted it's going to be device dependent
(for instance the ARM GIC has most registers be 32-bit access only,
with a few which allow 8-bit access; and the GIC spec states that
those byte accesses use a little endian memory order model).

It looks like most of the ARM PLxxx devices on the APB bus simply
don't wire up address lines 0 and 1, so an access to address 1
will behave like one to address 0. (Of course we don't model this.)

-- PMM
Anthony Liguori - July 15, 2013, 2:01 p.m.
Peter Maydell <peter.maydell@linaro.org> writes:

> On 12 July 2013 23:50, Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
>> Our experience is that it actually works fine for almost everything
>> except virtio :-) ie mostly TARGET_WORDS_BIGENDIAN is irrelevant (and
>> should be).
>
> I agree that TARGET_WORDS_BIGENDIAN *should* go away, but
> it exists currently. Do you actually implement a CPU which
> does dynamic endianness flipping?

TCG already supports bi-endianness for PPC.  Grep for 'le_mode' in
target-ppc/translate.c to see how the TCG instruction stream is affected
by it.

On PPC, le_mode only really affects load/stores and instruction
decoding.  Any shared data structures between the CPU and OS remain big
endian.

So TARGET_WORDS_BIGENDIAN is still accurate even when le_mode=1.  It's
not the semantic equivalent of changing TARGET_WORDS.

Regards,

Anthony Liguori

> Is it at all efficient
> in the config which is the opposite of whatever
> TARGET_WORDS_BIGENDIAN says?
>
> thanks
> -- PMM
Peter Maydell - July 15, 2013, 2:10 p.m.
On 15 July 2013 15:01, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Peter Maydell <peter.maydell@linaro.org> writes:
>> On 12 July 2013 23:50, Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
>> I agree that TARGET_WORDS_BIGENDIAN *should* go away, but
>> it exists currently. Do you actually implement a CPU which
>> does dynamic endianness flipping?
>
> TCG already supports bi-endianness for PPC.  Grep for 'le_mode' in
> target-ppc/translate.c to see how the TCG instruction stream is affected
> by it.

Right, that's what I thought. This means that if you're on a
little-endian host and using a PPC chip in le-mode then the
QEMU infrastructure will byteswap the data, and then those
generated bswap* ops will byteswap it again. It would be more
efficient if we supported dynamically having the emulated
CPU specify whether it wanted a BE or LE memory access, and
then byteswapping the correct number of times based on
device/host endianness/CPU setting. (This would also avoid
ugliness like the current ARM BE8 code which should be doing
"BE read for data, LE read for instructions" and instead has
to do "BE read of everything via TARGET_WORDS_BIGENDIAN
and then swap the instruction words again to get back to LE".)
If we ever want to support multiple different CPUs in one
model we'll need this anyway, because one might be BE and
another LE.

> On PPC, le_mode only really affects load/stores and instruction
> decoding.  Any shared data structures between the CPU and OS remain big
> endian.
>
> So TARGET_WORDS_BIGENDIAN is still accurate even when
> le_mode=1.  It's not the semantic equivalent of
> changing TARGET_WORDS.

I'm not sure what you mean by "accurate" here. It's a compile
time switch, when the hardware we're actually emulating does
runtime (dynamic) switching, and when the current runtime
mode doesn't match the compiled-in setting we end up doing
unnecessary double-swap operations.

thanks
-- PMM
Benjamin Herrenschmidt - July 15, 2013, 2:16 p.m.
On Mon, 2013-07-15 at 09:01 -0500, Anthony Liguori wrote:
> On PPC, le_mode only really affects load/stores and instruction
> decoding.  Any shared data structures between the CPU and OS remain big
> endian.

Translation:

On ppc with the specific machine type "pseries" which emulates a
paravirtualized environment, the data structures between the guest and
the hypervisor remain big endian :-)

But this is essentially irrelevant to the basic discussion about endian
handling (ie. PCI devices remain LE, etc...).

Point is, TCG can deal with dual endian reasonably easily. The main
issues are around things like virtio with its stupid concept of "guest
endian" (wtf does guest endian means on a dual endian processor ?) but I
think we've whacked Rusty's head plenty enough on that one... and it's
not that hard to work around.

> So TARGET_WORDS_BIGENDIAN is still accurate even when le_mode=1.  It's
> not the semantic equivalent of changing TARGET_WORDS.

The point is that it's unnecessary :-) IE. We should aim to get rid of
the concept of target endianness altogether :-)

Cheers,
Ben.
Jan Kiszka - July 16, 2013, 7:18 a.m.
On 2013-07-14 17:18, Anthony Liguori wrote:
> On Sun, Jul 14, 2013 at 9:58 AM, Peter Maydell <peter.maydell@linaro.org> wrote:
>> On 14 July 2013 14:05, Anthony Liguori <anthony@codemonkey.ws> wrote:
>>>> Also, what devices exactly would have a non-native byte order?!?  I'm
>>>> confused...
>>>
>>> MMIO/PIO requests don't have a byte order.  It's literally 64 or 32 data
>>> pins that are numbered D0..D31 whereas D0 is the LSB.  It doesn't matter
>>> how the pins are arranged.
>>
>> Devices themselves do have a byte order, though, right? Specifically,
>> if you do a 32 bit read of address 0 on a device and an 8 bit read,
> 
> It depends on the bus and device.  Busses don't necessary pass the I/O
> size down to the device like that.  If it does, the device may do any
> number of things (including ignoring the request entirely.
> 
> What's most common AFAIK is that the access is treated as a word
> access and then truncated.  IOW, the device sees the 32-bit word read
> but somewhere along the way, the top 24 bits are discarded.
> 
> The real interesting question is what happens when you do a byte
> access at address 1.  I think most devices simply don't allow that.
> 
>> then you can distinguish a BE device from an LE one.
>> (Most notably, RAM in QEMU is always host-endian...)
>> Devices which only allow 32 bit reads and abort any others wouldn't
>> have an endianness though.
> 
> My guess is that if you do this with a PCI device we have marked as
> LE, you'll get a truncated 32-bit read which will make it appear LE.
> But that doesn't mean it's LE.
> 
>> (I need to sit down and think about this all and draw diagrams
>> and look at what we currently do, though. BE guests on LE hosts
>> with and without KVM look particularly thorny.)
> 
> I took a first pass at cleaning this up and broke PPC so I'm
> investigating it further.  There may be another layer of silliness
> hidden somewhere too.

Sorry for sending out invitations and then being late to this party -
vacation. What is the status now? Do we have a short-term plan to avoid
the regression or is this better solved by cleaning up the whole
endianess thing? Is anyone actively on it, or should I take a drink, sit
down and join the discussion?

Jan
Paolo Bonzini - July 16, 2013, 7:33 a.m.
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Il 16/07/2013 09:18, Jan Kiszka ha scritto:
> Sorry for sending out invitations and then being late to this party
> - vacation. What is the status now? Do we have a short-term plan to
> avoid the regression or is this better solved by cleaning up the
> whole endianess thing? Is anyone actively on it, or should I take a
> drink, sit down and join the discussion?

Basically, we need testing.  The current state of the tree is before
Herve's patch, which means PREP is (should be) broken.

Alexey posted a patch that reintroduces the DEVICE_LITTLE_ENDIAN and
removes the cpu_{in,out}{b,w,l} indirection.

http://permalink.gmane.org/gmane.comp.emulators.qemu/222345

We need to test platforms that used a cpu_{in,out}{b,w,l} indirection
(MIPS, PPC, SPARC) with and without Alexey's patch.

The other occurrences of indirections are:

- - hw/isa/i82378.c: This is PREP.  Again, removing the indirection
should be tested by Herve or Andreas on top of Alexey's patch.

- - hw/isa/isa_mmio: bamboo, g3beige and mac99 could be tested by Alex.
 I don't know about MIPS.  If anything is broken, the solution is to
replace isa_mmio_{setup,init} with an alias to get_system_io().  This
stops using isa_mmio altogether, so it can be done only on those
platforms where it's needed.

- - hw/pci-host/apb.c: this is SPARC.  Perhaps Mark Cave-Ayland can test
it and see if it is broken---again with and without Alexey's patch.
There is a small difference.  This file uses DEVICE_NATIVE_ENDIAN and
does the byte swap itself in pci_apb_io{read,write}{b,w,l}.

There is also Alpha.  It doesn't matter because it's little endian,
but anyway rth is removing the indirection.

Paolo
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2.0.19 (GNU/Linux)
Comment: Using GnuPG with Thunderbird - http://www.enigmail.net/

iQIcBAEBAgAGBQJR5PcwAAoJEBvWZb6bTYbyAOMP/1y19WnUTxcA3+kIhQE1HU2R
ildgJ2VV3kRaY4n1HeBuXOXEQzNxwppC0c0Fq/fQpYWhUxB7Y7UTtDVq9S997EUm
SoU02UlD2+xnmgOSSTTrYNY20WBRt5/s3xkMjiQSPKItBGw1og7+72m/ydzjSPAh
DSBG/mPs12zI3WcTzaydirGVX874pV2N76TSyP+T4DMkipGlgGs1JSOP+Ib/fAWw
Hk/mIu+in7qQjv7DZ/0fJuk8cvbgafCGsKppQ5yYu2YKqkmsLhR6xPNeduWK2GMv
eRLOoobSOlQxUL4konlvI74Gw/+bFtpjWcjybDfAKcVBkm8H/zpg8SIgbmlbJzoR
Gx2bQJvj1PN9pJ/TZRo+1bcw4W7JXKN94s8CTWCL8eGDELB7BgNZFXrxnyfPrY5f
8ega3uZVtZ6SmtPMO7g5arQDfqdvplH/oQozK0muxXgfBzWwvF0Y/qDNRHY18yXC
ZdPdEI2FmslOp8ZzhFW+VApfJXDADlC9xOV9rIJ0jqOewBdJlKrEkqJ8oedYGsPM
am3fmRL3N/KJ1y59hkNEpGhF6yHS7eHJBVNYp0YakthKOKhZTcBfBryFs1Yo5zwt
zvozbSEM0wXnL2ajSwWmU5drH59GdtmYdsIyOm52Etobiu002SZ1VeuCEQOORTTL
qtqyWS4kFP/PTTm8KwV4
=RIcU
-----END PGP SIGNATURE-----
Hervé Poussineau - July 16, 2013, 4:59 p.m.
Paolo Bonzini a écrit :
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> Il 16/07/2013 09:18, Jan Kiszka ha scritto:
>> Sorry for sending out invitations and then being late to this party
>> - vacation. What is the status now? Do we have a short-term plan to
>> avoid the regression or is this better solved by cleaning up the
>> whole endianess thing? Is anyone actively on it, or should I take a
>> drink, sit down and join the discussion?
> 
> Basically, we need testing.  The current state of the tree is before
> Herve's patch, which means PREP is (should be) broken.
> 
> Alexey posted a patch that reintroduces the DEVICE_LITTLE_ENDIAN and
> removes the cpu_{in,out}{b,w,l} indirection.
> 
> http://permalink.gmane.org/gmane.comp.emulators.qemu/222345
> 
> We need to test platforms that used a cpu_{in,out}{b,w,l} indirection
> (MIPS, PPC, SPARC) with and without Alexey's patch.
> 
> The other occurrences of indirections are:
> 
> - - hw/isa/i82378.c: This is PREP.  Again, removing the indirection
> should be tested by Herve or Andreas on top of Alexey's patch.

For i82378, I have a big patch for it, which rewrites large parts of the 
emulation. Moreover, as i82378 is only used in PReP machine, this 
indirection can be ignored for now.

> - - hw/isa/isa_mmio: bamboo, g3beige and mac99 could be tested by Alex.
>  I don't know about MIPS.  If anything is broken, the solution is to
> replace isa_mmio_{setup,init} with an alias to get_system_io().  This
> stops using isa_mmio altogether, so it can be done only on those
> platforms where it's needed.

For MIPS Jazz, it is currently broken in SCSI emulation when installing 
Windows NT 4.0/MIPS. However, it seems quite a general problem on 
Windows NT 4, as Neozeed also reported the same freeze when running 
NT4/x86 on http://virtuallyfun.superglobalmegacorp.com/?p=3065 .

> - - hw/pci-host/apb.c: this is SPARC.  Perhaps Mark Cave-Ayland can test
> it and see if it is broken---again with and without Alexey's patch.
> There is a small difference.  This file uses DEVICE_NATIVE_ENDIAN and
> does the byte swap itself in pci_apb_io{read,write}{b,w,l}.
> 
> There is also Alpha.  It doesn't matter because it's little endian,
> but anyway rth is removing the indirection.

Hervé
Paolo Bonzini - July 16, 2013, 5:12 p.m.
Il 16/07/2013 18:59, Hervé Poussineau ha scritto:
> Paolo Bonzini a écrit :
>> -----BEGIN PGP SIGNED MESSAGE-----
>> Hash: SHA1
>>
>> Il 16/07/2013 09:18, Jan Kiszka ha scritto:
>>> Sorry for sending out invitations and then being late to this party
>>> - vacation. What is the status now? Do we have a short-term plan to
>>> avoid the regression or is this better solved by cleaning up the
>>> whole endianess thing? Is anyone actively on it, or should I take a
>>> drink, sit down and join the discussion?
>>
>> Basically, we need testing.  The current state of the tree is before
>> Herve's patch, which means PREP is (should be) broken.
>>
>> Alexey posted a patch that reintroduces the DEVICE_LITTLE_ENDIAN and
>> removes the cpu_{in,out}{b,w,l} indirection.
>>
>> http://permalink.gmane.org/gmane.comp.emulators.qemu/222345
>>
>> We need to test platforms that used a cpu_{in,out}{b,w,l} indirection
>> (MIPS, PPC, SPARC) with and without Alexey's patch.
>>
>> The other occurrences of indirections are:
>>
>> - - hw/isa/i82378.c: This is PREP.  Again, removing the indirection
>> should be tested by Herve or Andreas on top of Alexey's patch.
> 
> For i82378, I have a big patch for it, which rewrites large parts of the
> emulation. Moreover, as i82378 is only used in PReP machine, this
> indirection can be ignored for now.

As you prefer.

>> - - hw/isa/isa_mmio: bamboo, g3beige and mac99 could be tested by Alex.
>>  I don't know about MIPS.  If anything is broken, the solution is to
>> replace isa_mmio_{setup,init} with an alias to get_system_io().  This
>> stops using isa_mmio altogether, so it can be done only on those
>> platforms where it's needed.
> 
> For MIPS Jazz, it is currently broken in SCSI emulation when installing
> Windows NT 4.0/MIPS. However, it seems quite a general problem on
> Windows NT 4, as Neozeed also reported the same freeze when running
> NT4/x86 on http://virtuallyfun.superglobalmegacorp.com/?p=3065 .

But this bug could even prevent VGA initialization, so it should happen
much earlier than that.  So the SCSI breakage would not block testing.

Paolo

>> - - hw/pci-host/apb.c: this is SPARC.  Perhaps Mark Cave-Ayland can test
>> it and see if it is broken---again with and without Alexey's patch.
>> There is a small difference.  This file uses DEVICE_NATIVE_ENDIAN and
>> does the byte swap itself in pci_apb_io{read,write}{b,w,l}.
>>
>> There is also Alpha.  It doesn't matter because it's little endian,
>> but anyway rth is removing the indirection.
> 
> Hervé
> 
>
Alexey Kardashevskiy - July 19, 2013, 11:09 a.m.
Hi!

This patch also breaks virtio on powerpc. I thought it was fixed
(reverted?) in the master branch from qemu.org but it is still there. What
did I miss?

The problem comes from "-drive file=virtimg/fc18guest,if=virtio".

My full command line is:
./qemu-system-ppc64 -L "qemu-ppc64-bios/" \
 -trace "events=qemu_trace_events" \
 -nodefaults \
 -chardev "stdio,id=char1,signal=off,mux=on" \
 -device "spapr-vty,id=ser1,chardev=char1" \
 -mon "chardev=char1,mode=readline,id=mon1" \
 -drive file=virtimg/fc18guest,if=virtio \
 -m "1024" \
 -machine "pseries" \
 -nographic \
 -vga "none" \
 -enable-kvm


This is what happens:

SLOF **********************************************************************
QEMU Starting
 Build Date = Apr 30 2013 14:04:00
 FW Version = git-8cfdfc43f4c4c8c8
 Press "s" to enter Open Firmware.

Populating /vdevice methods
Populating /vdevice/nvram@71000000

NVRAM: size=65536, fetch=200E, store=200F
Populating /vdevice/vty@71000001
Populating /pci@800000020000000
 Adapters on 0800000020000000
                     00 0000 (D) : 1af4 1001    virtio [ block ]
No NVRAM common partition, re-initializing...
claim failed!
Using default console: /vdevice/vty@71000001

  Welcome to Open Firmware

  Copyright (c) 2004, 2011 IBM Corporation All rights reserved.
  This program and the accompanying materials are made available
  under the terms of the BSD License available at
  http://www.opensource.org/licenses/bsd-license.php


Trying to load:  from: disk ... virtioblk_read: Access beyond end of device!
virtioblk_read: Access beyond end of device!
virtioblk_read: Access beyond end of device!
virtioblk_read: Access beyond end of device!
virtioblk_read: Access beyond end of device!
virtioblk_read: Access beyond end of device!
virtioblk_read: Access beyond end of device!
virtioblk_read: Access beyond end of device!
virtioblk_read: Access beyond end of device!
virtioblk_read: Access beyond end of device!
virtioblk_read: Access beyond end of device!
virtioblk_read: Access beyond end of device!
[many of those]



On 07/11/2013 10:29 PM, Alexander Graf wrote:
> 
> On 24.06.2013, at 08:07, Jan Kiszka wrote:
> 
>> On 2013-06-23 22:50, Hervé Poussineau wrote:
>>> Jan Kiszka a écrit :
>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>
>>>> The current ioport dispatcher is a complex beast, mostly due to the
>>>> need to deal with old portio interface users. But we can overcome it
>>>> without converting all portio users by embedding the required base
>>>> address of a MemoryRegionPortio access into that data structure. That
>>>> removes the need to have the additional MemoryRegionIORange structure
>>>> in the loop on every access.
>>>>
>>>> To handle old portio memory ops, we simply install dispatching handlers
>>>> for portio memory regions when registering them with the memory core.
>>>> This removes the need for the old_portio field.
>>>>
>>>> We can drop the additional aliasing of ioport regions and also the
>>>> special address space listener. cpu_in and cpu_out now simply call
>>>> address_space_read/write. And we can concentrate portio handling in a
>>>> single source file.
>>>>
>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>> ---
>>>
>>> ...
>>>
>>>> +
>>>> +static void portio_write(void *opaque, hwaddr addr, uint64_t data,
>>>> +                         unsigned size)
>>>> +{
>>>> +    MemoryRegionPortioList *mrpio = opaque;
>>>> +    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size,
>>>> true);
>>>> +
>>>> +    if (mrp) {
>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
>>>> +    } else if (size == 2) {
>>>> +        mrp = find_portio(mrpio, addr, 1, true);
>>>> +        assert(mrp);
>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data
>>>>>> 8);
>>>> +    }
>>>> +}
>>>> +
>>>> +static const MemoryRegionOps portio_ops = {
>>>> +    .read = portio_read,
>>>> +    .write = portio_write,
>>>> +    .valid.unaligned = true,
>>>> +    .impl.unaligned = true,
>>>> +};
>>>> +
>>>
>>> You need to mark these operations as DEVICE_LITTLE_ENDIAN.
>>> In portio_write above, you clearly assume that data is in LE format.
>>
>> Anything behind PIO is little endian, of course. Will add this.
> 
> This patch breaks VGA on PPC as it is in master today.
> 
> 
> Alex
> 
>>
>>>
>>> This fixes PPC PReP emulation, which would otherwise be broken with this
>>> patchset.
>>
>> Thanks,
>> Jan
>>
>>
> 
>
Paolo Bonzini - July 19, 2013, 12:49 p.m.
Il 19/07/2013 13:09, Alexey Kardashevskiy ha scritto:
> Hi!
> 
> This patch also breaks virtio on powerpc. I thought it was fixed
> (reverted?) in the master branch from qemu.org but it is still there. What
> did I miss?

It was not reverted, only the "DEVICE_LITTLE_ENDIAN" marking was.

Let me check if I can reproduce this, it looks like a endianness
problems reading virtio-blk config space.

Paolo

> Trying to load:  from: disk ... virtioblk_read: Access beyond end of device!
> virtioblk_read: Access beyond end of device!
> virtioblk_read: Access beyond end of device!
> virtioblk_read: Access beyond end of device!
> virtioblk_read: Access beyond end of device!
> virtioblk_read: Access beyond end of device!
> virtioblk_read: Access beyond end of device!
> virtioblk_read: Access beyond end of device!
> virtioblk_read: Access beyond end of device!
> virtioblk_read: Access beyond end of device!
> virtioblk_read: Access beyond end of device!
> virtioblk_read: Access beyond end of device!
> [many of those]
> 
> 
> 
> On 07/11/2013 10:29 PM, Alexander Graf wrote:
>>
>> On 24.06.2013, at 08:07, Jan Kiszka wrote:
>>
>>> On 2013-06-23 22:50, Hervé Poussineau wrote:
>>>> Jan Kiszka a écrit :
>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>
>>>>> The current ioport dispatcher is a complex beast, mostly due to the
>>>>> need to deal with old portio interface users. But we can overcome it
>>>>> without converting all portio users by embedding the required base
>>>>> address of a MemoryRegionPortio access into that data structure. That
>>>>> removes the need to have the additional MemoryRegionIORange structure
>>>>> in the loop on every access.
>>>>>
>>>>> To handle old portio memory ops, we simply install dispatching handlers
>>>>> for portio memory regions when registering them with the memory core.
>>>>> This removes the need for the old_portio field.
>>>>>
>>>>> We can drop the additional aliasing of ioport regions and also the
>>>>> special address space listener. cpu_in and cpu_out now simply call
>>>>> address_space_read/write. And we can concentrate portio handling in a
>>>>> single source file.
>>>>>
>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>> ---
>>>>
>>>> ...
>>>>
>>>>> +
>>>>> +static void portio_write(void *opaque, hwaddr addr, uint64_t data,
>>>>> +                         unsigned size)
>>>>> +{
>>>>> +    MemoryRegionPortioList *mrpio = opaque;
>>>>> +    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size,
>>>>> true);
>>>>> +
>>>>> +    if (mrp) {
>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
>>>>> +    } else if (size == 2) {
>>>>> +        mrp = find_portio(mrpio, addr, 1, true);
>>>>> +        assert(mrp);
>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data
>>>>>>> 8);
>>>>> +    }
>>>>> +}
>>>>> +
>>>>> +static const MemoryRegionOps portio_ops = {
>>>>> +    .read = portio_read,
>>>>> +    .write = portio_write,
>>>>> +    .valid.unaligned = true,
>>>>> +    .impl.unaligned = true,
>>>>> +};
>>>>> +
>>>>
>>>> You need to mark these operations as DEVICE_LITTLE_ENDIAN.
>>>> In portio_write above, you clearly assume that data is in LE format.
>>>
>>> Anything behind PIO is little endian, of course. Will add this.
>>
>> This patch breaks VGA on PPC as it is in master today.
>>
>>
>> Alex
>>
>>>
>>>>
>>>> This fixes PPC PReP emulation, which would otherwise be broken with this
>>>> patchset.
>>>
>>> Thanks,
>>> Jan
>>>
>>>
>>
>>
> 
>
Alexey Kardashevskiy - July 19, 2013, 3:48 p.m.
Ok. So.

What broke is...
I could try explaining but backtraces are lot better :)

Shortly - virtio_pci_config_ops.endianness was ignored before (was bad but
we had a workaround in spapr_io_ops), now it works so double swap happens
and everything gets broken.

If we talk about VGA (in powerpc, it is all about powerpc), I guess
memory_region_iorange_write() will go through mr->ops->old_portio branch
and won't do any byte swapping (so spapr_io_ops will do the job), so we are
fine here. I do not understand yet why it works on mac99 though, too late
here :)

h_logical_store is a hypercall for system firmware to do cache inhibited
read/write.


This is with the patch applied (git checkout  b40acf9):


#0  virtqueue_init (vq=0x11014ac0) at
/home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio.c:90
#1  0x0000000010371f28 in virtio_queue_set_addr (vdev=0x11019dd0, n=0x0,
addr=0xd0fb0000000)
    at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio.c:662
#2  0x00000000102027f0 in virtio_ioport_write (opaque=0x11019580, addr=0x8,
val=0xd0fb0000)
    at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio-pci.c:278
#3  0x0000000010202f08 in virtio_pci_config_write (opaque=0x11019580,
addr=0x8, val=0xd0fb0000, size=0x4)
    at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio-pci.c:416
#4  0x000000001037e220 in memory_region_write_accessor (opaque=0x11019c78,
addr=0x8, value=0x1fffff0edc00,
    size=0x4, shift=0x0, mask=0xffffffff) at
/home/alexey/pcipassthru/qemu-impreza/memory.c:364
#5  0x000000001037e36c in access_with_adjusted_size (addr=0x8,
value=0x1fffff0edc00, size=0x4,
    access_size_min=0x1, access_size_max=0x4, access=
    @0x1069df40: 0x1037e164 <memory_region_write_accessor>, opaque=0x11019c78)
    at /home/alexey/pcipassthru/qemu-impreza/memory.c:396
#6  0x0000000010380b5c in memory_region_dispatch_write (mr=0x11019c78,
addr=0x8, data=0xd0fb0000, size=0x4)
    at /home/alexey/pcipassthru/qemu-impreza/memory.c:905
#7  0x0000000010383fa4 in io_mem_write (mr=0x11019c78, addr=0x8,
val=0xfbd0, size=0x4)
    at /home/alexey/pcipassthru/qemu-impreza/memory.c:1608
#8  0x00000000102e2fdc in address_space_rw (as=0x10ef4350
<address_space_io>, addr=0x48,
    buf=0x1fffff0edde0 "", len=0x4, is_write=0x1) at
/home/alexey/pcipassthru/qemu-impreza/exec.c:1918
#9  0x00000000102e33c8 in address_space_write (as=0x10ef4350
<address_space_io>, addr=0x48,
    buf=0x1fffff0edde0 "", len=0x4) at
/home/alexey/pcipassthru/qemu-impreza/exec.c:1969
#10 0x0000000010375754 in cpu_outl (addr=0x48, val=0xfbd0)
    at /home/alexey/pcipassthru/qemu-impreza/ioport.c:309
#11 0x0000000010358240 in spapr_io_write (opaque=0x11016a00, addr=0x48,
data=0xfbd0, size=0x4)
    at /home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_pci.c:468
#12 0x000000001037e220 in memory_region_write_accessor (opaque=0x110191f8,
addr=0x48, value=0x1fffff0ee060,
    size=0x4, shift=0x0, mask=0xffffffff) at
/home/alexey/pcipassthru/qemu-impreza/memory.c:364
#13 0x000000001037e36c in access_with_adjusted_size (addr=0x48,
value=0x1fffff0ee060, size=0x4,
    access_size_min=0x1, access_size_max=0x4, access=
    @0x1069df40: 0x1037e164 <memory_region_write_accessor>, opaque=0x110191f8)
    at /home/alexey/pcipassthru/qemu-impreza/memory.c:396
#14 0x0000000010380b5c in memory_region_dispatch_write (mr=0x110191f8,
addr=0x48, data=0xfbd0, size=0x4)
    at /home/alexey/pcipassthru/qemu-impreza/memory.c:905
#15 0x0000000010383fa4 in io_mem_write (mr=0x110191f8, addr=0x48,
val=0xd0fb0000, size=0x4)
    at /home/alexey/pcipassthru/qemu-impreza/memory.c:1608
#16 0x00000000102e47ac in stl_phys_internal (addr=0x10080000048,
val=0xd0fb0000, endian=
    DEVICE_NATIVE_ENDIAN) at /home/alexey/pcipassthru/qemu-impreza/exec.c:2420
#17 0x00000000102e48a8 in stl_phys (addr=0x10080000048, val=0xd0fb0000)
    at /home/alexey/pcipassthru/qemu-impreza/exec.c:2442
#18 0x0000000010354f1c in h_logical_store (cpu=0x1fffff0f0010,
spapr=0x10fe9510, opcode=0x40,
    args=0x1ffffffd0030) at
/home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_hcall.c:570



This is without this patch (i.e. git checkout  b40acf9^ ):

#0  virtqueue_init (vq=0x11014ac0) at
/home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio.c:90
#1  0x00000000103720e4 in virtio_queue_set_addr (vdev=0x11019dd0, n=0x0,
addr=0xffe2000)
    at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio.c:662
#2  0x00000000102027f0 in virtio_ioport_write (opaque=0x11019580, addr=0x8,
val=0xffe2)
    at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio-pci.c:278
#3  0x0000000010202f08 in virtio_pci_config_write (opaque=0x11019580,
addr=0x8, val=0xffe2, size=0x4)
    at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio-pci.c:416
#4  0x000000001037dca8 in memory_region_write_accessor (opaque=0x11019c78,
addr=0x8, value=0x1fffff0edca8,
    size=0x4, shift=0x0, mask=0xffffffff) at
/home/alexey/pcipassthru/qemu-impreza/memory.c:364
#5  0x000000001037ddf4 in access_with_adjusted_size (addr=0x8,
value=0x1fffff0edca8, size=0x4,
    access_size_min=0x1, access_size_max=0x4, access=
    @0x1069def8: 0x1037dbec <memory_region_write_accessor>, opaque=0x11019c78)
    at /home/alexey/pcipassthru/qemu-impreza/memory.c:396
#6  0x000000001037e474 in memory_region_iorange_write
(iorange=0x1ffff0005430, offset=0x8, width=0x4,
    data=0xffe2) at /home/alexey/pcipassthru/qemu-impreza/memory.c:475
#7  0x00000000103750d4 in ioport_writel_thunk (opaque=0x1ffff0005430,
addr=0x48, data=0xffe2)
    at /home/alexey/pcipassthru/qemu-impreza/ioport.c:226
#8  0x0000000010374728 in ioport_write (index=0x2, address=0x48, data=0xffe2)
    at /home/alexey/pcipassthru/qemu-impreza/ioport.c:83
#9  0x0000000010375688 in cpu_outl (addr=0x48, val=0xffe2)
    at /home/alexey/pcipassthru/qemu-impreza/ioport.c:296
#10 0x00000000103583fc in spapr_io_write (opaque=0x11016a00, addr=0x48,
data=0xffe2, size=0x4)
    at /home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_pci.c:468
#11 0x000000001037dca8 in memory_region_write_accessor (opaque=0x110191f8,
addr=0x48, value=0x1fffff0ee060,
    size=0x4, shift=0x0, mask=0xffffffff) at
/home/alexey/pcipassthru/qemu-impreza/memory.c:364
#12 0x000000001037ddf4 in access_with_adjusted_size (addr=0x48,
value=0x1fffff0ee060, size=0x4,
    access_size_min=0x1, access_size_max=0x4, access=
    @0x1069def8: 0x1037dbec <memory_region_write_accessor>, opaque=0x110191f8)
    at /home/alexey/pcipassthru/qemu-impreza/memory.c:396
#13 0x0000000010380c90 in memory_region_dispatch_write (mr=0x110191f8,
addr=0x48, data=0xffe2, size=0x4)
    at /home/alexey/pcipassthru/qemu-impreza/memory.c:993
#14 0x00000000103840d8 in io_mem_write (mr=0x110191f8, addr=0x48,
val=0xe2ff0000, size=0x4)
    at /home/alexey/pcipassthru/qemu-impreza/memory.c:1696
#15 0x00000000102e4968 in stl_phys_internal (addr=0x10080000048,
val=0xe2ff0000, endian=
    DEVICE_NATIVE_ENDIAN) at /home/alexey/pcipassthru/qemu-impreza/exec.c:2447
#16 0x00000000102e4a64 in stl_phys (addr=0x10080000048, val=0xe2ff0000)
    at /home/alexey/pcipassthru/qemu-impreza/exec.c:2469
#17 0x00000000103550d8 in h_logical_store (cpu=0x1fffff0f0010,
spapr=0x10fe9510, opcode=0x40,
    args=0x1ffffffd0030) at
/home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_hcall.c:570
#18 0x0000000010355698 in spapr_hypercall (cpu=0x1fffff0f0010, opcode=0x40,
args=0x1ffffffd0030)
    at /home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_hcall.c:689






On 07/19/2013 10:49 PM, Paolo Bonzini wrote:
> Il 19/07/2013 13:09, Alexey Kardashevskiy ha scritto:
>> Hi!
>>
>> This patch also breaks virtio on powerpc. I thought it was fixed
>> (reverted?) in the master branch from qemu.org but it is still there. What
>> did I miss?
> 
> It was not reverted, only the "DEVICE_LITTLE_ENDIAN" marking was.
> 
> Let me check if I can reproduce this, it looks like a endianness
> problems reading virtio-blk config space.
> 
> Paolo
> 
>> Trying to load:  from: disk ... virtioblk_read: Access beyond end of device!
>> virtioblk_read: Access beyond end of device!
>> virtioblk_read: Access beyond end of device!
>> virtioblk_read: Access beyond end of device!
>> virtioblk_read: Access beyond end of device!
>> virtioblk_read: Access beyond end of device!
>> virtioblk_read: Access beyond end of device!
>> virtioblk_read: Access beyond end of device!
>> virtioblk_read: Access beyond end of device!
>> virtioblk_read: Access beyond end of device!
>> virtioblk_read: Access beyond end of device!
>> virtioblk_read: Access beyond end of device!
>> [many of those]
>>
>>
>>
>> On 07/11/2013 10:29 PM, Alexander Graf wrote:
>>>
>>> On 24.06.2013, at 08:07, Jan Kiszka wrote:
>>>
>>>> On 2013-06-23 22:50, Hervé Poussineau wrote:
>>>>> Jan Kiszka a écrit :
>>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>
>>>>>> The current ioport dispatcher is a complex beast, mostly due to the
>>>>>> need to deal with old portio interface users. But we can overcome it
>>>>>> without converting all portio users by embedding the required base
>>>>>> address of a MemoryRegionPortio access into that data structure. That
>>>>>> removes the need to have the additional MemoryRegionIORange structure
>>>>>> in the loop on every access.
>>>>>>
>>>>>> To handle old portio memory ops, we simply install dispatching handlers
>>>>>> for portio memory regions when registering them with the memory core.
>>>>>> This removes the need for the old_portio field.
>>>>>>
>>>>>> We can drop the additional aliasing of ioport regions and also the
>>>>>> special address space listener. cpu_in and cpu_out now simply call
>>>>>> address_space_read/write. And we can concentrate portio handling in a
>>>>>> single source file.
>>>>>>
>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>> ---
>>>>>
>>>>> ...
>>>>>
>>>>>> +
>>>>>> +static void portio_write(void *opaque, hwaddr addr, uint64_t data,
>>>>>> +                         unsigned size)
>>>>>> +{
>>>>>> +    MemoryRegionPortioList *mrpio = opaque;
>>>>>> +    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size,
>>>>>> true);
>>>>>> +
>>>>>> +    if (mrp) {
>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
>>>>>> +    } else if (size == 2) {
>>>>>> +        mrp = find_portio(mrpio, addr, 1, true);
>>>>>> +        assert(mrp);
>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data
>>>>>>>> 8);
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +static const MemoryRegionOps portio_ops = {
>>>>>> +    .read = portio_read,
>>>>>> +    .write = portio_write,
>>>>>> +    .valid.unaligned = true,
>>>>>> +    .impl.unaligned = true,
>>>>>> +};
>>>>>> +
>>>>>
>>>>> You need to mark these operations as DEVICE_LITTLE_ENDIAN.
>>>>> In portio_write above, you clearly assume that data is in LE format.
>>>>
>>>> Anything behind PIO is little endian, of course. Will add this.
>>>
>>> This patch breaks VGA on PPC as it is in master today.
>>>
>>>
>>> Alex
>>>
>>>>
>>>>>
>>>>> This fixes PPC PReP emulation, which would otherwise be broken with this
>>>>> patchset.
>>>>
>>>> Thanks,
>>>> Jan
>>>>
>>>>
>>>
>>>
>>
>>
>
Alexey Kardashevskiy - July 20, 2013, 12:55 a.m.
On 07/20/2013 01:48 AM, Alexey Kardashevskiy wrote:
> Ok. So.
> 
> What broke is...
> I could try explaining but backtraces are lot better :)
> 
> Shortly - virtio_pci_config_ops.endianness was ignored before (was bad but
> we had a workaround in spapr_io_ops), now it works so double swap happens
> and everything gets broken.
> 
> If we talk about VGA (in powerpc, it is all about powerpc), I guess
> memory_region_iorange_write() will go through mr->ops->old_portio branch
> and won't do any byte swapping (so spapr_io_ops will do the job), so we are
> fine here. I do not understand yet why it works on mac99 though, too late
> here :)


I understood. VGA does not work for mac99 either with this command line:
./qemu-system-ppc64 -m "1024" -M "mac99" -vga "std"
So it works for pseries only because of parity bug in spapr_io_ops.

So the right fix is to get rid of spapr_io_ops and every other hack like
that and to add byte swapping to every "if (mr->ops->old_portio)" branch
(should fix VGA and all other old_portio users). Current byte swapping in
memory regions seems to be right.

I would try fixing it but since all my patches were terrible shit so far, I
won't risk :)



> h_logical_store is a hypercall for system firmware to do cache inhibited
> read/write.
> 
> 
> This is with the patch applied (git checkout  b40acf9):
> 
> 
> #0  virtqueue_init (vq=0x11014ac0) at
> /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio.c:90
> #1  0x0000000010371f28 in virtio_queue_set_addr (vdev=0x11019dd0, n=0x0,
> addr=0xd0fb0000000)
>     at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio.c:662
> #2  0x00000000102027f0 in virtio_ioport_write (opaque=0x11019580, addr=0x8,
> val=0xd0fb0000)
>     at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio-pci.c:278
> #3  0x0000000010202f08 in virtio_pci_config_write (opaque=0x11019580,
> addr=0x8, val=0xd0fb0000, size=0x4)
>     at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio-pci.c:416
> #4  0x000000001037e220 in memory_region_write_accessor (opaque=0x11019c78,
> addr=0x8, value=0x1fffff0edc00,
>     size=0x4, shift=0x0, mask=0xffffffff) at
> /home/alexey/pcipassthru/qemu-impreza/memory.c:364
> #5  0x000000001037e36c in access_with_adjusted_size (addr=0x8,
> value=0x1fffff0edc00, size=0x4,
>     access_size_min=0x1, access_size_max=0x4, access=
>     @0x1069df40: 0x1037e164 <memory_region_write_accessor>, opaque=0x11019c78)
>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:396
> #6  0x0000000010380b5c in memory_region_dispatch_write (mr=0x11019c78,
> addr=0x8, data=0xd0fb0000, size=0x4)
>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:905
> #7  0x0000000010383fa4 in io_mem_write (mr=0x11019c78, addr=0x8,
> val=0xfbd0, size=0x4)
>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:1608
> #8  0x00000000102e2fdc in address_space_rw (as=0x10ef4350
> <address_space_io>, addr=0x48,
>     buf=0x1fffff0edde0 "", len=0x4, is_write=0x1) at
> /home/alexey/pcipassthru/qemu-impreza/exec.c:1918
> #9  0x00000000102e33c8 in address_space_write (as=0x10ef4350
> <address_space_io>, addr=0x48,
>     buf=0x1fffff0edde0 "", len=0x4) at
> /home/alexey/pcipassthru/qemu-impreza/exec.c:1969
> #10 0x0000000010375754 in cpu_outl (addr=0x48, val=0xfbd0)
>     at /home/alexey/pcipassthru/qemu-impreza/ioport.c:309
> #11 0x0000000010358240 in spapr_io_write (opaque=0x11016a00, addr=0x48,
> data=0xfbd0, size=0x4)
>     at /home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_pci.c:468
> #12 0x000000001037e220 in memory_region_write_accessor (opaque=0x110191f8,
> addr=0x48, value=0x1fffff0ee060,
>     size=0x4, shift=0x0, mask=0xffffffff) at
> /home/alexey/pcipassthru/qemu-impreza/memory.c:364
> #13 0x000000001037e36c in access_with_adjusted_size (addr=0x48,
> value=0x1fffff0ee060, size=0x4,
>     access_size_min=0x1, access_size_max=0x4, access=
>     @0x1069df40: 0x1037e164 <memory_region_write_accessor>, opaque=0x110191f8)
>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:396
> #14 0x0000000010380b5c in memory_region_dispatch_write (mr=0x110191f8,
> addr=0x48, data=0xfbd0, size=0x4)
>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:905
> #15 0x0000000010383fa4 in io_mem_write (mr=0x110191f8, addr=0x48,
> val=0xd0fb0000, size=0x4)
>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:1608
> #16 0x00000000102e47ac in stl_phys_internal (addr=0x10080000048,
> val=0xd0fb0000, endian=
>     DEVICE_NATIVE_ENDIAN) at /home/alexey/pcipassthru/qemu-impreza/exec.c:2420
> #17 0x00000000102e48a8 in stl_phys (addr=0x10080000048, val=0xd0fb0000)
>     at /home/alexey/pcipassthru/qemu-impreza/exec.c:2442
> #18 0x0000000010354f1c in h_logical_store (cpu=0x1fffff0f0010,
> spapr=0x10fe9510, opcode=0x40,
>     args=0x1ffffffd0030) at
> /home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_hcall.c:570
> 
> 
> 
> This is without this patch (i.e. git checkout  b40acf9^ ):
> 
> #0  virtqueue_init (vq=0x11014ac0) at
> /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio.c:90
> #1  0x00000000103720e4 in virtio_queue_set_addr (vdev=0x11019dd0, n=0x0,
> addr=0xffe2000)
>     at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio.c:662
> #2  0x00000000102027f0 in virtio_ioport_write (opaque=0x11019580, addr=0x8,
> val=0xffe2)
>     at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio-pci.c:278
> #3  0x0000000010202f08 in virtio_pci_config_write (opaque=0x11019580,
> addr=0x8, val=0xffe2, size=0x4)
>     at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio-pci.c:416
> #4  0x000000001037dca8 in memory_region_write_accessor (opaque=0x11019c78,
> addr=0x8, value=0x1fffff0edca8,
>     size=0x4, shift=0x0, mask=0xffffffff) at
> /home/alexey/pcipassthru/qemu-impreza/memory.c:364
> #5  0x000000001037ddf4 in access_with_adjusted_size (addr=0x8,
> value=0x1fffff0edca8, size=0x4,
>     access_size_min=0x1, access_size_max=0x4, access=
>     @0x1069def8: 0x1037dbec <memory_region_write_accessor>, opaque=0x11019c78)
>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:396
> #6  0x000000001037e474 in memory_region_iorange_write
> (iorange=0x1ffff0005430, offset=0x8, width=0x4,
>     data=0xffe2) at /home/alexey/pcipassthru/qemu-impreza/memory.c:475
> #7  0x00000000103750d4 in ioport_writel_thunk (opaque=0x1ffff0005430,
> addr=0x48, data=0xffe2)
>     at /home/alexey/pcipassthru/qemu-impreza/ioport.c:226
> #8  0x0000000010374728 in ioport_write (index=0x2, address=0x48, data=0xffe2)
>     at /home/alexey/pcipassthru/qemu-impreza/ioport.c:83
> #9  0x0000000010375688 in cpu_outl (addr=0x48, val=0xffe2)
>     at /home/alexey/pcipassthru/qemu-impreza/ioport.c:296
> #10 0x00000000103583fc in spapr_io_write (opaque=0x11016a00, addr=0x48,
> data=0xffe2, size=0x4)
>     at /home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_pci.c:468
> #11 0x000000001037dca8 in memory_region_write_accessor (opaque=0x110191f8,
> addr=0x48, value=0x1fffff0ee060,
>     size=0x4, shift=0x0, mask=0xffffffff) at
> /home/alexey/pcipassthru/qemu-impreza/memory.c:364
> #12 0x000000001037ddf4 in access_with_adjusted_size (addr=0x48,
> value=0x1fffff0ee060, size=0x4,
>     access_size_min=0x1, access_size_max=0x4, access=
>     @0x1069def8: 0x1037dbec <memory_region_write_accessor>, opaque=0x110191f8)
>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:396
> #13 0x0000000010380c90 in memory_region_dispatch_write (mr=0x110191f8,
> addr=0x48, data=0xffe2, size=0x4)
>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:993
> #14 0x00000000103840d8 in io_mem_write (mr=0x110191f8, addr=0x48,
> val=0xe2ff0000, size=0x4)
>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:1696
> #15 0x00000000102e4968 in stl_phys_internal (addr=0x10080000048,
> val=0xe2ff0000, endian=
>     DEVICE_NATIVE_ENDIAN) at /home/alexey/pcipassthru/qemu-impreza/exec.c:2447
> #16 0x00000000102e4a64 in stl_phys (addr=0x10080000048, val=0xe2ff0000)
>     at /home/alexey/pcipassthru/qemu-impreza/exec.c:2469
> #17 0x00000000103550d8 in h_logical_store (cpu=0x1fffff0f0010,
> spapr=0x10fe9510, opcode=0x40,
>     args=0x1ffffffd0030) at
> /home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_hcall.c:570
> #18 0x0000000010355698 in spapr_hypercall (cpu=0x1fffff0f0010, opcode=0x40,
> args=0x1ffffffd0030)
>     at /home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_hcall.c:689
> 
> 
> 
> 
> 
> 
> On 07/19/2013 10:49 PM, Paolo Bonzini wrote:
>> Il 19/07/2013 13:09, Alexey Kardashevskiy ha scritto:
>>> Hi!
>>>
>>> This patch also breaks virtio on powerpc. I thought it was fixed
>>> (reverted?) in the master branch from qemu.org but it is still there. What
>>> did I miss?
>>
>> It was not reverted, only the "DEVICE_LITTLE_ENDIAN" marking was.
>>
>> Let me check if I can reproduce this, it looks like a endianness
>> problems reading virtio-blk config space.
>>
>> Paolo
>>
>>> Trying to load:  from: disk ... virtioblk_read: Access beyond end of device!
>>> virtioblk_read: Access beyond end of device!
>>> virtioblk_read: Access beyond end of device!
>>> virtioblk_read: Access beyond end of device!
>>> virtioblk_read: Access beyond end of device!
>>> virtioblk_read: Access beyond end of device!
>>> virtioblk_read: Access beyond end of device!
>>> virtioblk_read: Access beyond end of device!
>>> virtioblk_read: Access beyond end of device!
>>> virtioblk_read: Access beyond end of device!
>>> virtioblk_read: Access beyond end of device!
>>> virtioblk_read: Access beyond end of device!
>>> [many of those]
>>>
>>>
>>>
>>> On 07/11/2013 10:29 PM, Alexander Graf wrote:
>>>>
>>>> On 24.06.2013, at 08:07, Jan Kiszka wrote:
>>>>
>>>>> On 2013-06-23 22:50, Hervé Poussineau wrote:
>>>>>> Jan Kiszka a écrit :
>>>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>>
>>>>>>> The current ioport dispatcher is a complex beast, mostly due to the
>>>>>>> need to deal with old portio interface users. But we can overcome it
>>>>>>> without converting all portio users by embedding the required base
>>>>>>> address of a MemoryRegionPortio access into that data structure. That
>>>>>>> removes the need to have the additional MemoryRegionIORange structure
>>>>>>> in the loop on every access.
>>>>>>>
>>>>>>> To handle old portio memory ops, we simply install dispatching handlers
>>>>>>> for portio memory regions when registering them with the memory core.
>>>>>>> This removes the need for the old_portio field.
>>>>>>>
>>>>>>> We can drop the additional aliasing of ioport regions and also the
>>>>>>> special address space listener. cpu_in and cpu_out now simply call
>>>>>>> address_space_read/write. And we can concentrate portio handling in a
>>>>>>> single source file.
>>>>>>>
>>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>> ---
>>>>>>
>>>>>> ...
>>>>>>
>>>>>>> +
>>>>>>> +static void portio_write(void *opaque, hwaddr addr, uint64_t data,
>>>>>>> +                         unsigned size)
>>>>>>> +{
>>>>>>> +    MemoryRegionPortioList *mrpio = opaque;
>>>>>>> +    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size,
>>>>>>> true);
>>>>>>> +
>>>>>>> +    if (mrp) {
>>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
>>>>>>> +    } else if (size == 2) {
>>>>>>> +        mrp = find_portio(mrpio, addr, 1, true);
>>>>>>> +        assert(mrp);
>>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
>>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data
>>>>>>>>> 8);
>>>>>>> +    }
>>>>>>> +}
>>>>>>> +
>>>>>>> +static const MemoryRegionOps portio_ops = {
>>>>>>> +    .read = portio_read,
>>>>>>> +    .write = portio_write,
>>>>>>> +    .valid.unaligned = true,
>>>>>>> +    .impl.unaligned = true,
>>>>>>> +};
>>>>>>> +
>>>>>>
>>>>>> You need to mark these operations as DEVICE_LITTLE_ENDIAN.
>>>>>> In portio_write above, you clearly assume that data is in LE format.
>>>>>
>>>>> Anything behind PIO is little endian, of course. Will add this.
>>>>
>>>> This patch breaks VGA on PPC as it is in master today.
>>>>
>>>>
>>>> Alex
>>>>
>>>>>
>>>>>>
>>>>>> This fixes PPC PReP emulation, which would otherwise be broken with this
>>>>>> patchset.
>>>>>
>>>>> Thanks,
>>>>> Jan
>>>>>
>>>>>
>>>>
>>>>
>>>
>>>
>>
> 
>
Alexey Kardashevskiy - July 20, 2013, 1:11 a.m.
On 07/20/2013 10:55 AM, Alexey Kardashevskiy wrote:
> On 07/20/2013 01:48 AM, Alexey Kardashevskiy wrote:
>> Ok. So.
>>
>> What broke is...
>> I could try explaining but backtraces are lot better :)
>>
>> Shortly - virtio_pci_config_ops.endianness was ignored before (was bad but
>> we had a workaround in spapr_io_ops), now it works so double swap happens
>> and everything gets broken.
>>
>> If we talk about VGA (in powerpc, it is all about powerpc), I guess
>> memory_region_iorange_write() will go through mr->ops->old_portio branch
>> and won't do any byte swapping (so spapr_io_ops will do the job), so we are
>> fine here. I do not understand yet why it works on mac99 though, too late
>> here :)
> 
> 
> I understood. VGA does not work for mac99 either with this command line:
> ./qemu-system-ppc64 -m "1024" -M "mac99" -vga "std"
> So it works for pseries only because of parity bug in spapr_io_ops.


oops. I am wrong and VGA works on mac99 in upstream because isa_mmio_ops
does the swapping in this case and portio_ops does not swap (in upstream).

Oh. Ah. Uh.

Adding cc:Benh...


> So the right fix is to get rid of spapr_io_ops and every other hack like
> that and to add byte swapping to every "if (mr->ops->old_portio)" branch
> (should fix VGA and all other old_portio users). Current byte swapping in
> memory regions seems to be right.
> 
> I would try fixing it but since all my patches were terrible shit so far, I
> won't risk :)
> 
> 
> 
>> h_logical_store is a hypercall for system firmware to do cache inhibited
>> read/write.
>>
>>
>> This is with the patch applied (git checkout  b40acf9):
>>
>>
>> #0  virtqueue_init (vq=0x11014ac0) at
>> /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio.c:90
>> #1  0x0000000010371f28 in virtio_queue_set_addr (vdev=0x11019dd0, n=0x0,
>> addr=0xd0fb0000000)
>>     at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio.c:662
>> #2  0x00000000102027f0 in virtio_ioport_write (opaque=0x11019580, addr=0x8,
>> val=0xd0fb0000)
>>     at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio-pci.c:278
>> #3  0x0000000010202f08 in virtio_pci_config_write (opaque=0x11019580,
>> addr=0x8, val=0xd0fb0000, size=0x4)
>>     at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio-pci.c:416
>> #4  0x000000001037e220 in memory_region_write_accessor (opaque=0x11019c78,
>> addr=0x8, value=0x1fffff0edc00,
>>     size=0x4, shift=0x0, mask=0xffffffff) at
>> /home/alexey/pcipassthru/qemu-impreza/memory.c:364
>> #5  0x000000001037e36c in access_with_adjusted_size (addr=0x8,
>> value=0x1fffff0edc00, size=0x4,
>>     access_size_min=0x1, access_size_max=0x4, access=
>>     @0x1069df40: 0x1037e164 <memory_region_write_accessor>, opaque=0x11019c78)
>>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:396
>> #6  0x0000000010380b5c in memory_region_dispatch_write (mr=0x11019c78,
>> addr=0x8, data=0xd0fb0000, size=0x4)
>>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:905
>> #7  0x0000000010383fa4 in io_mem_write (mr=0x11019c78, addr=0x8,
>> val=0xfbd0, size=0x4)
>>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:1608
>> #8  0x00000000102e2fdc in address_space_rw (as=0x10ef4350
>> <address_space_io>, addr=0x48,
>>     buf=0x1fffff0edde0 "", len=0x4, is_write=0x1) at
>> /home/alexey/pcipassthru/qemu-impreza/exec.c:1918
>> #9  0x00000000102e33c8 in address_space_write (as=0x10ef4350
>> <address_space_io>, addr=0x48,
>>     buf=0x1fffff0edde0 "", len=0x4) at
>> /home/alexey/pcipassthru/qemu-impreza/exec.c:1969
>> #10 0x0000000010375754 in cpu_outl (addr=0x48, val=0xfbd0)
>>     at /home/alexey/pcipassthru/qemu-impreza/ioport.c:309
>> #11 0x0000000010358240 in spapr_io_write (opaque=0x11016a00, addr=0x48,
>> data=0xfbd0, size=0x4)
>>     at /home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_pci.c:468
>> #12 0x000000001037e220 in memory_region_write_accessor (opaque=0x110191f8,
>> addr=0x48, value=0x1fffff0ee060,
>>     size=0x4, shift=0x0, mask=0xffffffff) at
>> /home/alexey/pcipassthru/qemu-impreza/memory.c:364
>> #13 0x000000001037e36c in access_with_adjusted_size (addr=0x48,
>> value=0x1fffff0ee060, size=0x4,
>>     access_size_min=0x1, access_size_max=0x4, access=
>>     @0x1069df40: 0x1037e164 <memory_region_write_accessor>, opaque=0x110191f8)
>>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:396
>> #14 0x0000000010380b5c in memory_region_dispatch_write (mr=0x110191f8,
>> addr=0x48, data=0xfbd0, size=0x4)
>>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:905
>> #15 0x0000000010383fa4 in io_mem_write (mr=0x110191f8, addr=0x48,
>> val=0xd0fb0000, size=0x4)
>>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:1608
>> #16 0x00000000102e47ac in stl_phys_internal (addr=0x10080000048,
>> val=0xd0fb0000, endian=
>>     DEVICE_NATIVE_ENDIAN) at /home/alexey/pcipassthru/qemu-impreza/exec.c:2420
>> #17 0x00000000102e48a8 in stl_phys (addr=0x10080000048, val=0xd0fb0000)
>>     at /home/alexey/pcipassthru/qemu-impreza/exec.c:2442
>> #18 0x0000000010354f1c in h_logical_store (cpu=0x1fffff0f0010,
>> spapr=0x10fe9510, opcode=0x40,
>>     args=0x1ffffffd0030) at
>> /home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_hcall.c:570
>>
>>
>>
>> This is without this patch (i.e. git checkout  b40acf9^ ):
>>
>> #0  virtqueue_init (vq=0x11014ac0) at
>> /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio.c:90
>> #1  0x00000000103720e4 in virtio_queue_set_addr (vdev=0x11019dd0, n=0x0,
>> addr=0xffe2000)
>>     at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio.c:662
>> #2  0x00000000102027f0 in virtio_ioport_write (opaque=0x11019580, addr=0x8,
>> val=0xffe2)
>>     at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio-pci.c:278
>> #3  0x0000000010202f08 in virtio_pci_config_write (opaque=0x11019580,
>> addr=0x8, val=0xffe2, size=0x4)
>>     at /home/alexey/pcipassthru/qemu-impreza/hw/virtio/virtio-pci.c:416
>> #4  0x000000001037dca8 in memory_region_write_accessor (opaque=0x11019c78,
>> addr=0x8, value=0x1fffff0edca8,
>>     size=0x4, shift=0x0, mask=0xffffffff) at
>> /home/alexey/pcipassthru/qemu-impreza/memory.c:364
>> #5  0x000000001037ddf4 in access_with_adjusted_size (addr=0x8,
>> value=0x1fffff0edca8, size=0x4,
>>     access_size_min=0x1, access_size_max=0x4, access=
>>     @0x1069def8: 0x1037dbec <memory_region_write_accessor>, opaque=0x11019c78)
>>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:396
>> #6  0x000000001037e474 in memory_region_iorange_write
>> (iorange=0x1ffff0005430, offset=0x8, width=0x4,
>>     data=0xffe2) at /home/alexey/pcipassthru/qemu-impreza/memory.c:475
>> #7  0x00000000103750d4 in ioport_writel_thunk (opaque=0x1ffff0005430,
>> addr=0x48, data=0xffe2)
>>     at /home/alexey/pcipassthru/qemu-impreza/ioport.c:226
>> #8  0x0000000010374728 in ioport_write (index=0x2, address=0x48, data=0xffe2)
>>     at /home/alexey/pcipassthru/qemu-impreza/ioport.c:83
>> #9  0x0000000010375688 in cpu_outl (addr=0x48, val=0xffe2)
>>     at /home/alexey/pcipassthru/qemu-impreza/ioport.c:296
>> #10 0x00000000103583fc in spapr_io_write (opaque=0x11016a00, addr=0x48,
>> data=0xffe2, size=0x4)
>>     at /home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_pci.c:468
>> #11 0x000000001037dca8 in memory_region_write_accessor (opaque=0x110191f8,
>> addr=0x48, value=0x1fffff0ee060,
>>     size=0x4, shift=0x0, mask=0xffffffff) at
>> /home/alexey/pcipassthru/qemu-impreza/memory.c:364
>> #12 0x000000001037ddf4 in access_with_adjusted_size (addr=0x48,
>> value=0x1fffff0ee060, size=0x4,
>>     access_size_min=0x1, access_size_max=0x4, access=
>>     @0x1069def8: 0x1037dbec <memory_region_write_accessor>, opaque=0x110191f8)
>>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:396
>> #13 0x0000000010380c90 in memory_region_dispatch_write (mr=0x110191f8,
>> addr=0x48, data=0xffe2, size=0x4)
>>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:993
>> #14 0x00000000103840d8 in io_mem_write (mr=0x110191f8, addr=0x48,
>> val=0xe2ff0000, size=0x4)
>>     at /home/alexey/pcipassthru/qemu-impreza/memory.c:1696
>> #15 0x00000000102e4968 in stl_phys_internal (addr=0x10080000048,
>> val=0xe2ff0000, endian=
>>     DEVICE_NATIVE_ENDIAN) at /home/alexey/pcipassthru/qemu-impreza/exec.c:2447
>> #16 0x00000000102e4a64 in stl_phys (addr=0x10080000048, val=0xe2ff0000)
>>     at /home/alexey/pcipassthru/qemu-impreza/exec.c:2469
>> #17 0x00000000103550d8 in h_logical_store (cpu=0x1fffff0f0010,
>> spapr=0x10fe9510, opcode=0x40,
>>     args=0x1ffffffd0030) at
>> /home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_hcall.c:570
>> #18 0x0000000010355698 in spapr_hypercall (cpu=0x1fffff0f0010, opcode=0x40,
>> args=0x1ffffffd0030)
>>     at /home/alexey/pcipassthru/qemu-impreza/hw/ppc/spapr_hcall.c:689
>>
>>
>>
>>
>>
>>
>> On 07/19/2013 10:49 PM, Paolo Bonzini wrote:
>>> Il 19/07/2013 13:09, Alexey Kardashevskiy ha scritto:
>>>> Hi!
>>>>
>>>> This patch also breaks virtio on powerpc. I thought it was fixed
>>>> (reverted?) in the master branch from qemu.org but it is still there. What
>>>> did I miss?
>>>
>>> It was not reverted, only the "DEVICE_LITTLE_ENDIAN" marking was.
>>>
>>> Let me check if I can reproduce this, it looks like a endianness
>>> problems reading virtio-blk config space.
>>>
>>> Paolo
>>>
>>>> Trying to load:  from: disk ... virtioblk_read: Access beyond end of device!
>>>> virtioblk_read: Access beyond end of device!
>>>> virtioblk_read: Access beyond end of device!
>>>> virtioblk_read: Access beyond end of device!
>>>> virtioblk_read: Access beyond end of device!
>>>> virtioblk_read: Access beyond end of device!
>>>> virtioblk_read: Access beyond end of device!
>>>> virtioblk_read: Access beyond end of device!
>>>> virtioblk_read: Access beyond end of device!
>>>> virtioblk_read: Access beyond end of device!
>>>> virtioblk_read: Access beyond end of device!
>>>> virtioblk_read: Access beyond end of device!
>>>> [many of those]
>>>>
>>>>
>>>>
>>>> On 07/11/2013 10:29 PM, Alexander Graf wrote:
>>>>>
>>>>> On 24.06.2013, at 08:07, Jan Kiszka wrote:
>>>>>
>>>>>> On 2013-06-23 22:50, Hervé Poussineau wrote:
>>>>>>> Jan Kiszka a écrit :
>>>>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>>>
>>>>>>>> The current ioport dispatcher is a complex beast, mostly due to the
>>>>>>>> need to deal with old portio interface users. But we can overcome it
>>>>>>>> without converting all portio users by embedding the required base
>>>>>>>> address of a MemoryRegionPortio access into that data structure. That
>>>>>>>> removes the need to have the additional MemoryRegionIORange structure
>>>>>>>> in the loop on every access.
>>>>>>>>
>>>>>>>> To handle old portio memory ops, we simply install dispatching handlers
>>>>>>>> for portio memory regions when registering them with the memory core.
>>>>>>>> This removes the need for the old_portio field.
>>>>>>>>
>>>>>>>> We can drop the additional aliasing of ioport regions and also the
>>>>>>>> special address space listener. cpu_in and cpu_out now simply call
>>>>>>>> address_space_read/write. And we can concentrate portio handling in a
>>>>>>>> single source file.
>>>>>>>>
>>>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>>> ---
>>>>>>>
>>>>>>> ...
>>>>>>>
>>>>>>>> +
>>>>>>>> +static void portio_write(void *opaque, hwaddr addr, uint64_t data,
>>>>>>>> +                         unsigned size)
>>>>>>>> +{
>>>>>>>> +    MemoryRegionPortioList *mrpio = opaque;
>>>>>>>> +    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size,
>>>>>>>> true);
>>>>>>>> +
>>>>>>>> +    if (mrp) {
>>>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
>>>>>>>> +    } else if (size == 2) {
>>>>>>>> +        mrp = find_portio(mrpio, addr, 1, true);
>>>>>>>> +        assert(mrp);
>>>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
>>>>>>>> +        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data
>>>>>>>>>> 8);
>>>>>>>> +    }
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static const MemoryRegionOps portio_ops = {
>>>>>>>> +    .read = portio_read,
>>>>>>>> +    .write = portio_write,
>>>>>>>> +    .valid.unaligned = true,
>>>>>>>> +    .impl.unaligned = true,
>>>>>>>> +};
>>>>>>>> +
>>>>>>>
>>>>>>> You need to mark these operations as DEVICE_LITTLE_ENDIAN.
>>>>>>> In portio_write above, you clearly assume that data is in LE format.
>>>>>>
>>>>>> Anything behind PIO is little endian, of course. Will add this.
>>>>>
>>>>> This patch breaks VGA on PPC as it is in master today.
>>>>>
>>>>>
>>>>> Alex
>>>>>
>>>>>>
>>>>>>>
>>>>>>> This fixes PPC PReP emulation, which would otherwise be broken with this
>>>>>>> patchset.
>>>>>>
>>>>>> Thanks,
>>>>>> Jan
>>>>>>
>>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>
>>
>>
> 
>
Paolo Bonzini - July 20, 2013, 10:11 a.m.
Il 20/07/2013 03:11, Alexey Kardashevskiy ha scritto:
> On 07/20/2013 10:55 AM, Alexey Kardashevskiy wrote:
>> On 07/20/2013 01:48 AM, Alexey Kardashevskiy wrote:
>>> Ok. So.
>>>
>>> What broke is...
>>> I could try explaining but backtraces are lot better :)
>>>
>>> Shortly - virtio_pci_config_ops.endianness was ignored before (was bad but
>>> we had a workaround in spapr_io_ops), now it works so double swap happens
>>> and everything gets broken.
>>>
>>> If we talk about VGA (in powerpc, it is all about powerpc), I guess
>>> memory_region_iorange_write() will go through mr->ops->old_portio branch
>>> and won't do any byte swapping (so spapr_io_ops will do the job), so we are
>>> fine here. I do not understand yet why it works on mac99 though, too late
>>> here :)
>>
>> I understood. VGA does not work for mac99 either with this command line:
>> ./qemu-system-ppc64 -m "1024" -M "mac99" -vga "std"
>> So it works for pseries only because of parity bug in spapr_io_ops.

Yes, this is what Herve was fixing for PREP too.

> oops. I am wrong and VGA works on mac99 in upstream because isa_mmio_ops
> does the swapping in this case and portio_ops does not swap (in upstream).

Uff... I guess we have to look at all cases for big-endian machines, and
make sure there is an odd number of exchanges.  Aurelien/Alex/Herve,
where can I find images for mips malta, mips jazz, ppc4xx and prep?
Edgar, what about sh4eb (IIRC one of the two machine types supported PCI)?

Also, is all the firmware included in pc-bios/?

For g3beige and mac99 I can use mintppc, I think, that's what I
installed on my old G4 PowerBook.

Paolo
Edgar Iglesias - July 20, 2013, 8:53 p.m.
On Sat, Jul 20, 2013 at 12:11:34PM +0200, Paolo Bonzini wrote:
> Il 20/07/2013 03:11, Alexey Kardashevskiy ha scritto:
> > On 07/20/2013 10:55 AM, Alexey Kardashevskiy wrote:
> >> On 07/20/2013 01:48 AM, Alexey Kardashevskiy wrote:
> >>> Ok. So.
> >>>
> >>> What broke is...
> >>> I could try explaining but backtraces are lot better :)
> >>>
> >>> Shortly - virtio_pci_config_ops.endianness was ignored before (was bad but
> >>> we had a workaround in spapr_io_ops), now it works so double swap happens
> >>> and everything gets broken.
> >>>
> >>> If we talk about VGA (in powerpc, it is all about powerpc), I guess
> >>> memory_region_iorange_write() will go through mr->ops->old_portio branch
> >>> and won't do any byte swapping (so spapr_io_ops will do the job), so we are
> >>> fine here. I do not understand yet why it works on mac99 though, too late
> >>> here :)
> >>
> >> I understood. VGA does not work for mac99 either with this command line:
> >> ./qemu-system-ppc64 -m "1024" -M "mac99" -vga "std"
> >> So it works for pseries only because of parity bug in spapr_io_ops.
> 
> Yes, this is what Herve was fixing for PREP too.
> 
> > oops. I am wrong and VGA works on mac99 in upstream because isa_mmio_ops
> > does the swapping in this case and portio_ops does not swap (in upstream).
> 
> Uff... I guess we have to look at all cases for big-endian machines, and
> make sure there is an odd number of exchanges.  Aurelien/Alex/Herve,
> where can I find images for mips malta, mips jazz, ppc4xx and prep?
> Edgar, what about sh4eb (IIRC one of the two machine types supported PCI)?

Sorry, I don't know (I haven't done much SH).

Best regards,
Edgar
Hervé Poussineau - July 21, 2013, 3:13 p.m.
Paolo Bonzini a écrit :
  >> oops. I am wrong and VGA works on mac99 in upstream because 
isa_mmio_ops
>> does the swapping in this case and portio_ops does not swap (in upstream).
> 
> Uff... I guess we have to look at all cases for big-endian machines, and
> make sure there is an odd number of exchanges.  Aurelien/Alex/Herve,
> where can I find images for mips malta, mips jazz, ppc4xx and prep?
> Edgar, what about sh4eb (IIRC one of the two machine types supported PCI)?

For PReP, you can find a kernel at 
http://www.juneau-lug.org/zImage.initrd.sandalfoot
Run it with qemu-system-ppc -M prep -kernel zImage.initrd.sandalfoot

Hervé
Paolo Bonzini - July 22, 2013, 10:25 a.m.
Il 21/07/2013 17:13, Hervé Poussineau ha scritto:
> Paolo Bonzini a écrit :
>  >> oops. I am wrong and VGA works on mac99 in upstream because
> isa_mmio_ops
>>> does the swapping in this case and portio_ops does not swap (in
>>> upstream).
>>
>> Uff... I guess we have to look at all cases for big-endian machines, and
>> make sure there is an odd number of exchanges.  Aurelien/Alex/Herve,
>> where can I find images for mips malta, mips jazz, ppc4xx and prep?
>> Edgar, what about sh4eb (IIRC one of the two machine types supported
>> PCI)?
> 
> For PReP, you can find a kernel at
> http://www.juneau-lug.org/zImage.initrd.sandalfoot
> Run it with qemu-system-ppc -M prep -kernel zImage.initrd.sandalfoot

Thanks.  I can also use qtest to fix bugs, but this will help making a
more complete test.

Paolo

Patch

diff --git a/exec.c b/exec.c
index 0b0118b..87c7ef3 100644
--- a/exec.c
+++ b/exec.c
@@ -1760,26 +1760,6 @@  static void core_log_global_stop(MemoryListener *listener)
     cpu_physical_memory_set_dirty_tracking(0);
 }
 
-static void io_region_add(MemoryListener *listener,
-                          MemoryRegionSection *section)
-{
-    MemoryRegionIORange *mrio = g_new(MemoryRegionIORange, 1);
-
-    mrio->mr = section->mr;
-    mrio->offset = section->offset_within_region;
-    iorange_init(&mrio->iorange, &memory_region_iorange_ops,
-                 section->offset_within_address_space,
-                 int128_get64(section->size));
-    ioport_register(&mrio->iorange);
-}
-
-static void io_region_del(MemoryListener *listener,
-                          MemoryRegionSection *section)
-{
-    isa_unassign_ioport(section->offset_within_address_space,
-                        int128_get64(section->size));
-}
-
 static MemoryListener core_memory_listener = {
     .begin = core_begin,
     .log_global_start = core_log_global_start,
@@ -1787,12 +1767,6 @@  static MemoryListener core_memory_listener = {
     .priority = 1,
 };
 
-static MemoryListener io_memory_listener = {
-    .region_add = io_region_add,
-    .region_del = io_region_del,
-    .priority = 0,
-};
-
 static MemoryListener tcg_memory_listener = {
     .commit = tcg_commit,
 };
@@ -1834,7 +1808,6 @@  static void memory_map_init(void)
     address_space_init(&address_space_io, system_io, "I/O");
 
     memory_listener_register(&core_memory_listener, &address_space_memory);
-    memory_listener_register(&io_memory_listener, &address_space_io);
     memory_listener_register(&tcg_memory_listener, &address_space_memory);
 }
 
diff --git a/include/exec/ioport.h b/include/exec/ioport.h
index eb99ffe..b476857 100644
--- a/include/exec/ioport.h
+++ b/include/exec/ioport.h
@@ -56,7 +56,6 @@  typedef struct PortioList {
     struct MemoryRegion *address_space;
     unsigned nr;
     struct MemoryRegion **regions;
-    struct MemoryRegion **aliases;
     void *opaque;
     const char *name;
 } PortioList;
diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h
index 26689fe..d0e0633 100644
--- a/include/exec/memory-internal.h
+++ b/include/exec/memory-internal.h
@@ -119,8 +119,6 @@  static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start,
 void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
                                      int dirty_flags);
 
-extern const IORangeOps memory_region_iorange_ops;
-
 #endif
 
 #endif
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 3598c4f..7c1e6da 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -124,10 +124,6 @@  struct MemoryRegionOps {
          bool unaligned;
     } impl;
 
-    /* If .read and .write are not present, old_portio may be used for
-     * backwards compatibility with old portio registration
-     */
-    const MemoryRegionPortio *old_portio;
     /* If .read and .write are not present, old_mmio may be used for
      * backwards compatibility with old mmio registration
      */
@@ -183,6 +179,7 @@  struct MemoryRegionPortio {
     unsigned size;
     IOPortReadFunc *read;
     IOPortWriteFunc *write;
+    uint32_t base; /* private field */
 };
 
 #define PORTIO_END_OF_LIST() { }
diff --git a/ioport.c b/ioport.c
index 56470c5..e34f6b2 100644
--- a/ioport.c
+++ b/ioport.c
@@ -28,6 +28,7 @@ 
 #include "exec/ioport.h"
 #include "trace.h"
 #include "exec/memory.h"
+#include "exec/address-spaces.h"
 
 /***********************************************************/
 /* IO Port */
@@ -47,6 +48,12 @@ 
 #  define LOG_IOPORT(...) do { } while (0)
 #endif
 
+typedef struct MemoryRegionPortioList {
+    MemoryRegion mr;
+    void *portio_opaque;
+    MemoryRegionPortio ports[];
+} MemoryRegionPortioList;
+
 /* XXX: use a two level table to limit memory usage */
 
 static void *ioport_opaque[MAX_IOPORTS];
@@ -279,27 +286,34 @@  void cpu_outb(pio_addr_t addr, uint8_t val)
 {
     LOG_IOPORT("outb: %04"FMT_pioaddr" %02"PRIx8"\n", addr, val);
     trace_cpu_out(addr, val);
-    ioport_write(0, addr, val);
+    address_space_write(&address_space_io, addr, &val, 1);
 }
 
 void cpu_outw(pio_addr_t addr, uint16_t val)
 {
+    uint8_t buf[2];
+
     LOG_IOPORT("outw: %04"FMT_pioaddr" %04"PRIx16"\n", addr, val);
     trace_cpu_out(addr, val);
-    ioport_write(1, addr, val);
+    stw_p(buf, val);
+    address_space_write(&address_space_io, addr, buf, 2);
 }
 
 void cpu_outl(pio_addr_t addr, uint32_t val)
 {
+    uint8_t buf[4];
+
     LOG_IOPORT("outl: %04"FMT_pioaddr" %08"PRIx32"\n", addr, val);
     trace_cpu_out(addr, val);
-    ioport_write(2, addr, val);
+    stl_p(buf, val);
+    address_space_write(&address_space_io, addr, buf, 4);
 }
 
 uint8_t cpu_inb(pio_addr_t addr)
 {
     uint8_t val;
-    val = ioport_read(0, addr);
+
+    address_space_read(&address_space_io, addr, &val, 1);
     trace_cpu_in(addr, val);
     LOG_IOPORT("inb : %04"FMT_pioaddr" %02"PRIx8"\n", addr, val);
     return val;
@@ -307,8 +321,11 @@  uint8_t cpu_inb(pio_addr_t addr)
 
 uint16_t cpu_inw(pio_addr_t addr)
 {
+    uint8_t buf[2];
     uint16_t val;
-    val = ioport_read(1, addr);
+
+    address_space_read(&address_space_io, addr, buf, 2);
+    val = lduw_p(buf);
     trace_cpu_in(addr, val);
     LOG_IOPORT("inw : %04"FMT_pioaddr" %04"PRIx16"\n", addr, val);
     return val;
@@ -316,8 +333,11 @@  uint16_t cpu_inw(pio_addr_t addr)
 
 uint32_t cpu_inl(pio_addr_t addr)
 {
+    uint8_t buf[4];
     uint32_t val;
-    val = ioport_read(2, addr);
+
+    address_space_read(&address_space_io, addr, buf, 4);
+    val = ldl_p(buf);
     trace_cpu_in(addr, val);
     LOG_IOPORT("inl : %04"FMT_pioaddr" %08"PRIx32"\n", addr, val);
     return val;
@@ -336,7 +356,6 @@  void portio_list_init(PortioList *piolist,
     piolist->ports = callbacks;
     piolist->nr = 0;
     piolist->regions = g_new0(MemoryRegion *, n);
-    piolist->aliases = g_new0(MemoryRegion *, n);
     piolist->address_space = NULL;
     piolist->opaque = opaque;
     piolist->name = name;
@@ -345,46 +364,95 @@  void portio_list_init(PortioList *piolist,
 void portio_list_destroy(PortioList *piolist)
 {
     g_free(piolist->regions);
-    g_free(piolist->aliases);
 }
 
+static const MemoryRegionPortio *find_portio(MemoryRegionPortioList *mrpio,
+                                             uint64_t offset, unsigned size,
+                                             bool write)
+{
+    const MemoryRegionPortio *mrp;
+
+    for (mrp = mrpio->ports; mrp->size; ++mrp) {
+        if (offset >= mrp->offset && offset < mrp->offset + mrp->len &&
+            size == mrp->size &&
+            (write ? (bool)mrp->write : (bool)mrp->read)) {
+            return mrp;
+        }
+    }
+    return NULL;
+}
+
+static uint64_t portio_read(void *opaque, hwaddr addr, unsigned size)
+{
+    MemoryRegionPortioList *mrpio = opaque;
+    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size, false);
+    uint64_t data;
+
+    data = ((uint64_t)1 << (size * 8)) - 1;
+    if (mrp) {
+        data = mrp->read(mrpio->portio_opaque, mrp->base + addr);
+    } else if (size == 2) {
+        mrp = find_portio(mrpio, addr, 1, false);
+        assert(mrp);
+        data = mrp->read(mrpio->portio_opaque, mrp->base + addr) |
+                (mrp->read(mrpio->portio_opaque, mrp->base + addr + 1) << 8);
+    }
+    return data;
+}
+
+static void portio_write(void *opaque, hwaddr addr, uint64_t data,
+                         unsigned size)
+{
+    MemoryRegionPortioList *mrpio = opaque;
+    const MemoryRegionPortio *mrp = find_portio(mrpio, addr, size, true);
+
+    if (mrp) {
+        mrp->write(mrpio->portio_opaque, mrp->base + addr, data);
+    } else if (size == 2) {
+        mrp = find_portio(mrpio, addr, 1, true);
+        assert(mrp);
+        mrp->write(mrpio->portio_opaque, mrp->base + addr, data & 0xff);
+        mrp->write(mrpio->portio_opaque, mrp->base + addr + 1, data >> 8);
+    }
+}
+
+static const MemoryRegionOps portio_ops = {
+    .read = portio_read,
+    .write = portio_write,
+    .valid.unaligned = true,
+    .impl.unaligned = true,
+};
+
 static void portio_list_add_1(PortioList *piolist,
                               const MemoryRegionPortio *pio_init,
                               unsigned count, unsigned start,
                               unsigned off_low, unsigned off_high)
 {
-    MemoryRegionPortio *pio;
-    MemoryRegionOps *ops;
-    MemoryRegion *region, *alias;
+    MemoryRegionPortioList *mrpio;
     unsigned i;
 
     /* Copy the sub-list and null-terminate it.  */
-    pio = g_new(MemoryRegionPortio, count + 1);
-    memcpy(pio, pio_init, sizeof(MemoryRegionPortio) * count);
-    memset(pio + count, 0, sizeof(MemoryRegionPortio));
+    mrpio = g_malloc0(sizeof(MemoryRegionPortioList) +
+                      sizeof(MemoryRegionPortio) * (count + 1));
+    mrpio->portio_opaque = piolist->opaque;
+    memcpy(mrpio->ports, pio_init, sizeof(MemoryRegionPortio) * count);
+    memset(mrpio->ports + count, 0, sizeof(MemoryRegionPortio));
 
     /* Adjust the offsets to all be zero-based for the region.  */
     for (i = 0; i < count; ++i) {
-        pio[i].offset -= off_low;
+        mrpio->ports[i].offset -= off_low;
+        mrpio->ports[i].base = start + off_low;
     }
 
-    ops = g_new0(MemoryRegionOps, 1);
-    ops->old_portio = pio;
-
-    region = g_new(MemoryRegion, 1);
-    alias = g_new(MemoryRegion, 1);
     /*
      * Use an alias so that the callback is called with an absolute address,
      * rather than an offset relative to to start + off_low.
      */
-    memory_region_init_io(region, ops, piolist->opaque, piolist->name,
-                          INT64_MAX);
-    memory_region_init_alias(alias, piolist->name,
-                             region, start + off_low, off_high - off_low);
+    memory_region_init_io(&mrpio->mr, &portio_ops, mrpio, piolist->name,
+                          off_high - off_low);
     memory_region_add_subregion(piolist->address_space,
-                                start + off_low, alias);
-    piolist->regions[piolist->nr] = region;
-    piolist->aliases[piolist->nr] = alias;
+                                start + off_low, &mrpio->mr);
+    piolist->regions[piolist->nr] = &mrpio->mr;
     ++piolist->nr;
 }
 
@@ -427,19 +495,14 @@  void portio_list_add(PortioList *piolist,
 
 void portio_list_del(PortioList *piolist)
 {
-    MemoryRegion *mr, *alias;
+    MemoryRegionPortioList *mrpio;
     unsigned i;
 
     for (i = 0; i < piolist->nr; ++i) {
-        mr = piolist->regions[i];
-        alias = piolist->aliases[i];
-        memory_region_del_subregion(piolist->address_space, alias);
-        memory_region_destroy(alias);
-        memory_region_destroy(mr);
-        g_free((MemoryRegionOps *)mr->ops);
-        g_free(mr);
-        g_free(alias);
+        mrpio = container_of(piolist->regions[i], MemoryRegionPortioList, mr);
+        memory_region_del_subregion(piolist->address_space, &mrpio->mr);
+        memory_region_destroy(&mrpio->mr);
+        g_free(mrpio);
         piolist->regions[i] = NULL;
-        piolist->aliases[i] = NULL;
     }
 }
diff --git a/memory.c b/memory.c
index 47b005a..df07b24 100644
--- a/memory.c
+++ b/memory.c
@@ -401,94 +401,6 @@  static void access_with_adjusted_size(hwaddr addr,
     }
 }
 
-static const MemoryRegionPortio *find_portio(MemoryRegion *mr, uint64_t offset,
-                                             unsigned width, bool write)
-{
-    const MemoryRegionPortio *mrp;
-
-    for (mrp = mr->ops->old_portio; mrp->size; ++mrp) {
-        if (offset >= mrp->offset && offset < mrp->offset + mrp->len
-            && width == mrp->size
-            && (write ? (bool)mrp->write : (bool)mrp->read)) {
-            return mrp;
-        }
-    }
-    return NULL;
-}
-
-static void memory_region_iorange_read(IORange *iorange,
-                                       uint64_t offset,
-                                       unsigned width,
-                                       uint64_t *data)
-{
-    MemoryRegionIORange *mrio
-        = container_of(iorange, MemoryRegionIORange, iorange);
-    MemoryRegion *mr = mrio->mr;
-
-    offset += mrio->offset;
-    if (mr->ops->old_portio) {
-        const MemoryRegionPortio *mrp = find_portio(mr, offset - mrio->offset,
-                                                    width, false);
-
-        *data = ((uint64_t)1 << (width * 8)) - 1;
-        if (mrp) {
-            *data = mrp->read(mr->opaque, offset);
-        } else if (width == 2) {
-            mrp = find_portio(mr, offset - mrio->offset, 1, false);
-            assert(mrp);
-            *data = mrp->read(mr->opaque, offset) |
-                    (mrp->read(mr->opaque, offset + 1) << 8);
-        }
-        return;
-    }
-    *data = 0;
-    access_with_adjusted_size(offset, data, width,
-                              mr->ops->impl.min_access_size,
-                              mr->ops->impl.max_access_size,
-                              memory_region_read_accessor, mr);
-}
-
-static void memory_region_iorange_write(IORange *iorange,
-                                        uint64_t offset,
-                                        unsigned width,
-                                        uint64_t data)
-{
-    MemoryRegionIORange *mrio
-        = container_of(iorange, MemoryRegionIORange, iorange);
-    MemoryRegion *mr = mrio->mr;
-
-    offset += mrio->offset;
-    if (mr->ops->old_portio) {
-        const MemoryRegionPortio *mrp = find_portio(mr, offset - mrio->offset,
-                                                    width, true);
-
-        if (mrp) {
-            mrp->write(mr->opaque, offset, data);
-        } else if (width == 2) {
-            mrp = find_portio(mr, offset - mrio->offset, 1, true);
-            assert(mrp);
-            mrp->write(mr->opaque, offset, data & 0xff);
-            mrp->write(mr->opaque, offset + 1, data >> 8);
-        }
-        return;
-    }
-    access_with_adjusted_size(offset, &data, width,
-                              mr->ops->impl.min_access_size,
-                              mr->ops->impl.max_access_size,
-                              memory_region_write_accessor, mr);
-}
-
-static void memory_region_iorange_destructor(IORange *iorange)
-{
-    g_free(container_of(iorange, MemoryRegionIORange, iorange));
-}
-
-const IORangeOps memory_region_iorange_ops = {
-    .read = memory_region_iorange_read,
-    .write = memory_region_iorange_write,
-    .destructor = memory_region_iorange_destructor,
-};
-
 static AddressSpace *memory_region_to_address_space(MemoryRegion *mr)
 {
     AddressSpace *as;