diff mbox

virtio-pci: implement cfg capability

Message ID 1435842022-13980-1-git-send-email-mst@redhat.com
State New
Headers show

Commit Message

Michael S. Tsirkin July 2, 2015, 1 p.m. UTC
spec says we must, so let's do it!

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

untested at this point.
Paolo, could you pls take a look and confirm this
memory API usage is ok?

 hw/virtio/virtio-pci.h |  3 ++
 hw/virtio/virtio-pci.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 78 insertions(+), 1 deletion(-)

Comments

Paolo Bonzini July 2, 2015, 6:48 p.m. UTC | #1
On 02/07/2015 15:00, Michael S. Tsirkin wrote:
> +        cfg = (void *)(proxy->pci_dev.config + proxy->config_cap);
> +        off = le32_to_cpu(cfg->cap.offset);
> +        len = le32_to_cpu(cfg->cap.length);
> +
> +        if ((len == 1 || len == 2 || len == 4)) {
> +            address_space_write(&proxy->modern_as, off,
> +                                MEMTXATTRS_UNSPECIFIED,
> +                                cfg->pci_cfg_data, len);
> +        }

This parses pci_cfg_data in target endianness I think.  You just want to
move the little-endian value from the config cap to the little-endian
value in the modern_as, so you need to use ldl_le_p and
address_space_stl_le.

> +    }
> +}
> +
> +static uint32_t virtio_read_config(PCIDevice *pci_dev,
> +                                   uint32_t address, int len)
> +{
> +    VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
> +    struct virtio_pci_cfg_cap *cfg;
> +
> +    if (proxy->config_cap &&
> +        ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap,
> +                                                                  pci_cfg_data),
> +                       sizeof cfg->pci_cfg_data)) {
> +        uint32_t off;
> +        uint32_t len;
> +
> +        cfg = (void *)(proxy->pci_dev.config + proxy->config_cap);
> +        off = le32_to_cpu(cfg->cap.offset);
> +        len = le32_to_cpu(cfg->cap.length);
> +
> +        if ((len == 1 || len == 2 || len == 4)) {
> +            address_space_read(&proxy->modern_as, off,
> +                                MEMTXATTRS_UNSPECIFIED,
> +                                cfg->pci_cfg_data, len);

Same here, use address_space_ldl_le to read into an int, and stl_le_p to
write into cfg->pci_cfg_data.

The best way to check it, of course, is to write a unit test! :)  But
you could also use a Linux BE guest on LE host.

Everything else looks good.

Paolo
Michael S. Tsirkin July 2, 2015, 7 p.m. UTC | #2
On Thu, Jul 02, 2015 at 08:48:14PM +0200, Paolo Bonzini wrote:
> 
> 
> On 02/07/2015 15:00, Michael S. Tsirkin wrote:
> > +        cfg = (void *)(proxy->pci_dev.config + proxy->config_cap);
> > +        off = le32_to_cpu(cfg->cap.offset);
> > +        len = le32_to_cpu(cfg->cap.length);
> > +
> > +        if ((len == 1 || len == 2 || len == 4)) {
> > +            address_space_write(&proxy->modern_as, off,
> > +                                MEMTXATTRS_UNSPECIFIED,
> > +                                cfg->pci_cfg_data, len);
> > +        }
> 
> This parses pci_cfg_data in target endianness I think.  You just want to
> move the little-endian value from the config cap to the little-endian
> value in the modern_as, so you need to use ldl_le_p and
> address_space_stl_le.

but isn't that two byteswaps? why isn't this same as memcpy?

> > +    }
> > +}
> > +
> > +static uint32_t virtio_read_config(PCIDevice *pci_dev,
> > +                                   uint32_t address, int len)
> > +{
> > +    VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
> > +    struct virtio_pci_cfg_cap *cfg;
> > +
> > +    if (proxy->config_cap &&
> > +        ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap,
> > +                                                                  pci_cfg_data),
> > +                       sizeof cfg->pci_cfg_data)) {
> > +        uint32_t off;
> > +        uint32_t len;
> > +
> > +        cfg = (void *)(proxy->pci_dev.config + proxy->config_cap);
> > +        off = le32_to_cpu(cfg->cap.offset);
> > +        len = le32_to_cpu(cfg->cap.length);
> > +
> > +        if ((len == 1 || len == 2 || len == 4)) {
> > +            address_space_read(&proxy->modern_as, off,
> > +                                MEMTXATTRS_UNSPECIFIED,
> > +                                cfg->pci_cfg_data, len);
> 
> Same here, use address_space_ldl_le to read into an int, and stl_le_p to
> write into cfg->pci_cfg_data.
> 
> The best way to check it, of course, is to write a unit test! :)  But
> you could also use a Linux BE guest on LE host.
> 
> Everything else looks good.
> 
> Paolo
Paolo Bonzini July 2, 2015, 7:02 p.m. UTC | #3
On 02/07/2015 21:00, Michael S. Tsirkin wrote:
> On Thu, Jul 02, 2015 at 08:48:14PM +0200, Paolo Bonzini wrote:
>>
>>
>> On 02/07/2015 15:00, Michael S. Tsirkin wrote:
>>> +        cfg = (void *)(proxy->pci_dev.config + proxy->config_cap);
>>> +        off = le32_to_cpu(cfg->cap.offset);
>>> +        len = le32_to_cpu(cfg->cap.length);
>>> +
>>> +        if ((len == 1 || len == 2 || len == 4)) {
>>> +            address_space_write(&proxy->modern_as, off,
>>> +                                MEMTXATTRS_UNSPECIFIED,
>>> +                                cfg->pci_cfg_data, len);
>>> +        }
>>
>> This parses pci_cfg_data in target endianness I think.  You just want to
>> move the little-endian value from the config cap to the little-endian
>> value in the modern_as, so you need to use ldl_le_p and
>> address_space_stl_le.
> 
> but isn't that two byteswaps? why isn't this same as memcpy?

It is a memcpy if you write to RAM, but the MMIO ops take an unsigned
integer, so you can still have one byteswap hiding; you have to be
careful when you have this kind of "forwarder", and the simplest way to
do it is to use an ldl/stl pair with the same endianness.

Paolo

>>> +    }
>>> +}
>>> +
>>> +static uint32_t virtio_read_config(PCIDevice *pci_dev,
>>> +                                   uint32_t address, int len)
>>> +{
>>> +    VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
>>> +    struct virtio_pci_cfg_cap *cfg;
>>> +
>>> +    if (proxy->config_cap &&
>>> +        ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap,
>>> +                                                                  pci_cfg_data),
>>> +                       sizeof cfg->pci_cfg_data)) {
>>> +        uint32_t off;
>>> +        uint32_t len;
>>> +
>>> +        cfg = (void *)(proxy->pci_dev.config + proxy->config_cap);
>>> +        off = le32_to_cpu(cfg->cap.offset);
>>> +        len = le32_to_cpu(cfg->cap.length);
>>> +
>>> +        if ((len == 1 || len == 2 || len == 4)) {
>>> +            address_space_read(&proxy->modern_as, off,
>>> +                                MEMTXATTRS_UNSPECIFIED,
>>> +                                cfg->pci_cfg_data, len);
>>
>> Same here, use address_space_ldl_le to read into an int, and stl_le_p to
>> write into cfg->pci_cfg_data.
>>
>> The best way to check it, of course, is to write a unit test! :)  But
>> you could also use a Linux BE guest on LE host.
>>
>> Everything else looks good.
>>
>> Paolo
Gerd Hoffmann July 3, 2015, 9:34 a.m. UTC | #4
On Do, 2015-07-02 at 15:00 +0200, Michael S. Tsirkin wrote:
> spec says we must, so let's do it!
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> 
> untested at this point.

Tested-by: Gerd Hoffmann <kraxel@redhat.com>

[ seabios patches follow in a moment ]

cheers,
  Gerd
Michael S. Tsirkin July 4, 2015, 9:19 p.m. UTC | #5
On Thu, Jul 02, 2015 at 09:02:58PM +0200, Paolo Bonzini wrote:
> 
> 
> On 02/07/2015 21:00, Michael S. Tsirkin wrote:
> > On Thu, Jul 02, 2015 at 08:48:14PM +0200, Paolo Bonzini wrote:
> >>
> >>
> >> On 02/07/2015 15:00, Michael S. Tsirkin wrote:
> >>> +        cfg = (void *)(proxy->pci_dev.config + proxy->config_cap);
> >>> +        off = le32_to_cpu(cfg->cap.offset);
> >>> +        len = le32_to_cpu(cfg->cap.length);
> >>> +
> >>> +        if ((len == 1 || len == 2 || len == 4)) {
> >>> +            address_space_write(&proxy->modern_as, off,
> >>> +                                MEMTXATTRS_UNSPECIFIED,
> >>> +                                cfg->pci_cfg_data, len);
> >>> +        }
> >>
> >> This parses pci_cfg_data in target endianness I think.  You just want to
> >> move the little-endian value from the config cap to the little-endian
> >> value in the modern_as, so you need to use ldl_le_p and
> >> address_space_stl_le.
> > 
> > but isn't that two byteswaps? why isn't this same as memcpy?
> 
> It is a memcpy if you write to RAM, but the MMIO ops take an unsigned
> integer, so you can still have one byteswap hiding; you have to be
> careful when you have this kind of "forwarder", and the simplest way to
> do it is to use an ldl/stl pair with the same endianness.
> 
> Paolo

The fact that address_space_write/_read actually does a byteswap if
host!=target endian should probably be documented.

Or maybe it should be changed: it seems likely that non-target-specific devices
that use it do this incorrectly ATM. In particular, dma_memory_rw_relaxed calls
address_space_rw and since DMA originates with devices I think there's very
little chance that these actually want a different behaviour depending on the
target endian-ness.

Most likely, these only work correctly because DMA outside RAM
is highly unusual.







> >>> +    }
> >>> +}
> >>> +
> >>> +static uint32_t virtio_read_config(PCIDevice *pci_dev,
> >>> +                                   uint32_t address, int len)
> >>> +{
> >>> +    VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
> >>> +    struct virtio_pci_cfg_cap *cfg;
> >>> +
> >>> +    if (proxy->config_cap &&
> >>> +        ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap,
> >>> +                                                                  pci_cfg_data),
> >>> +                       sizeof cfg->pci_cfg_data)) {
> >>> +        uint32_t off;
> >>> +        uint32_t len;
> >>> +
> >>> +        cfg = (void *)(proxy->pci_dev.config + proxy->config_cap);
> >>> +        off = le32_to_cpu(cfg->cap.offset);
> >>> +        len = le32_to_cpu(cfg->cap.length);
> >>> +
> >>> +        if ((len == 1 || len == 2 || len == 4)) {
> >>> +            address_space_read(&proxy->modern_as, off,
> >>> +                                MEMTXATTRS_UNSPECIFIED,
> >>> +                                cfg->pci_cfg_data, len);
> >>
> >> Same here, use address_space_ldl_le to read into an int, and stl_le_p to
> >> write into cfg->pci_cfg_data.
> >>
> >> The best way to check it, of course, is to write a unit test! :)  But
> >> you could also use a Linux BE guest on LE host.
> >>
> >> Everything else looks good.
> >>
> >> Paolo
Paolo Bonzini July 6, 2015, 7:46 a.m. UTC | #6
On 04/07/2015 23:19, Michael S. Tsirkin wrote:
> The fact that address_space_write/_read actually does a byteswap if
> host!=target endian should probably be documented.

FWIW, it's not if host != target endian.  It's if memory region
endianness != target endianness.  See memory_region_wrong_endianness.

> Or maybe it should be changed: it seems likely that non-target-specific devices
> that use it do this incorrectly ATM. In particular, dma_memory_rw_relaxed calls
> address_space_rw and since DMA originates with devices I think there's very
> little chance that these actually want a different behaviour depending on the
> target endian-ness.
> 
> Most likely, these only work correctly because DMA outside RAM
> is highly unusual.

They work correctly because of that, and because most devices *and*
targets are little endian so you have no swap.  On ppc64, which has
TARGET_WORDS_BIGENDIAN, it probably wouldn't work correctly.

Paolo
Michael S. Tsirkin July 6, 2015, 8:33 a.m. UTC | #7
On Mon, Jul 06, 2015 at 09:46:14AM +0200, Paolo Bonzini wrote:
> 
> 
> On 04/07/2015 23:19, Michael S. Tsirkin wrote:
> > The fact that address_space_write/_read actually does a byteswap if
> > host!=target endian should probably be documented.
> 
> FWIW, it's not if host != target endian.  It's if memory region
> endianness != target endianness.  See memory_region_wrong_endianness.

Right. What I meant is that if device != host then we want a byteswap
anyway.  E.g. if device says it's BE and system is LE, then it does want
to get byte 0 as MSB in the uint32 it gets.

IOW we want byteswap if host != MR.
Instead we get a byteswap if target != MR.

This is the same unless host != target.


> > Or maybe it should be changed: it seems likely that non-target-specific devices
> > that use it do this incorrectly ATM. In particular, dma_memory_rw_relaxed calls
> > address_space_rw and since DMA originates with devices I think there's very
> > little chance that these actually want a different behaviour depending on the
> > target endian-ness.
> > 
> > Most likely, these only work correctly because DMA outside RAM
> > is highly unusual.
> 
> They work correctly because of that, and because most devices *and*
> targets are little endian so you have no swap.  On ppc64, which has
> TARGET_WORDS_BIGENDIAN, it probably wouldn't work correctly.
> 
> Paolo

Also, by luck, some values work the same whatever the endian-ness.
E.g. dma_memory_set fills the buffer with a given pattern, so
nothing changes if you byte-swap it.

Here's an example that's wrong: dp8393x. Typically it's accessing
memory for DMA, so there's no byteswap. Works fine.

But should device attempt to access another device memory,
it would break because MIPS target is BE.

Cc Hervé for confirmation.

I conclude that virtio is not so special in needing a variant
of address_space_rw that assumes host endian format for the data.
Paolo Bonzini July 6, 2015, 8:46 a.m. UTC | #8
On 06/07/2015 10:33, Michael S. Tsirkin wrote:
> Also, by luck, some values work the same whatever the endian-ness.
> E.g. dma_memory_set fills the buffer with a given pattern, so
> nothing changes if you byte-swap it.
> 
> Here's an example that's wrong: dp8393x. Typically it's accessing
> memory for DMA, so there's no byteswap. Works fine.
> 
> But should device attempt to access another device memory,
> it would break because MIPS target is BE.
> 
> Cc Hervé for confirmation.
> 
> I conclude that virtio is not so special in needing a variant
> of address_space_rw that assumes host endian format for the data.

Why host endian and not device (in this case little) endian?

Paolo
Michael S. Tsirkin July 6, 2015, 9:06 a.m. UTC | #9
On Mon, Jul 06, 2015 at 10:46:31AM +0200, Paolo Bonzini wrote:
> 
> 
> On 06/07/2015 10:33, Michael S. Tsirkin wrote:
> > Also, by luck, some values work the same whatever the endian-ness.
> > E.g. dma_memory_set fills the buffer with a given pattern, so
> > nothing changes if you byte-swap it.
> > 
> > Here's an example that's wrong: dp8393x. Typically it's accessing
> > memory for DMA, so there's no byteswap. Works fine.
> > 
> > But should device attempt to access another device memory,
> > it would break because MIPS target is BE.
> > 
> > Cc Hervé for confirmation.
> > 
> > I conclude that virtio is not so special in needing a variant
> > of address_space_rw that assumes host endian format for the data.
> 
> Why host endian and not device (in this case little) endian?
> 
> Paolo

It's the endian of the originator of the transaction.
And emulated device code is all compiled in host endian-ness.
Peter Maydell July 6, 2015, 9:11 a.m. UTC | #10
On 6 July 2015 at 10:06, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Mon, Jul 06, 2015 at 10:46:31AM +0200, Paolo Bonzini wrote:
>> Why host endian and not device (in this case little) endian?

> It's the endian of the originator of the transaction.
> And emulated device code is all compiled in host endian-ness.

But address_space_rw() is just the "memcpy bytes to the
target's memory" operation -- if you have a pile of bytes
then there are no endianness concerns. If you don't have
a pile of bytes then you need to know the structure of
the data you're DMAing around, and you should probably
have a loop doing things with the specify-the-width functions.

thanks
-- PMM
Michael S. Tsirkin July 6, 2015, 10:03 a.m. UTC | #11
On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote:
> On 6 July 2015 at 10:06, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Mon, Jul 06, 2015 at 10:46:31AM +0200, Paolo Bonzini wrote:
> >> Why host endian and not device (in this case little) endian?
> 
> > It's the endian of the originator of the transaction.
> > And emulated device code is all compiled in host endian-ness.
> 
> But address_space_rw() is just the "memcpy bytes to the
> target's memory" operation -- if you have a pile of bytes
> then there are no endianness concerns. If you don't have
> a pile of bytes then you need to know the structure of
> the data you're DMAing around, and you should probably
> have a loop doing things with the specify-the-width functions.
> 
> thanks
> -- PMM

Absolutely. But what if DMA happens to target another device
and not memory? Device needs some endian-ness so it needs
to be converted to that.
Peter Maydell July 6, 2015, 10:04 a.m. UTC | #12
On 6 July 2015 at 11:03, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote:
>> But address_space_rw() is just the "memcpy bytes to the
>> target's memory" operation -- if you have a pile of bytes
>> then there are no endianness concerns. If you don't have
>> a pile of bytes then you need to know the structure of
>> the data you're DMAing around, and you should probably
>> have a loop doing things with the specify-the-width functions.

> Absolutely. But what if DMA happens to target another device
> and not memory? Device needs some endian-ness so it needs
> to be converted to that.

Yes, and address_space_rw() already deals with conversion to
that device's specified endianness.

-- PMM
Michael S. Tsirkin July 6, 2015, 10:31 a.m. UTC | #13
On Mon, Jul 06, 2015 at 11:04:24AM +0100, Peter Maydell wrote:
> On 6 July 2015 at 11:03, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote:
> >> But address_space_rw() is just the "memcpy bytes to the
> >> target's memory" operation -- if you have a pile of bytes
> >> then there are no endianness concerns. If you don't have
> >> a pile of bytes then you need to know the structure of
> >> the data you're DMAing around, and you should probably
> >> have a loop doing things with the specify-the-width functions.
> 
> > Absolutely. But what if DMA happens to target another device
> > and not memory? Device needs some endian-ness so it needs
> > to be converted to that.
> 
> Yes, and address_space_rw() already deals with conversion to
> that device's specified endianness.
> 
> -- PMM

Yes, but incorrectly if target endian != host endian.
For example, LE target and LE device on BE host.

IO callbacks always get a native endian format so they expect to get
byte 0 of the buffer in MSB on this host.
Peter Maydell July 6, 2015, 11:50 a.m. UTC | #14
On 6 July 2015 at 11:31, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Mon, Jul 06, 2015 at 11:04:24AM +0100, Peter Maydell wrote:
>> On 6 July 2015 at 11:03, Michael S. Tsirkin <mst@redhat.com> wrote:
>> > On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote:
>> >> But address_space_rw() is just the "memcpy bytes to the
>> >> target's memory" operation -- if you have a pile of bytes
>> >> then there are no endianness concerns. If you don't have
>> >> a pile of bytes then you need to know the structure of
>> >> the data you're DMAing around, and you should probably
>> >> have a loop doing things with the specify-the-width functions.
>>
>> > Absolutely. But what if DMA happens to target another device
>> > and not memory? Device needs some endian-ness so it needs
>> > to be converted to that.
>>
>> Yes, and address_space_rw() already deals with conversion to
>> that device's specified endianness.

> Yes, but incorrectly if target endian != host endian.
> For example, LE target and LE device on BE host.

Having walked through the code, got confused, talked to
bonzini on IRC about it and got unconfused again, I believe
we do get this correct.

 * address_space_rw() takes a pointer to a pile of bytes
 * if the destination is RAM, we just memcpy them (because
   guest RAM is also a pile of bytes)
 * if the destination is a device, then we read a value
   out of the pile of bytes at whatever width the target
   device can handle. The functions we use for this are
   ldl_q/ldl_p/etc, which do "load target endianness"
   (ie "interpret this set of 4 bytes as if it were an
   integer in the target-endianness") because the API of
   memory_region_dispatch_write() is that it takes a uint64_t
   data whose contents are the value to write in target
   endianness order. (This is regrettably undocumented.)
 * memory_region_dispatch_write() then calls adjust_endianness(),
   converting a target-endian value to the endianness the
   device says it requires
 * we then call the device's read/write functions, whose API
   is that they get a value in the endianness they asked for.

> IO callbacks always get a native endian format so they expect to get
> byte 0 of the buffer in MSB on this host.

IO callbacks get the format they asked for (which might
be BE, LE or target endianness). They will get byte 0 of
the buffer in the MSB if they said they were BE devices
(or if they said they were target-endian on a BE target).

thanks
-- PMM
Paolo Bonzini July 6, 2015, 12:04 p.m. UTC | #15
On 06/07/2015 13:50, Peter Maydell wrote:
> On 6 July 2015 at 11:31, Michael S. Tsirkin <mst@redhat.com> wrote:
>> On Mon, Jul 06, 2015 at 11:04:24AM +0100, Peter Maydell wrote:
>>> On 6 July 2015 at 11:03, Michael S. Tsirkin <mst@redhat.com> wrote:
>>>> On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote:
>>>>> But address_space_rw() is just the "memcpy bytes to the
>>>>> target's memory" operation -- if you have a pile of bytes
>>>>> then there are no endianness concerns. If you don't have
>>>>> a pile of bytes then you need to know the structure of
>>>>> the data you're DMAing around, and you should probably
>>>>> have a loop doing things with the specify-the-width functions.
>>>
>>>> Absolutely. But what if DMA happens to target another device
>>>> and not memory? Device needs some endian-ness so it needs
>>>> to be converted to that.
>>>
>>> Yes, and address_space_rw() already deals with conversion to
>>> that device's specified endianness.
> 
>> Yes, but incorrectly if target endian != host endian.
>> For example, LE target and LE device on BE host.
> 
> Having walked through the code, got confused, talked to
> bonzini on IRC about it and got unconfused again,

Ah, *that discussion*.  So it was yet another XY question, :) but for
the better because it also helped me abstract Michael's question.

Peter's analysis below summarizes the implementation very well.

 I believe
> we do get this correct.
> 
>  * address_space_rw() takes a pointer to a pile of bytes
>  * if the destination is RAM, we just memcpy them (because
>    guest RAM is also a pile of bytes)
>  * if the destination is a device, then we read a value
>    out of the pile of bytes at whatever width the target
>    device can handle. The functions we use for this are
>    ldl_q/ldl_p/etc, which do "load target endianness"
>    (ie "interpret this set of 4 bytes as if it were an
>    integer in the target-endianness") because the API of
>    memory_region_dispatch_write() is that it takes a uint64_t
>    data whose contents are the value to write in target
>    endianness order. (This is regrettably undocumented.)

^^ And this is the part where "the endianness of the CPU->device
bus/link" enters the picture.  But it doesn't matter if the source is
instead another device.  What matters is that address_space_rw() manages
conversion from a pile of bytes, and the device doing DMA provides
that---a pile of bytes.

In the patch at the beginning of this thread, problems arose because
what you passed to address_space_write wasn't just a "pile of bytes"
coming from a network packet or a disk sector.  Instead, it was the
outcome of a previous conversion from "pile of bytes" to "bytes
representing an integer in little-endian format".  This conversion could
have possibly included a byteswap.

Once you have established that the bytes represent an integer the right
way to access them is to use ld*_p/st*_p and
address_space_ld*/address_space_st*.  This ensures that you do an even
number of further byteswaps; for *_le_p and address_space_*_le, there
will be 0 further byteswaps on little-endian hosts and 2 on big-endian
hosts.

Paolo

>  * memory_region_dispatch_write() then calls adjust_endianness(),
>    converting a target-endian value to the endianness the
>    device says it requires
>  * we then call the device's read/write functions, whose API
>    is that they get a value in the endianness they asked for.
> 
>> IO callbacks always get a native endian format so they expect to get
>> byte 0 of the buffer in MSB on this host.
> 
> IO callbacks get the format they asked for (which might
> be BE, LE or target endianness). They will get byte 0 of
> the buffer in the MSB if they said they were BE devices
> (or if they said they were target-endian on a BE target).
Michael S. Tsirkin July 6, 2015, 12:12 p.m. UTC | #16
On Mon, Jul 06, 2015 at 12:50:43PM +0100, Peter Maydell wrote:
> On 6 July 2015 at 11:31, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Mon, Jul 06, 2015 at 11:04:24AM +0100, Peter Maydell wrote:
> >> On 6 July 2015 at 11:03, Michael S. Tsirkin <mst@redhat.com> wrote:
> >> > On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote:
> >> >> But address_space_rw() is just the "memcpy bytes to the
> >> >> target's memory" operation -- if you have a pile of bytes
> >> >> then there are no endianness concerns. If you don't have
> >> >> a pile of bytes then you need to know the structure of
> >> >> the data you're DMAing around, and you should probably
> >> >> have a loop doing things with the specify-the-width functions.
> >>
> >> > Absolutely. But what if DMA happens to target another device
> >> > and not memory? Device needs some endian-ness so it needs
> >> > to be converted to that.
> >>
> >> Yes, and address_space_rw() already deals with conversion to
> >> that device's specified endianness.
> 
> > Yes, but incorrectly if target endian != host endian.
> > For example, LE target and LE device on BE host.
> 
> Having walked through the code, got confused, talked to
> bonzini on IRC about it and got unconfused again, I believe
> we do get this correct.
> 
>  * address_space_rw() takes a pointer to a pile of bytes
>  * if the destination is RAM, we just memcpy them (because
>    guest RAM is also a pile of bytes)
>  * if the destination is a device, then we read a value
>    out of the pile of bytes at whatever width the target
>    device can handle. The functions we use for this are
>    ldl_q/ldl_p/etc, which do "load target endianness"
>    (ie "interpret this set of 4 bytes as if it were an
>    integer in the target-endianness") because the API of
>    memory_region_dispatch_write() is that it takes a uint64_t
>    data whose contents are the value to write in target
>    endianness order. (This is regrettably undocumented.)
>  * memory_region_dispatch_write() then calls adjust_endianness(),
>    converting a target-endian value to the endianness the
>    device says it requires
>  * we then call the device's read/write functions, whose API
>    is that they get a value in the endianness they asked for.

IMO what's missing here is that device read/write callbacks actually want
host endian-ness. See below for examples.

> > IO callbacks always get a native endian format so they expect to get
> > byte 0 of the buffer in MSB on this host.
> 
> IO callbacks get the format they asked for (which might
> be BE, LE or target endianness). They will get byte 0 of
> the buffer in the MSB if they said they were BE devices
> (or if they said they were target-endian on a BE target).
> 
> thanks
> -- PMM

I believe this last point is wrong.

For example typical PCI devices use little endian. This does not mean
they want an LE value in their callback.

This means guest will give them LE values and they want data
converted *from that* to the natural host endian, for
convenience.

Check e.g. e1000_mmio_write as a typical example.

Imagine that we have another device read data (e.g. from RAM
using address_space_rw) and then write it out to another device
usng address_space_rw.

Run it with x86 target on BE host.
	What do you get is byte 0 goes to bits 0-7 in the value.

Run it with x86 target on LE host.
	What do you get is byte 0 goes to bits 0-7 in the value.

The effect on the device will be very different, and clearly
this is a bug since host endian mustn't affect how
devices work.
Michael S. Tsirkin July 6, 2015, 12:15 p.m. UTC | #17
On Mon, Jul 06, 2015 at 02:04:12PM +0200, Paolo Bonzini wrote:
> 
> 
> On 06/07/2015 13:50, Peter Maydell wrote:
> > On 6 July 2015 at 11:31, Michael S. Tsirkin <mst@redhat.com> wrote:
> >> On Mon, Jul 06, 2015 at 11:04:24AM +0100, Peter Maydell wrote:
> >>> On 6 July 2015 at 11:03, Michael S. Tsirkin <mst@redhat.com> wrote:
> >>>> On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote:
> >>>>> But address_space_rw() is just the "memcpy bytes to the
> >>>>> target's memory" operation -- if you have a pile of bytes
> >>>>> then there are no endianness concerns. If you don't have
> >>>>> a pile of bytes then you need to know the structure of
> >>>>> the data you're DMAing around, and you should probably
> >>>>> have a loop doing things with the specify-the-width functions.
> >>>
> >>>> Absolutely. But what if DMA happens to target another device
> >>>> and not memory? Device needs some endian-ness so it needs
> >>>> to be converted to that.
> >>>
> >>> Yes, and address_space_rw() already deals with conversion to
> >>> that device's specified endianness.
> > 
> >> Yes, but incorrectly if target endian != host endian.
> >> For example, LE target and LE device on BE host.
> > 
> > Having walked through the code, got confused, talked to
> > bonzini on IRC about it and got unconfused again,
> 
> Ah, *that discussion*.  So it was yet another XY question, :) but for
> the better because it also helped me abstract Michael's question.
> 
> Peter's analysis below summarizes the implementation very well.
> 
>  I believe
> > we do get this correct.
> > 
> >  * address_space_rw() takes a pointer to a pile of bytes
> >  * if the destination is RAM, we just memcpy them (because
> >    guest RAM is also a pile of bytes)
> >  * if the destination is a device, then we read a value
> >    out of the pile of bytes at whatever width the target
> >    device can handle. The functions we use for this are
> >    ldl_q/ldl_p/etc, which do "load target endianness"
> >    (ie "interpret this set of 4 bytes as if it were an
> >    integer in the target-endianness") because the API of
> >    memory_region_dispatch_write() is that it takes a uint64_t
> >    data whose contents are the value to write in target
> >    endianness order. (This is regrettably undocumented.)
> 
> ^^ And this is the part where "the endianness of the CPU->device
> bus/link" enters the picture.  But it doesn't matter if the source is
> instead another device.  What matters is that address_space_rw() manages
> conversion from a pile of bytes, and the device doing DMA provides
> that---a pile of bytes.
> 
> In the patch at the beginning of this thread, problems arose because
> what you passed to address_space_write wasn't just a "pile of bytes"
> coming from a network packet or a disk sector.  Instead, it was the
> outcome of a previous conversion from "pile of bytes" to "bytes
> representing an integer in little-endian format".  This conversion could
> have possibly included a byteswap.

Well, not really. It's a pile of bytes from guest POV.
And same thing happens if you read a pile of bytes
from RAM using address_space_read.

> Once you have established that the bytes represent an integer the right
> way to access them is to use ld*_p/st*_p and
> address_space_ld*/address_space_st*.  This ensures that you do an even
> number of further byteswaps; for *_le_p and address_space_*_le, there
> will be 0 further byteswaps on little-endian hosts and 2 on big-endian
> hosts.
> 
> Paolo

I believe this summarizes the implementation correctly.
I also argue that many devices use address_space_rw incorrectly
assuming it converts from host endian, simply because most
devices are written in host endian.



> >  * memory_region_dispatch_write() then calls adjust_endianness(),
> >    converting a target-endian value to the endianness the
> >    device says it requires
> >  * we then call the device's read/write functions, whose API
> >    is that they get a value in the endianness they asked for.
> > 
> >> IO callbacks always get a native endian format so they expect to get
> >> byte 0 of the buffer in MSB on this host.
> > 
> > IO callbacks get the format they asked for (which might
> > be BE, LE or target endianness). They will get byte 0 of
> > the buffer in the MSB if they said they were BE devices
> > (or if they said they were target-endian on a BE target).
Paolo Bonzini July 6, 2015, 12:23 p.m. UTC | #18
On 06/07/2015 14:12, Michael S. Tsirkin wrote:
> On Mon, Jul 06, 2015 at 12:50:43PM +0100, Peter Maydell wrote:
>> On 6 July 2015 at 11:31, Michael S. Tsirkin <mst@redhat.com> wrote:
>>> On Mon, Jul 06, 2015 at 11:04:24AM +0100, Peter Maydell wrote:
>>>> On 6 July 2015 at 11:03, Michael S. Tsirkin <mst@redhat.com> wrote:
>>>>> On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote:
>>>>>> But address_space_rw() is just the "memcpy bytes to the
>>>>>> target's memory" operation -- if you have a pile of bytes
>>>>>> then there are no endianness concerns. If you don't have
>>>>>> a pile of bytes then you need to know the structure of
>>>>>> the data you're DMAing around, and you should probably
>>>>>> have a loop doing things with the specify-the-width functions.
>>>>
>>>>> Absolutely. But what if DMA happens to target another device
>>>>> and not memory? Device needs some endian-ness so it needs
>>>>> to be converted to that.
>>>>
>>>> Yes, and address_space_rw() already deals with conversion to
>>>> that device's specified endianness.
>>
>>> Yes, but incorrectly if target endian != host endian.
>>> For example, LE target and LE device on BE host.
>>
>> Having walked through the code, got confused, talked to
>> bonzini on IRC about it and got unconfused again, I believe
>> we do get this correct.
>>
>>  * address_space_rw() takes a pointer to a pile of bytes
>>  * if the destination is RAM, we just memcpy them (because
>>    guest RAM is also a pile of bytes)
>>  * if the destination is a device, then we read a value
>>    out of the pile of bytes at whatever width the target
>>    device can handle. The functions we use for this are
>>    ldl_q/ldl_p/etc, which do "load target endianness"
>>    (ie "interpret this set of 4 bytes as if it were an
>>    integer in the target-endianness") because the API of
>>    memory_region_dispatch_write() is that it takes a uint64_t
>>    data whose contents are the value to write in target
>>    endianness order. (This is regrettably undocumented.)
>>  * memory_region_dispatch_write() then calls adjust_endianness(),
>>    converting a target-endian value to the endianness the
>>    device says it requires
>>  * we then call the device's read/write functions, whose API
>>    is that they get a value in the endianness they asked for.
> 
> IMO what's missing here is that device read/write callbacks actually want
> host endian-ness. See below for examples.

Absolutely not.  The endianness of the device read/write callbacks is
included in the MemoryRegionOps.

>>> IO callbacks always get a native endian format so they expect to get
>>> byte 0 of the buffer in MSB on this host.
>>
>> IO callbacks get the format they asked for (which might
>> be BE, LE or target endianness). They will get byte 0 of
>> the buffer in the MSB if they said they were BE devices
>> (or if they said they were target-endian on a BE target).
>>
>> thanks
>> -- PMM
> 
> I believe this last point is wrong.
> 
> For example typical PCI devices use little endian. This does not mean
> they want an LE value in their callback.
> 
> This means guest will give them LE values and they want data
> converted *from that* to the natural host endian, for
> convenience.

An X-bit unsigned integer, such as the one passed to e1000_mmio_write,
doesn't have an endianness--neither big nor little.  It's just a number
between 0 and 2^X-1.  Endianness becomes a property of uintX_t only when
you read it as bytes through memcpy, which e1000_mmio_write doesn't do
(mac_reg is an array of uint32_t).

> Imagine that we have another device read data (e.g. from RAM
> using address_space_rw) and then write it out to another device
> usng address_space_rw.
> 
> Run it with x86 target on BE host.
> 	What do you get is byte 0 goes to bits 0-7 in the value.

You mean byte 7.

> Run it with x86 target on LE host.
> 	What do you get is byte 0 goes to bits 0-7 in the value.
> 
> The effect on the device will be very different, and clearly
> this is a bug since host endian mustn't affect how
> devices work.

I fail to see how host endianness enters the picture.  As Peter said,
there isn't a single access based on host endianness in the MMIO
dispatch path.

If you think there is a bug, please write a testcase.  Then we can
discuss based on something concrete, or just fix it.  See the recent
problems with access clamping, where based on concrete testcases we were
able to fix all the bugs and actually show that the patch was desirable
despite the number of bugs it unveiled.

Paolo
diff mbox

Patch

diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
index 05d9d24..b6c442f 100644
--- a/hw/virtio/virtio-pci.h
+++ b/hw/virtio/virtio-pci.h
@@ -112,9 +112,12 @@  struct VirtIOPCIProxy {
     VirtIOPCIRegion device;
     VirtIOPCIRegion notify;
     MemoryRegion modern_bar;
+    MemoryRegion modern_cfg;
+    AddressSpace modern_as;
     uint32_t legacy_io_bar;
     uint32_t msix_bar;
     uint32_t modern_mem_bar;
+    int config_cap;
     uint32_t flags;
     uint32_t class_code;
     uint32_t nvectors;
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 6a0174e..7890b00 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -448,6 +448,7 @@  static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
 {
     VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
     VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+    struct virtio_pci_cfg_cap *cfg;
 
     pci_default_write_config(pci_dev, address, val, len);
 
@@ -456,6 +457,51 @@  static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
         virtio_pci_stop_ioeventfd(proxy);
         virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
     }
+
+    if (proxy->config_cap &&
+        ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap,
+                                                                  pci_cfg_data),
+                       sizeof cfg->pci_cfg_data)) {
+        uint32_t off;
+        uint32_t len;
+
+        cfg = (void *)(proxy->pci_dev.config + proxy->config_cap);
+        off = le32_to_cpu(cfg->cap.offset);
+        len = le32_to_cpu(cfg->cap.length);
+
+        if ((len == 1 || len == 2 || len == 4)) {
+            address_space_write(&proxy->modern_as, off,
+                                MEMTXATTRS_UNSPECIFIED,
+                                cfg->pci_cfg_data, len);
+        }
+    }
+}
+
+static uint32_t virtio_read_config(PCIDevice *pci_dev,
+                                   uint32_t address, int len)
+{
+    VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
+    struct virtio_pci_cfg_cap *cfg;
+
+    if (proxy->config_cap &&
+        ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap,
+                                                                  pci_cfg_data),
+                       sizeof cfg->pci_cfg_data)) {
+        uint32_t off;
+        uint32_t len;
+
+        cfg = (void *)(proxy->pci_dev.config + proxy->config_cap);
+        off = le32_to_cpu(cfg->cap.offset);
+        len = le32_to_cpu(cfg->cap.length);
+
+        if ((len == 1 || len == 2 || len == 4)) {
+            address_space_read(&proxy->modern_as, off,
+                                MEMTXATTRS_UNSPECIFIED,
+                                cfg->pci_cfg_data, len);
+        }
+    }
+
+    return pci_default_read_config(pci_dev, address, len);
 }
 
 static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
@@ -942,7 +988,7 @@  static int virtio_pci_query_nvectors(DeviceState *d)
     return proxy->nvectors;
 }
 
-static void virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy,
+static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy,
                                    struct virtio_pci_cap *cap)
 {
     PCIDevice *dev = &proxy->pci_dev;
@@ -954,6 +1000,8 @@  static void virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy,
     assert(cap->cap_len >= sizeof *cap);
     memcpy(dev->config + offset + PCI_CAP_FLAGS, &cap->cap_len,
            cap->cap_len - PCI_CAP_FLAGS);
+
+    return offset;
 }
 
 static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr,
@@ -1329,6 +1377,11 @@  static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
             .notify_off_multiplier =
                 cpu_to_le32(QEMU_VIRTIO_PCI_QUEUE_MEM_MULT),
         };
+        struct virtio_pci_cfg_cap cfg = {
+            .cap.cap_len = sizeof cfg,
+            .cap.cfg_type = VIRTIO_PCI_CAP_PCI_CFG,
+        };
+        struct virtio_pci_cfg_cap *cfg_mask;
 
         /* TODO: add io access for speed */
 
@@ -1338,11 +1391,19 @@  static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
         virtio_pci_modern_region_map(proxy, &proxy->isr, &cap);
         virtio_pci_modern_region_map(proxy, &proxy->device, &cap);
         virtio_pci_modern_region_map(proxy, &proxy->notify, &notify.cap);
+
         pci_register_bar(&proxy->pci_dev, proxy->modern_mem_bar,
                          PCI_BASE_ADDRESS_SPACE_MEMORY |
                          PCI_BASE_ADDRESS_MEM_PREFETCH |
                          PCI_BASE_ADDRESS_MEM_TYPE_64,
                          &proxy->modern_bar);
+
+        proxy->config_cap = virtio_pci_add_mem_cap(proxy, &cfg.cap);
+        cfg_mask = (void *)(proxy->pci_dev.wmask + proxy->config_cap);
+        pci_set_byte(&cfg_mask->cap.bar, ~0x0);
+        pci_set_long((uint8_t *)&cfg_mask->cap.offset, ~0x0);
+        pci_set_long((uint8_t *)&cfg_mask->cap.length, ~0x0);
+        pci_set_long(cfg_mask->pci_cfg_data, ~0x0);
     }
 
     if (proxy->nvectors &&
@@ -1354,6 +1415,7 @@  static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
     }
 
     proxy->pci_dev.config_write = virtio_write_config;
+    proxy->pci_dev.config_read = virtio_read_config;
 
     if (legacy) {
         size = VIRTIO_PCI_REGION_SIZE(&proxy->pci_dev)
@@ -1424,6 +1486,15 @@  static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
                        2 * QEMU_VIRTIO_PCI_QUEUE_MEM_MULT *
                        VIRTIO_QUEUE_MAX);
 
+    memory_region_init_alias(&proxy->modern_cfg,
+                             OBJECT(proxy),
+                             "virtio-pci-cfg",
+                             &proxy->modern_bar,
+                             0,
+                             memory_region_size(&proxy->modern_bar));
+
+    address_space_init(&proxy->modern_as, &proxy->modern_cfg, "virtio-pci-cfg-as");
+
     virtio_pci_bus_new(&proxy->bus, sizeof(proxy->bus), proxy);
     if (k->realize) {
         k->realize(proxy, errp);
@@ -1432,7 +1503,10 @@  static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
 
 static void virtio_pci_exit(PCIDevice *pci_dev)
 {
+    VirtIOPCIProxy *proxy = VIRTIO_PCI(pci_dev);
+
     msix_uninit_exclusive_bar(pci_dev);
+    address_space_destroy(&proxy->modern_as);
 }
 
 static void virtio_pci_reset(DeviceState *qdev)