Message ID | 1435842022-13980-1-git-send-email-mst@redhat.com |
---|---|
State | New |
Headers | show |
On 02/07/2015 15:00, Michael S. Tsirkin wrote: > + cfg = (void *)(proxy->pci_dev.config + proxy->config_cap); > + off = le32_to_cpu(cfg->cap.offset); > + len = le32_to_cpu(cfg->cap.length); > + > + if ((len == 1 || len == 2 || len == 4)) { > + address_space_write(&proxy->modern_as, off, > + MEMTXATTRS_UNSPECIFIED, > + cfg->pci_cfg_data, len); > + } This parses pci_cfg_data in target endianness I think. You just want to move the little-endian value from the config cap to the little-endian value in the modern_as, so you need to use ldl_le_p and address_space_stl_le. > + } > +} > + > +static uint32_t virtio_read_config(PCIDevice *pci_dev, > + uint32_t address, int len) > +{ > + VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev); > + struct virtio_pci_cfg_cap *cfg; > + > + if (proxy->config_cap && > + ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap, > + pci_cfg_data), > + sizeof cfg->pci_cfg_data)) { > + uint32_t off; > + uint32_t len; > + > + cfg = (void *)(proxy->pci_dev.config + proxy->config_cap); > + off = le32_to_cpu(cfg->cap.offset); > + len = le32_to_cpu(cfg->cap.length); > + > + if ((len == 1 || len == 2 || len == 4)) { > + address_space_read(&proxy->modern_as, off, > + MEMTXATTRS_UNSPECIFIED, > + cfg->pci_cfg_data, len); Same here, use address_space_ldl_le to read into an int, and stl_le_p to write into cfg->pci_cfg_data. The best way to check it, of course, is to write a unit test! :) But you could also use a Linux BE guest on LE host. Everything else looks good. Paolo
On Thu, Jul 02, 2015 at 08:48:14PM +0200, Paolo Bonzini wrote: > > > On 02/07/2015 15:00, Michael S. Tsirkin wrote: > > + cfg = (void *)(proxy->pci_dev.config + proxy->config_cap); > > + off = le32_to_cpu(cfg->cap.offset); > > + len = le32_to_cpu(cfg->cap.length); > > + > > + if ((len == 1 || len == 2 || len == 4)) { > > + address_space_write(&proxy->modern_as, off, > > + MEMTXATTRS_UNSPECIFIED, > > + cfg->pci_cfg_data, len); > > + } > > This parses pci_cfg_data in target endianness I think. You just want to > move the little-endian value from the config cap to the little-endian > value in the modern_as, so you need to use ldl_le_p and > address_space_stl_le. but isn't that two byteswaps? why isn't this same as memcpy? > > + } > > +} > > + > > +static uint32_t virtio_read_config(PCIDevice *pci_dev, > > + uint32_t address, int len) > > +{ > > + VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev); > > + struct virtio_pci_cfg_cap *cfg; > > + > > + if (proxy->config_cap && > > + ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap, > > + pci_cfg_data), > > + sizeof cfg->pci_cfg_data)) { > > + uint32_t off; > > + uint32_t len; > > + > > + cfg = (void *)(proxy->pci_dev.config + proxy->config_cap); > > + off = le32_to_cpu(cfg->cap.offset); > > + len = le32_to_cpu(cfg->cap.length); > > + > > + if ((len == 1 || len == 2 || len == 4)) { > > + address_space_read(&proxy->modern_as, off, > > + MEMTXATTRS_UNSPECIFIED, > > + cfg->pci_cfg_data, len); > > Same here, use address_space_ldl_le to read into an int, and stl_le_p to > write into cfg->pci_cfg_data. > > The best way to check it, of course, is to write a unit test! :) But > you could also use a Linux BE guest on LE host. > > Everything else looks good. > > Paolo
On 02/07/2015 21:00, Michael S. Tsirkin wrote: > On Thu, Jul 02, 2015 at 08:48:14PM +0200, Paolo Bonzini wrote: >> >> >> On 02/07/2015 15:00, Michael S. Tsirkin wrote: >>> + cfg = (void *)(proxy->pci_dev.config + proxy->config_cap); >>> + off = le32_to_cpu(cfg->cap.offset); >>> + len = le32_to_cpu(cfg->cap.length); >>> + >>> + if ((len == 1 || len == 2 || len == 4)) { >>> + address_space_write(&proxy->modern_as, off, >>> + MEMTXATTRS_UNSPECIFIED, >>> + cfg->pci_cfg_data, len); >>> + } >> >> This parses pci_cfg_data in target endianness I think. You just want to >> move the little-endian value from the config cap to the little-endian >> value in the modern_as, so you need to use ldl_le_p and >> address_space_stl_le. > > but isn't that two byteswaps? why isn't this same as memcpy? It is a memcpy if you write to RAM, but the MMIO ops take an unsigned integer, so you can still have one byteswap hiding; you have to be careful when you have this kind of "forwarder", and the simplest way to do it is to use an ldl/stl pair with the same endianness. Paolo >>> + } >>> +} >>> + >>> +static uint32_t virtio_read_config(PCIDevice *pci_dev, >>> + uint32_t address, int len) >>> +{ >>> + VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev); >>> + struct virtio_pci_cfg_cap *cfg; >>> + >>> + if (proxy->config_cap && >>> + ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap, >>> + pci_cfg_data), >>> + sizeof cfg->pci_cfg_data)) { >>> + uint32_t off; >>> + uint32_t len; >>> + >>> + cfg = (void *)(proxy->pci_dev.config + proxy->config_cap); >>> + off = le32_to_cpu(cfg->cap.offset); >>> + len = le32_to_cpu(cfg->cap.length); >>> + >>> + if ((len == 1 || len == 2 || len == 4)) { >>> + address_space_read(&proxy->modern_as, off, >>> + MEMTXATTRS_UNSPECIFIED, >>> + cfg->pci_cfg_data, len); >> >> Same here, use address_space_ldl_le to read into an int, and stl_le_p to >> write into cfg->pci_cfg_data. >> >> The best way to check it, of course, is to write a unit test! :) But >> you could also use a Linux BE guest on LE host. >> >> Everything else looks good. >> >> Paolo
On Do, 2015-07-02 at 15:00 +0200, Michael S. Tsirkin wrote: > spec says we must, so let's do it! > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > --- > > untested at this point. Tested-by: Gerd Hoffmann <kraxel@redhat.com> [ seabios patches follow in a moment ] cheers, Gerd
On Thu, Jul 02, 2015 at 09:02:58PM +0200, Paolo Bonzini wrote: > > > On 02/07/2015 21:00, Michael S. Tsirkin wrote: > > On Thu, Jul 02, 2015 at 08:48:14PM +0200, Paolo Bonzini wrote: > >> > >> > >> On 02/07/2015 15:00, Michael S. Tsirkin wrote: > >>> + cfg = (void *)(proxy->pci_dev.config + proxy->config_cap); > >>> + off = le32_to_cpu(cfg->cap.offset); > >>> + len = le32_to_cpu(cfg->cap.length); > >>> + > >>> + if ((len == 1 || len == 2 || len == 4)) { > >>> + address_space_write(&proxy->modern_as, off, > >>> + MEMTXATTRS_UNSPECIFIED, > >>> + cfg->pci_cfg_data, len); > >>> + } > >> > >> This parses pci_cfg_data in target endianness I think. You just want to > >> move the little-endian value from the config cap to the little-endian > >> value in the modern_as, so you need to use ldl_le_p and > >> address_space_stl_le. > > > > but isn't that two byteswaps? why isn't this same as memcpy? > > It is a memcpy if you write to RAM, but the MMIO ops take an unsigned > integer, so you can still have one byteswap hiding; you have to be > careful when you have this kind of "forwarder", and the simplest way to > do it is to use an ldl/stl pair with the same endianness. > > Paolo The fact that address_space_write/_read actually does a byteswap if host!=target endian should probably be documented. Or maybe it should be changed: it seems likely that non-target-specific devices that use it do this incorrectly ATM. In particular, dma_memory_rw_relaxed calls address_space_rw and since DMA originates with devices I think there's very little chance that these actually want a different behaviour depending on the target endian-ness. Most likely, these only work correctly because DMA outside RAM is highly unusual. > >>> + } > >>> +} > >>> + > >>> +static uint32_t virtio_read_config(PCIDevice *pci_dev, > >>> + uint32_t address, int len) > >>> +{ > >>> + VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev); > >>> + struct virtio_pci_cfg_cap *cfg; > >>> + > >>> + if (proxy->config_cap && > >>> + ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap, > >>> + pci_cfg_data), > >>> + sizeof cfg->pci_cfg_data)) { > >>> + uint32_t off; > >>> + uint32_t len; > >>> + > >>> + cfg = (void *)(proxy->pci_dev.config + proxy->config_cap); > >>> + off = le32_to_cpu(cfg->cap.offset); > >>> + len = le32_to_cpu(cfg->cap.length); > >>> + > >>> + if ((len == 1 || len == 2 || len == 4)) { > >>> + address_space_read(&proxy->modern_as, off, > >>> + MEMTXATTRS_UNSPECIFIED, > >>> + cfg->pci_cfg_data, len); > >> > >> Same here, use address_space_ldl_le to read into an int, and stl_le_p to > >> write into cfg->pci_cfg_data. > >> > >> The best way to check it, of course, is to write a unit test! :) But > >> you could also use a Linux BE guest on LE host. > >> > >> Everything else looks good. > >> > >> Paolo
On 04/07/2015 23:19, Michael S. Tsirkin wrote: > The fact that address_space_write/_read actually does a byteswap if > host!=target endian should probably be documented. FWIW, it's not if host != target endian. It's if memory region endianness != target endianness. See memory_region_wrong_endianness. > Or maybe it should be changed: it seems likely that non-target-specific devices > that use it do this incorrectly ATM. In particular, dma_memory_rw_relaxed calls > address_space_rw and since DMA originates with devices I think there's very > little chance that these actually want a different behaviour depending on the > target endian-ness. > > Most likely, these only work correctly because DMA outside RAM > is highly unusual. They work correctly because of that, and because most devices *and* targets are little endian so you have no swap. On ppc64, which has TARGET_WORDS_BIGENDIAN, it probably wouldn't work correctly. Paolo
On Mon, Jul 06, 2015 at 09:46:14AM +0200, Paolo Bonzini wrote: > > > On 04/07/2015 23:19, Michael S. Tsirkin wrote: > > The fact that address_space_write/_read actually does a byteswap if > > host!=target endian should probably be documented. > > FWIW, it's not if host != target endian. It's if memory region > endianness != target endianness. See memory_region_wrong_endianness. Right. What I meant is that if device != host then we want a byteswap anyway. E.g. if device says it's BE and system is LE, then it does want to get byte 0 as MSB in the uint32 it gets. IOW we want byteswap if host != MR. Instead we get a byteswap if target != MR. This is the same unless host != target. > > Or maybe it should be changed: it seems likely that non-target-specific devices > > that use it do this incorrectly ATM. In particular, dma_memory_rw_relaxed calls > > address_space_rw and since DMA originates with devices I think there's very > > little chance that these actually want a different behaviour depending on the > > target endian-ness. > > > > Most likely, these only work correctly because DMA outside RAM > > is highly unusual. > > They work correctly because of that, and because most devices *and* > targets are little endian so you have no swap. On ppc64, which has > TARGET_WORDS_BIGENDIAN, it probably wouldn't work correctly. > > Paolo Also, by luck, some values work the same whatever the endian-ness. E.g. dma_memory_set fills the buffer with a given pattern, so nothing changes if you byte-swap it. Here's an example that's wrong: dp8393x. Typically it's accessing memory for DMA, so there's no byteswap. Works fine. But should device attempt to access another device memory, it would break because MIPS target is BE. Cc Hervé for confirmation. I conclude that virtio is not so special in needing a variant of address_space_rw that assumes host endian format for the data.
On 06/07/2015 10:33, Michael S. Tsirkin wrote: > Also, by luck, some values work the same whatever the endian-ness. > E.g. dma_memory_set fills the buffer with a given pattern, so > nothing changes if you byte-swap it. > > Here's an example that's wrong: dp8393x. Typically it's accessing > memory for DMA, so there's no byteswap. Works fine. > > But should device attempt to access another device memory, > it would break because MIPS target is BE. > > Cc Hervé for confirmation. > > I conclude that virtio is not so special in needing a variant > of address_space_rw that assumes host endian format for the data. Why host endian and not device (in this case little) endian? Paolo
On Mon, Jul 06, 2015 at 10:46:31AM +0200, Paolo Bonzini wrote: > > > On 06/07/2015 10:33, Michael S. Tsirkin wrote: > > Also, by luck, some values work the same whatever the endian-ness. > > E.g. dma_memory_set fills the buffer with a given pattern, so > > nothing changes if you byte-swap it. > > > > Here's an example that's wrong: dp8393x. Typically it's accessing > > memory for DMA, so there's no byteswap. Works fine. > > > > But should device attempt to access another device memory, > > it would break because MIPS target is BE. > > > > Cc Hervé for confirmation. > > > > I conclude that virtio is not so special in needing a variant > > of address_space_rw that assumes host endian format for the data. > > Why host endian and not device (in this case little) endian? > > Paolo It's the endian of the originator of the transaction. And emulated device code is all compiled in host endian-ness.
On 6 July 2015 at 10:06, Michael S. Tsirkin <mst@redhat.com> wrote: > On Mon, Jul 06, 2015 at 10:46:31AM +0200, Paolo Bonzini wrote: >> Why host endian and not device (in this case little) endian? > It's the endian of the originator of the transaction. > And emulated device code is all compiled in host endian-ness. But address_space_rw() is just the "memcpy bytes to the target's memory" operation -- if you have a pile of bytes then there are no endianness concerns. If you don't have a pile of bytes then you need to know the structure of the data you're DMAing around, and you should probably have a loop doing things with the specify-the-width functions. thanks -- PMM
On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote: > On 6 July 2015 at 10:06, Michael S. Tsirkin <mst@redhat.com> wrote: > > On Mon, Jul 06, 2015 at 10:46:31AM +0200, Paolo Bonzini wrote: > >> Why host endian and not device (in this case little) endian? > > > It's the endian of the originator of the transaction. > > And emulated device code is all compiled in host endian-ness. > > But address_space_rw() is just the "memcpy bytes to the > target's memory" operation -- if you have a pile of bytes > then there are no endianness concerns. If you don't have > a pile of bytes then you need to know the structure of > the data you're DMAing around, and you should probably > have a loop doing things with the specify-the-width functions. > > thanks > -- PMM Absolutely. But what if DMA happens to target another device and not memory? Device needs some endian-ness so it needs to be converted to that.
On 6 July 2015 at 11:03, Michael S. Tsirkin <mst@redhat.com> wrote: > On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote: >> But address_space_rw() is just the "memcpy bytes to the >> target's memory" operation -- if you have a pile of bytes >> then there are no endianness concerns. If you don't have >> a pile of bytes then you need to know the structure of >> the data you're DMAing around, and you should probably >> have a loop doing things with the specify-the-width functions. > Absolutely. But what if DMA happens to target another device > and not memory? Device needs some endian-ness so it needs > to be converted to that. Yes, and address_space_rw() already deals with conversion to that device's specified endianness. -- PMM
On Mon, Jul 06, 2015 at 11:04:24AM +0100, Peter Maydell wrote: > On 6 July 2015 at 11:03, Michael S. Tsirkin <mst@redhat.com> wrote: > > On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote: > >> But address_space_rw() is just the "memcpy bytes to the > >> target's memory" operation -- if you have a pile of bytes > >> then there are no endianness concerns. If you don't have > >> a pile of bytes then you need to know the structure of > >> the data you're DMAing around, and you should probably > >> have a loop doing things with the specify-the-width functions. > > > Absolutely. But what if DMA happens to target another device > > and not memory? Device needs some endian-ness so it needs > > to be converted to that. > > Yes, and address_space_rw() already deals with conversion to > that device's specified endianness. > > -- PMM Yes, but incorrectly if target endian != host endian. For example, LE target and LE device on BE host. IO callbacks always get a native endian format so they expect to get byte 0 of the buffer in MSB on this host.
On 6 July 2015 at 11:31, Michael S. Tsirkin <mst@redhat.com> wrote: > On Mon, Jul 06, 2015 at 11:04:24AM +0100, Peter Maydell wrote: >> On 6 July 2015 at 11:03, Michael S. Tsirkin <mst@redhat.com> wrote: >> > On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote: >> >> But address_space_rw() is just the "memcpy bytes to the >> >> target's memory" operation -- if you have a pile of bytes >> >> then there are no endianness concerns. If you don't have >> >> a pile of bytes then you need to know the structure of >> >> the data you're DMAing around, and you should probably >> >> have a loop doing things with the specify-the-width functions. >> >> > Absolutely. But what if DMA happens to target another device >> > and not memory? Device needs some endian-ness so it needs >> > to be converted to that. >> >> Yes, and address_space_rw() already deals with conversion to >> that device's specified endianness. > Yes, but incorrectly if target endian != host endian. > For example, LE target and LE device on BE host. Having walked through the code, got confused, talked to bonzini on IRC about it and got unconfused again, I believe we do get this correct. * address_space_rw() takes a pointer to a pile of bytes * if the destination is RAM, we just memcpy them (because guest RAM is also a pile of bytes) * if the destination is a device, then we read a value out of the pile of bytes at whatever width the target device can handle. The functions we use for this are ldl_q/ldl_p/etc, which do "load target endianness" (ie "interpret this set of 4 bytes as if it were an integer in the target-endianness") because the API of memory_region_dispatch_write() is that it takes a uint64_t data whose contents are the value to write in target endianness order. (This is regrettably undocumented.) * memory_region_dispatch_write() then calls adjust_endianness(), converting a target-endian value to the endianness the device says it requires * we then call the device's read/write functions, whose API is that they get a value in the endianness they asked for. > IO callbacks always get a native endian format so they expect to get > byte 0 of the buffer in MSB on this host. IO callbacks get the format they asked for (which might be BE, LE or target endianness). They will get byte 0 of the buffer in the MSB if they said they were BE devices (or if they said they were target-endian on a BE target). thanks -- PMM
On 06/07/2015 13:50, Peter Maydell wrote: > On 6 July 2015 at 11:31, Michael S. Tsirkin <mst@redhat.com> wrote: >> On Mon, Jul 06, 2015 at 11:04:24AM +0100, Peter Maydell wrote: >>> On 6 July 2015 at 11:03, Michael S. Tsirkin <mst@redhat.com> wrote: >>>> On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote: >>>>> But address_space_rw() is just the "memcpy bytes to the >>>>> target's memory" operation -- if you have a pile of bytes >>>>> then there are no endianness concerns. If you don't have >>>>> a pile of bytes then you need to know the structure of >>>>> the data you're DMAing around, and you should probably >>>>> have a loop doing things with the specify-the-width functions. >>> >>>> Absolutely. But what if DMA happens to target another device >>>> and not memory? Device needs some endian-ness so it needs >>>> to be converted to that. >>> >>> Yes, and address_space_rw() already deals with conversion to >>> that device's specified endianness. > >> Yes, but incorrectly if target endian != host endian. >> For example, LE target and LE device on BE host. > > Having walked through the code, got confused, talked to > bonzini on IRC about it and got unconfused again, Ah, *that discussion*. So it was yet another XY question, :) but for the better because it also helped me abstract Michael's question. Peter's analysis below summarizes the implementation very well. I believe > we do get this correct. > > * address_space_rw() takes a pointer to a pile of bytes > * if the destination is RAM, we just memcpy them (because > guest RAM is also a pile of bytes) > * if the destination is a device, then we read a value > out of the pile of bytes at whatever width the target > device can handle. The functions we use for this are > ldl_q/ldl_p/etc, which do "load target endianness" > (ie "interpret this set of 4 bytes as if it were an > integer in the target-endianness") because the API of > memory_region_dispatch_write() is that it takes a uint64_t > data whose contents are the value to write in target > endianness order. (This is regrettably undocumented.) ^^ And this is the part where "the endianness of the CPU->device bus/link" enters the picture. But it doesn't matter if the source is instead another device. What matters is that address_space_rw() manages conversion from a pile of bytes, and the device doing DMA provides that---a pile of bytes. In the patch at the beginning of this thread, problems arose because what you passed to address_space_write wasn't just a "pile of bytes" coming from a network packet or a disk sector. Instead, it was the outcome of a previous conversion from "pile of bytes" to "bytes representing an integer in little-endian format". This conversion could have possibly included a byteswap. Once you have established that the bytes represent an integer the right way to access them is to use ld*_p/st*_p and address_space_ld*/address_space_st*. This ensures that you do an even number of further byteswaps; for *_le_p and address_space_*_le, there will be 0 further byteswaps on little-endian hosts and 2 on big-endian hosts. Paolo > * memory_region_dispatch_write() then calls adjust_endianness(), > converting a target-endian value to the endianness the > device says it requires > * we then call the device's read/write functions, whose API > is that they get a value in the endianness they asked for. > >> IO callbacks always get a native endian format so they expect to get >> byte 0 of the buffer in MSB on this host. > > IO callbacks get the format they asked for (which might > be BE, LE or target endianness). They will get byte 0 of > the buffer in the MSB if they said they were BE devices > (or if they said they were target-endian on a BE target).
On Mon, Jul 06, 2015 at 12:50:43PM +0100, Peter Maydell wrote: > On 6 July 2015 at 11:31, Michael S. Tsirkin <mst@redhat.com> wrote: > > On Mon, Jul 06, 2015 at 11:04:24AM +0100, Peter Maydell wrote: > >> On 6 July 2015 at 11:03, Michael S. Tsirkin <mst@redhat.com> wrote: > >> > On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote: > >> >> But address_space_rw() is just the "memcpy bytes to the > >> >> target's memory" operation -- if you have a pile of bytes > >> >> then there are no endianness concerns. If you don't have > >> >> a pile of bytes then you need to know the structure of > >> >> the data you're DMAing around, and you should probably > >> >> have a loop doing things with the specify-the-width functions. > >> > >> > Absolutely. But what if DMA happens to target another device > >> > and not memory? Device needs some endian-ness so it needs > >> > to be converted to that. > >> > >> Yes, and address_space_rw() already deals with conversion to > >> that device's specified endianness. > > > Yes, but incorrectly if target endian != host endian. > > For example, LE target and LE device on BE host. > > Having walked through the code, got confused, talked to > bonzini on IRC about it and got unconfused again, I believe > we do get this correct. > > * address_space_rw() takes a pointer to a pile of bytes > * if the destination is RAM, we just memcpy them (because > guest RAM is also a pile of bytes) > * if the destination is a device, then we read a value > out of the pile of bytes at whatever width the target > device can handle. The functions we use for this are > ldl_q/ldl_p/etc, which do "load target endianness" > (ie "interpret this set of 4 bytes as if it were an > integer in the target-endianness") because the API of > memory_region_dispatch_write() is that it takes a uint64_t > data whose contents are the value to write in target > endianness order. (This is regrettably undocumented.) > * memory_region_dispatch_write() then calls adjust_endianness(), > converting a target-endian value to the endianness the > device says it requires > * we then call the device's read/write functions, whose API > is that they get a value in the endianness they asked for. IMO what's missing here is that device read/write callbacks actually want host endian-ness. See below for examples. > > IO callbacks always get a native endian format so they expect to get > > byte 0 of the buffer in MSB on this host. > > IO callbacks get the format they asked for (which might > be BE, LE or target endianness). They will get byte 0 of > the buffer in the MSB if they said they were BE devices > (or if they said they were target-endian on a BE target). > > thanks > -- PMM I believe this last point is wrong. For example typical PCI devices use little endian. This does not mean they want an LE value in their callback. This means guest will give them LE values and they want data converted *from that* to the natural host endian, for convenience. Check e.g. e1000_mmio_write as a typical example. Imagine that we have another device read data (e.g. from RAM using address_space_rw) and then write it out to another device usng address_space_rw. Run it with x86 target on BE host. What do you get is byte 0 goes to bits 0-7 in the value. Run it with x86 target on LE host. What do you get is byte 0 goes to bits 0-7 in the value. The effect on the device will be very different, and clearly this is a bug since host endian mustn't affect how devices work.
On Mon, Jul 06, 2015 at 02:04:12PM +0200, Paolo Bonzini wrote: > > > On 06/07/2015 13:50, Peter Maydell wrote: > > On 6 July 2015 at 11:31, Michael S. Tsirkin <mst@redhat.com> wrote: > >> On Mon, Jul 06, 2015 at 11:04:24AM +0100, Peter Maydell wrote: > >>> On 6 July 2015 at 11:03, Michael S. Tsirkin <mst@redhat.com> wrote: > >>>> On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote: > >>>>> But address_space_rw() is just the "memcpy bytes to the > >>>>> target's memory" operation -- if you have a pile of bytes > >>>>> then there are no endianness concerns. If you don't have > >>>>> a pile of bytes then you need to know the structure of > >>>>> the data you're DMAing around, and you should probably > >>>>> have a loop doing things with the specify-the-width functions. > >>> > >>>> Absolutely. But what if DMA happens to target another device > >>>> and not memory? Device needs some endian-ness so it needs > >>>> to be converted to that. > >>> > >>> Yes, and address_space_rw() already deals with conversion to > >>> that device's specified endianness. > > > >> Yes, but incorrectly if target endian != host endian. > >> For example, LE target and LE device on BE host. > > > > Having walked through the code, got confused, talked to > > bonzini on IRC about it and got unconfused again, > > Ah, *that discussion*. So it was yet another XY question, :) but for > the better because it also helped me abstract Michael's question. > > Peter's analysis below summarizes the implementation very well. > > I believe > > we do get this correct. > > > > * address_space_rw() takes a pointer to a pile of bytes > > * if the destination is RAM, we just memcpy them (because > > guest RAM is also a pile of bytes) > > * if the destination is a device, then we read a value > > out of the pile of bytes at whatever width the target > > device can handle. The functions we use for this are > > ldl_q/ldl_p/etc, which do "load target endianness" > > (ie "interpret this set of 4 bytes as if it were an > > integer in the target-endianness") because the API of > > memory_region_dispatch_write() is that it takes a uint64_t > > data whose contents are the value to write in target > > endianness order. (This is regrettably undocumented.) > > ^^ And this is the part where "the endianness of the CPU->device > bus/link" enters the picture. But it doesn't matter if the source is > instead another device. What matters is that address_space_rw() manages > conversion from a pile of bytes, and the device doing DMA provides > that---a pile of bytes. > > In the patch at the beginning of this thread, problems arose because > what you passed to address_space_write wasn't just a "pile of bytes" > coming from a network packet or a disk sector. Instead, it was the > outcome of a previous conversion from "pile of bytes" to "bytes > representing an integer in little-endian format". This conversion could > have possibly included a byteswap. Well, not really. It's a pile of bytes from guest POV. And same thing happens if you read a pile of bytes from RAM using address_space_read. > Once you have established that the bytes represent an integer the right > way to access them is to use ld*_p/st*_p and > address_space_ld*/address_space_st*. This ensures that you do an even > number of further byteswaps; for *_le_p and address_space_*_le, there > will be 0 further byteswaps on little-endian hosts and 2 on big-endian > hosts. > > Paolo I believe this summarizes the implementation correctly. I also argue that many devices use address_space_rw incorrectly assuming it converts from host endian, simply because most devices are written in host endian. > > * memory_region_dispatch_write() then calls adjust_endianness(), > > converting a target-endian value to the endianness the > > device says it requires > > * we then call the device's read/write functions, whose API > > is that they get a value in the endianness they asked for. > > > >> IO callbacks always get a native endian format so they expect to get > >> byte 0 of the buffer in MSB on this host. > > > > IO callbacks get the format they asked for (which might > > be BE, LE or target endianness). They will get byte 0 of > > the buffer in the MSB if they said they were BE devices > > (or if they said they were target-endian on a BE target).
On 06/07/2015 14:12, Michael S. Tsirkin wrote: > On Mon, Jul 06, 2015 at 12:50:43PM +0100, Peter Maydell wrote: >> On 6 July 2015 at 11:31, Michael S. Tsirkin <mst@redhat.com> wrote: >>> On Mon, Jul 06, 2015 at 11:04:24AM +0100, Peter Maydell wrote: >>>> On 6 July 2015 at 11:03, Michael S. Tsirkin <mst@redhat.com> wrote: >>>>> On Mon, Jul 06, 2015 at 10:11:18AM +0100, Peter Maydell wrote: >>>>>> But address_space_rw() is just the "memcpy bytes to the >>>>>> target's memory" operation -- if you have a pile of bytes >>>>>> then there are no endianness concerns. If you don't have >>>>>> a pile of bytes then you need to know the structure of >>>>>> the data you're DMAing around, and you should probably >>>>>> have a loop doing things with the specify-the-width functions. >>>> >>>>> Absolutely. But what if DMA happens to target another device >>>>> and not memory? Device needs some endian-ness so it needs >>>>> to be converted to that. >>>> >>>> Yes, and address_space_rw() already deals with conversion to >>>> that device's specified endianness. >> >>> Yes, but incorrectly if target endian != host endian. >>> For example, LE target and LE device on BE host. >> >> Having walked through the code, got confused, talked to >> bonzini on IRC about it and got unconfused again, I believe >> we do get this correct. >> >> * address_space_rw() takes a pointer to a pile of bytes >> * if the destination is RAM, we just memcpy them (because >> guest RAM is also a pile of bytes) >> * if the destination is a device, then we read a value >> out of the pile of bytes at whatever width the target >> device can handle. The functions we use for this are >> ldl_q/ldl_p/etc, which do "load target endianness" >> (ie "interpret this set of 4 bytes as if it were an >> integer in the target-endianness") because the API of >> memory_region_dispatch_write() is that it takes a uint64_t >> data whose contents are the value to write in target >> endianness order. (This is regrettably undocumented.) >> * memory_region_dispatch_write() then calls adjust_endianness(), >> converting a target-endian value to the endianness the >> device says it requires >> * we then call the device's read/write functions, whose API >> is that they get a value in the endianness they asked for. > > IMO what's missing here is that device read/write callbacks actually want > host endian-ness. See below for examples. Absolutely not. The endianness of the device read/write callbacks is included in the MemoryRegionOps. >>> IO callbacks always get a native endian format so they expect to get >>> byte 0 of the buffer in MSB on this host. >> >> IO callbacks get the format they asked for (which might >> be BE, LE or target endianness). They will get byte 0 of >> the buffer in the MSB if they said they were BE devices >> (or if they said they were target-endian on a BE target). >> >> thanks >> -- PMM > > I believe this last point is wrong. > > For example typical PCI devices use little endian. This does not mean > they want an LE value in their callback. > > This means guest will give them LE values and they want data > converted *from that* to the natural host endian, for > convenience. An X-bit unsigned integer, such as the one passed to e1000_mmio_write, doesn't have an endianness--neither big nor little. It's just a number between 0 and 2^X-1. Endianness becomes a property of uintX_t only when you read it as bytes through memcpy, which e1000_mmio_write doesn't do (mac_reg is an array of uint32_t). > Imagine that we have another device read data (e.g. from RAM > using address_space_rw) and then write it out to another device > usng address_space_rw. > > Run it with x86 target on BE host. > What do you get is byte 0 goes to bits 0-7 in the value. You mean byte 7. > Run it with x86 target on LE host. > What do you get is byte 0 goes to bits 0-7 in the value. > > The effect on the device will be very different, and clearly > this is a bug since host endian mustn't affect how > devices work. I fail to see how host endianness enters the picture. As Peter said, there isn't a single access based on host endianness in the MMIO dispatch path. If you think there is a bug, please write a testcase. Then we can discuss based on something concrete, or just fix it. See the recent problems with access clamping, where based on concrete testcases we were able to fix all the bugs and actually show that the patch was desirable despite the number of bugs it unveiled. Paolo
diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h index 05d9d24..b6c442f 100644 --- a/hw/virtio/virtio-pci.h +++ b/hw/virtio/virtio-pci.h @@ -112,9 +112,12 @@ struct VirtIOPCIProxy { VirtIOPCIRegion device; VirtIOPCIRegion notify; MemoryRegion modern_bar; + MemoryRegion modern_cfg; + AddressSpace modern_as; uint32_t legacy_io_bar; uint32_t msix_bar; uint32_t modern_mem_bar; + int config_cap; uint32_t flags; uint32_t class_code; uint32_t nvectors; diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 6a0174e..7890b00 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -448,6 +448,7 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address, { VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev); VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + struct virtio_pci_cfg_cap *cfg; pci_default_write_config(pci_dev, address, val, len); @@ -456,6 +457,51 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address, virtio_pci_stop_ioeventfd(proxy); virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK); } + + if (proxy->config_cap && + ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap, + pci_cfg_data), + sizeof cfg->pci_cfg_data)) { + uint32_t off; + uint32_t len; + + cfg = (void *)(proxy->pci_dev.config + proxy->config_cap); + off = le32_to_cpu(cfg->cap.offset); + len = le32_to_cpu(cfg->cap.length); + + if ((len == 1 || len == 2 || len == 4)) { + address_space_write(&proxy->modern_as, off, + MEMTXATTRS_UNSPECIFIED, + cfg->pci_cfg_data, len); + } + } +} + +static uint32_t virtio_read_config(PCIDevice *pci_dev, + uint32_t address, int len) +{ + VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev); + struct virtio_pci_cfg_cap *cfg; + + if (proxy->config_cap && + ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap, + pci_cfg_data), + sizeof cfg->pci_cfg_data)) { + uint32_t off; + uint32_t len; + + cfg = (void *)(proxy->pci_dev.config + proxy->config_cap); + off = le32_to_cpu(cfg->cap.offset); + len = le32_to_cpu(cfg->cap.length); + + if ((len == 1 || len == 2 || len == 4)) { + address_space_read(&proxy->modern_as, off, + MEMTXATTRS_UNSPECIFIED, + cfg->pci_cfg_data, len); + } + } + + return pci_default_read_config(pci_dev, address, len); } static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, @@ -942,7 +988,7 @@ static int virtio_pci_query_nvectors(DeviceState *d) return proxy->nvectors; } -static void virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy, +static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy, struct virtio_pci_cap *cap) { PCIDevice *dev = &proxy->pci_dev; @@ -954,6 +1000,8 @@ static void virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy, assert(cap->cap_len >= sizeof *cap); memcpy(dev->config + offset + PCI_CAP_FLAGS, &cap->cap_len, cap->cap_len - PCI_CAP_FLAGS); + + return offset; } static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, @@ -1329,6 +1377,11 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) .notify_off_multiplier = cpu_to_le32(QEMU_VIRTIO_PCI_QUEUE_MEM_MULT), }; + struct virtio_pci_cfg_cap cfg = { + .cap.cap_len = sizeof cfg, + .cap.cfg_type = VIRTIO_PCI_CAP_PCI_CFG, + }; + struct virtio_pci_cfg_cap *cfg_mask; /* TODO: add io access for speed */ @@ -1338,11 +1391,19 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) virtio_pci_modern_region_map(proxy, &proxy->isr, &cap); virtio_pci_modern_region_map(proxy, &proxy->device, &cap); virtio_pci_modern_region_map(proxy, &proxy->notify, ¬ify.cap); + pci_register_bar(&proxy->pci_dev, proxy->modern_mem_bar, PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_PREFETCH | PCI_BASE_ADDRESS_MEM_TYPE_64, &proxy->modern_bar); + + proxy->config_cap = virtio_pci_add_mem_cap(proxy, &cfg.cap); + cfg_mask = (void *)(proxy->pci_dev.wmask + proxy->config_cap); + pci_set_byte(&cfg_mask->cap.bar, ~0x0); + pci_set_long((uint8_t *)&cfg_mask->cap.offset, ~0x0); + pci_set_long((uint8_t *)&cfg_mask->cap.length, ~0x0); + pci_set_long(cfg_mask->pci_cfg_data, ~0x0); } if (proxy->nvectors && @@ -1354,6 +1415,7 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) } proxy->pci_dev.config_write = virtio_write_config; + proxy->pci_dev.config_read = virtio_read_config; if (legacy) { size = VIRTIO_PCI_REGION_SIZE(&proxy->pci_dev) @@ -1424,6 +1486,15 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) 2 * QEMU_VIRTIO_PCI_QUEUE_MEM_MULT * VIRTIO_QUEUE_MAX); + memory_region_init_alias(&proxy->modern_cfg, + OBJECT(proxy), + "virtio-pci-cfg", + &proxy->modern_bar, + 0, + memory_region_size(&proxy->modern_bar)); + + address_space_init(&proxy->modern_as, &proxy->modern_cfg, "virtio-pci-cfg-as"); + virtio_pci_bus_new(&proxy->bus, sizeof(proxy->bus), proxy); if (k->realize) { k->realize(proxy, errp); @@ -1432,7 +1503,10 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) static void virtio_pci_exit(PCIDevice *pci_dev) { + VirtIOPCIProxy *proxy = VIRTIO_PCI(pci_dev); + msix_uninit_exclusive_bar(pci_dev); + address_space_destroy(&proxy->modern_as); } static void virtio_pci_reset(DeviceState *qdev)
spec says we must, so let's do it! Signed-off-by: Michael S. Tsirkin <mst@redhat.com> --- untested at this point. Paolo, could you pls take a look and confirm this memory API usage is ok? hw/virtio/virtio-pci.h | 3 ++ hw/virtio/virtio-pci.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 1 deletion(-)