diff mbox

msi/msix: added API to set MSI message address and data

Message ID 4FE307DE.5070002@ozlabs.ru
State New
Headers show

Commit Message

Alexey Kardashevskiy June 21, 2012, 11:39 a.m. UTC
Added (msi|msix)_set_message() functions.

Currently msi_notify()/msix_notify() write to these vectors to
signal the guest about an interrupt so the correct values have to
written there by the guest or QEMU.

For example, POWER guest never initializes MSI/MSIX vectors, instead
it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
POWER we have to initialize MSI/MSIX message from QEMU.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 hw/msi.c  |   13 +++++++++++++
 hw/msi.h  |    1 +
 hw/msix.c |    9 +++++++++
 hw/msix.h |    2 ++
 4 files changed, 25 insertions(+)

Comments

Jan Kiszka June 21, 2012, 11:49 a.m. UTC | #1
On 2012-06-21 13:39, Alexey Kardashevskiy wrote:
> Added (msi|msix)_set_message() functions.
> 
> Currently msi_notify()/msix_notify() write to these vectors to
> signal the guest about an interrupt so the correct values have to
> written there by the guest or QEMU.
> 
> For example, POWER guest never initializes MSI/MSIX vectors, instead
> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
> POWER we have to initialize MSI/MSIX message from QEMU.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  hw/msi.c  |   13 +++++++++++++
>  hw/msi.h  |    1 +
>  hw/msix.c |    9 +++++++++
>  hw/msix.h |    2 ++
>  4 files changed, 25 insertions(+)
> 
> diff --git a/hw/msi.c b/hw/msi.c
> index 5233204..cc6102f 100644
> --- a/hw/msi.c
> +++ b/hw/msi.c
> @@ -105,6 +105,19 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
>  }
>  
> +void msi_set_message(PCIDevice *dev, MSIMessage msg)
> +{
> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> +
> +    if (msi64bit) {
> +        pci_set_quad(dev->config + msi_address_lo_off(dev), msg.address);
> +    } else {
> +        pci_set_long(dev->config + msi_address_lo_off(dev), msg.address);
> +    }
> +    pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data);
> +}
> +
>  bool msi_enabled(const PCIDevice *dev)
>  {
>      return msi_present(dev) &&
> diff --git a/hw/msi.h b/hw/msi.h
> index 75747ab..6ec1f99 100644
> --- a/hw/msi.h
> +++ b/hw/msi.h
> @@ -31,6 +31,7 @@ struct MSIMessage {
>  
>  extern bool msi_supported;
>  
> +void msi_set_message(PCIDevice *dev, MSIMessage msg);
>  bool msi_enabled(const PCIDevice *dev);
>  int msi_init(struct PCIDevice *dev, uint8_t offset,
>               unsigned int nr_vectors, bool msi64bit, bool msi_per_vector_mask);
> diff --git a/hw/msix.c b/hw/msix.c
> index ded3c55..5f7d6d3 100644
> --- a/hw/msix.c
> +++ b/hw/msix.c
> @@ -45,6 +45,15 @@ static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
>      return msg;
>  }
>  
> +void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg)
> +{
> +    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
> +
> +    pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address);
> +    pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data);
> +    table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
> +}
> +
>  /* Add MSI-X capability to the config space for the device. */
>  /* Given a bar and its size, add MSI-X table on top of it
>   * and fill MSI-X capability in the config space.
> diff --git a/hw/msix.h b/hw/msix.h
> index 50aee82..26a437e 100644
> --- a/hw/msix.h
> +++ b/hw/msix.h
> @@ -4,6 +4,8 @@
>  #include "qemu-common.h"
>  #include "pci.h"
>  
> +void msix_set_message(PCIDevice *dev, int vector, MSIMessage msg);
> +
>  int msix_init(PCIDevice *pdev, unsigned short nentries,
>                MemoryRegion *bar,
>                unsigned bar_nr, unsigned bar_size);
> 

Interface looks good as fas as I can tell (can't asses the POWER need
for clearing the mask bit on msix_set_message).

> -- 
> 1.7.10
> 
> ps. double '-' and git version is an end-of-patch scissor as I read somewhere, cannot recall where exactly 

Check man git-am.

Jan
Alexey Kardashevskiy June 22, 2012, 1:03 a.m. UTC | #2
On 21/06/12 21:49, Jan Kiszka wrote:
> On 2012-06-21 13:39, Alexey Kardashevskiy wrote:
>> Added (msi|msix)_set_message() functions.
>>
>> Currently msi_notify()/msix_notify() write to these vectors to
>> signal the guest about an interrupt so the correct values have to
>> written there by the guest or QEMU.
>>
>> For example, POWER guest never initializes MSI/MSIX vectors, instead
>> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
>> POWER we have to initialize MSI/MSIX message from QEMU.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  hw/msi.c  |   13 +++++++++++++
>>  hw/msi.h  |    1 +
>>  hw/msix.c |    9 +++++++++
>>  hw/msix.h |    2 ++
>>  4 files changed, 25 insertions(+)
>>
>> diff --git a/hw/msi.c b/hw/msi.c
>> index 5233204..cc6102f 100644
>> --- a/hw/msi.c
>> +++ b/hw/msi.c
>> @@ -105,6 +105,19 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
>>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
>>  }
>>  
>> +void msi_set_message(PCIDevice *dev, MSIMessage msg)
>> +{
>> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
>> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
>> +
>> +    if (msi64bit) {
>> +        pci_set_quad(dev->config + msi_address_lo_off(dev), msg.address);
>> +    } else {
>> +        pci_set_long(dev->config + msi_address_lo_off(dev), msg.address);
>> +    }
>> +    pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data);
>> +}
>> +
>>  bool msi_enabled(const PCIDevice *dev)
>>  {
>>      return msi_present(dev) &&
>> diff --git a/hw/msi.h b/hw/msi.h
>> index 75747ab..6ec1f99 100644
>> --- a/hw/msi.h
>> +++ b/hw/msi.h
>> @@ -31,6 +31,7 @@ struct MSIMessage {
>>  
>>  extern bool msi_supported;
>>  
>> +void msi_set_message(PCIDevice *dev, MSIMessage msg);
>>  bool msi_enabled(const PCIDevice *dev);
>>  int msi_init(struct PCIDevice *dev, uint8_t offset,
>>               unsigned int nr_vectors, bool msi64bit, bool msi_per_vector_mask);
>> diff --git a/hw/msix.c b/hw/msix.c
>> index ded3c55..5f7d6d3 100644
>> --- a/hw/msix.c
>> +++ b/hw/msix.c
>> @@ -45,6 +45,15 @@ static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
>>      return msg;
>>  }
>>  
>> +void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg)
>> +{
>> +    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
>> +
>> +    pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address);
>> +    pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data);
>> +    table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
>> +}
>> +
>>  /* Add MSI-X capability to the config space for the device. */
>>  /* Given a bar and its size, add MSI-X table on top of it
>>   * and fill MSI-X capability in the config space.
>> diff --git a/hw/msix.h b/hw/msix.h
>> index 50aee82..26a437e 100644
>> --- a/hw/msix.h
>> +++ b/hw/msix.h
>> @@ -4,6 +4,8 @@
>>  #include "qemu-common.h"
>>  #include "pci.h"
>>  
>> +void msix_set_message(PCIDevice *dev, int vector, MSIMessage msg);
>> +
>>  int msix_init(PCIDevice *pdev, unsigned short nentries,
>>                MemoryRegion *bar,
>>                unsigned bar_nr, unsigned bar_size);
>>
> 
> Interface looks good as fas as I can tell (can't asses the POWER need
> for clearing the mask bit on msix_set_message).


I do not know exactly how x86 works (who/how allocates addresses for MSI/MSIX). On POWER at the
moment I did the following thing in QEMU:

- registered memory_region_init_io at some big address which the guest won't use, it is just for QEMU
- put address from the previous step to the MSIX BAR via msix_set_message() when msi is being configured
- then the sequence looks like:
	- vfio_msi_interrupt() calls msix_notify()
	- msix_notify() checks if it is masked via msix_is_masked() - and here PCI_MSIX_ENTRY_CTRL_MASKBIT
must be unset
	- stl_le_phys() - here I get a notification in my MemoryRegionOps::write() and do qemu_irq_pulse()

2 reasons to do that:
1) I did not have to change either msix or vfio - cool for submitting patches;
2) neither POWER guest or qemu changes the msi or msix PCI config (it is done by different mechanism
called RTAS), so I have to do this myself to support 1) and I do not have to care about someone
breaking my settings


>> -- 
>> 1.7.10
>>
>> ps. double '-' and git version is an end-of-patch scissor as I read somewhere, cannot recall where exactly 
> 
> Check man git-am.

Ahhh. Confused end-of-message with end-of-patch. I'll repost it.
Michael S. Tsirkin July 18, 2012, 12:43 p.m. UTC | #3
On Thu, Jun 21, 2012 at 09:39:10PM +1000, Alexey Kardashevskiy wrote:
> Added (msi|msix)_set_message() functions.
> 
> Currently msi_notify()/msix_notify() write to these vectors to
> signal the guest about an interrupt so the correct values have to
> written there by the guest or QEMU.
> 
> For example, POWER guest never initializes MSI/MSIX vectors, instead
> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
> POWER we have to initialize MSI/MSIX message from QEMU.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

So guests do enable MSI through config space, but do
not fill in vectors? Very strange. Are you sure it's not
just a guest bug? How does it work for other PCI devices?
Can't we just fix guest drivers to program the vectors properly?

Also pls address the comment below.

Thanks!

> ---
>  hw/msi.c  |   13 +++++++++++++
>  hw/msi.h  |    1 +
>  hw/msix.c |    9 +++++++++
>  hw/msix.h |    2 ++
>  4 files changed, 25 insertions(+)
> 
> diff --git a/hw/msi.c b/hw/msi.c
> index 5233204..cc6102f 100644
> --- a/hw/msi.c
> +++ b/hw/msi.c
> @@ -105,6 +105,19 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
>  }
>  
> +void msi_set_message(PCIDevice *dev, MSIMessage msg)
> +{
> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> +
> +    if (msi64bit) {
> +        pci_set_quad(dev->config + msi_address_lo_off(dev), msg.address);
> +    } else {
> +        pci_set_long(dev->config + msi_address_lo_off(dev), msg.address);
> +    }
> +    pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data);
> +}
> +

Please add documentation. Something like

/*
 * Special API for POWER to configure the vectors through
 * a side channel. Should never be used by devices.
 */

>  bool msi_enabled(const PCIDevice *dev)
>  {
>      return msi_present(dev) &&
> diff --git a/hw/msi.h b/hw/msi.h
> index 75747ab..6ec1f99 100644
> --- a/hw/msi.h
> +++ b/hw/msi.h
> @@ -31,6 +31,7 @@ struct MSIMessage {
>  
>  extern bool msi_supported;
>  
> +void msi_set_message(PCIDevice *dev, MSIMessage msg);
>  bool msi_enabled(const PCIDevice *dev);
>  int msi_init(struct PCIDevice *dev, uint8_t offset,
>               unsigned int nr_vectors, bool msi64bit, bool msi_per_vector_mask);
> diff --git a/hw/msix.c b/hw/msix.c
> index ded3c55..5f7d6d3 100644
> --- a/hw/msix.c
> +++ b/hw/msix.c
> @@ -45,6 +45,15 @@ static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
>      return msg;
>  }
>  
> +void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg)
> +{
> +    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
> +
> +    pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address);
> +    pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data);
> +    table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
> +}
> +
>  /* Add MSI-X capability to the config space for the device. */
>  /* Given a bar and its size, add MSI-X table on top of it
>   * and fill MSI-X capability in the config space.
> diff --git a/hw/msix.h b/hw/msix.h
> index 50aee82..26a437e 100644
> --- a/hw/msix.h
> +++ b/hw/msix.h
> @@ -4,6 +4,8 @@
>  #include "qemu-common.h"
>  #include "pci.h"
>  
> +void msix_set_message(PCIDevice *dev, int vector, MSIMessage msg);
> +
>  int msix_init(PCIDevice *pdev, unsigned short nentries,
>                MemoryRegion *bar,
>                unsigned bar_nr, unsigned bar_size);
> -- 
> 1.7.10
> 
> ps. double '-' and git version is an end-of-patch scissor as I read somewhere, cannot recall where exactly :)
> 
> 
> 
> 
> 
> 
> On 21/06/12 20:56, Jan Kiszka wrote:
> > On 2012-06-21 12:50, Alexey Kardashevskiy wrote:
> >> On 21/06/12 20:38, Jan Kiszka wrote:
> >>> On 2012-06-21 12:28, Alexey Kardashevskiy wrote:
> >>>> On 21/06/12 17:39, Jan Kiszka wrote:
> >>>>> On 2012-06-21 09:18, Alexey Kardashevskiy wrote:
> >>>>>>
> >>>>>> agrhhh. sha1 of the patch changed after rebasing :)
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>> Added (msi|msix)_(set|get)_message() function for whoever might
> >>>>>> want to use them.
> >>>>>>
> >>>>>> Currently msi_notify()/msix_notify() write to these vectors to
> >>>>>> signal the guest about an interrupt so the correct values have to
> >>>>>> written there by the guest or QEMU.
> >>>>>>
> >>>>>> For example, POWER guest never initializes MSI/MSIX vectors, instead
> >>>>>> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
> >>>>>> POWER we have to initialize MSI/MSIX message from QEMU.
> >>>>>>
> >>>>>> As only set* function are required by now, the "get" functions were added
> >>>>>> or made public for a symmetry.
> >>>>>>
> >>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>>> ---
> >>>>>>  hw/msi.c  |   29 +++++++++++++++++++++++++++++
> >>>>>>  hw/msi.h  |    2 ++
> >>>>>>  hw/msix.c |   11 ++++++++++-
> >>>>>>  hw/msix.h |    3 +++
> >>>>>>  4 files changed, 44 insertions(+), 1 deletion(-)
> >>>>>>
> >>>>>> diff --git a/hw/msi.c b/hw/msi.c
> >>>>>> index 5233204..9ad84a4 100644
> >>>>>> --- a/hw/msi.c
> >>>>>> +++ b/hw/msi.c
> >>>>>> @@ -105,6 +105,35 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
> >>>>>>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
> >>>>>>  }
> >>>>>>  
> >>>>>> +MSIMessage msi_get_message(PCIDevice *dev)
> >>>>>
> >>>>> MSIMessage msi_get_message(PCIDevice *dev, unsigned vector)
> >>>>
> >>>>
> >>>> Who/how/why is going to calculate the vector here?
> >>>>
> >>>>>
> >>>>>> +{
> >>>>>> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> >>>>>> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> >>>>>> +    MSIMessage msg;
> >>>>>> +
> >>>>>> +    if (msi64bit) {
> >>>>>> +        msg.address = pci_get_quad(dev->config + msi_address_lo_off(dev));
> >>>>>> +    } else {
> >>>>>> +        msg.address = pci_get_long(dev->config + msi_address_lo_off(dev));
> >>>>>> +    }
> >>>>>> +    msg.data = pci_get_word(dev->config + msi_data_off(dev, msi64bit));
> >>>>>
> >>>>> And I have this here in addition:
> >>>>>
> >>>>>     unsigned int nr_vectors = msi_nr_vectors(flags);
> >>>>>     ...
> >>>>>
> >>>>>     if (nr_vectors > 1) {
> >>>>>         msg.data &= ~(nr_vectors - 1);
> >>>>>         msg.data |= vector;
> >>>>>     }
> >>>>>
> >>>>> See PCI spec and existing code.
> >>>>
> >>>>
> >>>> What for? I really do not get it why someone might want to read something but not real value.
> >>>> What PCI code should I look?
> >>>
> >>> I'm not sure what your use case for reading the message is. For KVM
> >>> device assignment it is preparing an alternative message delivery path
> >>> for MSI vectors. And for this we will need vector notifier support for
> >>> MSI as well. You can check the MSI-X code for corresponding use cases of
> >>> msix_get_message.
> >>
> >>> And when we already have msi_get_message, another logical use case is
> >>> msi_notify. See msix.c again.
> >>
> >> Aaaa.
> >>
> >> I have no case for reading the message. All I need is writing. And I want it public as I want to use
> >> it from hw/spapr_pci.c. You suggested to add reading, I added "get" to be _symmetric_ to "set"
> >> ("get" returns what "set" wrote). You want a different thing which I can do but it is not
> >> msi_get_message(), it is something like msi_prepare_message(MSImessage msg) or
> >> msi_set_vector(uint16_t data) or simply internal kitchen of msi_notify().
> >>
> >> Still can do what you suggested, it just does not seem right.
> > 
> > It is right - when looking at it from a different angle. ;)
> > 
> > I don't mind if you add msi_get_message now or leave this to me. Likely
> > the latter is better as you have no use case for msi_get_message (and
> > also msix_get_message!) outside of their modules, thus we should not
> > export those functions anyway.
> > 
> > Jan
> > 
> 
> 
> -- 
> Alexey
Alexey Kardashevskiy July 18, 2012, 1:17 p.m. UTC | #4
On 18/07/12 22:43, Michael S. Tsirkin wrote:
> On Thu, Jun 21, 2012 at 09:39:10PM +1000, Alexey Kardashevskiy wrote:
>> Added (msi|msix)_set_message() functions.
>>
>> Currently msi_notify()/msix_notify() write to these vectors to
>> signal the guest about an interrupt so the correct values have to
>> written there by the guest or QEMU.
>>
>> For example, POWER guest never initializes MSI/MSIX vectors, instead
>> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
>> POWER we have to initialize MSI/MSIX message from QEMU.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> 
> So guests do enable MSI through config space, but do
> not fill in vectors? 

Yes. msix_capability_init() calls arch_setup_msi_irqs() which does everything it needs to do (i.e. calls hypervisor) before msix_capability_init() writes PCI_MSIX_FLAGS_ENABLE to the PCI_MSIX_FLAGS register.

These vectors are the PCI bus addresses, the way they are set is specific for a PCI host controller, I do not see why the current scheme is a bug.


> Very strange. Are you sure it's not
> just a guest bug? How does it work for other PCI devices?

Did not get the question. It works the same for every PCI device under POWER guest.


> Can't we just fix guest drivers to program the vectors properly?
> 
> Also pls address the comment below.

Comment below.

> Thanks!
> 
>> ---
>>  hw/msi.c  |   13 +++++++++++++
>>  hw/msi.h  |    1 +
>>  hw/msix.c |    9 +++++++++
>>  hw/msix.h |    2 ++
>>  4 files changed, 25 insertions(+)
>>
>> diff --git a/hw/msi.c b/hw/msi.c
>> index 5233204..cc6102f 100644
>> --- a/hw/msi.c
>> +++ b/hw/msi.c
>> @@ -105,6 +105,19 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
>>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
>>  }
>>  
>> +void msi_set_message(PCIDevice *dev, MSIMessage msg)
>> +{
>> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
>> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
>> +
>> +    if (msi64bit) {
>> +        pci_set_quad(dev->config + msi_address_lo_off(dev), msg.address);
>> +    } else {
>> +        pci_set_long(dev->config + msi_address_lo_off(dev), msg.address);
>> +    }
>> +    pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data);
>> +}
>> +
> 
> Please add documentation. Something like
> 
> /*
>  * Special API for POWER to configure the vectors through
>  * a side channel. Should never be used by devices.
>  */


It is useful for any para-virtualized environment I believe, is not it?
For s390 as well. Of course, if it supports PCI, for example, what I am not sure it does though :)



>>  bool msi_enabled(const PCIDevice *dev)
>>  {
>>      return msi_present(dev) &&
>> diff --git a/hw/msi.h b/hw/msi.h
>> index 75747ab..6ec1f99 100644
>> --- a/hw/msi.h
>> +++ b/hw/msi.h
>> @@ -31,6 +31,7 @@ struct MSIMessage {
>>  
>>  extern bool msi_supported;
>>  
>> +void msi_set_message(PCIDevice *dev, MSIMessage msg);
>>  bool msi_enabled(const PCIDevice *dev);
>>  int msi_init(struct PCIDevice *dev, uint8_t offset,
>>               unsigned int nr_vectors, bool msi64bit, bool msi_per_vector_mask);
>> diff --git a/hw/msix.c b/hw/msix.c
>> index ded3c55..5f7d6d3 100644
>> --- a/hw/msix.c
>> +++ b/hw/msix.c
>> @@ -45,6 +45,15 @@ static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
>>      return msg;
>>  }
>>  
>> +void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg)
>> +{
>> +    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
>> +
>> +    pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address);
>> +    pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data);
>> +    table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
>> +}
>> +
>>  /* Add MSI-X capability to the config space for the device. */
>>  /* Given a bar and its size, add MSI-X table on top of it
>>   * and fill MSI-X capability in the config space.
>> diff --git a/hw/msix.h b/hw/msix.h
>> index 50aee82..26a437e 100644
>> --- a/hw/msix.h
>> +++ b/hw/msix.h
>> @@ -4,6 +4,8 @@
>>  #include "qemu-common.h"
>>  #include "pci.h"
>>  
>> +void msix_set_message(PCIDevice *dev, int vector, MSIMessage msg);
>> +
>>  int msix_init(PCIDevice *pdev, unsigned short nentries,
>>                MemoryRegion *bar,
>>                unsigned bar_nr, unsigned bar_size);
>> -- 
>> 1.7.10
>>
>> ps. double '-' and git version is an end-of-patch scissor as I read somewhere, cannot recall where exactly :)
>>
>>
>>
>>
>>
>>
>> On 21/06/12 20:56, Jan Kiszka wrote:
>>> On 2012-06-21 12:50, Alexey Kardashevskiy wrote:
>>>> On 21/06/12 20:38, Jan Kiszka wrote:
>>>>> On 2012-06-21 12:28, Alexey Kardashevskiy wrote:
>>>>>> On 21/06/12 17:39, Jan Kiszka wrote:
>>>>>>> On 2012-06-21 09:18, Alexey Kardashevskiy wrote:
>>>>>>>>
>>>>>>>> agrhhh. sha1 of the patch changed after rebasing :)
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> Added (msi|msix)_(set|get)_message() function for whoever might
>>>>>>>> want to use them.
>>>>>>>>
>>>>>>>> Currently msi_notify()/msix_notify() write to these vectors to
>>>>>>>> signal the guest about an interrupt so the correct values have to
>>>>>>>> written there by the guest or QEMU.
>>>>>>>>
>>>>>>>> For example, POWER guest never initializes MSI/MSIX vectors, instead
>>>>>>>> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
>>>>>>>> POWER we have to initialize MSI/MSIX message from QEMU.
>>>>>>>>
>>>>>>>> As only set* function are required by now, the "get" functions were added
>>>>>>>> or made public for a symmetry.
>>>>>>>>
>>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>>> ---
>>>>>>>>  hw/msi.c  |   29 +++++++++++++++++++++++++++++
>>>>>>>>  hw/msi.h  |    2 ++
>>>>>>>>  hw/msix.c |   11 ++++++++++-
>>>>>>>>  hw/msix.h |    3 +++
>>>>>>>>  4 files changed, 44 insertions(+), 1 deletion(-)
>>>>>>>>
>>>>>>>> diff --git a/hw/msi.c b/hw/msi.c
>>>>>>>> index 5233204..9ad84a4 100644
>>>>>>>> --- a/hw/msi.c
>>>>>>>> +++ b/hw/msi.c
>>>>>>>> @@ -105,6 +105,35 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
>>>>>>>>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
>>>>>>>>  }
>>>>>>>>  
>>>>>>>> +MSIMessage msi_get_message(PCIDevice *dev)
>>>>>>>
>>>>>>> MSIMessage msi_get_message(PCIDevice *dev, unsigned vector)
>>>>>>
>>>>>>
>>>>>> Who/how/why is going to calculate the vector here?
>>>>>>
>>>>>>>
>>>>>>>> +{
>>>>>>>> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
>>>>>>>> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
>>>>>>>> +    MSIMessage msg;
>>>>>>>> +
>>>>>>>> +    if (msi64bit) {
>>>>>>>> +        msg.address = pci_get_quad(dev->config + msi_address_lo_off(dev));
>>>>>>>> +    } else {
>>>>>>>> +        msg.address = pci_get_long(dev->config + msi_address_lo_off(dev));
>>>>>>>> +    }
>>>>>>>> +    msg.data = pci_get_word(dev->config + msi_data_off(dev, msi64bit));
>>>>>>>
>>>>>>> And I have this here in addition:
>>>>>>>
>>>>>>>     unsigned int nr_vectors = msi_nr_vectors(flags);
>>>>>>>     ...
>>>>>>>
>>>>>>>     if (nr_vectors > 1) {
>>>>>>>         msg.data &= ~(nr_vectors - 1);
>>>>>>>         msg.data |= vector;
>>>>>>>     }
>>>>>>>
>>>>>>> See PCI spec and existing code.
>>>>>>
>>>>>>
>>>>>> What for? I really do not get it why someone might want to read something but not real value.
>>>>>> What PCI code should I look?
>>>>>
>>>>> I'm not sure what your use case for reading the message is. For KVM
>>>>> device assignment it is preparing an alternative message delivery path
>>>>> for MSI vectors. And for this we will need vector notifier support for
>>>>> MSI as well. You can check the MSI-X code for corresponding use cases of
>>>>> msix_get_message.
>>>>
>>>>> And when we already have msi_get_message, another logical use case is
>>>>> msi_notify. See msix.c again.
>>>>
>>>> Aaaa.
>>>>
>>>> I have no case for reading the message. All I need is writing. And I want it public as I want to use
>>>> it from hw/spapr_pci.c. You suggested to add reading, I added "get" to be _symmetric_ to "set"
>>>> ("get" returns what "set" wrote). You want a different thing which I can do but it is not
>>>> msi_get_message(), it is something like msi_prepare_message(MSImessage msg) or
>>>> msi_set_vector(uint16_t data) or simply internal kitchen of msi_notify().
>>>>
>>>> Still can do what you suggested, it just does not seem right.
>>>
>>> It is right - when looking at it from a different angle. ;)
>>>
>>> I don't mind if you add msi_get_message now or leave this to me. Likely
>>> the latter is better as you have no use case for msi_get_message (and
>>> also msix_get_message!) outside of their modules, thus we should not
>>> export those functions anyway.
Michael S. Tsirkin July 18, 2012, 3:23 p.m. UTC | #5
On Wed, Jul 18, 2012 at 11:17:12PM +1000, Alexey Kardashevskiy wrote:
> On 18/07/12 22:43, Michael S. Tsirkin wrote:
> > On Thu, Jun 21, 2012 at 09:39:10PM +1000, Alexey Kardashevskiy wrote:
> >> Added (msi|msix)_set_message() functions.
> >>
> >> Currently msi_notify()/msix_notify() write to these vectors to
> >> signal the guest about an interrupt so the correct values have to
> >> written there by the guest or QEMU.
> >>
> >> For example, POWER guest never initializes MSI/MSIX vectors, instead
> >> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
> >> POWER we have to initialize MSI/MSIX message from QEMU.
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > 
> > So guests do enable MSI through config space, but do
> > not fill in vectors? 
> 
> Yes. msix_capability_init() calls arch_setup_msi_irqs() which does everything it needs to do (i.e. calls hypervisor) before msix_capability_init() writes PCI_MSIX_FLAGS_ENABLE to the PCI_MSIX_FLAGS register.
> 
> These vectors are the PCI bus addresses, the way they are set is specific for a PCI host controller, I do not see why the current scheme is a bug.

I won't work with any real PCI device, will it? Real pci devices expect
vectors to be written into their memory.

> > Very strange. Are you sure it's not
> > just a guest bug? How does it work for other PCI devices?
> 
> Did not get the question. It works the same for every PCI device under POWER guest.

I mean for real PCI devices.

> > Can't we just fix guest drivers to program the vectors properly?
> > 
> > Also pls address the comment below.
> 
> Comment below.
> 
> > Thanks!
> > 
> >> ---
> >>  hw/msi.c  |   13 +++++++++++++
> >>  hw/msi.h  |    1 +
> >>  hw/msix.c |    9 +++++++++
> >>  hw/msix.h |    2 ++
> >>  4 files changed, 25 insertions(+)
> >>
> >> diff --git a/hw/msi.c b/hw/msi.c
> >> index 5233204..cc6102f 100644
> >> --- a/hw/msi.c
> >> +++ b/hw/msi.c
> >> @@ -105,6 +105,19 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
> >>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
> >>  }
> >>  
> >> +void msi_set_message(PCIDevice *dev, MSIMessage msg)
> >> +{
> >> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> >> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> >> +
> >> +    if (msi64bit) {
> >> +        pci_set_quad(dev->config + msi_address_lo_off(dev), msg.address);
> >> +    } else {
> >> +        pci_set_long(dev->config + msi_address_lo_off(dev), msg.address);
> >> +    }
> >> +    pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data);
> >> +}
> >> +
> > 
> > Please add documentation. Something like
> > 
> > /*
> >  * Special API for POWER to configure the vectors through
> >  * a side channel. Should never be used by devices.
> >  */
> 
> 
> It is useful for any para-virtualized environment I believe, is not it?
> For s390 as well. Of course, if it supports PCI, for example, what I am not sure it does though :)

I expect the normal guest to program the address into MSI register using
config accesses, same way that it enables MSI/MSIX.
Why POWER does it differently I did not yet figure out but I hope
this weirdness is not so widespread.

> >>  bool msi_enabled(const PCIDevice *dev)
> >>  {
> >>      return msi_present(dev) &&
> >> diff --git a/hw/msi.h b/hw/msi.h
> >> index 75747ab..6ec1f99 100644
> >> --- a/hw/msi.h
> >> +++ b/hw/msi.h
> >> @@ -31,6 +31,7 @@ struct MSIMessage {
> >>  
> >>  extern bool msi_supported;
> >>  
> >> +void msi_set_message(PCIDevice *dev, MSIMessage msg);
> >>  bool msi_enabled(const PCIDevice *dev);
> >>  int msi_init(struct PCIDevice *dev, uint8_t offset,
> >>               unsigned int nr_vectors, bool msi64bit, bool msi_per_vector_mask);
> >> diff --git a/hw/msix.c b/hw/msix.c
> >> index ded3c55..5f7d6d3 100644
> >> --- a/hw/msix.c
> >> +++ b/hw/msix.c
> >> @@ -45,6 +45,15 @@ static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
> >>      return msg;
> >>  }
> >>  
> >> +void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg)
> >> +{
> >> +    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
> >> +
> >> +    pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address);
> >> +    pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data);
> >> +    table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
> >> +}
> >> +
> >>  /* Add MSI-X capability to the config space for the device. */
> >>  /* Given a bar and its size, add MSI-X table on top of it
> >>   * and fill MSI-X capability in the config space.
> >> diff --git a/hw/msix.h b/hw/msix.h
> >> index 50aee82..26a437e 100644
> >> --- a/hw/msix.h
> >> +++ b/hw/msix.h
> >> @@ -4,6 +4,8 @@
> >>  #include "qemu-common.h"
> >>  #include "pci.h"
> >>  
> >> +void msix_set_message(PCIDevice *dev, int vector, MSIMessage msg);
> >> +
> >>  int msix_init(PCIDevice *pdev, unsigned short nentries,
> >>                MemoryRegion *bar,
> >>                unsigned bar_nr, unsigned bar_size);
> >> -- 
> >> 1.7.10
> >>
> >> ps. double '-' and git version is an end-of-patch scissor as I read somewhere, cannot recall where exactly :)
> >>
> >>
> >>
> >>
> >>
> >>
> >> On 21/06/12 20:56, Jan Kiszka wrote:
> >>> On 2012-06-21 12:50, Alexey Kardashevskiy wrote:
> >>>> On 21/06/12 20:38, Jan Kiszka wrote:
> >>>>> On 2012-06-21 12:28, Alexey Kardashevskiy wrote:
> >>>>>> On 21/06/12 17:39, Jan Kiszka wrote:
> >>>>>>> On 2012-06-21 09:18, Alexey Kardashevskiy wrote:
> >>>>>>>>
> >>>>>>>> agrhhh. sha1 of the patch changed after rebasing :)
> >>>>>>>>
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> Added (msi|msix)_(set|get)_message() function for whoever might
> >>>>>>>> want to use them.
> >>>>>>>>
> >>>>>>>> Currently msi_notify()/msix_notify() write to these vectors to
> >>>>>>>> signal the guest about an interrupt so the correct values have to
> >>>>>>>> written there by the guest or QEMU.
> >>>>>>>>
> >>>>>>>> For example, POWER guest never initializes MSI/MSIX vectors, instead
> >>>>>>>> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
> >>>>>>>> POWER we have to initialize MSI/MSIX message from QEMU.
> >>>>>>>>
> >>>>>>>> As only set* function are required by now, the "get" functions were added
> >>>>>>>> or made public for a symmetry.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>>>>> ---
> >>>>>>>>  hw/msi.c  |   29 +++++++++++++++++++++++++++++
> >>>>>>>>  hw/msi.h  |    2 ++
> >>>>>>>>  hw/msix.c |   11 ++++++++++-
> >>>>>>>>  hw/msix.h |    3 +++
> >>>>>>>>  4 files changed, 44 insertions(+), 1 deletion(-)
> >>>>>>>>
> >>>>>>>> diff --git a/hw/msi.c b/hw/msi.c
> >>>>>>>> index 5233204..9ad84a4 100644
> >>>>>>>> --- a/hw/msi.c
> >>>>>>>> +++ b/hw/msi.c
> >>>>>>>> @@ -105,6 +105,35 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
> >>>>>>>>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
> >>>>>>>>  }
> >>>>>>>>  
> >>>>>>>> +MSIMessage msi_get_message(PCIDevice *dev)
> >>>>>>>
> >>>>>>> MSIMessage msi_get_message(PCIDevice *dev, unsigned vector)
> >>>>>>
> >>>>>>
> >>>>>> Who/how/why is going to calculate the vector here?
> >>>>>>
> >>>>>>>
> >>>>>>>> +{
> >>>>>>>> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> >>>>>>>> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> >>>>>>>> +    MSIMessage msg;
> >>>>>>>> +
> >>>>>>>> +    if (msi64bit) {
> >>>>>>>> +        msg.address = pci_get_quad(dev->config + msi_address_lo_off(dev));
> >>>>>>>> +    } else {
> >>>>>>>> +        msg.address = pci_get_long(dev->config + msi_address_lo_off(dev));
> >>>>>>>> +    }
> >>>>>>>> +    msg.data = pci_get_word(dev->config + msi_data_off(dev, msi64bit));
> >>>>>>>
> >>>>>>> And I have this here in addition:
> >>>>>>>
> >>>>>>>     unsigned int nr_vectors = msi_nr_vectors(flags);
> >>>>>>>     ...
> >>>>>>>
> >>>>>>>     if (nr_vectors > 1) {
> >>>>>>>         msg.data &= ~(nr_vectors - 1);
> >>>>>>>         msg.data |= vector;
> >>>>>>>     }
> >>>>>>>
> >>>>>>> See PCI spec and existing code.
> >>>>>>
> >>>>>>
> >>>>>> What for? I really do not get it why someone might want to read something but not real value.
> >>>>>> What PCI code should I look?
> >>>>>
> >>>>> I'm not sure what your use case for reading the message is. For KVM
> >>>>> device assignment it is preparing an alternative message delivery path
> >>>>> for MSI vectors. And for this we will need vector notifier support for
> >>>>> MSI as well. You can check the MSI-X code for corresponding use cases of
> >>>>> msix_get_message.
> >>>>
> >>>>> And when we already have msi_get_message, another logical use case is
> >>>>> msi_notify. See msix.c again.
> >>>>
> >>>> Aaaa.
> >>>>
> >>>> I have no case for reading the message. All I need is writing. And I want it public as I want to use
> >>>> it from hw/spapr_pci.c. You suggested to add reading, I added "get" to be _symmetric_ to "set"
> >>>> ("get" returns what "set" wrote). You want a different thing which I can do but it is not
> >>>> msi_get_message(), it is something like msi_prepare_message(MSImessage msg) or
> >>>> msi_set_vector(uint16_t data) or simply internal kitchen of msi_notify().
> >>>>
> >>>> Still can do what you suggested, it just does not seem right.
> >>>
> >>> It is right - when looking at it from a different angle. ;)
> >>>
> >>> I don't mind if you add msi_get_message now or leave this to me. Likely
> >>> the latter is better as you have no use case for msi_get_message (and
> >>> also msix_get_message!) outside of their modules, thus we should not
> >>> export those functions anyway.
> 
> 
> 
> -- 
> Alexey
>
Alexey Kardashevskiy July 19, 2012, 12:32 a.m. UTC | #6
On 19/07/12 01:23, Michael S. Tsirkin wrote:
> On Wed, Jul 18, 2012 at 11:17:12PM +1000, Alexey Kardashevskiy wrote:
>> On 18/07/12 22:43, Michael S. Tsirkin wrote:
>>> On Thu, Jun 21, 2012 at 09:39:10PM +1000, Alexey Kardashevskiy wrote:
>>>> Added (msi|msix)_set_message() functions.
>>>>
>>>> Currently msi_notify()/msix_notify() write to these vectors to
>>>> signal the guest about an interrupt so the correct values have to
>>>> written there by the guest or QEMU.
>>>>
>>>> For example, POWER guest never initializes MSI/MSIX vectors, instead
>>>> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
>>>> POWER we have to initialize MSI/MSIX message from QEMU.
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>
>>> So guests do enable MSI through config space, but do
>>> not fill in vectors? 
>>
>> Yes. msix_capability_init() calls arch_setup_msi_irqs() which does everything it needs to do (i.e. calls hypervisor) before msix_capability_init() writes PCI_MSIX_FLAGS_ENABLE to the PCI_MSIX_FLAGS register.
>>
>> These vectors are the PCI bus addresses, the way they are set is specific for a PCI host controller, I do not see why the current scheme is a bug.
> 
> I won't work with any real PCI device, will it? Real pci devices expect
> vectors to be written into their memory.


Yes. And the hypervisor does this. On POWER (at least book3s - server powerpc, the whole config space kitchen is hidden behind RTAS (kind of bios). For the guest, this RTAS is implemented in hypervisor, for the host - in the system firmware. So powerpc linux does not have to have PHB drivers. Kinda cool.

Usual powerpc server is running without the host linux at all, it is running a hypervisor called pHyp. And every guest knows that it is a guest, there is no full machine emulation, it is para-virtualization. In power-kvm, we replace that pHyp with the host linux and now QEMU plays a hypervisor role. Some day We will move the hypervisor to the host kernel completely (?) but now it is in QEMU.


>>> Very strange. Are you sure it's not
>>> just a guest bug? How does it work for other PCI devices?
>>
>> Did not get the question. It works the same for every PCI device under POWER guest.
> 
> I mean for real PCI devices.
> 
>>> Can't we just fix guest drivers to program the vectors properly?
>>>
>>> Also pls address the comment below.
>>
>> Comment below.
>>
>>> Thanks!
>>>
>>>> ---
>>>>  hw/msi.c  |   13 +++++++++++++
>>>>  hw/msi.h  |    1 +
>>>>  hw/msix.c |    9 +++++++++
>>>>  hw/msix.h |    2 ++
>>>>  4 files changed, 25 insertions(+)
>>>>
>>>> diff --git a/hw/msi.c b/hw/msi.c
>>>> index 5233204..cc6102f 100644
>>>> --- a/hw/msi.c
>>>> +++ b/hw/msi.c
>>>> @@ -105,6 +105,19 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
>>>>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
>>>>  }
>>>>  
>>>> +void msi_set_message(PCIDevice *dev, MSIMessage msg)
>>>> +{
>>>> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
>>>> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
>>>> +
>>>> +    if (msi64bit) {
>>>> +        pci_set_quad(dev->config + msi_address_lo_off(dev), msg.address);
>>>> +    } else {
>>>> +        pci_set_long(dev->config + msi_address_lo_off(dev), msg.address);
>>>> +    }
>>>> +    pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data);
>>>> +}
>>>> +
>>>
>>> Please add documentation. Something like
>>>
>>> /*
>>>  * Special API for POWER to configure the vectors through
>>>  * a side channel. Should never be used by devices.
>>>  */
>>
>>
>> It is useful for any para-virtualized environment I believe, is not it?
>> For s390 as well. Of course, if it supports PCI, for example, what I am not sure it does though :)
> 
> I expect the normal guest to program the address into MSI register using
> config accesses, same way that it enables MSI/MSIX.
> Why POWER does it differently I did not yet figure out but I hope
> this weirdness is not so widespread.


In para-virt I would expect the guest not to touch config space at all. At least it should use one interface rather than two but this is how it is.


>>>>  bool msi_enabled(const PCIDevice *dev)
>>>>  {
>>>>      return msi_present(dev) &&
>>>> diff --git a/hw/msi.h b/hw/msi.h
>>>> index 75747ab..6ec1f99 100644
>>>> --- a/hw/msi.h
>>>> +++ b/hw/msi.h
>>>> @@ -31,6 +31,7 @@ struct MSIMessage {
>>>>  
>>>>  extern bool msi_supported;
>>>>  
>>>> +void msi_set_message(PCIDevice *dev, MSIMessage msg);
>>>>  bool msi_enabled(const PCIDevice *dev);
>>>>  int msi_init(struct PCIDevice *dev, uint8_t offset,
>>>>               unsigned int nr_vectors, bool msi64bit, bool msi_per_vector_mask);
>>>> diff --git a/hw/msix.c b/hw/msix.c
>>>> index ded3c55..5f7d6d3 100644
>>>> --- a/hw/msix.c
>>>> +++ b/hw/msix.c
>>>> @@ -45,6 +45,15 @@ static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
>>>>      return msg;
>>>>  }
>>>>  
>>>> +void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg)
>>>> +{
>>>> +    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
>>>> +
>>>> +    pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address);
>>>> +    pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data);
>>>> +    table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
>>>> +}
>>>> +
>>>>  /* Add MSI-X capability to the config space for the device. */
>>>>  /* Given a bar and its size, add MSI-X table on top of it
>>>>   * and fill MSI-X capability in the config space.
>>>> diff --git a/hw/msix.h b/hw/msix.h
>>>> index 50aee82..26a437e 100644
>>>> --- a/hw/msix.h
>>>> +++ b/hw/msix.h
>>>> @@ -4,6 +4,8 @@
>>>>  #include "qemu-common.h"
>>>>  #include "pci.h"
>>>>  
>>>> +void msix_set_message(PCIDevice *dev, int vector, MSIMessage msg);
>>>> +
>>>>  int msix_init(PCIDevice *pdev, unsigned short nentries,
>>>>                MemoryRegion *bar,
>>>>                unsigned bar_nr, unsigned bar_size);
>>>> -- 
>>>> 1.7.10
>>>>
>>>> ps. double '-' and git version is an end-of-patch scissor as I read somewhere, cannot recall where exactly :)
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>> On 21/06/12 20:56, Jan Kiszka wrote:
>>>>> On 2012-06-21 12:50, Alexey Kardashevskiy wrote:
>>>>>> On 21/06/12 20:38, Jan Kiszka wrote:
>>>>>>> On 2012-06-21 12:28, Alexey Kardashevskiy wrote:
>>>>>>>> On 21/06/12 17:39, Jan Kiszka wrote:
>>>>>>>>> On 2012-06-21 09:18, Alexey Kardashevskiy wrote:
>>>>>>>>>>
>>>>>>>>>> agrhhh. sha1 of the patch changed after rebasing :)
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Added (msi|msix)_(set|get)_message() function for whoever might
>>>>>>>>>> want to use them.
>>>>>>>>>>
>>>>>>>>>> Currently msi_notify()/msix_notify() write to these vectors to
>>>>>>>>>> signal the guest about an interrupt so the correct values have to
>>>>>>>>>> written there by the guest or QEMU.
>>>>>>>>>>
>>>>>>>>>> For example, POWER guest never initializes MSI/MSIX vectors, instead
>>>>>>>>>> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
>>>>>>>>>> POWER we have to initialize MSI/MSIX message from QEMU.
>>>>>>>>>>
>>>>>>>>>> As only set* function are required by now, the "get" functions were added
>>>>>>>>>> or made public for a symmetry.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>>>>> ---
>>>>>>>>>>  hw/msi.c  |   29 +++++++++++++++++++++++++++++
>>>>>>>>>>  hw/msi.h  |    2 ++
>>>>>>>>>>  hw/msix.c |   11 ++++++++++-
>>>>>>>>>>  hw/msix.h |    3 +++
>>>>>>>>>>  4 files changed, 44 insertions(+), 1 deletion(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/hw/msi.c b/hw/msi.c
>>>>>>>>>> index 5233204..9ad84a4 100644
>>>>>>>>>> --- a/hw/msi.c
>>>>>>>>>> +++ b/hw/msi.c
>>>>>>>>>> @@ -105,6 +105,35 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
>>>>>>>>>>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
>>>>>>>>>>  }
>>>>>>>>>>  
>>>>>>>>>> +MSIMessage msi_get_message(PCIDevice *dev)
>>>>>>>>>
>>>>>>>>> MSIMessage msi_get_message(PCIDevice *dev, unsigned vector)
>>>>>>>>
>>>>>>>>
>>>>>>>> Who/how/why is going to calculate the vector here?
>>>>>>>>
>>>>>>>>>
>>>>>>>>>> +{
>>>>>>>>>> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
>>>>>>>>>> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
>>>>>>>>>> +    MSIMessage msg;
>>>>>>>>>> +
>>>>>>>>>> +    if (msi64bit) {
>>>>>>>>>> +        msg.address = pci_get_quad(dev->config + msi_address_lo_off(dev));
>>>>>>>>>> +    } else {
>>>>>>>>>> +        msg.address = pci_get_long(dev->config + msi_address_lo_off(dev));
>>>>>>>>>> +    }
>>>>>>>>>> +    msg.data = pci_get_word(dev->config + msi_data_off(dev, msi64bit));
>>>>>>>>>
>>>>>>>>> And I have this here in addition:
>>>>>>>>>
>>>>>>>>>     unsigned int nr_vectors = msi_nr_vectors(flags);
>>>>>>>>>     ...
>>>>>>>>>
>>>>>>>>>     if (nr_vectors > 1) {
>>>>>>>>>         msg.data &= ~(nr_vectors - 1);
>>>>>>>>>         msg.data |= vector;
>>>>>>>>>     }
>>>>>>>>>
>>>>>>>>> See PCI spec and existing code.
>>>>>>>>
>>>>>>>>
>>>>>>>> What for? I really do not get it why someone might want to read something but not real value.
>>>>>>>> What PCI code should I look?
>>>>>>>
>>>>>>> I'm not sure what your use case for reading the message is. For KVM
>>>>>>> device assignment it is preparing an alternative message delivery path
>>>>>>> for MSI vectors. And for this we will need vector notifier support for
>>>>>>> MSI as well. You can check the MSI-X code for corresponding use cases of
>>>>>>> msix_get_message.
>>>>>>
>>>>>>> And when we already have msi_get_message, another logical use case is
>>>>>>> msi_notify. See msix.c again.
>>>>>>
>>>>>> Aaaa.
>>>>>>
>>>>>> I have no case for reading the message. All I need is writing. And I want it public as I want to use
>>>>>> it from hw/spapr_pci.c. You suggested to add reading, I added "get" to be _symmetric_ to "set"
>>>>>> ("get" returns what "set" wrote). You want a different thing which I can do but it is not
>>>>>> msi_get_message(), it is something like msi_prepare_message(MSImessage msg) or
>>>>>> msi_set_vector(uint16_t data) or simply internal kitchen of msi_notify().
>>>>>>
>>>>>> Still can do what you suggested, it just does not seem right.
>>>>>
>>>>> It is right - when looking at it from a different angle. ;)
>>>>>
>>>>> I don't mind if you add msi_get_message now or leave this to me. Likely
>>>>> the latter is better as you have no use case for msi_get_message (and
>>>>> also msix_get_message!) outside of their modules, thus we should not
>>>>> export those functions anyway.
Michael S. Tsirkin July 19, 2012, 9:27 a.m. UTC | #7
On Thu, Jul 19, 2012 at 10:32:40AM +1000, Alexey Kardashevskiy wrote:
> On 19/07/12 01:23, Michael S. Tsirkin wrote:
> > On Wed, Jul 18, 2012 at 11:17:12PM +1000, Alexey Kardashevskiy wrote:
> >> On 18/07/12 22:43, Michael S. Tsirkin wrote:
> >>> On Thu, Jun 21, 2012 at 09:39:10PM +1000, Alexey Kardashevskiy wrote:
> >>>> Added (msi|msix)_set_message() functions.
> >>>>
> >>>> Currently msi_notify()/msix_notify() write to these vectors to
> >>>> signal the guest about an interrupt so the correct values have to
> >>>> written there by the guest or QEMU.
> >>>>
> >>>> For example, POWER guest never initializes MSI/MSIX vectors, instead
> >>>> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
> >>>> POWER we have to initialize MSI/MSIX message from QEMU.
> >>>>
> >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>
> >>> So guests do enable MSI through config space, but do
> >>> not fill in vectors? 
> >>
> >> Yes. msix_capability_init() calls arch_setup_msi_irqs() which does everything it needs to do (i.e. calls hypervisor) before msix_capability_init() writes PCI_MSIX_FLAGS_ENABLE to the PCI_MSIX_FLAGS register.
> >>
> >> These vectors are the PCI bus addresses, the way they are set is specific for a PCI host controller, I do not see why the current scheme is a bug.
> > 
> > I won't work with any real PCI device, will it? Real pci devices expect
> > vectors to be written into their memory.
> 
> 
> Yes. And the hypervisor does this. On POWER (at least book3s - server powerpc, the whole config space kitchen is hidden behind RTAS (kind of bios). For the guest, this RTAS is implemented in hypervisor, for the host - in the system firmware. So powerpc linux does not have to have PHB drivers. Kinda cool.
> 
> Usual powerpc server is running without the host linux at all, it is running a hypervisor called pHyp. And every guest knows that it is a guest, there is no full machine emulation, it is para-virtualization. In power-kvm, we replace that pHyp with the host linux and now QEMU plays a hypervisor role. Some day We will move the hypervisor to the host kernel completely (?) but now it is in QEMU.

OKay. So it is a POWER-specific weirdness as I suspected.
Sure, if this is what real hardware does we pretty much have to
emulate this.

> >>> Very strange. Are you sure it's not
> >>> just a guest bug? How does it work for other PCI devices?
> >>
> >> Did not get the question. It works the same for every PCI device under POWER guest.
> > 
> > I mean for real PCI devices.
> > 
> >>> Can't we just fix guest drivers to program the vectors properly?
> >>>
> >>> Also pls address the comment below.
> >>
> >> Comment below.
> >>
> >>> Thanks!
> >>>
> >>>> ---
> >>>>  hw/msi.c  |   13 +++++++++++++
> >>>>  hw/msi.h  |    1 +
> >>>>  hw/msix.c |    9 +++++++++
> >>>>  hw/msix.h |    2 ++
> >>>>  4 files changed, 25 insertions(+)
> >>>>
> >>>> diff --git a/hw/msi.c b/hw/msi.c
> >>>> index 5233204..cc6102f 100644
> >>>> --- a/hw/msi.c
> >>>> +++ b/hw/msi.c
> >>>> @@ -105,6 +105,19 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
> >>>>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
> >>>>  }
> >>>>  
> >>>> +void msi_set_message(PCIDevice *dev, MSIMessage msg)
> >>>> +{
> >>>> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> >>>> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> >>>> +
> >>>> +    if (msi64bit) {
> >>>> +        pci_set_quad(dev->config + msi_address_lo_off(dev), msg.address);
> >>>> +    } else {
> >>>> +        pci_set_long(dev->config + msi_address_lo_off(dev), msg.address);
> >>>> +    }
> >>>> +    pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data);
> >>>> +}
> >>>> +
> >>>
> >>> Please add documentation. Something like
> >>>
> >>> /*
> >>>  * Special API for POWER to configure the vectors through
> >>>  * a side channel. Should never be used by devices.
> >>>  */
> >>
> >>
> >> It is useful for any para-virtualized environment I believe, is not it?
> >> For s390 as well. Of course, if it supports PCI, for example, what I am not sure it does though :)
> > 
> > I expect the normal guest to program the address into MSI register using
> > config accesses, same way that it enables MSI/MSIX.
> > Why POWER does it differently I did not yet figure out but I hope
> > this weirdness is not so widespread.
> 
> 
> In para-virt I would expect the guest not to touch config space at all. At least it should use one interface rather than two but this is how it is.

It's not new that firmware developers consistently make inconsistent
design decisions :)

> >>>>  bool msi_enabled(const PCIDevice *dev)
> >>>>  {
> >>>>      return msi_present(dev) &&
> >>>> diff --git a/hw/msi.h b/hw/msi.h
> >>>> index 75747ab..6ec1f99 100644
> >>>> --- a/hw/msi.h
> >>>> +++ b/hw/msi.h
> >>>> @@ -31,6 +31,7 @@ struct MSIMessage {
> >>>>  
> >>>>  extern bool msi_supported;
> >>>>  
> >>>> +void msi_set_message(PCIDevice *dev, MSIMessage msg);
> >>>>  bool msi_enabled(const PCIDevice *dev);
> >>>>  int msi_init(struct PCIDevice *dev, uint8_t offset,
> >>>>               unsigned int nr_vectors, bool msi64bit, bool msi_per_vector_mask);
> >>>> diff --git a/hw/msix.c b/hw/msix.c
> >>>> index ded3c55..5f7d6d3 100644
> >>>> --- a/hw/msix.c
> >>>> +++ b/hw/msix.c
> >>>> @@ -45,6 +45,15 @@ static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
> >>>>      return msg;
> >>>>  }
> >>>>  
> >>>> +void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg)
> >>>> +{
> >>>> +    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
> >>>> +
> >>>> +    pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address);
> >>>> +    pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data);
> >>>> +    table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
> >>>> +}
> >>>> +
> >>>>  /* Add MSI-X capability to the config space for the device. */
> >>>>  /* Given a bar and its size, add MSI-X table on top of it
> >>>>   * and fill MSI-X capability in the config space.
> >>>> diff --git a/hw/msix.h b/hw/msix.h
> >>>> index 50aee82..26a437e 100644
> >>>> --- a/hw/msix.h
> >>>> +++ b/hw/msix.h
> >>>> @@ -4,6 +4,8 @@
> >>>>  #include "qemu-common.h"
> >>>>  #include "pci.h"
> >>>>  
> >>>> +void msix_set_message(PCIDevice *dev, int vector, MSIMessage msg);
> >>>> +
> >>>>  int msix_init(PCIDevice *pdev, unsigned short nentries,
> >>>>                MemoryRegion *bar,
> >>>>                unsigned bar_nr, unsigned bar_size);
> >>>> -- 
> >>>> 1.7.10
> >>>>
> >>>> ps. double '-' and git version is an end-of-patch scissor as I read somewhere, cannot recall where exactly :)
> >>>>
> >>>>
> >>>>
> >>>>
> >>>>
> >>>>
> >>>> On 21/06/12 20:56, Jan Kiszka wrote:
> >>>>> On 2012-06-21 12:50, Alexey Kardashevskiy wrote:
> >>>>>> On 21/06/12 20:38, Jan Kiszka wrote:
> >>>>>>> On 2012-06-21 12:28, Alexey Kardashevskiy wrote:
> >>>>>>>> On 21/06/12 17:39, Jan Kiszka wrote:
> >>>>>>>>> On 2012-06-21 09:18, Alexey Kardashevskiy wrote:
> >>>>>>>>>>
> >>>>>>>>>> agrhhh. sha1 of the patch changed after rebasing :)
> >>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>> Added (msi|msix)_(set|get)_message() function for whoever might
> >>>>>>>>>> want to use them.
> >>>>>>>>>>
> >>>>>>>>>> Currently msi_notify()/msix_notify() write to these vectors to
> >>>>>>>>>> signal the guest about an interrupt so the correct values have to
> >>>>>>>>>> written there by the guest or QEMU.
> >>>>>>>>>>
> >>>>>>>>>> For example, POWER guest never initializes MSI/MSIX vectors, instead
> >>>>>>>>>> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
> >>>>>>>>>> POWER we have to initialize MSI/MSIX message from QEMU.
> >>>>>>>>>>
> >>>>>>>>>> As only set* function are required by now, the "get" functions were added
> >>>>>>>>>> or made public for a symmetry.
> >>>>>>>>>>
> >>>>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>>>>>>> ---
> >>>>>>>>>>  hw/msi.c  |   29 +++++++++++++++++++++++++++++
> >>>>>>>>>>  hw/msi.h  |    2 ++
> >>>>>>>>>>  hw/msix.c |   11 ++++++++++-
> >>>>>>>>>>  hw/msix.h |    3 +++
> >>>>>>>>>>  4 files changed, 44 insertions(+), 1 deletion(-)
> >>>>>>>>>>
> >>>>>>>>>> diff --git a/hw/msi.c b/hw/msi.c
> >>>>>>>>>> index 5233204..9ad84a4 100644
> >>>>>>>>>> --- a/hw/msi.c
> >>>>>>>>>> +++ b/hw/msi.c
> >>>>>>>>>> @@ -105,6 +105,35 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
> >>>>>>>>>>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
> >>>>>>>>>>  }
> >>>>>>>>>>  
> >>>>>>>>>> +MSIMessage msi_get_message(PCIDevice *dev)
> >>>>>>>>>
> >>>>>>>>> MSIMessage msi_get_message(PCIDevice *dev, unsigned vector)
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> Who/how/why is going to calculate the vector here?
> >>>>>>>>
> >>>>>>>>>
> >>>>>>>>>> +{
> >>>>>>>>>> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> >>>>>>>>>> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> >>>>>>>>>> +    MSIMessage msg;
> >>>>>>>>>> +
> >>>>>>>>>> +    if (msi64bit) {
> >>>>>>>>>> +        msg.address = pci_get_quad(dev->config + msi_address_lo_off(dev));
> >>>>>>>>>> +    } else {
> >>>>>>>>>> +        msg.address = pci_get_long(dev->config + msi_address_lo_off(dev));
> >>>>>>>>>> +    }
> >>>>>>>>>> +    msg.data = pci_get_word(dev->config + msi_data_off(dev, msi64bit));
> >>>>>>>>>
> >>>>>>>>> And I have this here in addition:
> >>>>>>>>>
> >>>>>>>>>     unsigned int nr_vectors = msi_nr_vectors(flags);
> >>>>>>>>>     ...
> >>>>>>>>>
> >>>>>>>>>     if (nr_vectors > 1) {
> >>>>>>>>>         msg.data &= ~(nr_vectors - 1);
> >>>>>>>>>         msg.data |= vector;
> >>>>>>>>>     }
> >>>>>>>>>
> >>>>>>>>> See PCI spec and existing code.
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> What for? I really do not get it why someone might want to read something but not real value.
> >>>>>>>> What PCI code should I look?
> >>>>>>>
> >>>>>>> I'm not sure what your use case for reading the message is. For KVM
> >>>>>>> device assignment it is preparing an alternative message delivery path
> >>>>>>> for MSI vectors. And for this we will need vector notifier support for
> >>>>>>> MSI as well. You can check the MSI-X code for corresponding use cases of
> >>>>>>> msix_get_message.
> >>>>>>
> >>>>>>> And when we already have msi_get_message, another logical use case is
> >>>>>>> msi_notify. See msix.c again.
> >>>>>>
> >>>>>> Aaaa.
> >>>>>>
> >>>>>> I have no case for reading the message. All I need is writing. And I want it public as I want to use
> >>>>>> it from hw/spapr_pci.c. You suggested to add reading, I added "get" to be _symmetric_ to "set"
> >>>>>> ("get" returns what "set" wrote). You want a different thing which I can do but it is not
> >>>>>> msi_get_message(), it is something like msi_prepare_message(MSImessage msg) or
> >>>>>> msi_set_vector(uint16_t data) or simply internal kitchen of msi_notify().
> >>>>>>
> >>>>>> Still can do what you suggested, it just does not seem right.
> >>>>>
> >>>>> It is right - when looking at it from a different angle. ;)
> >>>>>
> >>>>> I don't mind if you add msi_get_message now or leave this to me. Likely
> >>>>> the latter is better as you have no use case for msi_get_message (and
> >>>>> also msix_get_message!) outside of their modules, thus we should not
> >>>>> export those functions anyway.
> 
> 
> -- 
> Alexey
>
Alexey Kardashevskiy July 19, 2012, 2:24 p.m. UTC | #8
One comment below.


On 19/07/12 19:27, Michael S. Tsirkin wrote:
> On Thu, Jul 19, 2012 at 10:32:40AM +1000, Alexey Kardashevskiy wrote:
>> On 19/07/12 01:23, Michael S. Tsirkin wrote:
>>> On Wed, Jul 18, 2012 at 11:17:12PM +1000, Alexey Kardashevskiy wrote:
>>>> On 18/07/12 22:43, Michael S. Tsirkin wrote:
>>>>> On Thu, Jun 21, 2012 at 09:39:10PM +1000, Alexey Kardashevskiy wrote:
>>>>>> Added (msi|msix)_set_message() functions.
>>>>>>
>>>>>> Currently msi_notify()/msix_notify() write to these vectors to
>>>>>> signal the guest about an interrupt so the correct values have to
>>>>>> written there by the guest or QEMU.
>>>>>>
>>>>>> For example, POWER guest never initializes MSI/MSIX vectors, instead
>>>>>> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
>>>>>> POWER we have to initialize MSI/MSIX message from QEMU.
>>>>>>
>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>
>>>>> So guests do enable MSI through config space, but do
>>>>> not fill in vectors? 
>>>>
>>>> Yes. msix_capability_init() calls arch_setup_msi_irqs() which does everything it needs to do (i.e. calls hypervisor) before msix_capability_init() writes PCI_MSIX_FLAGS_ENABLE to the PCI_MSIX_FLAGS register.
>>>>
>>>> These vectors are the PCI bus addresses, the way they are set is specific for a PCI host controller, I do not see why the current scheme is a bug.
>>>
>>> I won't work with any real PCI device, will it? Real pci devices expect
>>> vectors to be written into their memory.
>>
>>
>> Yes. And the hypervisor does this. On POWER (at least book3s - server powerpc, the whole config space kitchen is hidden behind RTAS (kind of bios). For the guest, this RTAS is implemented in hypervisor, for the host - in the system firmware. So powerpc linux does not have to have PHB drivers. Kinda cool.
>>
>> Usual powerpc server is running without the host linux at all, it is running a hypervisor called pHyp. And every guest knows that it is a guest, there is no full machine emulation, it is para-virtualization. In power-kvm, we replace that pHyp with the host linux and now QEMU plays a hypervisor role. Some day We will move the hypervisor to the host kernel completely (?) but now it is in QEMU.
> 
> OKay. So it is a POWER-specific weirdness as I suspected.
> Sure, if this is what real hardware does we pretty much have to
> emulate this.
> 
>>>>> Very strange. Are you sure it's not
>>>>> just a guest bug? How does it work for other PCI devices?
>>>>
>>>> Did not get the question. It works the same for every PCI device under POWER guest.
>>>
>>> I mean for real PCI devices.
>>>
>>>>> Can't we just fix guest drivers to program the vectors properly?
>>>>>
>>>>> Also pls address the comment below.
>>>>
>>>> Comment below.
>>>>
>>>>> Thanks!
>>>>>
>>>>>> ---
>>>>>>  hw/msi.c  |   13 +++++++++++++
>>>>>>  hw/msi.h  |    1 +
>>>>>>  hw/msix.c |    9 +++++++++
>>>>>>  hw/msix.h |    2 ++
>>>>>>  4 files changed, 25 insertions(+)
>>>>>>
>>>>>> diff --git a/hw/msi.c b/hw/msi.c
>>>>>> index 5233204..cc6102f 100644
>>>>>> --- a/hw/msi.c
>>>>>> +++ b/hw/msi.c
>>>>>> @@ -105,6 +105,19 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
>>>>>>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
>>>>>>  }
>>>>>>  
>>>>>> +void msi_set_message(PCIDevice *dev, MSIMessage msg)
>>>>>> +{
>>>>>> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
>>>>>> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
>>>>>> +
>>>>>> +    if (msi64bit) {
>>>>>> +        pci_set_quad(dev->config + msi_address_lo_off(dev), msg.address);
>>>>>> +    } else {
>>>>>> +        pci_set_long(dev->config + msi_address_lo_off(dev), msg.address);
>>>>>> +    }
>>>>>> +    pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data);
>>>>>> +}
>>>>>> +
>>>>>
>>>>> Please add documentation. Something like
>>>>>
>>>>> /*
>>>>>  * Special API for POWER to configure the vectors through
>>>>>  * a side channel. Should never be used by devices.
>>>>>  */
>>>>
>>>>
>>>> It is useful for any para-virtualized environment I believe, is not it?
>>>> For s390 as well. Of course, if it supports PCI, for example, what I am not sure it does though :)
>>>
>>> I expect the normal guest to program the address into MSI register using
>>> config accesses, same way that it enables MSI/MSIX.
>>> Why POWER does it differently I did not yet figure out but I hope
>>> this weirdness is not so widespread.
>>
>>
>> In para-virt I would expect the guest not to touch config space at all. At least it should use one interface rather than two but this is how it is.
> 
> It's not new that firmware developers consistently make inconsistent
> design decisions :)


It depends on how to look at it. Enabling MSI via the config space is also done via a special set of hypervisor calls (common and IBM-specific) so it is all hidden in one place - the system firmware, what is cool - no PHB drivers in the guest. Although MSI would not need any additional hypercall to init vectors (everything can be done via config space), there is MSI-X which stores vectors in BAR and there is no hypercall for BARs as they are simply memory mapped. This is I think why the firmware people (or phyp but it is probably the same) added IBM-specific MSI/MSIX config hypercalls.

And I do not quite understand why MSIX people could not use extended PCI config space which is 4096 bytes, quite a lot, enough to fit 256 vectors (have not seen a card which asked for more than 9 _per function_). If somebody really needs 2048, he may want 16384 as well (or any other crazy number), etc, so why did they put such a limit, it is a BAR, it is huge? :) A, offtopic anyway.


>>>>>>  bool msi_enabled(const PCIDevice *dev)
>>>>>>  {
>>>>>>      return msi_present(dev) &&
>>>>>> diff --git a/hw/msi.h b/hw/msi.h
>>>>>> index 75747ab..6ec1f99 100644
>>>>>> --- a/hw/msi.h
>>>>>> +++ b/hw/msi.h
>>>>>> @@ -31,6 +31,7 @@ struct MSIMessage {
>>>>>>  
>>>>>>  extern bool msi_supported;
>>>>>>  
>>>>>> +void msi_set_message(PCIDevice *dev, MSIMessage msg);
>>>>>>  bool msi_enabled(const PCIDevice *dev);
>>>>>>  int msi_init(struct PCIDevice *dev, uint8_t offset,
>>>>>>               unsigned int nr_vectors, bool msi64bit, bool msi_per_vector_mask);
>>>>>> diff --git a/hw/msix.c b/hw/msix.c
>>>>>> index ded3c55..5f7d6d3 100644
>>>>>> --- a/hw/msix.c
>>>>>> +++ b/hw/msix.c
>>>>>> @@ -45,6 +45,15 @@ static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
>>>>>>      return msg;
>>>>>>  }
>>>>>>  
>>>>>> +void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg)
>>>>>> +{
>>>>>> +    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
>>>>>> +
>>>>>> +    pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address);
>>>>>> +    pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data);
>>>>>> +    table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
>>>>>> +}
>>>>>> +
>>>>>>  /* Add MSI-X capability to the config space for the device. */
>>>>>>  /* Given a bar and its size, add MSI-X table on top of it
>>>>>>   * and fill MSI-X capability in the config space.
>>>>>> diff --git a/hw/msix.h b/hw/msix.h
>>>>>> index 50aee82..26a437e 100644
>>>>>> --- a/hw/msix.h
>>>>>> +++ b/hw/msix.h
>>>>>> @@ -4,6 +4,8 @@
>>>>>>  #include "qemu-common.h"
>>>>>>  #include "pci.h"
>>>>>>  
>>>>>> +void msix_set_message(PCIDevice *dev, int vector, MSIMessage msg);
>>>>>> +
>>>>>>  int msix_init(PCIDevice *pdev, unsigned short nentries,
>>>>>>                MemoryRegion *bar,
>>>>>>                unsigned bar_nr, unsigned bar_size);
>>>>>> -- 
>>>>>> 1.7.10
>>>>>>
>>>>>> ps. double '-' and git version is an end-of-patch scissor as I read somewhere, cannot recall where exactly :)
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 21/06/12 20:56, Jan Kiszka wrote:
>>>>>>> On 2012-06-21 12:50, Alexey Kardashevskiy wrote:
>>>>>>>> On 21/06/12 20:38, Jan Kiszka wrote:
>>>>>>>>> On 2012-06-21 12:28, Alexey Kardashevskiy wrote:
>>>>>>>>>> On 21/06/12 17:39, Jan Kiszka wrote:
>>>>>>>>>>> On 2012-06-21 09:18, Alexey Kardashevskiy wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> agrhhh. sha1 of the patch changed after rebasing :)
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Added (msi|msix)_(set|get)_message() function for whoever might
>>>>>>>>>>>> want to use them.
>>>>>>>>>>>>
>>>>>>>>>>>> Currently msi_notify()/msix_notify() write to these vectors to
>>>>>>>>>>>> signal the guest about an interrupt so the correct values have to
>>>>>>>>>>>> written there by the guest or QEMU.
>>>>>>>>>>>>
>>>>>>>>>>>> For example, POWER guest never initializes MSI/MSIX vectors, instead
>>>>>>>>>>>> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
>>>>>>>>>>>> POWER we have to initialize MSI/MSIX message from QEMU.
>>>>>>>>>>>>
>>>>>>>>>>>> As only set* function are required by now, the "get" functions were added
>>>>>>>>>>>> or made public for a symmetry.
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>>>>>>> ---
>>>>>>>>>>>>  hw/msi.c  |   29 +++++++++++++++++++++++++++++
>>>>>>>>>>>>  hw/msi.h  |    2 ++
>>>>>>>>>>>>  hw/msix.c |   11 ++++++++++-
>>>>>>>>>>>>  hw/msix.h |    3 +++
>>>>>>>>>>>>  4 files changed, 44 insertions(+), 1 deletion(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/hw/msi.c b/hw/msi.c
>>>>>>>>>>>> index 5233204..9ad84a4 100644
>>>>>>>>>>>> --- a/hw/msi.c
>>>>>>>>>>>> +++ b/hw/msi.c
>>>>>>>>>>>> @@ -105,6 +105,35 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
>>>>>>>>>>>>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
>>>>>>>>>>>>  }
>>>>>>>>>>>>  
>>>>>>>>>>>> +MSIMessage msi_get_message(PCIDevice *dev)
>>>>>>>>>>>
>>>>>>>>>>> MSIMessage msi_get_message(PCIDevice *dev, unsigned vector)
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Who/how/why is going to calculate the vector here?
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>> +{
>>>>>>>>>>>> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
>>>>>>>>>>>> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
>>>>>>>>>>>> +    MSIMessage msg;
>>>>>>>>>>>> +
>>>>>>>>>>>> +    if (msi64bit) {
>>>>>>>>>>>> +        msg.address = pci_get_quad(dev->config + msi_address_lo_off(dev));
>>>>>>>>>>>> +    } else {
>>>>>>>>>>>> +        msg.address = pci_get_long(dev->config + msi_address_lo_off(dev));
>>>>>>>>>>>> +    }
>>>>>>>>>>>> +    msg.data = pci_get_word(dev->config + msi_data_off(dev, msi64bit));
>>>>>>>>>>>
>>>>>>>>>>> And I have this here in addition:
>>>>>>>>>>>
>>>>>>>>>>>     unsigned int nr_vectors = msi_nr_vectors(flags);
>>>>>>>>>>>     ...
>>>>>>>>>>>
>>>>>>>>>>>     if (nr_vectors > 1) {
>>>>>>>>>>>         msg.data &= ~(nr_vectors - 1);
>>>>>>>>>>>         msg.data |= vector;
>>>>>>>>>>>     }
>>>>>>>>>>>
>>>>>>>>>>> See PCI spec and existing code.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> What for? I really do not get it why someone might want to read something but not real value.
>>>>>>>>>> What PCI code should I look?
>>>>>>>>>
>>>>>>>>> I'm not sure what your use case for reading the message is. For KVM
>>>>>>>>> device assignment it is preparing an alternative message delivery path
>>>>>>>>> for MSI vectors. And for this we will need vector notifier support for
>>>>>>>>> MSI as well. You can check the MSI-X code for corresponding use cases of
>>>>>>>>> msix_get_message.
>>>>>>>>
>>>>>>>>> And when we already have msi_get_message, another logical use case is
>>>>>>>>> msi_notify. See msix.c again.
>>>>>>>>
>>>>>>>> Aaaa.
>>>>>>>>
>>>>>>>> I have no case for reading the message. All I need is writing. And I want it public as I want to use
>>>>>>>> it from hw/spapr_pci.c. You suggested to add reading, I added "get" to be _symmetric_ to "set"
>>>>>>>> ("get" returns what "set" wrote). You want a different thing which I can do but it is not
>>>>>>>> msi_get_message(), it is something like msi_prepare_message(MSImessage msg) or
>>>>>>>> msi_set_vector(uint16_t data) or simply internal kitchen of msi_notify().
>>>>>>>>
>>>>>>>> Still can do what you suggested, it just does not seem right.
>>>>>>>
>>>>>>> It is right - when looking at it from a different angle. ;)
>>>>>>>
>>>>>>> I don't mind if you add msi_get_message now or leave this to me. Likely
>>>>>>> the latter is better as you have no use case for msi_get_message (and
>>>>>>> also msix_get_message!) outside of their modules, thus we should not
>>>>>>> export those functions anyway.
Michael S. Tsirkin July 19, 2012, 2:43 p.m. UTC | #9
On Fri, Jul 20, 2012 at 12:24:05AM +1000, Alexey Kardashevskiy wrote:
> One comment below.
> 
> 
> On 19/07/12 19:27, Michael S. Tsirkin wrote:
> > On Thu, Jul 19, 2012 at 10:32:40AM +1000, Alexey Kardashevskiy wrote:
> >> On 19/07/12 01:23, Michael S. Tsirkin wrote:
> >>> On Wed, Jul 18, 2012 at 11:17:12PM +1000, Alexey Kardashevskiy wrote:
> >>>> On 18/07/12 22:43, Michael S. Tsirkin wrote:
> >>>>> On Thu, Jun 21, 2012 at 09:39:10PM +1000, Alexey Kardashevskiy wrote:
> >>>>>> Added (msi|msix)_set_message() functions.
> >>>>>>
> >>>>>> Currently msi_notify()/msix_notify() write to these vectors to
> >>>>>> signal the guest about an interrupt so the correct values have to
> >>>>>> written there by the guest or QEMU.
> >>>>>>
> >>>>>> For example, POWER guest never initializes MSI/MSIX vectors, instead
> >>>>>> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
> >>>>>> POWER we have to initialize MSI/MSIX message from QEMU.
> >>>>>>
> >>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>>
> >>>>> So guests do enable MSI through config space, but do
> >>>>> not fill in vectors? 
> >>>>
> >>>> Yes. msix_capability_init() calls arch_setup_msi_irqs() which does everything it needs to do (i.e. calls hypervisor) before msix_capability_init() writes PCI_MSIX_FLAGS_ENABLE to the PCI_MSIX_FLAGS register.
> >>>>
> >>>> These vectors are the PCI bus addresses, the way they are set is specific for a PCI host controller, I do not see why the current scheme is a bug.
> >>>
> >>> I won't work with any real PCI device, will it? Real pci devices expect
> >>> vectors to be written into their memory.
> >>
> >>
> >> Yes. And the hypervisor does this. On POWER (at least book3s - server powerpc, the whole config space kitchen is hidden behind RTAS (kind of bios). For the guest, this RTAS is implemented in hypervisor, for the host - in the system firmware. So powerpc linux does not have to have PHB drivers. Kinda cool.
> >>
> >> Usual powerpc server is running without the host linux at all, it is running a hypervisor called pHyp. And every guest knows that it is a guest, there is no full machine emulation, it is para-virtualization. In power-kvm, we replace that pHyp with the host linux and now QEMU plays a hypervisor role. Some day We will move the hypervisor to the host kernel completely (?) but now it is in QEMU.
> > 
> > OKay. So it is a POWER-specific weirdness as I suspected.
> > Sure, if this is what real hardware does we pretty much have to
> > emulate this.
> > 
> >>>>> Very strange. Are you sure it's not
> >>>>> just a guest bug? How does it work for other PCI devices?
> >>>>
> >>>> Did not get the question. It works the same for every PCI device under POWER guest.
> >>>
> >>> I mean for real PCI devices.
> >>>
> >>>>> Can't we just fix guest drivers to program the vectors properly?
> >>>>>
> >>>>> Also pls address the comment below.
> >>>>
> >>>> Comment below.
> >>>>
> >>>>> Thanks!
> >>>>>
> >>>>>> ---
> >>>>>>  hw/msi.c  |   13 +++++++++++++
> >>>>>>  hw/msi.h  |    1 +
> >>>>>>  hw/msix.c |    9 +++++++++
> >>>>>>  hw/msix.h |    2 ++
> >>>>>>  4 files changed, 25 insertions(+)
> >>>>>>
> >>>>>> diff --git a/hw/msi.c b/hw/msi.c
> >>>>>> index 5233204..cc6102f 100644
> >>>>>> --- a/hw/msi.c
> >>>>>> +++ b/hw/msi.c
> >>>>>> @@ -105,6 +105,19 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
> >>>>>>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
> >>>>>>  }
> >>>>>>  
> >>>>>> +void msi_set_message(PCIDevice *dev, MSIMessage msg)
> >>>>>> +{
> >>>>>> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> >>>>>> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> >>>>>> +
> >>>>>> +    if (msi64bit) {
> >>>>>> +        pci_set_quad(dev->config + msi_address_lo_off(dev), msg.address);
> >>>>>> +    } else {
> >>>>>> +        pci_set_long(dev->config + msi_address_lo_off(dev), msg.address);
> >>>>>> +    }
> >>>>>> +    pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data);
> >>>>>> +}
> >>>>>> +
> >>>>>
> >>>>> Please add documentation. Something like
> >>>>>
> >>>>> /*
> >>>>>  * Special API for POWER to configure the vectors through
> >>>>>  * a side channel. Should never be used by devices.
> >>>>>  */
> >>>>
> >>>>
> >>>> It is useful for any para-virtualized environment I believe, is not it?
> >>>> For s390 as well. Of course, if it supports PCI, for example, what I am not sure it does though :)
> >>>
> >>> I expect the normal guest to program the address into MSI register using
> >>> config accesses, same way that it enables MSI/MSIX.
> >>> Why POWER does it differently I did not yet figure out but I hope
> >>> this weirdness is not so widespread.
> >>
> >>
> >> In para-virt I would expect the guest not to touch config space at all. At least it should use one interface rather than two but this is how it is.
> > 
> > It's not new that firmware developers consistently make inconsistent
> > design decisions :)
> 
> 
> It depends on how to look at it. Enabling MSI via the config space is also done via a special set of hypervisor calls (common and IBM-specific) so it is all hidden in one place - the system firmware, what is cool - no PHB drivers in the guest. Although MSI would not need any additional hypercall to init vectors (everything can be done via config space), there is MSI-X which stores vectors in BAR and there is no hypercall for BARs as they are simply memory mapped. This is I think why the firmware people (or phyp but it is probably the same) added IBM-specific MSI/MSIX config hypercalls.

Well what's wrong with guest doing this through a memory mapped interface?

> And I do not quite understand why MSIX people could not use extended PCI config space which is 4096 bytes, quite a lot, enough to fit 256 vectors (have not seen a card which asked for more than 9 _per function_). If somebody really needs 2048, he may want 16384 as well (or any other crazy number), etc, so why did they put such a limit, it is a BAR, it is huge? :) A, offtopic anyway.


Well you have just described MSI, just don't use MSIX.

The motivation for MSIX was as follows:
PCI/PCI-X config space is not 4096 bytes, it is 256 bytes, and is very
crowded. You are thinking of PCI express. Config accesses are also
nonposted which means at most one must be in flight. This is not
appropriate for vector programming which needs to be done from multiple
CPUs in parallel.

Also offtopic, please try to avoid these super long lines in mail :).

> 
> 
> >>>>>>  bool msi_enabled(const PCIDevice *dev)
> >>>>>>  {
> >>>>>>      return msi_present(dev) &&
> >>>>>> diff --git a/hw/msi.h b/hw/msi.h
> >>>>>> index 75747ab..6ec1f99 100644
> >>>>>> --- a/hw/msi.h
> >>>>>> +++ b/hw/msi.h
> >>>>>> @@ -31,6 +31,7 @@ struct MSIMessage {
> >>>>>>  
> >>>>>>  extern bool msi_supported;
> >>>>>>  
> >>>>>> +void msi_set_message(PCIDevice *dev, MSIMessage msg);
> >>>>>>  bool msi_enabled(const PCIDevice *dev);
> >>>>>>  int msi_init(struct PCIDevice *dev, uint8_t offset,
> >>>>>>               unsigned int nr_vectors, bool msi64bit, bool msi_per_vector_mask);
> >>>>>> diff --git a/hw/msix.c b/hw/msix.c
> >>>>>> index ded3c55..5f7d6d3 100644
> >>>>>> --- a/hw/msix.c
> >>>>>> +++ b/hw/msix.c
> >>>>>> @@ -45,6 +45,15 @@ static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
> >>>>>>      return msg;
> >>>>>>  }
> >>>>>>  
> >>>>>> +void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg)
> >>>>>> +{
> >>>>>> +    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
> >>>>>> +
> >>>>>> +    pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address);
> >>>>>> +    pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data);
> >>>>>> +    table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
> >>>>>> +}
> >>>>>> +
> >>>>>>  /* Add MSI-X capability to the config space for the device. */
> >>>>>>  /* Given a bar and its size, add MSI-X table on top of it
> >>>>>>   * and fill MSI-X capability in the config space.
> >>>>>> diff --git a/hw/msix.h b/hw/msix.h
> >>>>>> index 50aee82..26a437e 100644
> >>>>>> --- a/hw/msix.h
> >>>>>> +++ b/hw/msix.h
> >>>>>> @@ -4,6 +4,8 @@
> >>>>>>  #include "qemu-common.h"
> >>>>>>  #include "pci.h"
> >>>>>>  
> >>>>>> +void msix_set_message(PCIDevice *dev, int vector, MSIMessage msg);
> >>>>>> +
> >>>>>>  int msix_init(PCIDevice *pdev, unsigned short nentries,
> >>>>>>                MemoryRegion *bar,
> >>>>>>                unsigned bar_nr, unsigned bar_size);
> >>>>>> -- 
> >>>>>> 1.7.10
> >>>>>>
> >>>>>> ps. double '-' and git version is an end-of-patch scissor as I read somewhere, cannot recall where exactly :)
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>> On 21/06/12 20:56, Jan Kiszka wrote:
> >>>>>>> On 2012-06-21 12:50, Alexey Kardashevskiy wrote:
> >>>>>>>> On 21/06/12 20:38, Jan Kiszka wrote:
> >>>>>>>>> On 2012-06-21 12:28, Alexey Kardashevskiy wrote:
> >>>>>>>>>> On 21/06/12 17:39, Jan Kiszka wrote:
> >>>>>>>>>>> On 2012-06-21 09:18, Alexey Kardashevskiy wrote:
> >>>>>>>>>>>>
> >>>>>>>>>>>> agrhhh. sha1 of the patch changed after rebasing :)
> >>>>>>>>>>>>
> >>>>>>>>>>>>
> >>>>>>>>>>>>
> >>>>>>>>>>>> Added (msi|msix)_(set|get)_message() function for whoever might
> >>>>>>>>>>>> want to use them.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Currently msi_notify()/msix_notify() write to these vectors to
> >>>>>>>>>>>> signal the guest about an interrupt so the correct values have to
> >>>>>>>>>>>> written there by the guest or QEMU.
> >>>>>>>>>>>>
> >>>>>>>>>>>> For example, POWER guest never initializes MSI/MSIX vectors, instead
> >>>>>>>>>>>> it uses RTAS hypercalls. So in order to support MSIX for virtio-pci on
> >>>>>>>>>>>> POWER we have to initialize MSI/MSIX message from QEMU.
> >>>>>>>>>>>>
> >>>>>>>>>>>> As only set* function are required by now, the "get" functions were added
> >>>>>>>>>>>> or made public for a symmetry.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>>>>>>>>> ---
> >>>>>>>>>>>>  hw/msi.c  |   29 +++++++++++++++++++++++++++++
> >>>>>>>>>>>>  hw/msi.h  |    2 ++
> >>>>>>>>>>>>  hw/msix.c |   11 ++++++++++-
> >>>>>>>>>>>>  hw/msix.h |    3 +++
> >>>>>>>>>>>>  4 files changed, 44 insertions(+), 1 deletion(-)
> >>>>>>>>>>>>
> >>>>>>>>>>>> diff --git a/hw/msi.c b/hw/msi.c
> >>>>>>>>>>>> index 5233204..9ad84a4 100644
> >>>>>>>>>>>> --- a/hw/msi.c
> >>>>>>>>>>>> +++ b/hw/msi.c
> >>>>>>>>>>>> @@ -105,6 +105,35 @@ static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
> >>>>>>>>>>>>      return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
> >>>>>>>>>>>>  }
> >>>>>>>>>>>>  
> >>>>>>>>>>>> +MSIMessage msi_get_message(PCIDevice *dev)
> >>>>>>>>>>>
> >>>>>>>>>>> MSIMessage msi_get_message(PCIDevice *dev, unsigned vector)
> >>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>> Who/how/why is going to calculate the vector here?
> >>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>> +{
> >>>>>>>>>>>> +    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
> >>>>>>>>>>>> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> >>>>>>>>>>>> +    MSIMessage msg;
> >>>>>>>>>>>> +
> >>>>>>>>>>>> +    if (msi64bit) {
> >>>>>>>>>>>> +        msg.address = pci_get_quad(dev->config + msi_address_lo_off(dev));
> >>>>>>>>>>>> +    } else {
> >>>>>>>>>>>> +        msg.address = pci_get_long(dev->config + msi_address_lo_off(dev));
> >>>>>>>>>>>> +    }
> >>>>>>>>>>>> +    msg.data = pci_get_word(dev->config + msi_data_off(dev, msi64bit));
> >>>>>>>>>>>
> >>>>>>>>>>> And I have this here in addition:
> >>>>>>>>>>>
> >>>>>>>>>>>     unsigned int nr_vectors = msi_nr_vectors(flags);
> >>>>>>>>>>>     ...
> >>>>>>>>>>>
> >>>>>>>>>>>     if (nr_vectors > 1) {
> >>>>>>>>>>>         msg.data &= ~(nr_vectors - 1);
> >>>>>>>>>>>         msg.data |= vector;
> >>>>>>>>>>>     }
> >>>>>>>>>>>
> >>>>>>>>>>> See PCI spec and existing code.
> >>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>> What for? I really do not get it why someone might want to read something but not real value.
> >>>>>>>>>> What PCI code should I look?
> >>>>>>>>>
> >>>>>>>>> I'm not sure what your use case for reading the message is. For KVM
> >>>>>>>>> device assignment it is preparing an alternative message delivery path
> >>>>>>>>> for MSI vectors. And for this we will need vector notifier support for
> >>>>>>>>> MSI as well. You can check the MSI-X code for corresponding use cases of
> >>>>>>>>> msix_get_message.
> >>>>>>>>
> >>>>>>>>> And when we already have msi_get_message, another logical use case is
> >>>>>>>>> msi_notify. See msix.c again.
> >>>>>>>>
> >>>>>>>> Aaaa.
> >>>>>>>>
> >>>>>>>> I have no case for reading the message. All I need is writing. And I want it public as I want to use
> >>>>>>>> it from hw/spapr_pci.c. You suggested to add reading, I added "get" to be _symmetric_ to "set"
> >>>>>>>> ("get" returns what "set" wrote). You want a different thing which I can do but it is not
> >>>>>>>> msi_get_message(), it is something like msi_prepare_message(MSImessage msg) or
> >>>>>>>> msi_set_vector(uint16_t data) or simply internal kitchen of msi_notify().
> >>>>>>>>
> >>>>>>>> Still can do what you suggested, it just does not seem right.
> >>>>>>>
> >>>>>>> It is right - when looking at it from a different angle. ;)
> >>>>>>>
> >>>>>>> I don't mind if you add msi_get_message now or leave this to me. Likely
> >>>>>>> the latter is better as you have no use case for msi_get_message (and
> >>>>>>> also msix_get_message!) outside of their modules, thus we should not
> >>>>>>> export those functions anyway.
> 
> 
> -- 
> Alexey
>
Alexey Kardashevskiy July 19, 2012, 2:50 p.m. UTC | #10
On 20/07/12 00:43, Michael S. Tsirkin wrote:
> On Fri, Jul 20, 2012 at 12:24:05AM +1000, Alexey Kardashevskiy wrote:
>> One comment below.
>> 
>> 
>> On 19/07/12 19:27, Michael S. Tsirkin wrote:
>>> On Thu, Jul 19, 2012 at 10:32:40AM +1000, Alexey Kardashevskiy
>>> wrote:
>>>> On 19/07/12 01:23, Michael S. Tsirkin wrote:
>>>>> On Wed, Jul 18, 2012 at 11:17:12PM +1000, Alexey Kardashevskiy
>>>>> wrote:
>>>>>> On 18/07/12 22:43, Michael S. Tsirkin wrote:
>>>>>>> On Thu, Jun 21, 2012 at 09:39:10PM +1000, Alexey
>>>>>>> Kardashevskiy wrote:
>>>>>>>> Added (msi|msix)_set_message() functions.
>>>>>>>> 
>>>>>>>> Currently msi_notify()/msix_notify() write to these
>>>>>>>> vectors to signal the guest about an interrupt so the
>>>>>>>> correct values have to written there by the guest or
>>>>>>>> QEMU.
>>>>>>>> 
>>>>>>>> For example, POWER guest never initializes MSI/MSIX
>>>>>>>> vectors, instead it uses RTAS hypercalls. So in order to
>>>>>>>> support MSIX for virtio-pci on POWER we have to initialize
>>>>>>>> MSI/MSIX message from QEMU.
>>>>>>>> 
>>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>> 
>>>>>>> So guests do enable MSI through config space, but do not
>>>>>>> fill in vectors?
>>>>>> 
>>>>>> Yes. msix_capability_init() calls arch_setup_msi_irqs() which
>>>>>> does everything it needs to do (i.e. calls hypervisor) before
>>>>>> msix_capability_init() writes PCI_MSIX_FLAGS_ENABLE to the
>>>>>> PCI_MSIX_FLAGS register.
>>>>>> 
>>>>>> These vectors are the PCI bus addresses, the way they are set
>>>>>> is specific for a PCI host controller, I do not see why the
>>>>>> current scheme is a bug.
>>>>> 
>>>>> I won't work with any real PCI device, will it? Real pci devices
>>>>> expect vectors to be written into their memory.
>>>> 
>>>> 
>>>> Yes. And the hypervisor does this. On POWER (at least book3s -
>>>> server powerpc, the whole config space kitchen is hidden behind
>>>> RTAS (kind of bios). For the guest, this RTAS is implemented in
>>>> hypervisor, for the host - in the system firmware. So powerpc
>>>> linux does not have to have PHB drivers. Kinda cool.
>>>> 
>>>> Usual powerpc server is running without the host linux at all, it
>>>> is running a hypervisor called pHyp. And every guest knows that it
>>>> is a guest, there is no full machine emulation, it is
>>>> para-virtualization. In power-kvm, we replace that pHyp with the
>>>> host linux and now QEMU plays a hypervisor role. Some day We will
>>>> move the hypervisor to the host kernel completely (?) but now it
>>>> is in QEMU.
>>> 
>>> OKay. So it is a POWER-specific weirdness as I suspected. Sure, if
>>> this is what real hardware does we pretty much have to emulate
>>> this.
>>> 
>>>>>>> Very strange. Are you sure it's not just a guest bug? How
>>>>>>> does it work for other PCI devices?
>>>>>> 
>>>>>> Did not get the question. It works the same for every PCI
>>>>>> device under POWER guest.
>>>>> 
>>>>> I mean for real PCI devices.
>>>>> 
>>>>>>> Can't we just fix guest drivers to program the vectors
>>>>>>> properly?
>>>>>>> 
>>>>>>> Also pls address the comment below.
>>>>>> 
>>>>>> Comment below.
>>>>>> 
>>>>>>> Thanks!
>>>>>>> 
>>>>>>>> --- hw/msi.c  |   13 +++++++++++++ hw/msi.h  |    1 + 
>>>>>>>> hw/msix.c |    9 +++++++++ hw/msix.h |    2 ++ 4 files
>>>>>>>> changed, 25 insertions(+)
>>>>>>>> 
>>>>>>>> diff --git a/hw/msi.c b/hw/msi.c index 5233204..cc6102f
>>>>>>>> 100644 --- a/hw/msi.c +++ b/hw/msi.c @@ -105,6 +105,19 @@
>>>>>>>> static inline uint8_t msi_pending_off(const PCIDevice*
>>>>>>>> dev, bool msi64bit) return dev->msi_cap + (msi64bit ?
>>>>>>>> PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32); }
>>>>>>>> 
>>>>>>>> +void msi_set_message(PCIDevice *dev, MSIMessage msg) +{ +
>>>>>>>> uint16_t flags = pci_get_word(dev->config +
>>>>>>>> msi_flags_off(dev)); +    bool msi64bit = flags &
>>>>>>>> PCI_MSI_FLAGS_64BIT; + +    if (msi64bit) { +
>>>>>>>> pci_set_quad(dev->config + msi_address_lo_off(dev),
>>>>>>>> msg.address); +    } else { +
>>>>>>>> pci_set_long(dev->config + msi_address_lo_off(dev),
>>>>>>>> msg.address); +    } +    pci_set_word(dev->config +
>>>>>>>> msi_data_off(dev, msi64bit), msg.data); +} +
>>>>>>> 
>>>>>>> Please add documentation. Something like
>>>>>>> 
>>>>>>> /* * Special API for POWER to configure the vectors through 
>>>>>>> * a side channel. Should never be used by devices. */
>>>>>> 
>>>>>> 
>>>>>> It is useful for any para-virtualized environment I believe,
>>>>>> is not it? For s390 as well. Of course, if it supports PCI,
>>>>>> for example, what I am not sure it does though :)
>>>>> 
>>>>> I expect the normal guest to program the address into MSI
>>>>> register using config accesses, same way that it enables
>>>>> MSI/MSIX. Why POWER does it differently I did not yet figure out
>>>>> but I hope this weirdness is not so widespread.
>>>> 
>>>> 
>>>> In para-virt I would expect the guest not to touch config space at
>>>> all. At least it should use one interface rather than two but this
>>>> is how it is.
>>> 
>>> It's not new that firmware developers consistently make
>>> inconsistent design decisions :)
>> 
>> 
>> It depends on how to look at it. Enabling MSI via the config space is
>> also done via a special set of hypervisor calls (common and
>> IBM-specific) so it is all hidden in one place - the system firmware,
>> what is cool - no PHB drivers in the guest. Although MSI would not
>> need any additional hypercall to init vectors (everything can be done
>> via config space), there is MSI-X which stores vectors in BAR and
>> there is no hypercall for BARs as they are simply memory mapped. This
>> is I think why the firmware people (or phyp but it is probably the
>> same) added IBM-specific MSI/MSIX config hypercalls.
> 
> Well what's wrong with guest doing this through a memory mapped
> interface?


Should not guest allocate addresses and program PHB with them?
The idea was to hide PHB details in the system firmware, this is the point.


>> And I do not quite understand why MSIX people could not use extended
>> PCI config space which is 4096 bytes, quite a lot, enough to fit 256
>> vectors (have not seen a card which asked for more than 9 _per
>> function_). If somebody really needs 2048, he may want 16384 as well
>> (or any other crazy number), etc, so why did they put such a limit, it
>> is a BAR, it is huge? :) A, offtopic anyway.


> Well you have just described MSI, just don't use MSIX.
> 
> The motivation for MSIX was as follows: PCI/PCI-X config space is not
> 4096 bytes, it is 256 bytes, and is very crowded. You are thinking of
> PCI express. 

MSIX is PCIe feature, no?

> Config accesses are also nonposted which means at most one
> must be in flight. This is not appropriate for vector programming which
> needs to be done from multiple CPUs in parallel.

> Also offtopic, please try to avoid these super long lines in mail :).

Ah. This is from the time when I posted patches via thunderbird and
disabled wrapping :) Is wrapping at 75 chars ok?


>> 
>> 
>>>>>>>> bool msi_enabled(const PCIDevice *dev) { return
>>>>>>>> msi_present(dev) && diff --git a/hw/msi.h b/hw/msi.h index
>>>>>>>> 75747ab..6ec1f99 100644 --- a/hw/msi.h +++ b/hw/msi.h @@
>>>>>>>> -31,6 +31,7 @@ struct MSIMessage {
>>>>>>>> 
>>>>>>>> extern bool msi_supported;
>>>>>>>> 
>>>>>>>> +void msi_set_message(PCIDevice *dev, MSIMessage msg); 
>>>>>>>> bool msi_enabled(const PCIDevice *dev); int
>>>>>>>> msi_init(struct PCIDevice *dev, uint8_t offset, unsigned
>>>>>>>> int nr_vectors, bool msi64bit, bool msi_per_vector_mask); 
>>>>>>>> diff --git a/hw/msix.c b/hw/msix.c index ded3c55..5f7d6d3
>>>>>>>> 100644 --- a/hw/msix.c +++ b/hw/msix.c @@ -45,6 +45,15 @@
>>>>>>>> static MSIMessage msix_get_message(PCIDevice *dev,
>>>>>>>> unsigned vector) return msg; }
>>>>>>>> 
>>>>>>>> +void msix_set_message(PCIDevice *dev, int vector, struct
>>>>>>>> MSIMessage msg) +{ +    uint8_t *table_entry =
>>>>>>>> dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE; + +
>>>>>>>> pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR,
>>>>>>>> msg.address); +    pci_set_long(table_entry +
>>>>>>>> PCI_MSIX_ENTRY_DATA, msg.data); +
>>>>>>>> table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &=
>>>>>>>> ~PCI_MSIX_ENTRY_CTRL_MASKBIT; +} + /* Add MSI-X capability
>>>>>>>> to the config space for the device. */ /* Given a bar and
>>>>>>>> its size, add MSI-X table on top of it * and fill MSI-X
>>>>>>>> capability in the config space. diff --git a/hw/msix.h
>>>>>>>> b/hw/msix.h index 50aee82..26a437e 100644 --- a/hw/msix.h 
>>>>>>>> +++ b/hw/msix.h @@ -4,6 +4,8 @@ #include "qemu-common.h" 
>>>>>>>> #include "pci.h"
>>>>>>>> 
>>>>>>>> +void msix_set_message(PCIDevice *dev, int vector,
>>>>>>>> MSIMessage msg); + int msix_init(PCIDevice *pdev, unsigned
>>>>>>>> short nentries, MemoryRegion *bar, unsigned bar_nr,
>>>>>>>> unsigned bar_size); -- 1.7.10
>>>>>>>> 
>>>>>>>> ps. double '-' and git version is an end-of-patch scissor
>>>>>>>> as I read somewhere, cannot recall where exactly :)
>>>>>>>> 
>>>>>>>> 
>>>>>>>> 
>>>>>>>> 
>>>>>>>> 
>>>>>>>> 
>>>>>>>> On 21/06/12 20:56, Jan Kiszka wrote:
>>>>>>>>> On 2012-06-21 12:50, Alexey Kardashevskiy wrote:
>>>>>>>>>> On 21/06/12 20:38, Jan Kiszka wrote:
>>>>>>>>>>> On 2012-06-21 12:28, Alexey Kardashevskiy wrote:
>>>>>>>>>>>> On 21/06/12 17:39, Jan Kiszka wrote:
>>>>>>>>>>>>> On 2012-06-21 09:18, Alexey Kardashevskiy
>>>>>>>>>>>>> wrote:
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> agrhhh. sha1 of the patch changed after
>>>>>>>>>>>>>> rebasing :)
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> Added (msi|msix)_(set|get)_message() function
>>>>>>>>>>>>>> for whoever might want to use them.
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> Currently msi_notify()/msix_notify() write to
>>>>>>>>>>>>>> these vectors to signal the guest about an
>>>>>>>>>>>>>> interrupt so the correct values have to 
>>>>>>>>>>>>>> written there by the guest or QEMU.
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> For example, POWER guest never initializes
>>>>>>>>>>>>>> MSI/MSIX vectors, instead it uses RTAS
>>>>>>>>>>>>>> hypercalls. So in order to support MSIX for
>>>>>>>>>>>>>> virtio-pci on POWER we have to initialize
>>>>>>>>>>>>>> MSI/MSIX message from QEMU.
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> As only set* function are required by now, the
>>>>>>>>>>>>>> "get" functions were added or made public for
>>>>>>>>>>>>>> a symmetry.
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> Signed-off-by: Alexey Kardashevskiy
>>>>>>>>>>>>>> <aik@ozlabs.ru> --- hw/msi.c  |   29
>>>>>>>>>>>>>> +++++++++++++++++++++++++++++ hw/msi.h  |    2
>>>>>>>>>>>>>> ++ hw/msix.c |   11 ++++++++++- hw/msix.h |
>>>>>>>>>>>>>> 3 +++ 4 files changed, 44 insertions(+), 1
>>>>>>>>>>>>>> deletion(-)
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> diff --git a/hw/msi.c b/hw/msi.c index
>>>>>>>>>>>>>> 5233204..9ad84a4 100644 --- a/hw/msi.c +++
>>>>>>>>>>>>>> b/hw/msi.c @@ -105,6 +105,35 @@ static inline
>>>>>>>>>>>>>> uint8_t msi_pending_off(const PCIDevice* dev,
>>>>>>>>>>>>>> bool msi64bit) return dev->msi_cap + (msi64bit
>>>>>>>>>>>>>> ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32); }
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> +MSIMessage msi_get_message(PCIDevice *dev)
>>>>>>>>>>>>> 
>>>>>>>>>>>>> MSIMessage msi_get_message(PCIDevice *dev,
>>>>>>>>>>>>> unsigned vector)
>>>>>>>>>>>> 
>>>>>>>>>>>> 
>>>>>>>>>>>> Who/how/why is going to calculate the vector
>>>>>>>>>>>> here?
>>>>>>>>>>>> 
>>>>>>>>>>>>> 
>>>>>>>>>>>>>> +{ +    uint16_t flags =
>>>>>>>>>>>>>> pci_get_word(dev->config +
>>>>>>>>>>>>>> msi_flags_off(dev)); +    bool msi64bit =
>>>>>>>>>>>>>> flags & PCI_MSI_FLAGS_64BIT; +    MSIMessage
>>>>>>>>>>>>>> msg; + +    if (msi64bit) { +
>>>>>>>>>>>>>> msg.address = pci_get_quad(dev->config +
>>>>>>>>>>>>>> msi_address_lo_off(dev)); +    } else { +
>>>>>>>>>>>>>> msg.address = pci_get_long(dev->config +
>>>>>>>>>>>>>> msi_address_lo_off(dev)); +    } +    msg.data
>>>>>>>>>>>>>> = pci_get_word(dev->config + msi_data_off(dev,
>>>>>>>>>>>>>> msi64bit));
>>>>>>>>>>>>> 
>>>>>>>>>>>>> And I have this here in addition:
>>>>>>>>>>>>> 
>>>>>>>>>>>>> unsigned int nr_vectors =
>>>>>>>>>>>>> msi_nr_vectors(flags); ...
>>>>>>>>>>>>> 
>>>>>>>>>>>>> if (nr_vectors > 1) { msg.data &= ~(nr_vectors -
>>>>>>>>>>>>> 1); msg.data |= vector; }
>>>>>>>>>>>>> 
>>>>>>>>>>>>> See PCI spec and existing code.
>>>>>>>>>>>> 
>>>>>>>>>>>> 
>>>>>>>>>>>> What for? I really do not get it why someone might
>>>>>>>>>>>> want to read something but not real value. What
>>>>>>>>>>>> PCI code should I look?
>>>>>>>>>>> 
>>>>>>>>>>> I'm not sure what your use case for reading the
>>>>>>>>>>> message is. For KVM device assignment it is
>>>>>>>>>>> preparing an alternative message delivery path for
>>>>>>>>>>> MSI vectors. And for this we will need vector
>>>>>>>>>>> notifier support for MSI as well. You can check the
>>>>>>>>>>> MSI-X code for corresponding use cases of 
>>>>>>>>>>> msix_get_message.
>>>>>>>>>> 
>>>>>>>>>>> And when we already have msi_get_message, another
>>>>>>>>>>> logical use case is msi_notify. See msix.c again.
>>>>>>>>>> 
>>>>>>>>>> Aaaa.
>>>>>>>>>> 
>>>>>>>>>> I have no case for reading the message. All I need is
>>>>>>>>>> writing. And I want it public as I want to use it from
>>>>>>>>>> hw/spapr_pci.c. You suggested to add reading, I added
>>>>>>>>>> "get" to be _symmetric_ to "set" ("get" returns what
>>>>>>>>>> "set" wrote). You want a different thing which I can
>>>>>>>>>> do but it is not msi_get_message(), it is something
>>>>>>>>>> like msi_prepare_message(MSImessage msg) or 
>>>>>>>>>> msi_set_vector(uint16_t data) or simply internal
>>>>>>>>>> kitchen of msi_notify().
>>>>>>>>>> 
>>>>>>>>>> Still can do what you suggested, it just does not seem
>>>>>>>>>> right.
>>>>>>>>> 
>>>>>>>>> It is right - when looking at it from a different angle.
>>>>>>>>> ;)
>>>>>>>>> 
>>>>>>>>> I don't mind if you add msi_get_message now or leave
>>>>>>>>> this to me. Likely the latter is better as you have no
>>>>>>>>> use case for msi_get_message (and also
>>>>>>>>> msix_get_message!) outside of their modules, thus we
>>>>>>>>> should not export those functions anyway.
>> 
>> 
>> -- Alexey
>>
Michael S. Tsirkin July 19, 2012, 2:56 p.m. UTC | #11
On Fri, Jul 20, 2012 at 12:50:26AM +1000, Alexey Kardashevskiy wrote:
> On 20/07/12 00:43, Michael S. Tsirkin wrote:
> > On Fri, Jul 20, 2012 at 12:24:05AM +1000, Alexey Kardashevskiy wrote:
> >> One comment below.
> >> 
> >> 
> >> On 19/07/12 19:27, Michael S. Tsirkin wrote:
> >>> On Thu, Jul 19, 2012 at 10:32:40AM +1000, Alexey Kardashevskiy
> >>> wrote:
> >>>> On 19/07/12 01:23, Michael S. Tsirkin wrote:
> >>>>> On Wed, Jul 18, 2012 at 11:17:12PM +1000, Alexey Kardashevskiy
> >>>>> wrote:
> >>>>>> On 18/07/12 22:43, Michael S. Tsirkin wrote:
> >>>>>>> On Thu, Jun 21, 2012 at 09:39:10PM +1000, Alexey
> >>>>>>> Kardashevskiy wrote:
> >>>>>>>> Added (msi|msix)_set_message() functions.
> >>>>>>>> 
> >>>>>>>> Currently msi_notify()/msix_notify() write to these
> >>>>>>>> vectors to signal the guest about an interrupt so the
> >>>>>>>> correct values have to written there by the guest or
> >>>>>>>> QEMU.
> >>>>>>>> 
> >>>>>>>> For example, POWER guest never initializes MSI/MSIX
> >>>>>>>> vectors, instead it uses RTAS hypercalls. So in order to
> >>>>>>>> support MSIX for virtio-pci on POWER we have to initialize
> >>>>>>>> MSI/MSIX message from QEMU.
> >>>>>>>> 
> >>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>>>> 
> >>>>>>> So guests do enable MSI through config space, but do not
> >>>>>>> fill in vectors?
> >>>>>> 
> >>>>>> Yes. msix_capability_init() calls arch_setup_msi_irqs() which
> >>>>>> does everything it needs to do (i.e. calls hypervisor) before
> >>>>>> msix_capability_init() writes PCI_MSIX_FLAGS_ENABLE to the
> >>>>>> PCI_MSIX_FLAGS register.
> >>>>>> 
> >>>>>> These vectors are the PCI bus addresses, the way they are set
> >>>>>> is specific for a PCI host controller, I do not see why the
> >>>>>> current scheme is a bug.
> >>>>> 
> >>>>> I won't work with any real PCI device, will it? Real pci devices
> >>>>> expect vectors to be written into their memory.
> >>>> 
> >>>> 
> >>>> Yes. And the hypervisor does this. On POWER (at least book3s -
> >>>> server powerpc, the whole config space kitchen is hidden behind
> >>>> RTAS (kind of bios). For the guest, this RTAS is implemented in
> >>>> hypervisor, for the host - in the system firmware. So powerpc
> >>>> linux does not have to have PHB drivers. Kinda cool.
> >>>> 
> >>>> Usual powerpc server is running without the host linux at all, it
> >>>> is running a hypervisor called pHyp. And every guest knows that it
> >>>> is a guest, there is no full machine emulation, it is
> >>>> para-virtualization. In power-kvm, we replace that pHyp with the
> >>>> host linux and now QEMU plays a hypervisor role. Some day We will
> >>>> move the hypervisor to the host kernel completely (?) but now it
> >>>> is in QEMU.
> >>> 
> >>> OKay. So it is a POWER-specific weirdness as I suspected. Sure, if
> >>> this is what real hardware does we pretty much have to emulate
> >>> this.
> >>> 
> >>>>>>> Very strange. Are you sure it's not just a guest bug? How
> >>>>>>> does it work for other PCI devices?
> >>>>>> 
> >>>>>> Did not get the question. It works the same for every PCI
> >>>>>> device under POWER guest.
> >>>>> 
> >>>>> I mean for real PCI devices.
> >>>>> 
> >>>>>>> Can't we just fix guest drivers to program the vectors
> >>>>>>> properly?
> >>>>>>> 
> >>>>>>> Also pls address the comment below.
> >>>>>> 
> >>>>>> Comment below.
> >>>>>> 
> >>>>>>> Thanks!
> >>>>>>> 
> >>>>>>>> --- hw/msi.c  |   13 +++++++++++++ hw/msi.h  |    1 + 
> >>>>>>>> hw/msix.c |    9 +++++++++ hw/msix.h |    2 ++ 4 files
> >>>>>>>> changed, 25 insertions(+)
> >>>>>>>> 
> >>>>>>>> diff --git a/hw/msi.c b/hw/msi.c index 5233204..cc6102f
> >>>>>>>> 100644 --- a/hw/msi.c +++ b/hw/msi.c @@ -105,6 +105,19 @@
> >>>>>>>> static inline uint8_t msi_pending_off(const PCIDevice*
> >>>>>>>> dev, bool msi64bit) return dev->msi_cap + (msi64bit ?
> >>>>>>>> PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32); }
> >>>>>>>> 
> >>>>>>>> +void msi_set_message(PCIDevice *dev, MSIMessage msg) +{ +
> >>>>>>>> uint16_t flags = pci_get_word(dev->config +
> >>>>>>>> msi_flags_off(dev)); +    bool msi64bit = flags &
> >>>>>>>> PCI_MSI_FLAGS_64BIT; + +    if (msi64bit) { +
> >>>>>>>> pci_set_quad(dev->config + msi_address_lo_off(dev),
> >>>>>>>> msg.address); +    } else { +
> >>>>>>>> pci_set_long(dev->config + msi_address_lo_off(dev),
> >>>>>>>> msg.address); +    } +    pci_set_word(dev->config +
> >>>>>>>> msi_data_off(dev, msi64bit), msg.data); +} +
> >>>>>>> 
> >>>>>>> Please add documentation. Something like
> >>>>>>> 
> >>>>>>> /* * Special API for POWER to configure the vectors through 
> >>>>>>> * a side channel. Should never be used by devices. */
> >>>>>> 
> >>>>>> 
> >>>>>> It is useful for any para-virtualized environment I believe,
> >>>>>> is not it? For s390 as well. Of course, if it supports PCI,
> >>>>>> for example, what I am not sure it does though :)
> >>>>> 
> >>>>> I expect the normal guest to program the address into MSI
> >>>>> register using config accesses, same way that it enables
> >>>>> MSI/MSIX. Why POWER does it differently I did not yet figure out
> >>>>> but I hope this weirdness is not so widespread.
> >>>> 
> >>>> 
> >>>> In para-virt I would expect the guest not to touch config space at
> >>>> all. At least it should use one interface rather than two but this
> >>>> is how it is.
> >>> 
> >>> It's not new that firmware developers consistently make
> >>> inconsistent design decisions :)
> >> 
> >> 
> >> It depends on how to look at it. Enabling MSI via the config space is
> >> also done via a special set of hypervisor calls (common and
> >> IBM-specific) so it is all hidden in one place - the system firmware,
> >> what is cool - no PHB drivers in the guest. Although MSI would not
> >> need any additional hypercall to init vectors (everything can be done
> >> via config space), there is MSI-X which stores vectors in BAR and
> >> there is no hypercall for BARs as they are simply memory mapped. This
> >> is I think why the firmware people (or phyp but it is probably the
> >> same) added IBM-specific MSI/MSIX config hypercalls.
> > 
> > Well what's wrong with guest doing this through a memory mapped
> > interface?
> 
> 
> Should not guest allocate addresses and program PHB with them?

What are you asking about? How does MSIX work normally?
OS gets the vector (address/data) pairs in some arch specific way
and then programs them into devices. No need for firmware to touch
any devices.

> The idea was to hide PHB details in the system firmware, this is the point.

The actual result is POWER behaves differently from almost any other
architecture.

> >> And I do not quite understand why MSIX people could not use extended
> >> PCI config space which is 4096 bytes, quite a lot, enough to fit 256
> >> vectors (have not seen a card which asked for more than 9 _per
> >> function_). If somebody really needs 2048, he may want 16384 as well
> >> (or any other crazy number), etc, so why did they put such a limit, it
> >> is a BAR, it is huge? :) A, offtopic anyway.
> 
> 
> > Well you have just described MSI, just don't use MSIX.
> > 
> > The motivation for MSIX was as follows: PCI/PCI-X config space is not
> > 4096 bytes, it is 256 bytes, and is very crowded. You are thinking of
> > PCI express. 
> 
> MSIX is PCIe feature, no?

No.

> > Config accesses are also nonposted which means at most one
> > must be in flight. This is not appropriate for vector programming which
> > needs to be done from multiple CPUs in parallel.
> 
> > Also offtopic, please try to avoid these super long lines in mail :).
> 
> Ah. This is from the time when I posted patches via thunderbird and
> disabled wrapping :) Is wrapping at 75 chars ok?

Sure.

> 
> >> 
> >> 
> >>>>>>>> bool msi_enabled(const PCIDevice *dev) { return
> >>>>>>>> msi_present(dev) && diff --git a/hw/msi.h b/hw/msi.h index
> >>>>>>>> 75747ab..6ec1f99 100644 --- a/hw/msi.h +++ b/hw/msi.h @@
> >>>>>>>> -31,6 +31,7 @@ struct MSIMessage {
> >>>>>>>> 
> >>>>>>>> extern bool msi_supported;
> >>>>>>>> 
> >>>>>>>> +void msi_set_message(PCIDevice *dev, MSIMessage msg); 
> >>>>>>>> bool msi_enabled(const PCIDevice *dev); int
> >>>>>>>> msi_init(struct PCIDevice *dev, uint8_t offset, unsigned
> >>>>>>>> int nr_vectors, bool msi64bit, bool msi_per_vector_mask); 
> >>>>>>>> diff --git a/hw/msix.c b/hw/msix.c index ded3c55..5f7d6d3
> >>>>>>>> 100644 --- a/hw/msix.c +++ b/hw/msix.c @@ -45,6 +45,15 @@
> >>>>>>>> static MSIMessage msix_get_message(PCIDevice *dev,
> >>>>>>>> unsigned vector) return msg; }
> >>>>>>>> 
> >>>>>>>> +void msix_set_message(PCIDevice *dev, int vector, struct
> >>>>>>>> MSIMessage msg) +{ +    uint8_t *table_entry =
> >>>>>>>> dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE; + +
> >>>>>>>> pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR,
> >>>>>>>> msg.address); +    pci_set_long(table_entry +
> >>>>>>>> PCI_MSIX_ENTRY_DATA, msg.data); +
> >>>>>>>> table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &=
> >>>>>>>> ~PCI_MSIX_ENTRY_CTRL_MASKBIT; +} + /* Add MSI-X capability
> >>>>>>>> to the config space for the device. */ /* Given a bar and
> >>>>>>>> its size, add MSI-X table on top of it * and fill MSI-X
> >>>>>>>> capability in the config space. diff --git a/hw/msix.h
> >>>>>>>> b/hw/msix.h index 50aee82..26a437e 100644 --- a/hw/msix.h 
> >>>>>>>> +++ b/hw/msix.h @@ -4,6 +4,8 @@ #include "qemu-common.h" 
> >>>>>>>> #include "pci.h"
> >>>>>>>> 
> >>>>>>>> +void msix_set_message(PCIDevice *dev, int vector,
> >>>>>>>> MSIMessage msg); + int msix_init(PCIDevice *pdev, unsigned
> >>>>>>>> short nentries, MemoryRegion *bar, unsigned bar_nr,
> >>>>>>>> unsigned bar_size); -- 1.7.10
> >>>>>>>> 
> >>>>>>>> ps. double '-' and git version is an end-of-patch scissor
> >>>>>>>> as I read somewhere, cannot recall where exactly :)
> >>>>>>>> 
> >>>>>>>> 
> >>>>>>>> 
> >>>>>>>> 
> >>>>>>>> 
> >>>>>>>> 
> >>>>>>>> On 21/06/12 20:56, Jan Kiszka wrote:
> >>>>>>>>> On 2012-06-21 12:50, Alexey Kardashevskiy wrote:
> >>>>>>>>>> On 21/06/12 20:38, Jan Kiszka wrote:
> >>>>>>>>>>> On 2012-06-21 12:28, Alexey Kardashevskiy wrote:
> >>>>>>>>>>>> On 21/06/12 17:39, Jan Kiszka wrote:
> >>>>>>>>>>>>> On 2012-06-21 09:18, Alexey Kardashevskiy
> >>>>>>>>>>>>> wrote:
> >>>>>>>>>>>>>> 
> >>>>>>>>>>>>>> agrhhh. sha1 of the patch changed after
> >>>>>>>>>>>>>> rebasing :)
> >>>>>>>>>>>>>> 
> >>>>>>>>>>>>>> 
> >>>>>>>>>>>>>> 
> >>>>>>>>>>>>>> Added (msi|msix)_(set|get)_message() function
> >>>>>>>>>>>>>> for whoever might want to use them.
> >>>>>>>>>>>>>> 
> >>>>>>>>>>>>>> Currently msi_notify()/msix_notify() write to
> >>>>>>>>>>>>>> these vectors to signal the guest about an
> >>>>>>>>>>>>>> interrupt so the correct values have to 
> >>>>>>>>>>>>>> written there by the guest or QEMU.
> >>>>>>>>>>>>>> 
> >>>>>>>>>>>>>> For example, POWER guest never initializes
> >>>>>>>>>>>>>> MSI/MSIX vectors, instead it uses RTAS
> >>>>>>>>>>>>>> hypercalls. So in order to support MSIX for
> >>>>>>>>>>>>>> virtio-pci on POWER we have to initialize
> >>>>>>>>>>>>>> MSI/MSIX message from QEMU.
> >>>>>>>>>>>>>> 
> >>>>>>>>>>>>>> As only set* function are required by now, the
> >>>>>>>>>>>>>> "get" functions were added or made public for
> >>>>>>>>>>>>>> a symmetry.
> >>>>>>>>>>>>>> 
> >>>>>>>>>>>>>> Signed-off-by: Alexey Kardashevskiy
> >>>>>>>>>>>>>> <aik@ozlabs.ru> --- hw/msi.c  |   29
> >>>>>>>>>>>>>> +++++++++++++++++++++++++++++ hw/msi.h  |    2
> >>>>>>>>>>>>>> ++ hw/msix.c |   11 ++++++++++- hw/msix.h |
> >>>>>>>>>>>>>> 3 +++ 4 files changed, 44 insertions(+), 1
> >>>>>>>>>>>>>> deletion(-)
> >>>>>>>>>>>>>> 
> >>>>>>>>>>>>>> diff --git a/hw/msi.c b/hw/msi.c index
> >>>>>>>>>>>>>> 5233204..9ad84a4 100644 --- a/hw/msi.c +++
> >>>>>>>>>>>>>> b/hw/msi.c @@ -105,6 +105,35 @@ static inline
> >>>>>>>>>>>>>> uint8_t msi_pending_off(const PCIDevice* dev,
> >>>>>>>>>>>>>> bool msi64bit) return dev->msi_cap + (msi64bit
> >>>>>>>>>>>>>> ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32); }
> >>>>>>>>>>>>>> 
> >>>>>>>>>>>>>> +MSIMessage msi_get_message(PCIDevice *dev)
> >>>>>>>>>>>>> 
> >>>>>>>>>>>>> MSIMessage msi_get_message(PCIDevice *dev,
> >>>>>>>>>>>>> unsigned vector)
> >>>>>>>>>>>> 
> >>>>>>>>>>>> 
> >>>>>>>>>>>> Who/how/why is going to calculate the vector
> >>>>>>>>>>>> here?
> >>>>>>>>>>>> 
> >>>>>>>>>>>>> 
> >>>>>>>>>>>>>> +{ +    uint16_t flags =
> >>>>>>>>>>>>>> pci_get_word(dev->config +
> >>>>>>>>>>>>>> msi_flags_off(dev)); +    bool msi64bit =
> >>>>>>>>>>>>>> flags & PCI_MSI_FLAGS_64BIT; +    MSIMessage
> >>>>>>>>>>>>>> msg; + +    if (msi64bit) { +
> >>>>>>>>>>>>>> msg.address = pci_get_quad(dev->config +
> >>>>>>>>>>>>>> msi_address_lo_off(dev)); +    } else { +
> >>>>>>>>>>>>>> msg.address = pci_get_long(dev->config +
> >>>>>>>>>>>>>> msi_address_lo_off(dev)); +    } +    msg.data
> >>>>>>>>>>>>>> = pci_get_word(dev->config + msi_data_off(dev,
> >>>>>>>>>>>>>> msi64bit));
> >>>>>>>>>>>>> 
> >>>>>>>>>>>>> And I have this here in addition:
> >>>>>>>>>>>>> 
> >>>>>>>>>>>>> unsigned int nr_vectors =
> >>>>>>>>>>>>> msi_nr_vectors(flags); ...
> >>>>>>>>>>>>> 
> >>>>>>>>>>>>> if (nr_vectors > 1) { msg.data &= ~(nr_vectors -
> >>>>>>>>>>>>> 1); msg.data |= vector; }
> >>>>>>>>>>>>> 
> >>>>>>>>>>>>> See PCI spec and existing code.
> >>>>>>>>>>>> 
> >>>>>>>>>>>> 
> >>>>>>>>>>>> What for? I really do not get it why someone might
> >>>>>>>>>>>> want to read something but not real value. What
> >>>>>>>>>>>> PCI code should I look?
> >>>>>>>>>>> 
> >>>>>>>>>>> I'm not sure what your use case for reading the
> >>>>>>>>>>> message is. For KVM device assignment it is
> >>>>>>>>>>> preparing an alternative message delivery path for
> >>>>>>>>>>> MSI vectors. And for this we will need vector
> >>>>>>>>>>> notifier support for MSI as well. You can check the
> >>>>>>>>>>> MSI-X code for corresponding use cases of 
> >>>>>>>>>>> msix_get_message.
> >>>>>>>>>> 
> >>>>>>>>>>> And when we already have msi_get_message, another
> >>>>>>>>>>> logical use case is msi_notify. See msix.c again.
> >>>>>>>>>> 
> >>>>>>>>>> Aaaa.
> >>>>>>>>>> 
> >>>>>>>>>> I have no case for reading the message. All I need is
> >>>>>>>>>> writing. And I want it public as I want to use it from
> >>>>>>>>>> hw/spapr_pci.c. You suggested to add reading, I added
> >>>>>>>>>> "get" to be _symmetric_ to "set" ("get" returns what
> >>>>>>>>>> "set" wrote). You want a different thing which I can
> >>>>>>>>>> do but it is not msi_get_message(), it is something
> >>>>>>>>>> like msi_prepare_message(MSImessage msg) or 
> >>>>>>>>>> msi_set_vector(uint16_t data) or simply internal
> >>>>>>>>>> kitchen of msi_notify().
> >>>>>>>>>> 
> >>>>>>>>>> Still can do what you suggested, it just does not seem
> >>>>>>>>>> right.
> >>>>>>>>> 
> >>>>>>>>> It is right - when looking at it from a different angle.
> >>>>>>>>> ;)
> >>>>>>>>> 
> >>>>>>>>> I don't mind if you add msi_get_message now or leave
> >>>>>>>>> this to me. Likely the latter is better as you have no
> >>>>>>>>> use case for msi_get_message (and also
> >>>>>>>>> msix_get_message!) outside of their modules, thus we
> >>>>>>>>> should not export those functions anyway.
> >> 
> >> 
> >> -- Alexey
> >> 
> 
> 
> -- 
> Alexey
>
diff mbox

Patch

diff --git a/hw/msi.c b/hw/msi.c
index 5233204..cc6102f 100644
--- a/hw/msi.c
+++ b/hw/msi.c
@@ -105,6 +105,19 @@  static inline uint8_t msi_pending_off(const PCIDevice* dev, bool msi64bit)
     return dev->msi_cap + (msi64bit ? PCI_MSI_PENDING_64 : PCI_MSI_PENDING_32);
 }
 
+void msi_set_message(PCIDevice *dev, MSIMessage msg)
+{
+    uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
+    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
+
+    if (msi64bit) {
+        pci_set_quad(dev->config + msi_address_lo_off(dev), msg.address);
+    } else {
+        pci_set_long(dev->config + msi_address_lo_off(dev), msg.address);
+    }
+    pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data);
+}
+
 bool msi_enabled(const PCIDevice *dev)
 {
     return msi_present(dev) &&
diff --git a/hw/msi.h b/hw/msi.h
index 75747ab..6ec1f99 100644
--- a/hw/msi.h
+++ b/hw/msi.h
@@ -31,6 +31,7 @@  struct MSIMessage {
 
 extern bool msi_supported;
 
+void msi_set_message(PCIDevice *dev, MSIMessage msg);
 bool msi_enabled(const PCIDevice *dev);
 int msi_init(struct PCIDevice *dev, uint8_t offset,
              unsigned int nr_vectors, bool msi64bit, bool msi_per_vector_mask);
diff --git a/hw/msix.c b/hw/msix.c
index ded3c55..5f7d6d3 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -45,6 +45,15 @@  static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
     return msg;
 }
 
+void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg)
+{
+    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
+
+    pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address);
+    pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data);
+    table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
+}
+
 /* Add MSI-X capability to the config space for the device. */
 /* Given a bar and its size, add MSI-X table on top of it
  * and fill MSI-X capability in the config space.
diff --git a/hw/msix.h b/hw/msix.h
index 50aee82..26a437e 100644
--- a/hw/msix.h
+++ b/hw/msix.h
@@ -4,6 +4,8 @@ 
 #include "qemu-common.h"
 #include "pci.h"
 
+void msix_set_message(PCIDevice *dev, int vector, MSIMessage msg);
+
 int msix_init(PCIDevice *pdev, unsigned short nentries,
               MemoryRegion *bar,
               unsigned bar_nr, unsigned bar_size);