diff mbox

[v7,3/3] sPAPR: EEH support for VFIO PCI device

Message ID 1401180670-6420-4-git-send-email-gwshan@linux.vnet.ibm.com
State New
Headers show

Commit Message

Gavin Shan May 27, 2014, 8:51 a.m. UTC
The patch introduces EEH RTAS servers on sPAPR platform and handle
them there. Each sPAPRPHBVFIOState is binding with only one IOMMU
group, so it can be regarded as PE in nature. The PE address is
maintained in sPAPRPHBState, which has default value 0xffffffff
for non-VFIO PHBs. Otherwise, the PE address is equal to IOMMU
group ID. It can be used to distinguish sPAPRVFIOPHBState from
sPAPRPHBState.

As we're going to support EEH functonality for VFIO PCI devices,
we simply return error if the EEH RTAS call is routed to non-VFIO
PHB whose PE address is 0xffffffff.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 hw/ppc/spapr.c              |   1 +
 hw/ppc/spapr_pci.c          |   1 +
 hw/ppc/spapr_pci_vfio.c     | 335 ++++++++++++++++++++++++++++++++++++++++++++
 include/hw/pci-host/spapr.h |   1 +
 include/hw/ppc/spapr.h      |   1 +
 5 files changed, 339 insertions(+)

Comments

Alexander Graf May 27, 2014, 10:22 p.m. UTC | #1
On 27.05.14 10:51, Gavin Shan wrote:
> The patch introduces EEH RTAS servers on sPAPR platform and handle
> them there. Each sPAPRPHBVFIOState is binding with only one IOMMU
> group, so it can be regarded as PE in nature. The PE address is
> maintained in sPAPRPHBState, which has default value 0xffffffff
> for non-VFIO PHBs. Otherwise, the PE address is equal to IOMMU
> group ID. It can be used to distinguish sPAPRVFIOPHBState from
> sPAPRPHBState.
>
> As we're going to support EEH functonality for VFIO PCI devices,
> we simply return error if the EEH RTAS call is routed to non-VFIO
> PHB whose PE address is 0xffffffff.
>
> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
> ---
>   hw/ppc/spapr.c              |   1 +
>   hw/ppc/spapr_pci.c          |   1 +
>   hw/ppc/spapr_pci_vfio.c     | 335 ++++++++++++++++++++++++++++++++++++++++++++
>   include/hw/pci-host/spapr.h |   1 +
>   include/hw/ppc/spapr.h      |   1 +
>   5 files changed, 339 insertions(+)
>
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index eee0a7c..e394d0c 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1325,6 +1325,7 @@ static void ppc_spapr_init(QEMUMachineInitArgs *args)
>       spapr_pci_msi_init(spapr, SPAPR_PCI_MSI_WINDOW);
>       spapr_pci_rtas_init();
>       spapr_tce_ddw_rtas_init(spapr);
> +    spapr_eeh_rtas_init(spapr);
>   
>       phb = spapr_create_phb(spapr, 0);
>   
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index a9f307a..0f2ac02 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -578,6 +578,7 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
>           return;
>       }
>   
> +    sphb->pe_addr = 0xffffffff;
>       sphb->dtbusname = g_strdup_printf("pci@%" PRIx64, sphb->buid);
>   
>       namebuf = alloca(strlen(sphb->dtbusname) + 32);
> diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
> index decf3dd..5dbddd0 100644
> --- a/hw/ppc/spapr_pci_vfio.c
> +++ b/hw/ppc/spapr_pci_vfio.c

[...]

> +void spapr_eeh_rtas_init(sPAPREnvironment *spapr)
> +{
> +    spapr_rtas_register("ibm,set-eeh-option",
> +                        rtas_ibm_set_eeh_option);
> +    spapr_rtas_register("ibm,get-config-addr-info2",
> +                        rtas_ibm_get_config_addr_info2);
> +    spapr_rtas_register("ibm,read-slot-reset-state2",
> +                        rtas_ibm_read_slot_reset_state2);
> +    spapr_rtas_register("ibm,set-slot-reset",
> +                        rtas_ibm_set_slot_reset);
> +    spapr_rtas_register("ibm,configure-pe",
> +                        rtas_ibm_configure_pe);
> +    spapr_rtas_register("ibm,slot-error-detail",
> +                        rtas_ibm_slot_error_detail);

This goes straight from RTAS to VFIO code. NAK.


Alex
Benjamin Herrenschmidt May 27, 2014, 10:27 p.m. UTC | #2
On Wed, 2014-05-28 at 00:22 +0200, Alexander Graf wrote:
> > +void spapr_eeh_rtas_init(sPAPREnvironment *spapr)
> > +{
> > +    spapr_rtas_register("ibm,set-eeh-option",
> > +                        rtas_ibm_set_eeh_option);
> > +    spapr_rtas_register("ibm,get-config-addr-info2",
> > +                        rtas_ibm_get_config_addr_info2);
> > +    spapr_rtas_register("ibm,read-slot-reset-state2",
> > +                        rtas_ibm_read_slot_reset_state2);
> > +    spapr_rtas_register("ibm,set-slot-reset",
> > +                        rtas_ibm_set_slot_reset);
> > +    spapr_rtas_register("ibm,configure-pe",
> > +                        rtas_ibm_configure_pe);
> > +    spapr_rtas_register("ibm,slot-error-detail",
> > +                        rtas_ibm_slot_error_detail);
> 
> This goes straight from RTAS to VFIO code. NAK.

It would be much appreciated if instead of "NAK" you actually provided
at least a sentence explaining how you think this should go...

In any case, the above isn't the problem, we register rtas functions
called rtas_ibm_*, that's fine.

The problem you have is with what's inside these functions, correct ?
You want some kind of PCI Dev ops... am I understanding properly ?
Instead of having them call the VFIO ioctl's directly.

Cheers,
Ben.
Alexander Graf May 27, 2014, 10:41 p.m. UTC | #3
On 28.05.14 00:27, Benjamin Herrenschmidt wrote:
> On Wed, 2014-05-28 at 00:22 +0200, Alexander Graf wrote:
>>> +void spapr_eeh_rtas_init(sPAPREnvironment *spapr)
>>> +{
>>> +    spapr_rtas_register("ibm,set-eeh-option",
>>> +                        rtas_ibm_set_eeh_option);
>>> +    spapr_rtas_register("ibm,get-config-addr-info2",
>>> +                        rtas_ibm_get_config_addr_info2);
>>> +    spapr_rtas_register("ibm,read-slot-reset-state2",
>>> +                        rtas_ibm_read_slot_reset_state2);
>>> +    spapr_rtas_register("ibm,set-slot-reset",
>>> +                        rtas_ibm_set_slot_reset);
>>> +    spapr_rtas_register("ibm,configure-pe",
>>> +                        rtas_ibm_configure_pe);
>>> +    spapr_rtas_register("ibm,slot-error-detail",
>>> +                        rtas_ibm_slot_error_detail);
>> This goes straight from RTAS to VFIO code. NAK.
> It would be much appreciated if instead of "NAK" you actually provided
> at least a sentence explaining how you think this should go...

You mean the same way I did the last few times?

> In any case, the above isn't the problem, we register rtas functions
> called rtas_ibm_*, that's fine.

They're called rtas_ibm, not rtas_vfio. They simply live in the wrong 
file for starters. Once you start moving them out of *vfio*.c and make 
sure you don't sneak in vfio headers into spapr_rtas.c all the 
abstraction should come naturally.

> The problem you have is with what's inside these functions, correct ?
> You want some kind of PCI Dev ops... am I understanding properly ?
> Instead of having them call the VFIO ioctl's directly.

We have a nice bus hierarchy. The RTAS call tells you the PHB's ID. The 
machine knows about all of its PHBs, so you can find that one.

 From the PHB we can then find a PCIDevice by identifier if we received 
one in the RTAS call.

If we received a PE, we need to act on a virtual PE object (not 
available yet I think) which then would call at least into each VFIO 
container associated with that guest PE. We don't want to call into 
devices here, since we could have 2 VFIO devices in one PE and don't 
want to do a PE level operation twice.

If we want to simplify things (and I'm fine with that), we can also 
restrict a "guest PE" to only span at most a single VFIO container 
context and unlimited emulated devices. That way we don't need to worry 
about potential side effects.

I can't tell you what the "guest PE" would best be modeled at. Maybe a 
simple list/hash inside the PHB is the answer. But we need to ensure 
that the guest knows about its scope of action. And we need to make sure 
that we maintain abstraction levels to keep at least a minimum level of 
sanity for the reader.


Alex
Gavin Shan May 28, 2014, 4:12 a.m. UTC | #4
On Wed, May 28, 2014 at 12:41:37AM +0200, Alexander Graf wrote:
>On 28.05.14 00:27, Benjamin Herrenschmidt wrote:
>>On Wed, 2014-05-28 at 00:22 +0200, Alexander Graf wrote:

.../...

>>In any case, the above isn't the problem, we register rtas functions
>>called rtas_ibm_*, that's fine.
>
>They're called rtas_ibm, not rtas_vfio. They simply live in the wrong
>file for starters. Once you start moving them out of *vfio*.c and
>make sure you don't sneak in vfio headers into spapr_rtas.c all the
>abstraction should come naturally.
>
>>The problem you have is with what's inside these functions, correct ?
>>You want some kind of PCI Dev ops... am I understanding properly ?
>>Instead of having them call the VFIO ioctl's directly.
>
>We have a nice bus hierarchy. The RTAS call tells you the PHB's ID.
>The machine knows about all of its PHBs, so you can find that one.
>
>From the PHB we can then find a PCIDevice by identifier if we
>received one in the RTAS call.
>
>If we received a PE, we need to act on a virtual PE object (not
>available yet I think) which then would call at least into each VFIO
>container associated with that guest PE. We don't want to call into
>devices here, since we could have 2 VFIO devices in one PE and don't
>want to do a PE level operation twice.
>
>If we want to simplify things (and I'm fine with that), we can also
>restrict a "guest PE" to only span at most a single VFIO container
>context and unlimited emulated devices. That way we don't need to
>worry about potential side effects.
>
>I can't tell you what the "guest PE" would best be modeled at. Maybe
>a simple list/hash inside the PHB is the answer. But we need to
>ensure that the guest knows about its scope of action. And we need to
>make sure that we maintain abstraction levels to keep at least a
>minimum level of sanity for the reader.
>

sPAPRVFIOPHBState has one-to-one mapping with container, which has
one-to-one mapping relationship with IOMMU group on sPAPR platform.
So sPAPRVFIOPHBState is representing "guest PE".

Yes. RTAS calls from guest, 2 types of address information as input there:
PHB BUID+device's PCI config address, or PHB BUID+PE address. After removing
the logic of translating device's PCI config address to PE address from
host to QEMU, we don't have to care much about the first case (PHB BUID+
device's PCI config address). That means we only have to care "PE address",
which is basically the IOMMU group ID (plus fixed offset) of sPAPRPHBVFIOState.
So the BUID can be used to locate sPAPRPHBVFIOState and "PE address" can
help doing same thing.

As you mentioned above, spapr_rtas.c can't include "vfio.h". I guess I have
to rework on PCIDevice/PCIDeviceClass for a bit as follows. Alex, could you
help confirming it's what you're expecting to see?

- In include/hw/pci/pci.h, to have following callback for PCIDeviceClass

  /* It's basically mapping ioctl commands to another set of macros */
  #define PCI_DEV_EEH_OP_ENABLE    0
  #define PCI_DEV_EEH_OP_DISABLE   1
  #define PCI_DEV_EEH_OP_GET_STATE 2
  #define PCI_DEV_EEH_OP_PE_RESET  3
  #define PCI_DEV_EEH_OP_PE_CONFIGURE 4

  #define PCI_DEV_EEH_OPT_HRSET     0
  #define PCI_DEV_EEH_OPT_FRESET    1
          :
  int (*eeh_handler)(PCIDevice *pdev, int opcode, int option);

- In hw/ppc/spapr_pci_vfio.c, to register EEH RTAS calls there and I
  shouldn't include linux-headers/vfio.h there. When EEH RTAS calls
  come in, find the PE according to the PE address and pick up any
  one of the included PCI device and call into eeh_handler(). If the
  input address is device's PCI config address, find the PCI device
  and call to its eeh_handler() callback.

- eeh_handler() should point to some function implemented in hw/misc/vfio.c.
  Lets say "vfio_eeh_handler()", where in turns call to container's ioctl().
  I probably have to move the ioctl commands from container to VFIO PCI device
  fd as I did in previous patch.

- eeh_handler() should be NULL for emulated PCI device and just return error
  for them.

Thanks,
Gavin
Alexander Graf May 28, 2014, 11:24 a.m. UTC | #5
On 28.05.14 06:12, Gavin Shan wrote:
> On Wed, May 28, 2014 at 12:41:37AM +0200, Alexander Graf wrote:
>> On 28.05.14 00:27, Benjamin Herrenschmidt wrote:
>>> On Wed, 2014-05-28 at 00:22 +0200, Alexander Graf wrote:
> .../...
>
>>> In any case, the above isn't the problem, we register rtas functions
>>> called rtas_ibm_*, that's fine.
>> They're called rtas_ibm, not rtas_vfio. They simply live in the wrong
>> file for starters. Once you start moving them out of *vfio*.c and
>> make sure you don't sneak in vfio headers into spapr_rtas.c all the
>> abstraction should come naturally.
>>
>>> The problem you have is with what's inside these functions, correct ?
>>> You want some kind of PCI Dev ops... am I understanding properly ?
>>> Instead of having them call the VFIO ioctl's directly.
>> We have a nice bus hierarchy. The RTAS call tells you the PHB's ID.
>> The machine knows about all of its PHBs, so you can find that one.
>>
> >From the PHB we can then find a PCIDevice by identifier if we
>> received one in the RTAS call.
>>
>> If we received a PE, we need to act on a virtual PE object (not
>> available yet I think) which then would call at least into each VFIO
>> container associated with that guest PE. We don't want to call into
>> devices here, since we could have 2 VFIO devices in one PE and don't
>> want to do a PE level operation twice.
>>
>> If we want to simplify things (and I'm fine with that), we can also
>> restrict a "guest PE" to only span at most a single VFIO container
>> context and unlimited emulated devices. That way we don't need to
>> worry about potential side effects.
>>
>> I can't tell you what the "guest PE" would best be modeled at. Maybe
>> a simple list/hash inside the PHB is the answer. But we need to
>> ensure that the guest knows about its scope of action. And we need to
>> make sure that we maintain abstraction levels to keep at least a
>> minimum level of sanity for the reader.
>>
> sPAPRVFIOPHBState has one-to-one mapping with container, which has
> one-to-one mapping relationship with IOMMU group on sPAPR platform.
> So sPAPRVFIOPHBState is representing "guest PE".

So each PHB only spans a single PE?

>
> Yes. RTAS calls from guest, 2 types of address information as input there:
> PHB BUID+device's PCI config address, or PHB BUID+PE address. After removing
> the logic of translating device's PCI config address to PE address from
> host to QEMU, we don't have to care much about the first case (PHB BUID+
> device's PCI config address). That means we only have to care "PE address",
> which is basically the IOMMU group ID (plus fixed offset) of sPAPRPHBVFIOState.
> So the BUID can be used to locate sPAPRPHBVFIOState and "PE address" can
> help doing same thing.

... then we can just ignore the "PE address", no?

>
> As you mentioned above, spapr_rtas.c can't include "vfio.h". I guess I have
> to rework on PCIDevice/PCIDeviceClass for a bit as follows. Alex, could you
> help confirming it's what you're expecting to see?

I depends. If you need a PCI device level callback, then yes. If you 
don't need a PCI level callback, just have one in the PHB. I'm not quite 
sure how you can find the "VFIO container" that belongs to that PHB 
though, but I'm sure you'll figure that out :).

>
> - In include/hw/pci/pci.h, to have following callback for PCIDeviceClass
>
>    /* It's basically mapping ioctl commands to another set of macros */
>    #define PCI_DEV_EEH_OP_ENABLE    0
>    #define PCI_DEV_EEH_OP_DISABLE   1
>    #define PCI_DEV_EEH_OP_GET_STATE 2
>    #define PCI_DEV_EEH_OP_PE_RESET  3
>    #define PCI_DEV_EEH_OP_PE_CONFIGURE 4
>
>    #define PCI_DEV_EEH_OPT_HRSET     0
>    #define PCI_DEV_EEH_OPT_FRESET    1
>            :
>    int (*eeh_handler)(PCIDevice *pdev, int opcode, int option);

If we need a PCI device level callback, yes.

>
> - In hw/ppc/spapr_pci_vfio.c, to register EEH RTAS calls there and I
>    shouldn't include linux-headers/vfio.h there. When EEH RTAS calls
>    come in, find the PE according to the PE address and pick up any
>    one of the included PCI device and call into eeh_handler(). If the
>    input address is device's PCI config address, find the PCI device
>    and call to its eeh_handler() callback.

Uh. The RTAS flow should go through a few different files that basically 
represent the different layers on a real machine.

The spapr_rtas.c file should register the RTAS calls, as that one 
emulates the RTAS code that would usually run in guest code. That code 
determines and calls into the PHB.

The PHB is implemented in hw/ppc/spapr_pci.c, so that's where the EEH 
calls on the PHB level belong. If the target of a call is a PCI device, 
the PHB looks up the device, its class and calls into an EEH call inside 
the PCI device. If the target is a PE, the PHB needs to call out on the 
TCE entity representing the PE. If that is backed by a VFIO IOMMU, it 
should get called somehow.


Alex

> - eeh_handler() should point to some function implemented in hw/misc/vfio.c.
>    Lets say "vfio_eeh_handler()", where in turns call to container's ioctl().
>    I probably have to move the ioctl commands from container to VFIO PCI device
>    fd as I did in previous patch.
>
> - eeh_handler() should be NULL for emulated PCI device and just return error
>    for them.
>
> Thanks,
> Gavin
>
Gavin Shan May 28, 2014, 12:33 p.m. UTC | #6
On Wed, May 28, 2014 at 01:24:15PM +0200, Alexander Graf wrote:
>
>On 28.05.14 06:12, Gavin Shan wrote:
>>On Wed, May 28, 2014 at 12:41:37AM +0200, Alexander Graf wrote:
>>>On 28.05.14 00:27, Benjamin Herrenschmidt wrote:
>>>>On Wed, 2014-05-28 at 00:22 +0200, Alexander Graf wrote:
>>.../...
>>
>>>>In any case, the above isn't the problem, we register rtas functions
>>>>called rtas_ibm_*, that's fine.
>>>They're called rtas_ibm, not rtas_vfio. They simply live in the wrong
>>>file for starters. Once you start moving them out of *vfio*.c and
>>>make sure you don't sneak in vfio headers into spapr_rtas.c all the
>>>abstraction should come naturally.
>>>
>>>>The problem you have is with what's inside these functions, correct ?
>>>>You want some kind of PCI Dev ops... am I understanding properly ?
>>>>Instead of having them call the VFIO ioctl's directly.
>>>We have a nice bus hierarchy. The RTAS call tells you the PHB's ID.
>>>The machine knows about all of its PHBs, so you can find that one.
>>>
>>>From the PHB we can then find a PCIDevice by identifier if we
>>>received one in the RTAS call.
>>>
>>>If we received a PE, we need to act on a virtual PE object (not
>>>available yet I think) which then would call at least into each VFIO
>>>container associated with that guest PE. We don't want to call into
>>>devices here, since we could have 2 VFIO devices in one PE and don't
>>>want to do a PE level operation twice.
>>>
>>>If we want to simplify things (and I'm fine with that), we can also
>>>restrict a "guest PE" to only span at most a single VFIO container
>>>context and unlimited emulated devices. That way we don't need to
>>>worry about potential side effects.
>>>
>>>I can't tell you what the "guest PE" would best be modeled at. Maybe
>>>a simple list/hash inside the PHB is the answer. But we need to
>>>ensure that the guest knows about its scope of action. And we need to
>>>make sure that we maintain abstraction levels to keep at least a
>>>minimum level of sanity for the reader.
>>>
>>sPAPRVFIOPHBState has one-to-one mapping with container, which has
>>one-to-one mapping relationship with IOMMU group on sPAPR platform.
>>So sPAPRVFIOPHBState is representing "guest PE".
>
>So each PHB only spans a single PE?
>

Yep.

>>
>>Yes. RTAS calls from guest, 2 types of address information as input there:
>>PHB BUID+device's PCI config address, or PHB BUID+PE address. After removing
>>the logic of translating device's PCI config address to PE address from
>>host to QEMU, we don't have to care much about the first case (PHB BUID+
>>device's PCI config address). That means we only have to care "PE address",
>>which is basically the IOMMU group ID (plus fixed offset) of sPAPRPHBVFIOState.
>>So the BUID can be used to locate sPAPRPHBVFIOState and "PE address" can
>>help doing same thing.
>
>... then we can just ignore the "PE address", no?
>

Yes, we can ignore that. But I plan to use it for more sanity check, which
isn't harmful.

>>
>>As you mentioned above, spapr_rtas.c can't include "vfio.h". I guess I have
>>to rework on PCIDevice/PCIDeviceClass for a bit as follows. Alex, could you
>>help confirming it's what you're expecting to see?
>
>I depends. If you need a PCI device level callback, then yes. If you
>don't need a PCI level callback, just have one in the PHB. I'm not
>quite sure how you can find the "VFIO container" that belongs to that
>PHB though, but I'm sure you'll figure that out :).
>

Yeah, I'll have PHB level callback after having a discussion with
Alexey who gave good idea to make the abstraction as follows. Please
let me know if I'm in wrong direction again because I don't have QEMU
experience at all:

	RTAS call -> hw/ppc/spapr_pci.c where RTAS calls get registered
                  -> sPAPRPHBClass::rtas_eeh_handler() in hw/ppc/spapr_pci_vfio.c
                  -> vfio_container_ioctl() in hw/misc/vfio.c


If you think it's right way to go, I'm going to change the code accordingly
and send it to you for review after Alexey's quick scan. That would save
you a bit time in future though it already took you much time.

>>
>>- In include/hw/pci/pci.h, to have following callback for PCIDeviceClass
>>
>>   /* It's basically mapping ioctl commands to another set of macros */
>>   #define PCI_DEV_EEH_OP_ENABLE    0
>>   #define PCI_DEV_EEH_OP_DISABLE   1
>>   #define PCI_DEV_EEH_OP_GET_STATE 2
>>   #define PCI_DEV_EEH_OP_PE_RESET  3
>>   #define PCI_DEV_EEH_OP_PE_CONFIGURE 4
>>
>>   #define PCI_DEV_EEH_OPT_HRSET     0
>>   #define PCI_DEV_EEH_OPT_FRESET    1
>>           :
>>   int (*eeh_handler)(PCIDevice *pdev, int opcode, int option);
>
>If we need a PCI device level callback, yes.
>
>>
>>- In hw/ppc/spapr_pci_vfio.c, to register EEH RTAS calls there and I
>>   shouldn't include linux-headers/vfio.h there. When EEH RTAS calls
>>   come in, find the PE according to the PE address and pick up any
>>   one of the included PCI device and call into eeh_handler(). If the
>>   input address is device's PCI config address, find the PCI device
>>   and call to its eeh_handler() callback.
>
>Uh. The RTAS flow should go through a few different files that
>basically represent the different layers on a real machine.
>
>The spapr_rtas.c file should register the RTAS calls, as that one
>emulates the RTAS code that would usually run in guest code. That
>code determines and calls into the PHB.
>
>The PHB is implemented in hw/ppc/spapr_pci.c, so that's where the EEH
>calls on the PHB level belong. If the target of a call is a PCI
>device, the PHB looks up the device, its class and calls into an EEH
>call inside the PCI device. If the target is a PE, the PHB needs to
>call out on the TCE entity representing the PE. If that is backed by
>a VFIO IOMMU, it should get called somehow.
>

Ah, It's almost same to the above idea that Alexey shared. The different
point is that I'm going to have PHB level callback as sPAPRVFIOPHBState
could be regarded as "PE" :-)

Thanks,
Gavin
Alexander Graf May 28, 2014, 1:16 p.m. UTC | #7
On 28.05.14 14:33, Gavin Shan wrote:
> On Wed, May 28, 2014 at 01:24:15PM +0200, Alexander Graf wrote:
>> On 28.05.14 06:12, Gavin Shan wrote:
>>> On Wed, May 28, 2014 at 12:41:37AM +0200, Alexander Graf wrote:
>>>> On 28.05.14 00:27, Benjamin Herrenschmidt wrote:
>>>>> On Wed, 2014-05-28 at 00:22 +0200, Alexander Graf wrote:
>>> .../...
>>>
>>>>> In any case, the above isn't the problem, we register rtas functions
>>>>> called rtas_ibm_*, that's fine.
>>>> They're called rtas_ibm, not rtas_vfio. They simply live in the wrong
>>>> file for starters. Once you start moving them out of *vfio*.c and
>>>> make sure you don't sneak in vfio headers into spapr_rtas.c all the
>>>> abstraction should come naturally.
>>>>
>>>>> The problem you have is with what's inside these functions, correct ?
>>>>> You want some kind of PCI Dev ops... am I understanding properly ?
>>>>> Instead of having them call the VFIO ioctl's directly.
>>>> We have a nice bus hierarchy. The RTAS call tells you the PHB's ID.
>>>> The machine knows about all of its PHBs, so you can find that one.
>>>>
>>> >From the PHB we can then find a PCIDevice by identifier if we
>>>> received one in the RTAS call.
>>>>
>>>> If we received a PE, we need to act on a virtual PE object (not
>>>> available yet I think) which then would call at least into each VFIO
>>>> container associated with that guest PE. We don't want to call into
>>>> devices here, since we could have 2 VFIO devices in one PE and don't
>>>> want to do a PE level operation twice.
>>>>
>>>> If we want to simplify things (and I'm fine with that), we can also
>>>> restrict a "guest PE" to only span at most a single VFIO container
>>>> context and unlimited emulated devices. That way we don't need to
>>>> worry about potential side effects.
>>>>
>>>> I can't tell you what the "guest PE" would best be modeled at. Maybe
>>>> a simple list/hash inside the PHB is the answer. But we need to
>>>> ensure that the guest knows about its scope of action. And we need to
>>>> make sure that we maintain abstraction levels to keep at least a
>>>> minimum level of sanity for the reader.
>>>>
>>> sPAPRVFIOPHBState has one-to-one mapping with container, which has
>>> one-to-one mapping relationship with IOMMU group on sPAPR platform.
>>> So sPAPRVFIOPHBState is representing "guest PE".
>> So each PHB only spans a single PE?
>>
> Yep.
>
>>> Yes. RTAS calls from guest, 2 types of address information as input there:
>>> PHB BUID+device's PCI config address, or PHB BUID+PE address. After removing
>>> the logic of translating device's PCI config address to PE address from
>>> host to QEMU, we don't have to care much about the first case (PHB BUID+
>>> device's PCI config address). That means we only have to care "PE address",
>>> which is basically the IOMMU group ID (plus fixed offset) of sPAPRPHBVFIOState.
>>> So the BUID can be used to locate sPAPRPHBVFIOState and "PE address" can
>>> help doing same thing.
>> ... then we can just ignore the "PE address", no?
>>
> Yes, we can ignore that. But I plan to use it for more sanity check, which
> isn't harmful.
>
>>> As you mentioned above, spapr_rtas.c can't include "vfio.h". I guess I have
>>> to rework on PCIDevice/PCIDeviceClass for a bit as follows. Alex, could you
>>> help confirming it's what you're expecting to see?
>> I depends. If you need a PCI device level callback, then yes. If you
>> don't need a PCI level callback, just have one in the PHB. I'm not
>> quite sure how you can find the "VFIO container" that belongs to that
>> PHB though, but I'm sure you'll figure that out :).
>>
> Yeah, I'll have PHB level callback after having a discussion with
> Alexey who gave good idea to make the abstraction as follows. Please
> let me know if I'm in wrong direction again because I don't have QEMU
> experience at all:
>
> 	RTAS call -> hw/ppc/spapr_pci.c where RTAS calls get registered
>                    -> sPAPRPHBClass::rtas_eeh_handler() in hw/ppc/spapr_pci_vfio.c
>                    -> vfio_container_ioctl() in hw/misc/vfio.c

I think that would make sense, yes :).


Alex
Benjamin Herrenschmidt May 28, 2014, 9:54 p.m. UTC | #8
On Wed, 2014-05-28 at 13:24 +0200, Alexander Graf wrote:
> >    int (*eeh_handler)(PCIDevice *pdev, int opcode, int option);
> 
> If we need a PCI device level callback, yes.

I think a PCI device level callback is preferable regardless of whether
the current VFIO creates a PHB for a group/domain/PE or not. Emulated
devices might do differently etc..

Cheers,
Ben.
Alexander Graf May 28, 2014, 10:58 p.m. UTC | #9
On 28.05.14 23:54, Benjamin Herrenschmidt wrote:
> On Wed, 2014-05-28 at 13:24 +0200, Alexander Graf wrote:
>>>     int (*eeh_handler)(PCIDevice *pdev, int opcode, int option);
>> If we need a PCI device level callback, yes.
> I think a PCI device level callback is preferable regardless of whether
> the current VFIO creates a PHB for a group/domain/PE or not. Emulated
> devices might do differently etc..

So from the layering the way I grasped EEH works is that it basically 
operates on a mystical PE level. We basically have

   PHB =n=> PE =m=> Devices

One PE can contain VFIO devices that all share the same "container" and 
emulated devices.

What I think we need to do for emulated EEH is to implement it on a PE 
level. What EEH really does is that it unlinks a device's access to 
memory and it disallows for config space access.

Both of these operations really occur on the PE / PHB level. So we could 
for example have a bit in the PE that says "you're broken". When that 
bit is set, config space reads return -1 and writes become nops. When we 
enable that bit, we disable all IOMMU translations as well, so devices 
behind that PE become DMA incapable as well.

For the VFIO container, we have to notify it when something like this 
happens. So when we set the "you're broken" bit in the virtual PE, we 
have to notify the container to also break any real PE that VFIO devices 
behind that virtual PE are on. The same goes for recovery operations and 
so on.


Today, we don't really model multiple PEs, but assume that there's one 
PE per emulated PHB. So the model becomes easier in the sense that we 
can ignore the PHB -> PE abstraction and just maintain the code and 
state that the PE entity would have inside the PHB device.

Later, if we model it nicely now, it should be trivial to rip apart the 
PHB and PE entities and make them separate.

I hope that makes sense so far, and I hope we're all on the same page as 
to directions here.


Alex
diff mbox

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index eee0a7c..e394d0c 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1325,6 +1325,7 @@  static void ppc_spapr_init(QEMUMachineInitArgs *args)
     spapr_pci_msi_init(spapr, SPAPR_PCI_MSI_WINDOW);
     spapr_pci_rtas_init();
     spapr_tce_ddw_rtas_init(spapr);
+    spapr_eeh_rtas_init(spapr);
 
     phb = spapr_create_phb(spapr, 0);
 
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index a9f307a..0f2ac02 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -578,6 +578,7 @@  static void spapr_phb_realize(DeviceState *dev, Error **errp)
         return;
     }
 
+    sphb->pe_addr = 0xffffffff;
     sphb->dtbusname = g_strdup_printf("pci@%" PRIx64, sphb->buid);
 
     namebuf = alloca(strlen(sphb->dtbusname) + 32);
diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
index decf3dd..5dbddd0 100644
--- a/hw/ppc/spapr_pci_vfio.c
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -23,6 +23,8 @@ 
  */
 #include <sys/types.h>
 #include <dirent.h>
+#include <sys/ioctl.h>
+#include <linux/vfio.h>
 
 #include "hw/hw.h"
 #include "hw/ppc/spapr.h"
@@ -52,6 +54,12 @@  static void spapr_phb_vfio_finish_realize(sPAPRPHBState *sphb, Error **errp)
         return;
     }
 
+    /*
+     * PE address isn't relevant to PCI config address any more.
+     * However, 0 is invalid PE address and can't be used.
+     */
+    sphb->pe_addr = svphb->iommugroupid + 1;
+
     ret = vfio_container_spapr_get_info(&svphb->phb.iommu_as,
                                         svphb->iommugroupid,
                                         &info);
@@ -354,3 +362,330 @@  void spapr_tce_ddw_rtas_init(sPAPREnvironment *spapr)
         spapr_rtas_register("ibm,reset-pe-dma-window",
                             rtas_ibm_reset_pe_dma_window);
 }
+
+static bool spapr_vfio_pci_addr_check(sPAPRPHBState *sphb,
+                                      uint32_t addr, bool is_pe_addr)
+{
+    PCIDevice *pdev = NULL;
+
+    if (!sphb || !sphb->parent_obj.bus) {
+        return false;
+    }
+
+    if (!is_pe_addr) {
+        pdev = pci_find_device(sphb->parent_obj.bus,
+                               (addr >> 16) & 0xFF, (addr >> 8) & 0xFF);
+        if (!pdev) {
+            return false;
+        }
+    } else {
+        if (sphb->pe_addr != addr) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static void rtas_ibm_set_eeh_option(PowerPCCPU *cpu,
+                                    sPAPREnvironment *spapr,
+                                    uint32_t token, uint32_t nargs,
+                                    target_ulong args, uint32_t nret,
+                                    target_ulong rets)
+{
+    struct vfio_eeh_pe_set_option option;
+    uint32_t addr;
+    uint64_t buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    sPAPRPHBState *sphb = spapr_find_phb(spapr, buid);
+    sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
+    int32_t fd, ret;
+    bool valid = false;
+
+    if (!sphb || sphb->pe_addr == 0xffffffff) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    if ((nargs != 4) || (nret != 1)) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    option.argsz = sizeof(option);
+    option.option = rtas_ld(args, 3);
+    addr = rtas_ld(args, 0);
+    switch (option.option) {
+    case VFIO_EEH_PE_SET_OPT_ENABLE:
+        valid = spapr_vfio_pci_addr_check(sphb, addr, false);
+        break;
+    case VFIO_EEH_PE_SET_OPT_DISABLE:
+    case VFIO_EEH_PE_SET_OPT_IO:
+    case VFIO_EEH_PE_SET_OPT_DMA:
+        valid = spapr_vfio_pci_addr_check(sphb, addr, true);
+        break;
+    default:
+        valid = false;
+    }
+
+    if (!valid) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    fd = vfio_get_container_fd_by_group_id(svphb->iommugroupid);
+    if (fd < 0) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    ret = ioctl(fd, VFIO_EEH_PE_SET_OPTION, &option);
+    if (ret == 0) {
+        rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    } else {
+        rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
+    }
+}
+
+static void rtas_ibm_get_config_addr_info2(PowerPCCPU *cpu,
+                                           sPAPREnvironment *spapr,
+                                           uint32_t token, uint32_t nargs,
+                                           target_ulong args, uint32_t nret,
+                                           target_ulong rets)
+{
+    uint32_t addr, option;
+    uint64_t buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    sPAPRPHBState *sphb = spapr_find_phb(spapr, buid);
+    bool valid = false;
+
+    if (!sphb || sphb->pe_addr == 0xffffffff) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    if ((nargs != 4) || (nret != 2)) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    addr = rtas_ld(args, 0);
+    option = rtas_ld(args, 3);
+    switch (option) {
+    case VFIO_EEH_PE_GET_ADDRESS:
+        valid = spapr_vfio_pci_addr_check(sphb, addr, false);
+        if (valid) {
+            rtas_st(rets, 1, sphb->pe_addr);
+        }
+
+        break;
+    case VFIO_EEH_PE_GET_MODE:
+        valid = spapr_vfio_pci_addr_check(sphb, addr, false);
+
+        /* We always have shared PE */
+        if (valid) {
+            rtas_st(rets, 1, VFIO_EEH_PE_MODE_SHARED);
+        }
+
+        break;
+    default:
+        valid = false;
+    }
+
+    if (valid) {
+        rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    } else {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+    }
+}
+
+static void rtas_ibm_read_slot_reset_state2(PowerPCCPU *cpu,
+                                            sPAPREnvironment *spapr,
+                                            uint32_t token, uint32_t nargs,
+                                            target_ulong args, uint32_t nret,
+                                            target_ulong rets)
+{
+    struct vfio_eeh_pe_get_state get_state;
+    uint32_t addr;
+    uint64_t buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    sPAPRPHBState *sphb = spapr_find_phb(spapr, buid);
+    sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
+    int32_t fd, ret;
+    bool valid = false;
+
+    if (!sphb || sphb->pe_addr == 0xffffffff) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    if ((nargs != 3) || (nret != 4 && nret != 5)) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    get_state.argsz = sizeof(get_state);
+    addr = rtas_ld(args, 0);
+    valid = spapr_vfio_pci_addr_check(sphb, addr, true);
+    if (!valid) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    fd = vfio_get_container_fd_by_group_id(svphb->iommugroupid);
+    if (fd <= 0) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    ret = ioctl(fd, VFIO_EEH_PE_GET_STATE, &get_state);
+    if (ret == 0) {
+        rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+        rtas_st(rets, 1, get_state.state);
+        rtas_st(rets, 2, 1);
+        rtas_st(rets, 3, 1000);
+        if (nret >= 5) {
+            rtas_st(rets, 4, 0);
+        }
+    } else {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+    }
+}
+
+static void rtas_ibm_set_slot_reset(PowerPCCPU *cpu,
+                                    sPAPREnvironment *spapr,
+                                    uint32_t token, uint32_t nargs,
+                                    target_ulong args, uint32_t nret,
+                                    target_ulong rets)
+{
+    struct vfio_eeh_pe_reset pe_reset;
+    uint32_t addr;
+    uint64_t buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    sPAPRPHBState *sphb = spapr_find_phb(spapr, buid);
+    sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
+    int32_t fd, ret;
+    bool valid = false;
+
+    if (!sphb || sphb->pe_addr == 0xffffffff) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    if ((nargs != 4) || (nret != 1)) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    pe_reset.argsz = sizeof(pe_reset);
+    pe_reset.option = rtas_ld(args, 3);
+    addr = rtas_ld(args, 0);
+    switch (pe_reset.option) {
+    case VFIO_EEH_PE_RESET_DEACTIVATE:
+    case VFIO_EEH_PE_RESET_HOT:
+    case VFIO_EEH_PE_RESET_FUNDAMENTAL:
+        valid = spapr_vfio_pci_addr_check(sphb, addr, true);
+        break;
+    default:
+        valid = false;
+    }
+
+    if (!valid) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    fd = vfio_get_container_fd_by_group_id(svphb->iommugroupid);
+    if (fd <= 0) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    ret = ioctl(fd, VFIO_EEH_PE_RESET, &pe_reset);
+    if (ret == 0) {
+        rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    } else {
+        rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
+    }
+}
+
+static void rtas_ibm_configure_pe(PowerPCCPU *cpu,
+                                  sPAPREnvironment *spapr,
+                                  uint32_t token, uint32_t nargs,
+                                  target_ulong args, uint32_t nret,
+                                  target_ulong rets)
+{
+    uint32_t addr;
+    uint64_t buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    struct vfio_eeh_pe_configure configure;
+    sPAPRPHBState *sphb = spapr_find_phb(spapr, buid);
+    sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
+    int32_t fd, ret;
+    bool valid = false;
+
+    if (!sphb || sphb->pe_addr == 0xffffffff) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    if ((nargs != 3) || (nret != 1)) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    configure.argsz = sizeof(configure);
+    addr = rtas_ld(args, 0);
+    valid = spapr_vfio_pci_addr_check(sphb, addr, true);
+    if (!valid) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    fd = vfio_get_container_fd_by_group_id(svphb->iommugroupid);
+    if (fd <= 0) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    ret = ioctl(fd, VFIO_EEH_PE_CONFIGURE, &configure);
+    if (ret == 0) {
+        rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    } else {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+    }
+}
+
+static void rtas_ibm_slot_error_detail(PowerPCCPU *cpu,
+                                       sPAPREnvironment *spapr,
+                                       uint32_t token, uint32_t nargs,
+                                       target_ulong args, uint32_t nret,
+                                       target_ulong rets)
+{
+    int option;
+
+    /* To support it later */
+    if ((nargs != 8) || (nret != 1)) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    option = rtas_ld(args, 7);
+    if (option != 1 && option != 2) {
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+}
+
+void spapr_eeh_rtas_init(sPAPREnvironment *spapr)
+{
+    spapr_rtas_register("ibm,set-eeh-option",
+                        rtas_ibm_set_eeh_option);
+    spapr_rtas_register("ibm,get-config-addr-info2",
+                        rtas_ibm_get_config_addr_info2);
+    spapr_rtas_register("ibm,read-slot-reset-state2",
+                        rtas_ibm_read_slot_reset_state2);
+    spapr_rtas_register("ibm,set-slot-reset",
+                        rtas_ibm_set_slot_reset);
+    spapr_rtas_register("ibm,configure-pe",
+                        rtas_ibm_configure_pe);
+    spapr_rtas_register("ibm,slot-error-detail",
+                        rtas_ibm_slot_error_detail);
+}
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index a695f12..d6d165c 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -58,6 +58,7 @@  typedef struct sPAPRPHBState {
 
     int32_t index;
     uint64_t buid;
+    uint32_t pe_addr;
     char *dtbusname;
 
     MemoryRegion memspace, iospace;
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index f1d307f..7cf352b 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -406,6 +406,7 @@  struct sPAPRTCETable {
 
 sPAPRTCETable *spapr_tce_find_by_liobn(uint32_t liobn);
 void spapr_tce_ddw_rtas_init(sPAPREnvironment *spapr);
+void spapr_eeh_rtas_init(sPAPREnvironment *spapr);
 void spapr_events_init(sPAPREnvironment *spapr);
 void spapr_events_fdt_skel(void *fdt, uint32_t epow_irq);
 sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn,