diff mbox series

[2/2] hw/pcie: disable IO port fwd by default for pcie-root-port

Message ID 20170906142658.58298-3-marcel@redhat.com
State New
Headers show
Series hw/pcie: disable IO port fwd by default for pcie-root-port. | expand

Commit Message

Marcel Apfelbaum Sept. 6, 2017, 2:26 p.m. UTC
For most cases the devices attached to PCIe Root Ports
do not need IO ports range, add an 'enable-io-fwd' property
making it false by default, but keeping it true for older machines.

Signed-off-by: Marcel Apfelbaum <marcel@redhat.com>
---
 hw/pci-bridge/gen_pcie_root_port.c | 38 ++++++++++++++++++++++++++++++++++++++
 include/hw/compat.h                |  6 +++++-
 2 files changed, 43 insertions(+), 1 deletion(-)

Comments

Eduardo Habkost Sept. 6, 2017, 2:49 p.m. UTC | #1
On Wed, Sep 06, 2017 at 05:26:58PM +0300, Marcel Apfelbaum wrote:
> For most cases the devices attached to PCIe Root Ports
> do not need IO ports range, add an 'enable-io-fwd' property
> making it false by default, but keeping it true for older machines.
> 
> Signed-off-by: Marcel Apfelbaum <marcel@redhat.com>
[...]
> @@ -78,6 +111,7 @@ static const VMStateDescription vmstate_rp_dev = {
>  
>  static Property gen_rp_props[] = {
>      DEFINE_PROP_BOOL("x-migrate-msix", GenPCIERootPort, migrate_msix, true),
> +    DEFINE_PROP_BOOL("enable-io-fwd", GenPCIERootPort, enable_io_fwd, false),

There's no "x-" prefix, so I guess we really need to let the user
or management software to set enable-io-fwd=on explicitly on some
cases?
Marcel Apfelbaum Sept. 8, 2017, 11:39 a.m. UTC | #2
On 06/09/2017 17:49, Eduardo Habkost wrote:
> On Wed, Sep 06, 2017 at 05:26:58PM +0300, Marcel Apfelbaum wrote:
>> For most cases the devices attached to PCIe Root Ports
>> do not need IO ports range, add an 'enable-io-fwd' property
>> making it false by default, but keeping it true for older machines.
>>
>> Signed-off-by: Marcel Apfelbaum <marcel@redhat.com>
> [...]
>> @@ -78,6 +111,7 @@ static const VMStateDescription vmstate_rp_dev = {
>>   
>>   static Property gen_rp_props[] = {
>>       DEFINE_PROP_BOOL("x-migrate-msix", GenPCIERootPort, migrate_msix, true),
>> +    DEFINE_PROP_BOOL("enable-io-fwd", GenPCIERootPort, enable_io_fwd, false),
> 

Hi Eduardo,

> There's no "x-" prefix, so I guess we really need to let the user
> or management software to set enable-io-fwd=on explicitly on some
> cases?
> 

Definitely. Specifically 2 cases:
  - attach PCIe-PCI bridge to start a legacy PCI hierarchy.
  - assign a device requiring IO ports range.

Thanks,
Marcel
Laszlo Ersek Sept. 19, 2017, 10:15 p.m. UTC | #3
Hi Marcel,

On 09/06/17 16:26, Marcel Apfelbaum wrote:
> For most cases the devices attached to PCIe Root Ports
> do not need IO ports range, add an 'enable-io-fwd' property
> making it false by default, but keeping it true for older machines.
> 
> Signed-off-by: Marcel Apfelbaum <marcel@redhat.com>
> ---
>  hw/pci-bridge/gen_pcie_root_port.c | 38 ++++++++++++++++++++++++++++++++++++++
>  include/hw/compat.h                |  6 +++++-
>  2 files changed, 43 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/pci-bridge/gen_pcie_root_port.c b/hw/pci-bridge/gen_pcie_root_port.c
> index cb694d6da5..cbdeb73e2c 100644
> --- a/hw/pci-bridge/gen_pcie_root_port.c
> +++ b/hw/pci-bridge/gen_pcie_root_port.c
> @@ -20,12 +20,26 @@
>  #define GEN_PCIE_ROOT_PORT_AER_OFFSET           0x100
>  #define GEN_PCIE_ROOT_PORT_MSIX_NR_VECTOR       1
>  
> +#define GEN_PCIE_ROOT_PORT(obj) \
> +     OBJECT_CHECK(GenPCIERootPort, (obj), TYPE_GEN_PCIE_ROOT_PORT)
> +#define GEN_PCIE_ROOT_PORT_CLASS(klass) \
> +     OBJECT_CLASS_CHECK(GenPCIERootPortClass, (klass), TYPE_GEN_PCIE_ROOT_PORT)
> +#define GEN_PCIE_ROOT_PORT_GET_CLASS(obj) \
> +     OBJECT_GET_CLASS(GenPCIERootPortClass, (obj), TYPE_GEN_PCIE_ROOT_PORT)
> +
> +typedef struct GenPCIERootPortClass {
> +    PCIERootPortClass parent_class;
> +
> +    DeviceRealize parent_realize;
> +} GenPCIERootPortClass;
> +
>  typedef struct GenPCIERootPort {
>      /*< private >*/
>      PCIESlot parent_obj;
>      /*< public >*/
>  
>      bool migrate_msix;
> +    bool enable_io_fwd;
>  } GenPCIERootPort;
>  
>  static uint8_t gen_rp_aer_vector(const PCIDevice *d)
> @@ -60,6 +74,25 @@ static bool gen_rp_test_migrate_msix(void *opaque, int version_id)
>      return rp->migrate_msix;
>  }
>  
> +static void gen_rp_realize(DeviceState *d, Error **errp)
> +{
> +    GenPCIERootPortClass *grpc = GEN_PCIE_ROOT_PORT_GET_CLASS(d);
> +    GenPCIERootPort *grp = GEN_PCIE_ROOT_PORT(d);
> +    PCIDevice *pci_dev = PCI_DEVICE(d);
> +
> +    grpc->parent_realize(DEVICE(d), errp);
> +    if (*errp) {
> +        return;
> +    }
> +
> +    if (!grp->enable_io_fwd) {
> +        pci_word_test_and_clear_mask(pci_dev->wmask + PCI_COMMAND,
> +                                     PCI_COMMAND_IO);
> +        pci_dev->wmask[PCI_IO_BASE] = 0;
> +        pci_dev->wmask[PCI_IO_LIMIT] = 0;
> +    }
> +}
> +

This function exists now, but with different implementation, from
Aleksandr's commit 226263fb5cda ("hw/pci: add QEMU-specific PCI
capability to the Generic PCI Express Root Port", 2017-08-18).

It seems that we now have "grp->io_reserve" (a numeric quantity, not a
boolean), from property "io-reserve". The default value is -1.

According to the documentation added in c1800a162765 ("docs: update
documentation considering PCIE-PCI bridge", 2017-08-18), the value -1
seems to imply, "If any reservation field is -1 then this kind of
reservation is not needed and must be ignored by firmware".

I think we'll need to refine the definition. Once OVMF starts processing
this capability, the behavior should be compatible with earlier
behavior. Assume that a user sets only "mem-reserve" to something
different from -1, and thus the capability appears. When OVMF sees the
capability, it should be able to tell apart:

- no particular hint about IO space, so continue doing whatever has been
done all this time (default IO space reservation),
- do not request IO space reservation at all,
- a given (positive) size of IO space should be reserved.

So I think:

(a) the above read-only mask setting should be done based on

  (grp->io_reserve == 0)

and the "enable-io-fwd" property should be unnecessary,

(b) the "io-reserve" property should be set to 0 for 2.11 machine types
and onward, and to -1 for 2.10 and earlier (for compatibility),

(c) the documentation in "docs/pcie_pci_bridge.txt" should be updated to
say:
* (-1) --> default firmware behavior (unspecified)
*   0  --> do not reserve
*  >0  --> specific reservation requested

(d) pci_bridge_qemu_reserve_cap_init() should be updated accordingly
(i.e., a conflict exists if both mem_pref_32_reserve and
mem_pref_64_reserve are *positive*),


Second, when determining the reservations in OVMF, I would like to look
only at the capability fields, and not do a read-write-read-write
quadruplet to the IO base/limit registers. Do you agree?

Thanks!
Laszlo


>  static const VMStateDescription vmstate_rp_dev = {
>      .name = "pcie-root-port",
>      .version_id = 1,
> @@ -78,6 +111,7 @@ static const VMStateDescription vmstate_rp_dev = {
>  
>  static Property gen_rp_props[] = {
>      DEFINE_PROP_BOOL("x-migrate-msix", GenPCIERootPort, migrate_msix, true),
> +    DEFINE_PROP_BOOL("enable-io-fwd", GenPCIERootPort, enable_io_fwd, false),
>      DEFINE_PROP_END_OF_LIST()
>  };
>  
> @@ -86,6 +120,7 @@ static void gen_rp_dev_class_init(ObjectClass *klass, void *data)
>      DeviceClass *dc = DEVICE_CLASS(klass);
>      PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
>      PCIERootPortClass *rpc = PCIE_ROOT_PORT_CLASS(klass);
> +    GenPCIERootPortClass *grpc = GEN_PCIE_ROOT_PORT_CLASS(klass);
>  
>      k->vendor_id = PCI_VENDOR_ID_REDHAT;
>      k->device_id = PCI_DEVICE_ID_REDHAT_PCIE_RP;
> @@ -96,6 +131,8 @@ static void gen_rp_dev_class_init(ObjectClass *klass, void *data)
>      rpc->interrupts_init = gen_rp_interrupts_init;
>      rpc->interrupts_uninit = gen_rp_interrupts_uninit;
>      rpc->aer_offset = GEN_PCIE_ROOT_PORT_AER_OFFSET;
> +    grpc->parent_realize = dc->realize;
> +    dc->realize = gen_rp_realize;
>  }
>  
>  static const TypeInfo gen_rp_dev_info = {
> @@ -103,6 +140,7 @@ static const TypeInfo gen_rp_dev_info = {
>      .parent        = TYPE_PCIE_ROOT_PORT,
>      .instance_size = sizeof(GenPCIERootPort),
>      .class_init    = gen_rp_dev_class_init,
> +    .class_size    = sizeof(GenPCIERootPortClass),
>  };
>  
>   static void gen_rp_register_types(void)
> diff --git a/include/hw/compat.h b/include/hw/compat.h
> index 3e101f8f67..843bf4a3a5 100644
> --- a/include/hw/compat.h
> +++ b/include/hw/compat.h
> @@ -2,7 +2,11 @@
>  #define HW_COMPAT_H
>  
>  #define HW_COMPAT_2_10 \
> -    /* empty */
> +    {\
> +        .driver   = "pcie-root-port",\
> +        .property = "enable-io-fwd",\
> +        .value    = "true",\
> +    },
>  
>  #define HW_COMPAT_2_9 \
>      {\
>
Marcel Apfelbaum Sept. 20, 2017, 7:42 a.m. UTC | #4
Hi Laszlo,

On 20/09/2017 1:15, Laszlo Ersek wrote:
> Hi Marcel,
> 
> On 09/06/17 16:26, Marcel Apfelbaum wrote:
>> For most cases the devices attached to PCIe Root Ports
>> do not need IO ports range, add an 'enable-io-fwd' property
>> making it false by default, but keeping it true for older machines.
>>
>> Signed-off-by: Marcel Apfelbaum <marcel@redhat.com>
>> ---
>>   hw/pci-bridge/gen_pcie_root_port.c | 38 ++++++++++++++++++++++++++++++++++++++
>>   include/hw/compat.h                |  6 +++++-
>>   2 files changed, 43 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/pci-bridge/gen_pcie_root_port.c b/hw/pci-bridge/gen_pcie_root_port.c
>> index cb694d6da5..cbdeb73e2c 100644
>> --- a/hw/pci-bridge/gen_pcie_root_port.c
>> +++ b/hw/pci-bridge/gen_pcie_root_port.c
>> @@ -20,12 +20,26 @@
>>   #define GEN_PCIE_ROOT_PORT_AER_OFFSET           0x100
>>   #define GEN_PCIE_ROOT_PORT_MSIX_NR_VECTOR       1
>>   
>> +#define GEN_PCIE_ROOT_PORT(obj) \
>> +     OBJECT_CHECK(GenPCIERootPort, (obj), TYPE_GEN_PCIE_ROOT_PORT)
>> +#define GEN_PCIE_ROOT_PORT_CLASS(klass) \
>> +     OBJECT_CLASS_CHECK(GenPCIERootPortClass, (klass), TYPE_GEN_PCIE_ROOT_PORT)
>> +#define GEN_PCIE_ROOT_PORT_GET_CLASS(obj) \
>> +     OBJECT_GET_CLASS(GenPCIERootPortClass, (obj), TYPE_GEN_PCIE_ROOT_PORT)
>> +
>> +typedef struct GenPCIERootPortClass {
>> +    PCIERootPortClass parent_class;
>> +
>> +    DeviceRealize parent_realize;
>> +} GenPCIERootPortClass;
>> +
>>   typedef struct GenPCIERootPort {
>>       /*< private >*/
>>       PCIESlot parent_obj;
>>       /*< public >*/
>>   
>>       bool migrate_msix;
>> +    bool enable_io_fwd;
>>   } GenPCIERootPort;
>>   
>>   static uint8_t gen_rp_aer_vector(const PCIDevice *d)
>> @@ -60,6 +74,25 @@ static bool gen_rp_test_migrate_msix(void *opaque, int version_id)
>>       return rp->migrate_msix;
>>   }
>>   
>> +static void gen_rp_realize(DeviceState *d, Error **errp)
>> +{
>> +    GenPCIERootPortClass *grpc = GEN_PCIE_ROOT_PORT_GET_CLASS(d);
>> +    GenPCIERootPort *grp = GEN_PCIE_ROOT_PORT(d);
>> +    PCIDevice *pci_dev = PCI_DEVICE(d);
>> +
>> +    grpc->parent_realize(DEVICE(d), errp);
>> +    if (*errp) {
>> +        return;
>> +    }
>> +
>> +    if (!grp->enable_io_fwd) {
>> +        pci_word_test_and_clear_mask(pci_dev->wmask + PCI_COMMAND,
>> +                                     PCI_COMMAND_IO);
>> +        pci_dev->wmask[PCI_IO_BASE] = 0;
>> +        pci_dev->wmask[PCI_IO_LIMIT] = 0;
>> +    }
>> +}
>> +
> 

Thanks for the review!

> This function exists now, but with different implementation, from
> Aleksandr's commit 226263fb5cda ("hw/pci: add QEMU-specific PCI
> capability to the Generic PCI Express Root Port", 2017-08-18).
> 

Right, the patches touch the same areas.

> It seems that we now have "grp->io_reserve" (a numeric quantity, not a
> boolean), from property "io-reserve". The default value is -1.
> 
> According to the documentation added in c1800a162765 ("docs: update
> documentation considering PCIE-PCI bridge", 2017-08-18), the value -1
> seems to imply, "If any reservation field is -1 then this kind of
> reservation is not needed and must be ignored by firmware".
> 

This was decided after long long upstream discussions with
different maintainers.

> I think we'll need to refine the definition. Once OVMF starts processing
> this capability, the behavior should be compatible with earlier
> behavior. Assume that a user sets only "mem-reserve" to something
> different from -1, and thus the capability appears. When OVMF sees the
> capability, it should be able to tell apart:
> 
> - no particular hint about IO space, so continue doing whatever has been
> done all this time (default IO space reservation),
> - do not request IO space reservation at all, > - a given (positive) size of IO space should be reserved.
> 
> So I think:
> 
> (a) the above read-only mask setting should be done based on
> 
>    (grp->io_reserve == 0)
> 
> and the "enable-io-fwd" property should be unnecessary,
> 

I really want this to be the case,  but I need to check the
implementation first. The only concern is the semantics.

(grp->io_reserve == 0) hints the firmware to take
max(hint, <default-value>), since we don't want to reserve less
than the IO range needed by existing devices behind the bridge.

The implementation issue might be that Guest Firmware would
take the alignment as default value for an empty root port.
(maybe take it as a bug and "solve" it?)

> (b) the "io-reserve" property should be set to 0 for 2.11 machine types
> and onward, and to -1 for 2.10 and earlier (for compatibility),
> 

Michael asked us to wait a little before changing the default,
you can ask him for more info :)

> (c) the documentation in "docs/pcie_pci_bridge.txt" should be updated to
> say:
> * (-1) --> default firmware behavior (unspecified)
> *   0  --> do not reserve
> *  >0  --> specific reservation requested
> 

Agreed, once I re-check SeaBIOS to confirm behavior.

> (d) pci_bridge_qemu_reserve_cap_init() should be updated accordingly
> (i.e., a conflict exists if both mem_pref_32_reserve and
> mem_pref_64_reserve are *positive*),
> 

I thought we have this check already.

> 
> Second, when determining the reservations in OVMF, I would like to look
> only at the capability fields, and not do a read-write-read-write
> quadruplet to the IO base/limit registers. Do you agree?
> 

Well, as stated before, the semantics for the hint is
max(hint, <sum of all IO/MEM ranges for devices behind the bridge>).
If you can follow it, that would be OK.


Getting back to this patch, setting io-reserve to 0
would require also:

    +      pci_word_test_and_clear_mask(pci_dev->wmask + PCI_COMMAND,
    +                                     PCI_COMMAND_IO);
    +      pci_dev->wmask[PCI_IO_BASE] = 0;
    +      pci_dev->wmask[PCI_IO_LIMIT] = 0;

otherwise the Guest OS would not behave, just be aware of it.


Thanks,
Marcel

> Thanks!
> Laszlo
> 
> 
>>   static const VMStateDescription vmstate_rp_dev = {
>>       .name = "pcie-root-port",
>>       .version_id = 1,
>> @@ -78,6 +111,7 @@ static const VMStateDescription vmstate_rp_dev = {
>>   
>>   static Property gen_rp_props[] = {
>>       DEFINE_PROP_BOOL("x-migrate-msix", GenPCIERootPort, migrate_msix, true),
>> +    DEFINE_PROP_BOOL("enable-io-fwd", GenPCIERootPort, enable_io_fwd, false),
>>       DEFINE_PROP_END_OF_LIST()
>>   };
>>   
>> @@ -86,6 +120,7 @@ static void gen_rp_dev_class_init(ObjectClass *klass, void *data)
>>       DeviceClass *dc = DEVICE_CLASS(klass);
>>       PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
>>       PCIERootPortClass *rpc = PCIE_ROOT_PORT_CLASS(klass);
>> +    GenPCIERootPortClass *grpc = GEN_PCIE_ROOT_PORT_CLASS(klass);
>>   
>>       k->vendor_id = PCI_VENDOR_ID_REDHAT;
>>       k->device_id = PCI_DEVICE_ID_REDHAT_PCIE_RP;
>> @@ -96,6 +131,8 @@ static void gen_rp_dev_class_init(ObjectClass *klass, void *data)
>>       rpc->interrupts_init = gen_rp_interrupts_init;
>>       rpc->interrupts_uninit = gen_rp_interrupts_uninit;
>>       rpc->aer_offset = GEN_PCIE_ROOT_PORT_AER_OFFSET;
>> +    grpc->parent_realize = dc->realize;
>> +    dc->realize = gen_rp_realize;
>>   }
>>   
>>   static const TypeInfo gen_rp_dev_info = {
>> @@ -103,6 +140,7 @@ static const TypeInfo gen_rp_dev_info = {
>>       .parent        = TYPE_PCIE_ROOT_PORT,
>>       .instance_size = sizeof(GenPCIERootPort),
>>       .class_init    = gen_rp_dev_class_init,
>> +    .class_size    = sizeof(GenPCIERootPortClass),
>>   };
>>   
>>    static void gen_rp_register_types(void)
>> diff --git a/include/hw/compat.h b/include/hw/compat.h
>> index 3e101f8f67..843bf4a3a5 100644
>> --- a/include/hw/compat.h
>> +++ b/include/hw/compat.h
>> @@ -2,7 +2,11 @@
>>   #define HW_COMPAT_H
>>   
>>   #define HW_COMPAT_2_10 \
>> -    /* empty */
>> +    {\
>> +        .driver   = "pcie-root-port",\
>> +        .property = "enable-io-fwd",\
>> +        .value    = "true",\
>> +    },
>>   
>>   #define HW_COMPAT_2_9 \
>>       {\
>>
>
Laszlo Ersek Sept. 20, 2017, 11:01 a.m. UTC | #5
On 09/20/17 09:42, Marcel Apfelbaum wrote:
> On 20/09/2017 1:15, Laszlo Ersek wrote:

>> It seems that we now have "grp->io_reserve" (a numeric quantity, not a
>> boolean), from property "io-reserve". The default value is -1.
>>
>> According to the documentation added in c1800a162765 ("docs: update
>> documentation considering PCIE-PCI bridge", 2017-08-18), the value -1
>> seems to imply, "If any reservation field is -1 then this kind of
>> reservation is not needed and must be ignored by firmware".
>>
> 
> This was decided after long long upstream discussions with
> different maintainers.

I know :) I participated in the review, and the documentation patch
noted above carries my R-b.

I didn't mean that the -1 default was wrong, only that we might have to
polish the definition / semantics a bit.

>> I think we'll need to refine the definition. Once OVMF starts processing
>> this capability, the behavior should be compatible with earlier
>> behavior. Assume that a user sets only "mem-reserve" to something
>> different from -1, and thus the capability appears. When OVMF sees the
>> capability, it should be able to tell apart:
>>
>> - no particular hint about IO space, so continue doing whatever has been
>> done all this time (default IO space reservation),
>> - do not request IO space reservation at all, > - a given (positive)
>> size of IO space should be reserved.
>>
>> So I think:
>>
>> (a) the above read-only mask setting should be done based on
>>
>>    (grp->io_reserve == 0)
>>
>> and the "enable-io-fwd" property should be unnecessary,
>>
> 
> I really want this to be the case,  but I need to check the
> implementation first. The only concern is the semantics.
> 
> (grp->io_reserve == 0) hints the firmware to take
> max(hint, <default-value>), since we don't want to reserve less
> than the IO range needed by existing devices behind the bridge.

I agree.

In OVMF, I would translate (io_reserve == 0) into *not* returning any
particular IO reservation hint to the generic PciBusDxe driver, for the
PCI-E port in question. In turn PciBusDxe follows the above "max"
semantics, as far as I remember; in other words, setting io_reserve to
zero wouldn't break existing IO requirements.

Here's a table:

- io_reserve == (-1) --> OVMF returns an IO reservation of 512 bytes
  (which in practice gets rounded up to 4KB). If more IO is needed by
  devices behind the bridge, then that larger value is used. This is the
  current behavior and we should keep it.

- io_reserve == 0 --> OVMF returns no IO reservation at all. If any IO
  is needed by devices behind the bridge, then that value is used.
  Otherwise, no IO is allocated.

- io_reserve > 0 --> OVMF returns this value to PciBusDxe. PciBusDxe
  might round it up. If more IO is needed by devices behind the bridge,
  then that larger value is used.

> The implementation issue might be that Guest Firmware would
> take the alignment as default value for an empty root port.
> (maybe take it as a bug and "solve" it?)

Hm, I don't get this. What do you mean?

> 
>> (b) the "io-reserve" property should be set to 0 for 2.11 machine types
>> and onward, and to -1 for 2.10 and earlier (for compatibility),
>>
> 
> Michael asked us to wait a little before changing the default,
> you can ask him for more info :)

Sure, I'm totally fine with delaying the change to the default value.

> 
>> (c) the documentation in "docs/pcie_pci_bridge.txt" should be updated to
>> say:
>> * (-1) --> default firmware behavior (unspecified)
>> *   0  --> do not reserve
>> *  >0  --> specific reservation requested
>>
> 
> Agreed, once I re-check SeaBIOS to confirm behavior.

BTW, if the OVMF -- more precisely, PciBusDxe -- and SeaBIOS behaviors
turn out to differ significantly in the handling of the values (although
I don't expect it, see above -- last time I looked, the interpretations
were similar), I think it's a possibility (although not optimal) to say
that the interpretation of these values is firmware specific.

We won't know for certain until we write the code and test both firmwares.

> 
>> (d) pci_bridge_qemu_reserve_cap_init() should be updated accordingly
>> (i.e., a conflict exists if both mem_pref_32_reserve and
>> mem_pref_64_reserve are *positive*),
>>
> 
> I thought we have this check already.

We have a check like this, but it doesn't use the exact condition that
I'm suggesting above (emphasis on *positive*). We now have

    if (mem_pref_32_reserve != (uint32_t)-1 &&
        mem_pref_64_reserve != (uint64_t)-1) {
        error_setg(errp,

but after introducing the zero value, this is going to be too strict.
Consider all the possible combinations:

* (-1; -1): default fw behavior, check is OK

* (-1;  0): fw sticks with the default for the first component, picks
            "no reservation" for the second component, check is OK

* (-1; >0): fw can treat this identically to ( 0; >0) -- see below. This
            is justified because, while the first component says "use
            the default", the entire situation of having such a
            capability is new, so the firmware can introduce new ways
            for handling it. The check passes, which is good.

* ( 0;  0): check reports error, but the firmware can handle this case
            fine (no reservation for either component)

* ( 0; >0): check reports error, but the fw can handle this (no
            reservation for the first component, specific reservation
            for the second)

* (>0; >0): conflict, check is OK

(I'm skipping the description of the inverted orderings, such as (0;-1),
they behave similarly.)

So we need to accept 0 as "valid" for either component:

    if (mem_pref_32_reserve > 0 &&
        mem_pref_32_reserve < (uint32_t)-1 &&
        mem_pref_64_reserve > 0 &&
        mem_pref_64_reserve < (uint64_t)-1) {
        error_setg(errp,

> 
>>
>> Second, when determining the reservations in OVMF, I would like to look
>> only at the capability fields, and not do a read-write-read-write
>> quadruplet to the IO base/limit registers. Do you agree?
>>
> 
> Well, as stated before, the semantics for the hint is
> max(hint, <sum of all IO/MEM ranges for devices behind the bridge>).
> If you can follow it, that would be OK.

In OVMF there are two separate questions:
- how to report the requested reservations,
- how to act upon the reported values.

The first question pertains to "platform code", i.e., what I'm going to
implement under OvmfPkg. The second question pertains to "universal
code" (under "MdeModulePkg/Bus/Pci/PciBusDxe"), which is a lot harder to
modify, because it is shipped on a wide range of physical platforms too.

Again, my understanding is that PciBusDxe implements the "max" semantics
that you describe (pertaining to the 2nd question).

My interest is in figuring out the 1st question now, that is, where I
should take the information from that goes into the "reservation
description" that PciBusDxe understands. My preference would be to look
only at the PCIE port in question (i.e. no other PCI devices at all),
and only at said capability in the config space of the PCIE port.

> Getting back to this patch, setting io-reserve to 0
> would require also:
> 
>    +      pci_word_test_and_clear_mask(pci_dev->wmask + PCI_COMMAND,
>    +                                     PCI_COMMAND_IO);
>    +      pci_dev->wmask[PCI_IO_BASE] = 0;
>    +      pci_dev->wmask[PCI_IO_LIMIT] = 0;
> 
> otherwise the Guest OS would not behave, just be aware of it.

Absolutely, no doubt about this.

Thanks,
Laszlo


>>>   static const VMStateDescription vmstate_rp_dev = {
>>>       .name = "pcie-root-port",
>>>       .version_id = 1,
>>> @@ -78,6 +111,7 @@ static const VMStateDescription vmstate_rp_dev = {
>>>     static Property gen_rp_props[] = {
>>>       DEFINE_PROP_BOOL("x-migrate-msix", GenPCIERootPort,
>>> migrate_msix, true),
>>> +    DEFINE_PROP_BOOL("enable-io-fwd", GenPCIERootPort,
>>> enable_io_fwd, false),
>>>       DEFINE_PROP_END_OF_LIST()
>>>   };
>>>   @@ -86,6 +120,7 @@ static void gen_rp_dev_class_init(ObjectClass
>>> *klass, void *data)
>>>       DeviceClass *dc = DEVICE_CLASS(klass);
>>>       PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
>>>       PCIERootPortClass *rpc = PCIE_ROOT_PORT_CLASS(klass);
>>> +    GenPCIERootPortClass *grpc = GEN_PCIE_ROOT_PORT_CLASS(klass);
>>>         k->vendor_id = PCI_VENDOR_ID_REDHAT;
>>>       k->device_id = PCI_DEVICE_ID_REDHAT_PCIE_RP;
>>> @@ -96,6 +131,8 @@ static void gen_rp_dev_class_init(ObjectClass
>>> *klass, void *data)
>>>       rpc->interrupts_init = gen_rp_interrupts_init;
>>>       rpc->interrupts_uninit = gen_rp_interrupts_uninit;
>>>       rpc->aer_offset = GEN_PCIE_ROOT_PORT_AER_OFFSET;
>>> +    grpc->parent_realize = dc->realize;
>>> +    dc->realize = gen_rp_realize;
>>>   }
>>>     static const TypeInfo gen_rp_dev_info = {
>>> @@ -103,6 +140,7 @@ static const TypeInfo gen_rp_dev_info = {
>>>       .parent        = TYPE_PCIE_ROOT_PORT,
>>>       .instance_size = sizeof(GenPCIERootPort),
>>>       .class_init    = gen_rp_dev_class_init,
>>> +    .class_size    = sizeof(GenPCIERootPortClass),
>>>   };
>>>      static void gen_rp_register_types(void)
>>> diff --git a/include/hw/compat.h b/include/hw/compat.h
>>> index 3e101f8f67..843bf4a3a5 100644
>>> --- a/include/hw/compat.h
>>> +++ b/include/hw/compat.h
>>> @@ -2,7 +2,11 @@
>>>   #define HW_COMPAT_H
>>>     #define HW_COMPAT_2_10 \
>>> -    /* empty */
>>> +    {\
>>> +        .driver   = "pcie-root-port",\
>>> +        .property = "enable-io-fwd",\
>>> +        .value    = "true",\
>>> +    },
>>>     #define HW_COMPAT_2_9 \
>>>       {\
>>>
>>
>
Marcel Apfelbaum Sept. 20, 2017, 11:16 a.m. UTC | #6
On 20/09/2017 14:01, Laszlo Ersek wrote:
> On 09/20/17 09:42, Marcel Apfelbaum wrote:
>> On 20/09/2017 1:15, Laszlo Ersek wrote:
> 
>>> It seems that we now have "grp->io_reserve" (a numeric quantity, not a
>>> boolean), from property "io-reserve". The default value is -1.
>>>
>>> According to the documentation added in c1800a162765 ("docs: update
>>> documentation considering PCIE-PCI bridge", 2017-08-18), the value -1
>>> seems to imply, "If any reservation field is -1 then this kind of
>>> reservation is not needed and must be ignored by firmware".
>>>
>>
>> This was decided after long long upstream discussions with
>> different maintainers.
> 
> I know :) I participated in the review, and the documentation patch
> noted above carries my R-b.
> 
> I didn't mean that the -1 default was wrong, only that we might have to
> polish the definition / semantics a bit.
> 
>>> I think we'll need to refine the definition. Once OVMF starts processing
>>> this capability, the behavior should be compatible with earlier
>>> behavior. Assume that a user sets only "mem-reserve" to something
>>> different from -1, and thus the capability appears. When OVMF sees the
>>> capability, it should be able to tell apart:
>>>
>>> - no particular hint about IO space, so continue doing whatever has been
>>> done all this time (default IO space reservation),
>>> - do not request IO space reservation at all, > - a given (positive)
>>> size of IO space should be reserved.
>>>
>>> So I think:
>>>
>>> (a) the above read-only mask setting should be done based on
>>>
>>>     (grp->io_reserve == 0)
>>>
>>> and the "enable-io-fwd" property should be unnecessary,
>>>
>>
>> I really want this to be the case,  but I need to check the
>> implementation first. The only concern is the semantics.
>>
>> (grp->io_reserve == 0) hints the firmware to take
>> max(hint, <default-value>), since we don't want to reserve less
>> than the IO range needed by existing devices behind the bridge.
> 
> I agree.
> 
> In OVMF, I would translate (io_reserve == 0) into *not* returning any
> particular IO reservation hint to the generic PciBusDxe driver, for the
> PCI-E port in question. In turn PciBusDxe follows the above "max"
> semantics, as far as I remember; in other words, setting io_reserve to
> zero wouldn't break existing IO requirements.
> 
> Here's a table:
> 
> - io_reserve == (-1) --> OVMF returns an IO reservation of 512 bytes
>    (which in practice gets rounded up to 4KB). If more IO is needed by
>    devices behind the bridge, then that larger value is used. This is the
>    current behavior and we should keep it.
> 
> - io_reserve == 0 --> OVMF returns no IO reservation at all. If any IO
>    is needed by devices behind the bridge, then that value is used.
>    Otherwise, no IO is allocated.
> 
> - io_reserve > 0 --> OVMF returns this value to PciBusDxe. PciBusDxe
>    might round it up. If more IO is needed by devices behind the bridge,
>    then that larger value is used.
> 
>> The implementation issue might be that Guest Firmware would
>> take the alignment as default value for an empty root port.
>> (maybe take it as a bug and "solve" it?)
> 
> Hm, I don't get this. What do you mean?
> 

Empty PCI Express Root Port with io_reserve = 0 should end with
no IO ports range, right?

But there is some code in SeaBIOS that "aligns" the IO range to 4K,
so we may end up with a 4K range even if io_reserve=0.


>>
>>> (b) the "io-reserve" property should be set to 0 for 2.11 machine types
>>> and onward, and to -1 for 2.10 and earlier (for compatibility),
>>>
>>
>> Michael asked us to wait a little before changing the default,
>> you can ask him for more info :)
> 
> Sure, I'm totally fine with delaying the change to the default value.
> 
>>
>>> (c) the documentation in "docs/pcie_pci_bridge.txt" should be updated to
>>> say:
>>> * (-1) --> default firmware behavior (unspecified)
>>> *   0  --> do not reserve
>>> *  >0  --> specific reservation requested
>>>
>>
>> Agreed, once I re-check SeaBIOS to confirm behavior.
> 
> BTW, if the OVMF -- more precisely, PciBusDxe -- and SeaBIOS behaviors
> turn out to differ significantly in the handling of the values (although
> I don't expect it, see above -- last time I looked, the interpretations
> were similar), I think it's a possibility (although not optimal) to say
> that the interpretation of these values is firmware specific.
> 
> We won't know for certain until we write the code and test both firmwares.
> 
>>
>>> (d) pci_bridge_qemu_reserve_cap_init() should be updated accordingly
>>> (i.e., a conflict exists if both mem_pref_32_reserve and
>>> mem_pref_64_reserve are *positive*),
>>>
>>
>> I thought we have this check already.
> 
> We have a check like this, but it doesn't use the exact condition that
> I'm suggesting above (emphasis on *positive*). We now have
> 
>      if (mem_pref_32_reserve != (uint32_t)-1 &&
>          mem_pref_64_reserve != (uint64_t)-1) {
>          error_setg(errp,
> 
> but after introducing the zero value, this is going to be too strict.
> Consider all the possible combinations:
> 
> * (-1; -1): default fw behavior, check is OK
> 
> * (-1;  0): fw sticks with the default for the first component, picks
>              "no reservation" for the second component, check is OK
> 
> * (-1; >0): fw can treat this identically to ( 0; >0) -- see below. This
>              is justified because, while the first component says "use
>              the default", the entire situation of having such a
>              capability is new, so the firmware can introduce new ways
>              for handling it. The check passes, which is good.
> 
> * ( 0;  0): check reports error, but the firmware can handle this case
>              fine (no reservation for either component)
> 
> * ( 0; >0): check reports error, but the fw can handle this (no
>              reservation for the first component, specific reservation
>              for the second)
> 
> * (>0; >0): conflict, check is OK
> 
> (I'm skipping the description of the inverted orderings, such as (0;-1),
> they behave similarly.)
> 
> So we need to accept 0 as "valid" for either component:
> 
>      if (mem_pref_32_reserve > 0 &&
>          mem_pref_32_reserve < (uint32_t)-1 &&
>          mem_pref_64_reserve > 0 &&
>          mem_pref_64_reserve < (uint64_t)-1) {
>          error_setg(errp,
> 

Got it, you are right. How did we end up with something
complicated again :( ?
(complicated = needs documentation and not straight forward)

>>
>>>
>>> Second, when determining the reservations in OVMF, I would like to look
>>> only at the capability fields, and not do a read-write-read-write
>>> quadruplet to the IO base/limit registers. Do you agree?
>>>
>>
>> Well, as stated before, the semantics for the hint is
>> max(hint, <sum of all IO/MEM ranges for devices behind the bridge>).
>> If you can follow it, that would be OK.
> 
> In OVMF there are two separate questions:
> - how to report the requested reservations,
> - how to act upon the reported values.
> 
> The first question pertains to "platform code", i.e., what I'm going to
> implement under OvmfPkg. The second question pertains to "universal
> code" (under "MdeModulePkg/Bus/Pci/PciBusDxe"), which is a lot harder to
> modify, because it is shipped on a wide range of physical platforms too.
> 
> Again, my understanding is that PciBusDxe implements the "max" semantics
> that you describe (pertaining to the 2nd question).
> 
> My interest is in figuring out the 1st question now, that is, where I
> should take the information from that goes into the "reservation
> description" that PciBusDxe understands. My preference would be to look
> only at the PCIE port in question (i.e. no other PCI devices at all),
> and only at said capability in the config space of the PCIE port.
> 

Sounds OK.

Thanks,
Marcel

>> Getting back to this patch, setting io-reserve to 0
>> would require also:
>>
>>     +      pci_word_test_and_clear_mask(pci_dev->wmask + PCI_COMMAND,
>>     +                                     PCI_COMMAND_IO);
>>     +      pci_dev->wmask[PCI_IO_BASE] = 0;
>>     +      pci_dev->wmask[PCI_IO_LIMIT] = 0;
>>
>> otherwise the Guest OS would not behave, just be aware of it.
> 
> Absolutely, no doubt about this.
> 
> Thanks,
> Laszlo
> 
> 
>>>>    static const VMStateDescription vmstate_rp_dev = {
>>>>        .name = "pcie-root-port",
>>>>        .version_id = 1,
>>>> @@ -78,6 +111,7 @@ static const VMStateDescription vmstate_rp_dev = {
>>>>      static Property gen_rp_props[] = {
>>>>        DEFINE_PROP_BOOL("x-migrate-msix", GenPCIERootPort,
>>>> migrate_msix, true),
>>>> +    DEFINE_PROP_BOOL("enable-io-fwd", GenPCIERootPort,
>>>> enable_io_fwd, false),
>>>>        DEFINE_PROP_END_OF_LIST()
>>>>    };
>>>>    @@ -86,6 +120,7 @@ static void gen_rp_dev_class_init(ObjectClass
>>>> *klass, void *data)
>>>>        DeviceClass *dc = DEVICE_CLASS(klass);
>>>>        PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
>>>>        PCIERootPortClass *rpc = PCIE_ROOT_PORT_CLASS(klass);
>>>> +    GenPCIERootPortClass *grpc = GEN_PCIE_ROOT_PORT_CLASS(klass);
>>>>          k->vendor_id = PCI_VENDOR_ID_REDHAT;
>>>>        k->device_id = PCI_DEVICE_ID_REDHAT_PCIE_RP;
>>>> @@ -96,6 +131,8 @@ static void gen_rp_dev_class_init(ObjectClass
>>>> *klass, void *data)
>>>>        rpc->interrupts_init = gen_rp_interrupts_init;
>>>>        rpc->interrupts_uninit = gen_rp_interrupts_uninit;
>>>>        rpc->aer_offset = GEN_PCIE_ROOT_PORT_AER_OFFSET;
>>>> +    grpc->parent_realize = dc->realize;
>>>> +    dc->realize = gen_rp_realize;
>>>>    }
>>>>      static const TypeInfo gen_rp_dev_info = {
>>>> @@ -103,6 +140,7 @@ static const TypeInfo gen_rp_dev_info = {
>>>>        .parent        = TYPE_PCIE_ROOT_PORT,
>>>>        .instance_size = sizeof(GenPCIERootPort),
>>>>        .class_init    = gen_rp_dev_class_init,
>>>> +    .class_size    = sizeof(GenPCIERootPortClass),
>>>>    };
>>>>       static void gen_rp_register_types(void)
>>>> diff --git a/include/hw/compat.h b/include/hw/compat.h
>>>> index 3e101f8f67..843bf4a3a5 100644
>>>> --- a/include/hw/compat.h
>>>> +++ b/include/hw/compat.h
>>>> @@ -2,7 +2,11 @@
>>>>    #define HW_COMPAT_H
>>>>      #define HW_COMPAT_2_10 \
>>>> -    /* empty */
>>>> +    {\
>>>> +        .driver   = "pcie-root-port",\
>>>> +        .property = "enable-io-fwd",\
>>>> +        .value    = "true",\
>>>> +    },
>>>>      #define HW_COMPAT_2_9 \
>>>>        {\
>>>>
>>>
>>
>
Laszlo Ersek Sept. 20, 2017, 11:35 a.m. UTC | #7
On 09/20/17 13:16, Marcel Apfelbaum wrote:
> On 20/09/2017 14:01, Laszlo Ersek wrote:
>> On 09/20/17 09:42, Marcel Apfelbaum wrote:

>>> The implementation issue might be that Guest Firmware would
>>> take the alignment as default value for an empty root port.
>>> (maybe take it as a bug and "solve" it?)
>>
>> Hm, I don't get this. What do you mean?
>>
> 
> Empty PCI Express Root Port with io_reserve = 0 should end with
> no IO ports range, right?

Yes.

> But there is some code in SeaBIOS that "aligns" the IO range to 4K,
> so we may end up with a 4K range even if io_reserve=0.

Ah.

OK, if this makes a difference, please report your findings :)

>>>> (d) pci_bridge_qemu_reserve_cap_init() should be updated accordingly
>>>> (i.e., a conflict exists if both mem_pref_32_reserve and
>>>> mem_pref_64_reserve are *positive*),
>>>>
>>>
>>> I thought we have this check already.
>>
>> We have a check like this, but it doesn't use the exact condition that
>> I'm suggesting above (emphasis on *positive*). We now have
>>
>>      if (mem_pref_32_reserve != (uint32_t)-1 &&
>>          mem_pref_64_reserve != (uint64_t)-1) {
>>          error_setg(errp,
>>
>> but after introducing the zero value, this is going to be too strict.
>> Consider all the possible combinations:
>>
>> * (-1; -1): default fw behavior, check is OK
>>
>> * (-1;  0): fw sticks with the default for the first component, picks
>>              "no reservation" for the second component, check is OK
>>
>> * (-1; >0): fw can treat this identically to ( 0; >0) -- see below. This
>>              is justified because, while the first component says "use
>>              the default", the entire situation of having such a
>>              capability is new, so the firmware can introduce new ways
>>              for handling it. The check passes, which is good.
>>
>> * ( 0;  0): check reports error, but the firmware can handle this case
>>              fine (no reservation for either component)
>>
>> * ( 0; >0): check reports error, but the fw can handle this (no
>>              reservation for the first component, specific reservation
>>              for the second)
>>
>> * (>0; >0): conflict, check is OK
>>
>> (I'm skipping the description of the inverted orderings, such as (0;-1),
>> they behave similarly.)
>>
>> So we need to accept 0 as "valid" for either component:
>>
>>      if (mem_pref_32_reserve > 0 &&
>>          mem_pref_32_reserve < (uint32_t)-1 &&
>>          mem_pref_64_reserve > 0 &&
>>          mem_pref_64_reserve < (uint64_t)-1) {
>>          error_setg(errp,
>>
> 
> Got it, you are right. How did we end up with something
> complicated again :( ?
> (complicated = needs documentation and not straight forward)

Haha, are you serious? :)

*Nothing* is simple in QEMU; nothing at all. :) In general, software is
complex because of feature requests, and QEMU caters to a million
different (and conflicting) use cases, so you get complexity.

(I know this is trivial, but it sure felt good to write down! :) )

>>>> Second, when determining the reservations in OVMF, I would like to look
>>>> only at the capability fields, and not do a read-write-read-write
>>>> quadruplet to the IO base/limit registers. Do you agree?
>>>>
>>>
>>> Well, as stated before, the semantics for the hint is
>>> max(hint, <sum of all IO/MEM ranges for devices behind the bridge>).
>>> If you can follow it, that would be OK.
>>
>> In OVMF there are two separate questions:
>> - how to report the requested reservations,
>> - how to act upon the reported values.
>>
>> The first question pertains to "platform code", i.e., what I'm going to
>> implement under OvmfPkg. The second question pertains to "universal
>> code" (under "MdeModulePkg/Bus/Pci/PciBusDxe"), which is a lot harder to
>> modify, because it is shipped on a wide range of physical platforms too.
>>
>> Again, my understanding is that PciBusDxe implements the "max" semantics
>> that you describe (pertaining to the 2nd question).
>>
>> My interest is in figuring out the 1st question now, that is, where I
>> should take the information from that goes into the "reservation
>> description" that PciBusDxe understands. My preference would be to look
>> only at the PCIE port in question (i.e. no other PCI devices at all),
>> and only at said capability in the config space of the PCIE port.
>>
> 
> Sounds OK.

Thank you! Looks like I can start writing code for OVMF when I find the
time.

Thanks!
Laszlo
Marcel Apfelbaum Sept. 27, 2017, 10:06 a.m. UTC | #8
Hi Laszlo,

On 20/09/2017 14:01, Laszlo Ersek wrote:
>   We now have
> 
>      if (mem_pref_32_reserve != (uint32_t)-1 &&
>          mem_pref_64_reserve != (uint64_t)-1) {
>          error_setg(errp,
> 
> but after introducing the zero value, this is going to be too strict.
> Consider all the possible combinations:
> 
> * (-1; -1): default fw behavior, check is OK
> 
> * (-1;  0): fw sticks with the default for the first component, picks
>              "no reservation" for the second component, check is OK
> 
> * (-1; >0): fw can treat this identically to ( 0; >0) -- see below. This
>              is justified because, while the first component says "use
>              the default", the entire situation of having such a
>              capability is new, so the firmware can introduce new ways
>              for handling it. The check passes, which is good.
> 
> * ( 0;  0): check reports error, but the firmware can handle this case
>              fine (no reservation for either component)
> 
> * ( 0; >0): check reports error, but the fw can handle this (no
>              reservation for the first component, specific reservation
>              for the second)
> 
> * (>0; >0): conflict, check is OK
> 
> (I'm skipping the description of the inverted orderings, such as (0;-1),
> they behave similarly.)
> 
> So we need to accept 0 as "valid" for either component:
> 
>      if (mem_pref_32_reserve > 0 &&
>          mem_pref_32_reserve < (uint32_t)-1 &&
>          mem_pref_64_reserve > 0 &&
>          mem_pref_64_reserve < (uint64_t)-1) {
>          error_setg(errp,
> 

SeaBIOS  will not allow (0,0) values for pref32 and pref64 fields.
More than that, 0 values are not being taken in consideration.

We may say is a bug and fix it. But taking a step back, we need the
hints to reserve *more* mem range, I am not sure 0 values are 
interesting for mem reservation. Also is also only a hint,
the firmware should decide.

Do we have a concrete scenario where would we want to use (0, non-zero)
or (non-zero, 0) ? (ask for 32/64 prefetch)

Our current semantic is: "we give a hint to the Guest Firmware,
we don't decide. However, giving hints to both pref32 and pref64
is wrong".


Not related: Using 0 value to disable IO works just fine!
No need for disable-IO-fwd property :)

Thanks,
Marcel
Laszlo Ersek Sept. 28, 2017, 7:48 a.m. UTC | #9
On 09/27/17 12:06, Marcel Apfelbaum wrote:
> Hi Laszlo,
> 
> On 20/09/2017 14:01, Laszlo Ersek wrote:
>>   We now have
>>
>>      if (mem_pref_32_reserve != (uint32_t)-1 &&
>>          mem_pref_64_reserve != (uint64_t)-1) {
>>          error_setg(errp,
>>
>> but after introducing the zero value, this is going to be too strict.
>> Consider all the possible combinations:
>>
>> * (-1; -1): default fw behavior, check is OK
>>
>> * (-1;  0): fw sticks with the default for the first component, picks
>>              "no reservation" for the second component, check is OK
>>
>> * (-1; >0): fw can treat this identically to ( 0; >0) -- see below. This
>>              is justified because, while the first component says "use
>>              the default", the entire situation of having such a
>>              capability is new, so the firmware can introduce new ways
>>              for handling it. The check passes, which is good.
>>
>> * ( 0;  0): check reports error, but the firmware can handle this case
>>              fine (no reservation for either component)
>>
>> * ( 0; >0): check reports error, but the fw can handle this (no
>>              reservation for the first component, specific reservation
>>              for the second)
>>
>> * (>0; >0): conflict, check is OK
>>
>> (I'm skipping the description of the inverted orderings, such as (0;-1),
>> they behave similarly.)
>>
>> So we need to accept 0 as "valid" for either component:
>>
>>      if (mem_pref_32_reserve > 0 &&
>>          mem_pref_32_reserve < (uint32_t)-1 &&
>>          mem_pref_64_reserve > 0 &&
>>          mem_pref_64_reserve < (uint64_t)-1) {
>>          error_setg(errp,
>>
> 
> SeaBIOS  will not allow (0,0) values for pref32 and pref64 fields.
> More than that, 0 values are not being taken in consideration.
> 
> We may say is a bug and fix it. But taking a step back, we need the
> hints to reserve *more* mem range, I am not sure 0 values are
> interesting for mem reservation. Also is also only a hint,
> the firmware should decide.
> 
> Do we have a concrete scenario where would we want to use (0, non-zero)
> or (non-zero, 0) ? (ask for 32/64 prefetch)
> 
> Our current semantic is: "we give a hint to the Guest Firmware,
> we don't decide. However, giving hints to both pref32 and pref64
> is wrong".

After having written the OVMF code for this, my understanding is
clearer, and I agree that, for *prefetchable*, distinguishing -1 from 0
is not necessary (-1 means "firmware default (unspecified)", and 0 means
"do not reserve"; and for prefetchable, the two things mean the same in
OVMF). So the current QEMU code should be fine.

> Not related: Using 0 value to disable IO works just fine!
> No need for disable-IO-fwd property :)

Great!

Thanks!
Laszlo
diff mbox series

Patch

diff --git a/hw/pci-bridge/gen_pcie_root_port.c b/hw/pci-bridge/gen_pcie_root_port.c
index cb694d6da5..cbdeb73e2c 100644
--- a/hw/pci-bridge/gen_pcie_root_port.c
+++ b/hw/pci-bridge/gen_pcie_root_port.c
@@ -20,12 +20,26 @@ 
 #define GEN_PCIE_ROOT_PORT_AER_OFFSET           0x100
 #define GEN_PCIE_ROOT_PORT_MSIX_NR_VECTOR       1
 
+#define GEN_PCIE_ROOT_PORT(obj) \
+     OBJECT_CHECK(GenPCIERootPort, (obj), TYPE_GEN_PCIE_ROOT_PORT)
+#define GEN_PCIE_ROOT_PORT_CLASS(klass) \
+     OBJECT_CLASS_CHECK(GenPCIERootPortClass, (klass), TYPE_GEN_PCIE_ROOT_PORT)
+#define GEN_PCIE_ROOT_PORT_GET_CLASS(obj) \
+     OBJECT_GET_CLASS(GenPCIERootPortClass, (obj), TYPE_GEN_PCIE_ROOT_PORT)
+
+typedef struct GenPCIERootPortClass {
+    PCIERootPortClass parent_class;
+
+    DeviceRealize parent_realize;
+} GenPCIERootPortClass;
+
 typedef struct GenPCIERootPort {
     /*< private >*/
     PCIESlot parent_obj;
     /*< public >*/
 
     bool migrate_msix;
+    bool enable_io_fwd;
 } GenPCIERootPort;
 
 static uint8_t gen_rp_aer_vector(const PCIDevice *d)
@@ -60,6 +74,25 @@  static bool gen_rp_test_migrate_msix(void *opaque, int version_id)
     return rp->migrate_msix;
 }
 
+static void gen_rp_realize(DeviceState *d, Error **errp)
+{
+    GenPCIERootPortClass *grpc = GEN_PCIE_ROOT_PORT_GET_CLASS(d);
+    GenPCIERootPort *grp = GEN_PCIE_ROOT_PORT(d);
+    PCIDevice *pci_dev = PCI_DEVICE(d);
+
+    grpc->parent_realize(DEVICE(d), errp);
+    if (*errp) {
+        return;
+    }
+
+    if (!grp->enable_io_fwd) {
+        pci_word_test_and_clear_mask(pci_dev->wmask + PCI_COMMAND,
+                                     PCI_COMMAND_IO);
+        pci_dev->wmask[PCI_IO_BASE] = 0;
+        pci_dev->wmask[PCI_IO_LIMIT] = 0;
+    }
+}
+
 static const VMStateDescription vmstate_rp_dev = {
     .name = "pcie-root-port",
     .version_id = 1,
@@ -78,6 +111,7 @@  static const VMStateDescription vmstate_rp_dev = {
 
 static Property gen_rp_props[] = {
     DEFINE_PROP_BOOL("x-migrate-msix", GenPCIERootPort, migrate_msix, true),
+    DEFINE_PROP_BOOL("enable-io-fwd", GenPCIERootPort, enable_io_fwd, false),
     DEFINE_PROP_END_OF_LIST()
 };
 
@@ -86,6 +120,7 @@  static void gen_rp_dev_class_init(ObjectClass *klass, void *data)
     DeviceClass *dc = DEVICE_CLASS(klass);
     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
     PCIERootPortClass *rpc = PCIE_ROOT_PORT_CLASS(klass);
+    GenPCIERootPortClass *grpc = GEN_PCIE_ROOT_PORT_CLASS(klass);
 
     k->vendor_id = PCI_VENDOR_ID_REDHAT;
     k->device_id = PCI_DEVICE_ID_REDHAT_PCIE_RP;
@@ -96,6 +131,8 @@  static void gen_rp_dev_class_init(ObjectClass *klass, void *data)
     rpc->interrupts_init = gen_rp_interrupts_init;
     rpc->interrupts_uninit = gen_rp_interrupts_uninit;
     rpc->aer_offset = GEN_PCIE_ROOT_PORT_AER_OFFSET;
+    grpc->parent_realize = dc->realize;
+    dc->realize = gen_rp_realize;
 }
 
 static const TypeInfo gen_rp_dev_info = {
@@ -103,6 +140,7 @@  static const TypeInfo gen_rp_dev_info = {
     .parent        = TYPE_PCIE_ROOT_PORT,
     .instance_size = sizeof(GenPCIERootPort),
     .class_init    = gen_rp_dev_class_init,
+    .class_size    = sizeof(GenPCIERootPortClass),
 };
 
  static void gen_rp_register_types(void)
diff --git a/include/hw/compat.h b/include/hw/compat.h
index 3e101f8f67..843bf4a3a5 100644
--- a/include/hw/compat.h
+++ b/include/hw/compat.h
@@ -2,7 +2,11 @@ 
 #define HW_COMPAT_H
 
 #define HW_COMPAT_2_10 \
-    /* empty */
+    {\
+        .driver   = "pcie-root-port",\
+        .property = "enable-io-fwd",\
+        .value    = "true",\
+    },
 
 #define HW_COMPAT_2_9 \
     {\