diff mbox

[5/6,wip] tseg, part1, not (yet) tested

Message ID 1429521560-2743-5-git-send-email-kraxel@redhat.com
State New
Headers show

Commit Message

Gerd Hoffmann April 20, 2015, 9:19 a.m. UTC
route access to tseg into nowhere when enabled,
for both cpus and busmaster dma.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 hw/pci-host/q35.c         | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 include/hw/pci-host/q35.h |  1 +
 2 files changed, 58 insertions(+)

Comments

Paolo Bonzini April 20, 2015, 11:45 a.m. UTC | #1
On 20/04/2015 11:19, Gerd Hoffmann wrote:
> +    memory_region_del_subregion(mch->system_memory, &mch->tseg_blackhole);
> +    memory_region_set_enabled(&mch->tseg_blackhole, tseg_size);

Please use "tseg_size > 0" here.

Paolo

> +    memory_region_set_size(&mch->tseg_blackhole, tseg_size);
> +    memory_region_add_subregion_overlap(mch->system_memory,
> +                                        mch->below_4g_mem_size - tseg_size,
> +                                        &mch->tseg_blackhole, 1);
Laszlo Ersek April 21, 2015, 2:18 p.m. UTC | #2
In general, commit messages for the series would be appreciated by the
uninitiated :)

Then,

On 04/20/15 11:19, Gerd Hoffmann wrote:
> route access to tseg into nowhere when enabled,
> for both cpus and busmaster dma.
> 
> Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
> ---
>  hw/pci-host/q35.c         | 57 +++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/pci-host/q35.h |  1 +
>  2 files changed, 58 insertions(+)
> 
> diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
> index f0d840c..412ff0a 100644
> --- a/hw/pci-host/q35.c
> +++ b/hw/pci-host/q35.c
> @@ -198,6 +198,28 @@ static const TypeInfo q35_host_info = {
>   * MCH D0:F0
>   */
>  
> +static uint64_t tseg_blackhole_read(void *ptr, hwaddr reg, unsigned size)
> +{
> +    return 0xffffffff;
> +}
> +
> +static void tseg_blackhole_write(void *opaque, hwaddr addr, uint64_t val,
> +                                 unsigned width)
> +{
> +    /* nothing */
> +}
> +
> +static const MemoryRegionOps tseg_blackhole_ops = {
> +    .read = tseg_blackhole_read,
> +    .write = tseg_blackhole_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .valid.min_access_size = 1,
> +    .valid.max_access_size = 4,
> +    .impl.min_access_size = 4,
> +    .impl.max_access_size = 4,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +};

(a) you've specified .endianness twice (with inconsistent initializers
at that, although the value returned by the accessor happens to be
unaffected).

(b) Why are 8-byte accesses forbidden? ... Hm, actually, the above
doesn't forbid it, it just causes qemu to split up the access. Should be
fine.

> +
>  /* PCIe MMCFG */
>  static void mch_update_pciexbar(MCHPCIState *mch)
>  {
> @@ -267,6 +289,7 @@ static void mch_update_smram(MCHPCIState *mch)
>  {
>      PCIDevice *pd = PCI_DEVICE(mch);
>      bool h_smrame = (pd->config[MCH_HOST_BRIDGE_ESMRAMC] & MCH_HOST_BRIDGE_ESMRAMC_H_SMRAME);
> +    uint32_t tseg_size;
>  
>      /* implement SMRAM.D_LCK */
>      if (pd->config[MCH_HOST_BRIDGE_SMRAM] & MCH_HOST_BRIDGE_SMRAM_D_LCK) {
> @@ -296,6 +319,32 @@ static void mch_update_smram(MCHPCIState *mch)
>          memory_region_set_enabled(&mch->high_smram, false);
>      }
>  
> +    if (pd->config[MCH_HOST_BRIDGE_ESMRAMC] & MCH_HOST_BRIDGE_ESMRAMC_T_EN) {
> +        switch (pd->config[MCH_HOST_BRIDGE_ESMRAMC] &
> +                MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_MASK) {
> +        case MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_1MB:
> +            tseg_size = 1024 * 1024;
> +            break;
> +        case MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_2MB:
> +            tseg_size = 1024 * 1024 * 2;
> +            break;
> +        case MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_8MB:
> +            tseg_size = 1024 * 1024 * 8;
> +            break;
> +        default:
> +            tseg_size = 0;
> +            break;
> +        }
> +    } else {
> +        tseg_size = 0;
> +    }

- so, is the guest supposed to select the tseg size by writing this
register?

- I assume this register is not reprogrammable once SMRAM is locked --
is that correct?

- can the guest somehow use this facility to detect, dynamically, the
presence of this feature? (For now I'm thinking about a static build
flag for OVMF that would enable SMM support, but I'm 99% sure Jordan
will object and ask for a dynamic feature detection.)

> +    memory_region_del_subregion(mch->system_memory, &mch->tseg_blackhole);

Hmm. I think this API is for something else. It is not for hole punching.

... Ah, okay, you *are* removing the previously configured blackhole
region (ie. you're not hole punching), I see. But, does this work if
tseg_blackhole is still all zeroes? (Ie. this is the first invocation.)

> +    memory_region_set_enabled(&mch->tseg_blackhole, tseg_size);
> +    memory_region_set_size(&mch->tseg_blackhole, tseg_size);
> +    memory_region_add_subregion_overlap(mch->system_memory,
> +                                        mch->below_4g_mem_size - tseg_size,
> +                                        &mch->tseg_blackhole, 1);

So, does tseg overlay the last few MB of the RAM below 4GB? (Ie. right
before the PCI hole starts?)

> +
>      memory_region_transaction_commit();
>  }
>  
> @@ -443,6 +492,14 @@ static void mch_realize(PCIDevice *d, Error **errp)
>      object_property_add_alias(qdev_get_machine(), "smram",
>                                OBJECT(&mch->smram), ".", NULL);
>  
> +    memory_region_init_io(&mch->tseg_blackhole, OBJECT(mch),
> +                          &tseg_blackhole_ops, NULL,
> +                          "tseg-blackhole", 0);

Okay, this answers one of the above, tseg_blackhole is never uninitialized.

> +    memory_region_set_enabled(&mch->tseg_blackhole, false);
> +    memory_region_add_subregion_overlap(mch->system_memory,
> +                                        mch->below_4g_mem_size,
> +                                        &mch->tseg_blackhole, 1);

The address (2nd argument) is inconsistent with the one in
mch_update_smram() -- this function call places the tseg black hole at
the beginning of the 32-bit PCI hole, not just below it.

(OTOH it doesn't really matter, because the region is disabled. Still,
it would be easier to understand.)

... I'm thinking loud about what this means for the OVMF memory space
map... The central function we use in PlatformPei is
GetSystemMemorySizeBelow4gb(), which reads the CMOS (0x34/0x35). That
gives us "below_4g_mem_size" in OVMF.

Currently that value is used for three purposes:
(a) place the permanent PEI RAM so that it ends at the top of
"below_4g_mem_size"; PublishPeiMemory()
(b) create one of the memory HOBs, QemuInitializeRam(),
(c) create the MMIO HOB that describes the 32-bit PCI hole,
    MemMapInitialization()

If the last 1 / 2 / 8 MB of the low RAM can't be used as general purpose
RAM, then (a) is affected (it should simply be sunk by the tseg size of
choice); (b) is affected similarly (just decrease the top of the memory
HOB); (c) is not affected (the start of the low PCI hole remains
unchanged). Okay...

> +
>      init_pam(DEVICE(mch), mch->ram_memory, mch->system_memory,
>               mch->pci_address_space, &mch->pam_regions[0],
>               PAM_BIOS_BASE, PAM_BIOS_SIZE);
> diff --git a/include/hw/pci-host/q35.h b/include/hw/pci-host/q35.h
> index 61bfe7e..ba64c70 100644
> --- a/include/hw/pci-host/q35.h
> +++ b/include/hw/pci-host/q35.h
> @@ -55,6 +55,7 @@ typedef struct MCHPCIState {
>      PAMMemoryRegion pam_regions[13];
>      MemoryRegion smram_region, open_high_smram;
>      MemoryRegion smram, low_smram, high_smram;
> +    MemoryRegion tseg_blackhole;
>      PcPciInfo pci_info;
>      ram_addr_t below_4g_mem_size;
>      ram_addr_t above_4g_mem_size;
> 

Thanks!
Laszlo
Gerd Hoffmann April 21, 2015, 3:04 p.m. UTC | #3
> > +static const MemoryRegionOps tseg_blackhole_ops = {
> > +    .read = tseg_blackhole_read,
> > +    .write = tseg_blackhole_write,
> > +    .endianness = DEVICE_NATIVE_ENDIAN,
> > +    .valid.min_access_size = 1,
> > +    .valid.max_access_size = 4,
> > +    .impl.min_access_size = 4,
> > +    .impl.max_access_size = 4,
> > +    .endianness = DEVICE_LITTLE_ENDIAN,
> > +};
> 
> (a) you've specified .endianness twice (with inconsistent initializers
> at that, although the value returned by the accessor happens to be
> unaffected).

Oops, cut+paste bug, will fix (/me wonders why gcc didn't throw an error
on that one).

> (b) Why are 8-byte accesses forbidden? ...

just a matter of setting .valid.max_access_size = 8, can do that of
course.

> > +
> >  /* PCIe MMCFG */
> >  static void mch_update_pciexbar(MCHPCIState *mch)
> >  {
> > @@ -267,6 +289,7 @@ static void mch_update_smram(MCHPCIState *mch)
> >  {
> >      PCIDevice *pd = PCI_DEVICE(mch);
> >      bool h_smrame = (pd->config[MCH_HOST_BRIDGE_ESMRAMC] & MCH_HOST_BRIDGE_ESMRAMC_H_SMRAME);
> > +    uint32_t tseg_size;
> >  
> >      /* implement SMRAM.D_LCK */
> >      if (pd->config[MCH_HOST_BRIDGE_SMRAM] & MCH_HOST_BRIDGE_SMRAM_D_LCK) {
> > @@ -296,6 +319,32 @@ static void mch_update_smram(MCHPCIState *mch)
> >          memory_region_set_enabled(&mch->high_smram, false);
> >      }
> >  
> > +    if (pd->config[MCH_HOST_BRIDGE_ESMRAMC] & MCH_HOST_BRIDGE_ESMRAMC_T_EN) {
> > +        switch (pd->config[MCH_HOST_BRIDGE_ESMRAMC] &
> > +                MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_MASK) {
> > +        case MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_1MB:
> > +            tseg_size = 1024 * 1024;
> > +            break;
> > +        case MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_2MB:
> > +            tseg_size = 1024 * 1024 * 2;
> > +            break;
> > +        case MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_8MB:
> > +            tseg_size = 1024 * 1024 * 8;
> > +            break;
> > +        default:
> > +            tseg_size = 0;
> > +            break;
> > +        }
> > +    } else {
> > +        tseg_size = 0;
> > +    }
> 
> - so, is the guest supposed to select the tseg size by writing this
> register?

Correct.

> - I assume this register is not reprogrammable once SMRAM is locked --
> is that correct?

Correct.

> - can the guest somehow use this facility to detect, dynamically, the
> presence of this feature? (For now I'm thinking about a static build
> flag for OVMF that would enable SMM support, but I'm 99% sure Jordan
> will object and ask for a dynamic feature detection.)

Hmm.  I think if we merge all the smm / smram / tseg patches in one go
this should work.  Without patches reading the ESMRAMC register returns
zero.  With the patches the three cache-disable bits are hardcoded to
'1'.  This should work for detecting tseg support.

You also have to test for smm support.  The current protocol for this is
that seabios checks whenever smm is already initialized (see
*_apmc_smm_setup() in seabios/src/fw/smm.c) and if so it skips smm
initialization.  Right now qemu sets the bit on reset when running on
kvm, so seabios doesn't try to use smm.  On tcg the bit is clear after
reset and seabios actually uses smm mode.

> > +    memory_region_set_enabled(&mch->tseg_blackhole, tseg_size);
> > +    memory_region_set_size(&mch->tseg_blackhole, tseg_size);
> > +    memory_region_add_subregion_overlap(mch->system_memory,
> > +                                        mch->below_4g_mem_size - tseg_size,
> > +                                        &mch->tseg_blackhole, 1);
> 
> So, does tseg overlay the last few MB of the RAM below 4GB? (Ie. right
> before the PCI hole starts?)

tseg is just normal ram (yes, located at the end of memory), but (once
tseg is enabled) only cpus in smm mode are allowed to access it.
Likewise busmaster dma access is rejected, so non-smm code can't use the
sata controller to access this indirectly.

> > +    memory_region_init_io(&mch->tseg_blackhole, OBJECT(mch),
> > +                          &tseg_blackhole_ops, NULL,
> > +                          "tseg-blackhole", 0);
> 
> Okay, this answers one of the above, tseg_blackhole is never uninitialized.
> 
> > +    memory_region_set_enabled(&mch->tseg_blackhole, false);
> > +    memory_region_add_subregion_overlap(mch->system_memory,
> > +                                        mch->below_4g_mem_size,
> > +                                        &mch->tseg_blackhole, 1);
> 
> The address (2nd argument) is inconsistent with the one in
> mch_update_smram() -- this function call places the tseg black hole at
> the beginning of the 32-bit PCI hole, not just below it.

It's a zero-length hole here, therefore it actually is the same.  But I
didn't bother to use "mch->below_4g_mem_size - 0" as start address to
make that more clear ;)

> (a) place the permanent PEI RAM so that it ends at the top of
> "below_4g_mem_size"; PublishPeiMemory()
> (b) create one of the memory HOBs, QemuInitializeRam(),
> (c) create the MMIO HOB that describes the 32-bit PCI hole,
>     MemMapInitialization()
> 
> If the last 1 / 2 / 8 MB of the low RAM can't be used as general purpose
> RAM, then (a) is affected (it should simply be sunk by the tseg size of
> choice); (b) is affected similarly (just decrease the top of the memory
> HOB);

Yes.

cheers,
  Gerd
Paolo Bonzini April 21, 2015, 3:08 p.m. UTC | #4
On 21/04/2015 17:04, Gerd Hoffmann wrote:
> Hmm.  I think if we merge all the smm / smram / tseg patches in one go
> this should work.  Without patches reading the ESMRAMC register returns
> zero.  With the patches the three cache-disable bits are hardcoded to
> '1'.  This should work for detecting tseg support.
> 
> You also have to test for smm support.  The current protocol for this is
> that seabios checks whenever smm is already initialized (see
> *_apmc_smm_setup() in seabios/src/fw/smm.c) and if so it skips smm
> initialization.  Right now qemu sets the bit on reset when running on
> kvm, so seabios doesn't try to use smm.  On tcg the bit is clear after
> reset and seabios actually uses smm mode.

BTW, I plan to add "-machine smm=yes/no/auto".

Gerd, are you going to implement SMI_EN locking as well?

Paolo
Gerd Hoffmann April 21, 2015, 3:16 p.m. UTC | #5
On Di, 2015-04-21 at 17:08 +0200, Paolo Bonzini wrote:
> 
> On 21/04/2015 17:04, Gerd Hoffmann wrote:
> > Hmm.  I think if we merge all the smm / smram / tseg patches in one go
> > this should work.  Without patches reading the ESMRAMC register returns
> > zero.  With the patches the three cache-disable bits are hardcoded to
> > '1'.  This should work for detecting tseg support.
> > 
> > You also have to test for smm support.  The current protocol for this is
> > that seabios checks whenever smm is already initialized (see
> > *_apmc_smm_setup() in seabios/src/fw/smm.c) and if so it skips smm
> > initialization.  Right now qemu sets the bit on reset when running on
> > kvm, so seabios doesn't try to use smm.  On tcg the bit is clear after
> > reset and seabios actually uses smm mode.
> 
> BTW, I plan to add "-machine smm=yes/no/auto".
> 
> Gerd, are you going to implement SMI_EN locking as well?

Sure, can do that while being at it.

cheers,
  Gerd
Laszlo Ersek April 21, 2015, 6:46 p.m. UTC | #6
On 04/21/15 17:04, Gerd Hoffmann wrote:
>>> +static const MemoryRegionOps tseg_blackhole_ops = {
>>> +    .read = tseg_blackhole_read,
>>> +    .write = tseg_blackhole_write,
>>> +    .endianness = DEVICE_NATIVE_ENDIAN,
>>> +    .valid.min_access_size = 1,
>>> +    .valid.max_access_size = 4,
>>> +    .impl.min_access_size = 4,
>>> +    .impl.max_access_size = 4,
>>> +    .endianness = DEVICE_LITTLE_ENDIAN,
>>> +};
>>
>> (a) you've specified .endianness twice (with inconsistent initializers
>> at that, although the value returned by the accessor happens to be
>> unaffected).
> 
> Oops, cut+paste bug, will fix (/me wonders why gcc didn't throw an error
> on that one).

I think because it's valid C99. The "current object" concept is just
moved by the designation, and you can initialize the "current object".

>> - can the guest somehow use this facility to detect, dynamically, the
>> presence of this feature? (For now I'm thinking about a static build
>> flag for OVMF that would enable SMM support, but I'm 99% sure Jordan
>> will object and ask for a dynamic feature detection.)
> 
> Hmm.  I think if we merge all the smm / smram / tseg patches in one go
> this should work.  Without patches reading the ESMRAMC register returns
> zero.  With the patches the three cache-disable bits are hardcoded to
> '1'.  This should work for detecting tseg support.
> 
> You also have to test for smm support.  The current protocol for this is
> that seabios checks whenever smm is already initialized (see
> *_apmc_smm_setup() in seabios/src/fw/smm.c) and if so it skips smm
> initialization.  Right now qemu sets the bit on reset when running on
> kvm, so seabios doesn't try to use smm.  On tcg the bit is clear after
> reset and seabios actually uses smm mode.

Thanks. I'll stash the rest of this thread for later.

A "unified" patchset (not necessarily posted, just pushed somewhere)
would be helpful down the road -- IIRC this one was claimed unapplicable
to current master. (Which is also why I didn't try the "info mtree"
command, see elsewhere in the thread -- I didn't try to build the series.)

Thanks!
Laszlo
Gerd Hoffmann April 22, 2015, 6:07 a.m. UTC | #7
Hi,

> A "unified" patchset (not necessarily posted, just pushed somewhere)
> would be helpful down the road -- IIRC this one was claimed unapplicable
> to current master. (Which is also why I didn't try the "info mtree"
> command, see elsewhere in the thread -- I didn't try to build the series.)

https://www.kraxel.org/cgit/qemu/log/?h=rebase/smm-wip

HTH,
  Gerd
Gerd Hoffmann April 22, 2015, 8:09 a.m. UTC | #8
Hi,

> tseg is just normal ram (yes, located at the end of memory), but (once
> tseg is enabled) only cpus in smm mode are allowed to access it.
> Likewise busmaster dma access is rejected, so non-smm code can't use the
> sata controller to access this indirectly.

Update:  Seems tseg can be anywhere, there is a "tseg memory base"
register @ 0xac in pci config space.

Placing it at the end of memory is just what the bios is supposed to do
by default.  And it makes sense to place it there.

<quote>
This register contains the base address of TSEG DRAM memory. BIOS
determines the base of TSEG memory by subtracting the TSEG size (PCI
Device 0, offset 9Eh, bits 2:1) from graphics GTT stolen base (PCI
Device 0, offset A8h, bits 31:20).

Once D_LCK has been set, these bits becomes read only.
</quote>

"GTT stolen base" equals "top of below-4g memory" for us because we
emulate the chipset variant without graphics in qemu.

cheers,
  Gerd
Laszlo Ersek April 22, 2015, 8:52 a.m. UTC | #9
On 04/22/15 10:09, Gerd Hoffmann wrote:
>   Hi,
> 
>> tseg is just normal ram (yes, located at the end of memory), but (once
>> tseg is enabled) only cpus in smm mode are allowed to access it.
>> Likewise busmaster dma access is rejected, so non-smm code can't use the
>> sata controller to access this indirectly.
> 
> Update:  Seems tseg can be anywhere, there is a "tseg memory base"
> register @ 0xac in pci config space.
> 
> Placing it at the end of memory is just what the bios is supposed to do
> by default.  And it makes sense to place it there.
> 
> <quote>
> This register contains the base address of TSEG DRAM memory. BIOS
> determines the base of TSEG memory by subtracting the TSEG size (PCI
> Device 0, offset 9Eh, bits 2:1) from graphics GTT stolen base (PCI
> Device 0, offset A8h, bits 31:20).
> 
> Once D_LCK has been set, these bits becomes read only.
> </quote>
> 
> "GTT stolen base" equals "top of below-4g memory" for us because we
> emulate the chipset variant without graphics in qemu.

Thanks, that sounds good. So, as far as I understand, no changes to what
we've discussed thus far.

But, I have another question -- am I allowed to use "top of below-4g
memory" directly, as discussed earlier, or should I use the above PCI
registers? The tseg size will actually come from me (because I'll select
it), but the top I could take from "top of below-4g memory" (preferably,
see earlier), or reading the 0xA8 register.

Unless, of course 0xA8 won't be implemented at all, *because* "we
emulate the chipset variant without graphics in qemu." In other words,
if the 0xA8 register is dependent on the integrated graphics, then I
don't have a question. :)

Thanks!
Laszlo
Gerd Hoffmann April 22, 2015, 9:33 a.m. UTC | #10
Hi,

> Thanks, that sounds good. So, as far as I understand, no changes to what
> we've discussed thus far.
> 
> But, I have another question -- am I allowed to use "top of below-4g
> memory" directly, as discussed earlier, or should I use the above PCI
> registers? The tseg size will actually come from me (because I'll select
> it), but the top I could take from "top of below-4g memory" (preferably,
> see earlier), or reading the 0xA8 register.

I've figured there is one more register, with the funky name "Top of Low
Usable DRAM".  Which should hold our "top of below-4g memory" value.  I
guess I should read the whole mch spec from start to end once ...

But as I read the specs it is the job of the firmware to set all these
registers.  On physical hardware you can go into the firmware setup and
configure all sorts of stuff there, like the pci io window size,
graphics memory size etc.  And probably the firmware then goes setup all
those chipset registers for low memory size, gfx regions size, tseg etc.
according to the setup configuration.

I'd recommend to likewise just write the correct values into those
registers.  Right now it has no effect (other than the guest os seeing
realistic values when reading those registers).  But if the tseg memory
base register is implemented some day it surely is helpful if ovmf sets
it correctly ;)

cheers,
  Gerd
Laszlo Ersek April 22, 2015, 9:41 p.m. UTC | #11
On 04/21/15 17:04, Gerd Hoffmann wrote:
>>> +static const MemoryRegionOps tseg_blackhole_ops = {
>>> +    .read = tseg_blackhole_read,
>>> +    .write = tseg_blackhole_write,
>>> +    .endianness = DEVICE_NATIVE_ENDIAN,
>>> +    .valid.min_access_size = 1,
>>> +    .valid.max_access_size = 4,
>>> +    .impl.min_access_size = 4,
>>> +    .impl.max_access_size = 4,
>>> +    .endianness = DEVICE_LITTLE_ENDIAN,
>>> +};
>>
>> (a) you've specified .endianness twice (with inconsistent initializers
>> at that, although the value returned by the accessor happens to be
>> unaffected).
> 
> Oops, cut+paste bug, will fix (/me wonders why gcc didn't throw an error
> on that one).
> 
>> (b) Why are 8-byte accesses forbidden? ...
> 
> just a matter of setting .valid.max_access_size = 8, can do that of
> course.
> 
>>> +
>>>  /* PCIe MMCFG */
>>>  static void mch_update_pciexbar(MCHPCIState *mch)
>>>  {
>>> @@ -267,6 +289,7 @@ static void mch_update_smram(MCHPCIState *mch)
>>>  {
>>>      PCIDevice *pd = PCI_DEVICE(mch);
>>>      bool h_smrame = (pd->config[MCH_HOST_BRIDGE_ESMRAMC] & MCH_HOST_BRIDGE_ESMRAMC_H_SMRAME);
>>> +    uint32_t tseg_size;
>>>  
>>>      /* implement SMRAM.D_LCK */
>>>      if (pd->config[MCH_HOST_BRIDGE_SMRAM] & MCH_HOST_BRIDGE_SMRAM_D_LCK) {
>>> @@ -296,6 +319,32 @@ static void mch_update_smram(MCHPCIState *mch)
>>>          memory_region_set_enabled(&mch->high_smram, false);
>>>      }
>>>  
>>> +    if (pd->config[MCH_HOST_BRIDGE_ESMRAMC] & MCH_HOST_BRIDGE_ESMRAMC_T_EN) {
>>> +        switch (pd->config[MCH_HOST_BRIDGE_ESMRAMC] &
>>> +                MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_MASK) {
>>> +        case MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_1MB:
>>> +            tseg_size = 1024 * 1024;
>>> +            break;
>>> +        case MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_2MB:
>>> +            tseg_size = 1024 * 1024 * 2;
>>> +            break;
>>> +        case MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_8MB:
>>> +            tseg_size = 1024 * 1024 * 8;
>>> +            break;
>>> +        default:
>>> +            tseg_size = 0;
>>> +            break;
>>> +        }
>>> +    } else {
>>> +        tseg_size = 0;
>>> +    }
>>
>> - so, is the guest supposed to select the tseg size by writing this
>> register?
> 
> Correct.
> 
>> - I assume this register is not reprogrammable once SMRAM is locked --
>> is that correct?
> 
> Correct.
> 
>> - can the guest somehow use this facility to detect, dynamically, the
>> presence of this feature? (For now I'm thinking about a static build
>> flag for OVMF that would enable SMM support, but I'm 99% sure Jordan
>> will object and ask for a dynamic feature detection.)
> 
> Hmm.  I think if we merge all the smm / smram / tseg patches in one go
> this should work.  Without patches reading the ESMRAMC register returns
> zero.  With the patches the three cache-disable bits are hardcoded to
> '1'.  This should work for detecting tseg support.
> 
> You also have to test for smm support.  The current protocol for this is
> that seabios checks whenever smm is already initialized (see
> *_apmc_smm_setup() in seabios/src/fw/smm.c) and if so it skips smm
> initialization.  Right now qemu sets the bit on reset when running on
> kvm, so seabios doesn't try to use smm.  On tcg the bit is clear after
> reset and seabios actually uses smm mode.

I started looking into this. Basically the condition for SMM/SMRAM
support is:

  Q35 && SMRAM support && SMM support           (1)

The first subcondition can be checked via the host bridge devid (and
OVMF already does, for other purposes).

The second one relies on the ESMRAMC, which is in PCI config space.

The third one is messy. It relies on SMI_EN, which is an ioport at
PMBA+0x30. It requires a configured PMBA.

The problem for OVMF is the following: this condition is too complex /
too intrusive to evaluate in order to see whether Q35+SMM+SMRAM are
available.

For that reason, I'd like to ask if it would be possible to introduce a
new fw_cfg file that would simply communicate the end result of the
above expression.

Long version:

If the above condition evaluates to TRUE, we assume that the user wants
"security" -- ie. it wants to prevent a "malicious" runtime guest OS
form messing with OVMF's Secure Boot feature. Good.

However, at the moment, OVMF has a large number of holes that a
malicious runtime guest OS can exploit. The S3 LockBox and the varstore
pflash chip are most frequently mentioned, but there are more; in
particular I'm referring to the other special memory ranges OVMF uses.

(Please refer to the

  A comprehensive memory map of OVMF

section of the OVMF whitepaper
<http://people.redhat.com/~lersek/ovmf-whitepaper-c770f8c.txt> for the
following.  It describes in detail what special memory ranges OVMF has,
and the life cycle of each.)

These ranges are adequately protected against a *benign* guest runtime
OS; the UEFI memory map makes sure that such a runtime OS doesn't
accidentally overwrite stuff that OVMF will either *need* for S3 resume
in pristine form from the first cold boot, or will *overwrite* during S3
resume as scratch space (thereby potentially destroying OS data).

But a malicious guest OS can nonetheless overwrite things in these
special areas that OVMF will run (or use) during S3, before it locks
down SMRAM again.

Some of those holes / inappropriately protected memory ranges are tied
to the SEC phase, which is the earliest and least capable UEFI phase.
For example, refer to:

+--------------------------+ 9216 KB       PcdOvmfDxeMemFvBase
|                          |
|PEIFV from FVMAIN_COMPACT | size: 896 KB  PcdOvmfPeiMemFvSize
|  decompressed firmware   |
| volume with PEI modules  |
|                          |
+--------------------------+ 8320 KB       PcdOvmfPeiMemFvBase

"(6) PEIFV -- decompressed firmware volume with PEI modules".

OVMF is an unconventional UEFI firmware in the sense that it runs *only*
the SEC phase from (unmodifiable, read-only) flash. The PEI and DXE
firmware volumes are decompressed from flash to RAM by SEC, and then PEI
and DXE modules run from RAM.

(It is more conventional to run PEI modules from the flash as well, not
just SEC. The way OVMF does it allows it to save space in the pflash,
because PEI modules are thus allowed to exist only in compressed (as
opposed to directly executable) form in the pflash.)

In order to save some cycles, OVMF's SEC decompresses the PEI and DXE
firmware volumes (in one go) to RAM after a real reset only, and then
"protects" the decompressed PEI firmware volume in RAM via the UEFI
memory map (see above). Then, at S3 resume, SEC simply reuses the
already decompressed (and "protected") PEI firmware volume when handing
off to PEI.

Obviously, if a guest OS pokes some foreign code into this decompressed
PEI firmware volume before S3 suspend, then OVMF will run that foreign
code at S3 resume, with the SMRAM open.

Therefore OVMF has to forego such optimizations even as early as in SEC
when condition (1) evaluates to false -- it must decompress the firmware
volumes from pristine pflash over the possibly breached RAM even during
S3 resume. And for that SEC needs to evaluate (1).

This is where the complexity / intrusiveness of (1) becomes a problem.
Programming the PMBA in SEC just so I can read SMI_EN (for the sake of
evaluation (1)) is something I'd like to avoid. We already have a nice
fw_cfg client library that is usable in SEC; that would be my choice for
learning about condition (1).

I'll audit the rest of the special memory regions too, of course. In any
case, exposing this QEMU capability / configuration (see also Paolo's
-machine smm=on/off idea) via fw_cfg seems both appropriate for fw_cfg
(after all it does configure the firmware) and the least intrusive for OVMF.

What do you think?

Thanks!
Laszlo


>>> +    memory_region_set_enabled(&mch->tseg_blackhole, tseg_size);
>>> +    memory_region_set_size(&mch->tseg_blackhole, tseg_size);
>>> +    memory_region_add_subregion_overlap(mch->system_memory,
>>> +                                        mch->below_4g_mem_size - tseg_size,
>>> +                                        &mch->tseg_blackhole, 1);
>>
>> So, does tseg overlay the last few MB of the RAM below 4GB? (Ie. right
>> before the PCI hole starts?)
> 
> tseg is just normal ram (yes, located at the end of memory), but (once
> tseg is enabled) only cpus in smm mode are allowed to access it.
> Likewise busmaster dma access is rejected, so non-smm code can't use the
> sata controller to access this indirectly.
> 
>>> +    memory_region_init_io(&mch->tseg_blackhole, OBJECT(mch),
>>> +                          &tseg_blackhole_ops, NULL,
>>> +                          "tseg-blackhole", 0);
>>
>> Okay, this answers one of the above, tseg_blackhole is never uninitialized.
>>
>>> +    memory_region_set_enabled(&mch->tseg_blackhole, false);
>>> +    memory_region_add_subregion_overlap(mch->system_memory,
>>> +                                        mch->below_4g_mem_size,
>>> +                                        &mch->tseg_blackhole, 1);
>>
>> The address (2nd argument) is inconsistent with the one in
>> mch_update_smram() -- this function call places the tseg black hole at
>> the beginning of the 32-bit PCI hole, not just below it.
> 
> It's a zero-length hole here, therefore it actually is the same.  But I
> didn't bother to use "mch->below_4g_mem_size - 0" as start address to
> make that more clear ;)
> 
>> (a) place the permanent PEI RAM so that it ends at the top of
>> "below_4g_mem_size"; PublishPeiMemory()
>> (b) create one of the memory HOBs, QemuInitializeRam(),
>> (c) create the MMIO HOB that describes the 32-bit PCI hole,
>>     MemMapInitialization()
>>
>> If the last 1 / 2 / 8 MB of the low RAM can't be used as general purpose
>> RAM, then (a) is affected (it should simply be sunk by the tseg size of
>> choice); (b) is affected similarly (just decrease the top of the memory
>> HOB);
> 
> Yes.
> 
> cheers,
>   Gerd
> 
>
Laszlo Ersek April 22, 2015, 9:51 p.m. UTC | #12
another point:

On 04/22/15 23:41, Laszlo Ersek wrote:
> On 04/21/15 17:04, Gerd Hoffmann wrote:

>>> - can the guest somehow use this facility to detect, dynamically, the
>>> presence of this feature? (For now I'm thinking about a static build
>>> flag for OVMF that would enable SMM support, but I'm 99% sure Jordan
>>> will object and ask for a dynamic feature detection.)
>>
>> Hmm.  I think if we merge all the smm / smram / tseg patches in one go
>> this should work.  Without patches reading the ESMRAMC register returns
>> zero.  With the patches the three cache-disable bits are hardcoded to
>> '1'.  This should work for detecting tseg support.
>>
>> You also have to test for smm support.  The current protocol for this is
>> that seabios checks whenever smm is already initialized (see
>> *_apmc_smm_setup() in seabios/src/fw/smm.c) and if so it skips smm
>> initialization.  Right now qemu sets the bit on reset when running on
>> kvm, so seabios doesn't try to use smm.  On tcg the bit is clear after
>> reset and seabios actually uses smm mode.
> 
> I started looking into this. Basically the condition for SMM/SMRAM
> support is:
> 
>   Q35 && SMRAM support && SMM support           (1)
> 
> The first subcondition can be checked via the host bridge devid (and
> OVMF already does, for other purposes).
> 
> The second one relies on the ESMRAMC, which is in PCI config space.
> 
> The third one is messy. It relies on SMI_EN, which is an ioport at
> PMBA+0x30. It requires a configured PMBA.
> 
> The problem for OVMF is the following: this condition is too complex /
> too intrusive to evaluate in order to see whether Q35+SMM+SMRAM are
> available.
> 
> For that reason, I'd like to ask if it would be possible to introduce a
> new fw_cfg file that would simply communicate the end result of the
> above expression.

There's another problem with basing this decision in OVMF on
SMI_EN.APMC_EN: it is not an idempotent check. At some point the
firmware itself has to set SMI_EN.APMC_EN.

This is probably no issue for SeaBIOS, but in OVMF there's a bunch of
more-or-less independently dispatched modules that depend on condition
(1), and if I set SMI_EN.APMC_EN in some of those modules myself, then
code that ends up running later cannot reuse SMI_EN.APMC_EN for
determining (1). So I'd have to store the original result in some PCD,
but that itself doesn't guarantee ordering either, so it just becomes a
huge mess.

fw_cfg on the other hand is always available, and this fw_cfg file would
never change.

Thanks!
Laszlo
Gerd Hoffmann April 23, 2015, 7:02 a.m. UTC | #13
Hi,

> > The third one is messy. It relies on SMI_EN, which is an ioport at
> > PMBA+0x30. It requires a configured PMBA.

Isn't that initialized early anyway, because the pm timer lives there?
It's not a regular pci bar exactly to allow early init, so the full pci
bus scan + bar allocation done later will not mess up things.

[ just a question, could very well be that even when initialized early
  it isn't early enough because you need to know the memory layout
  before uncompressing the firmware modules ]

> There's another problem with basing this decision in OVMF on
> SMI_EN.APMC_EN: it is not an idempotent check. At some point the
> firmware itself has to set SMI_EN.APMC_EN.

Yep, you'll need some variable saying whenever smm is there or not and
check that.  Because of this.  Or (in case we are going for the fw_cfg
file) because you don't want dig into fw_cfg each time you need to know
this.

cheers,
  Gerd
Laszlo Ersek April 23, 2015, 7:41 a.m. UTC | #14
On 04/23/15 09:02, Gerd Hoffmann wrote:
>   Hi,
> 
>>> The third one is messy. It relies on SMI_EN, which is an ioport at
>>> PMBA+0x30. It requires a configured PMBA.
> 
> Isn't that initialized early anyway, because the pm timer lives there?

TimerLib (which is based in OVMF's case on the PM timer) is not needed /
used before PEI in the default case. It is used in SEC only when -D
SOURCE_DEBUG_ENABLE is passed for the build (which is "never" in practice).

> It's not a regular pci bar exactly to allow early init, so the full pci
> bus scan + bar allocation done later will not mess up things.
> 
> [ just a question, could very well be that even when initialized early
>   it isn't early enough because you need to know the memory layout
>   before uncompressing the firmware modules ]

I'm trying to avoid that; the decompression happens to a low fixed range.

>> There's another problem with basing this decision in OVMF on
>> SMI_EN.APMC_EN: it is not an idempotent check. At some point the
>> firmware itself has to set SMI_EN.APMC_EN.
> 
> Yep, you'll need some variable saying whenever smm is there or not and
> check that.  Because of this.  Or (in case we are going for the fw_cfg
> file) because you don't want dig into fw_cfg each time you need to know
> this.

The annoying limitation with SEC is that it cannot use normal C static
variables, nor PCDs. It is okay if SEC is a bit wasteful on fw_cfg (and
even in SEC I might be able to cache the result in a local variable and
pass it around). In PEI I can already use static variables (because OVMF
is unorthodox and runs PEI modules from RAM, not flash), plus I can set
PCDs (because in PEI the PCDs live in a dedicated HOB, and that HOB is
allocated from RAM (independently of OVMF)).

I'll ask Jordan too about the dynamic feature detection, because there's
another big hurdle with it (selecting a LockBox library instance at
runtime, dependent on the presence of SMM). Deciding about SMM support
at build time would make things hugely easier (because that's how
firmware for physical platforms is built as well).

Thanks
Laszlo
Laszlo Ersek April 23, 2015, 8:33 a.m. UTC | #15
On 04/23/15 09:41, Laszlo Ersek wrote:
> On 04/23/15 09:02, Gerd Hoffmann wrote:
>>   Hi,
>>
>>>> The third one is messy. It relies on SMI_EN, which is an ioport at
>>>> PMBA+0x30. It requires a configured PMBA.
>>
>> Isn't that initialized early anyway, because the pm timer lives there?
> 
> TimerLib (which is based in OVMF's case on the PM timer) is not needed /
> used before PEI in the default case. It is used in SEC only when -D
> SOURCE_DEBUG_ENABLE is passed for the build (which is "never" in practice).
> 
>> It's not a regular pci bar exactly to allow early init, so the full pci
>> bus scan + bar allocation done later will not mess up things.
>>
>> [ just a question, could very well be that even when initialized early
>>   it isn't early enough because you need to know the memory layout
>>   before uncompressing the firmware modules ]
> 
> I'm trying to avoid that; the decompression happens to a low fixed range.
> 
>>> There's another problem with basing this decision in OVMF on
>>> SMI_EN.APMC_EN: it is not an idempotent check. At some point the
>>> firmware itself has to set SMI_EN.APMC_EN.
>>
>> Yep, you'll need some variable saying whenever smm is there or not and
>> check that.  Because of this.  Or (in case we are going for the fw_cfg
>> file) because you don't want dig into fw_cfg each time you need to know
>> this.
> 
> The annoying limitation with SEC is that it cannot use normal C static
> variables, nor PCDs. It is okay if SEC is a bit wasteful on fw_cfg (and
> even in SEC I might be able to cache the result in a local variable and
> pass it around). In PEI I can already use static variables (because OVMF
> is unorthodox and runs PEI modules from RAM, not flash), plus I can set
> PCDs (because in PEI the PCDs live in a dedicated HOB, and that HOB is
> allocated from RAM (independently of OVMF)).
> 
> I'll ask Jordan too about the dynamic feature detection, because there's
> another big hurdle with it (selecting a LockBox library instance at
> runtime, dependent on the presence of SMM). Deciding about SMM support
> at build time would make things hugely easier (because that's how
> firmware for physical platforms is built as well).

Okay, looks like Jordan agrees with using a static build option.

This makes things much easier, and I think I won't need a new fw_cfg file.

I will certainly do a sanity check (based on the condition we discussed
eariler in this thread, using the PCI config and iospace registers).

It's going to be "somewhere" in the firmware (still thinking about the
best place), but this way I won't have to handle everything (fall back
to non-SMM behavior) if the sanity check fails; in that case I can just
stop.

Public IRC transcript follows.

Thanks!
Laszlo

* Loaded log from Wed Apr 22 21:45:19 2015

* Now talking on #edk2
* Topic for #edk2 is: EDK II / OVMF / UEFI development discussions |
  http://www.tianocore.org/edk2
* Topic for #edk2 set by
  jljusten!~jljusten@static-50-43-41-168.bvtn.or.frontiernet.net at Thu
  Feb  5 03:36:47 2015

<lersek>   jljusten, are you still online?
<lersek>   (I guess you are, from your recent email)
<jljusten> lersek: yeah. it's getting a bit late though. :)
<lersek>   jljusten, okay, just briefly then
<lersek>   I can send an email too if you prefer that
<lersek>   the question is dynamic detection of SMM/SMRAM
           support
<lersek>   it is quite messy
<lersek>   and has a large number of ties into things
<lersek>   several modules / module types are affected
<lersek>   but the worst question is how we'd select a
           LockBox library instance dynamically
<lersek>   dependent on the presence of the SMM/SMRAM
           capability in QEMU
<lersek>   one idea I had
<lersek>   is quite ugly
<lersek>   but could work
<jljusten> ah, yikes
<lersek>   (1) introduce a new PPI and a new protocol
<lersek>   with some GUID we generate
<lersek>   this PPI and protocol would abstract the LockBox
           library interface
<lersek>   the PPI for PEIMs
<lersek>   the protocol for DXE modules
<lersek>   (2) we'd build two such drivers
<lersek>   for each of the phases
<lersek>   one pair would back the PPI / protocol with the
           SMM-based (unprivileged) lockbox library instance
<lersek>   another pair would back the PPI / protocol with
           our current (insecure) lockbox library instance
<lersek>   (3) we'd build both pairs of drivers into OVMF
<lersek>   and their entry points would check for SMM/SMRAM
<lersek>   only one pair would remain active and install the
           PPI / protocol
<lersek>   (4) we'd introduce another LockBox library
           instance
<lersek>   that would depend on the PPI / protocol
<lersek>   on the depex level
<lersek>   (PPI depexes can also be stated for PEIMs)
<jljusten> wow. any chance this could be a build time
           option? :)
<lersek>   then all modules using this library instance
           would be delayed until after the corect driver
<lersek>   exactly!
<lersek>   so you can see where this is going
<lersek>   a huge mess
<lersek>   I actually *want* to make this a build time option
<lersek>   but I was worried you'd ask me to do it
           dynamically
<lersek>   basing it on -D SMM_ENABLE, statically
<lersek>   would make things hugely easier
<lersek>   not just for the LockBox lib instance
<lersek>   but a number of other things as well
<lersek>   so if you agree to make SMM/SMRAM support a
           static build time config option
<lersek>   that would be awesome
<jljusten> Whenever I think of SMM, I think, yeah that'd be
           cool to have a sample platform in EDK II, but it never
           sounds like something we'd want to enable by default.
<jljusten> Of course, that could change over time...
<lersek>   right
<lersek>   so let's agree on the -D SMM_ENABLE for now
<lersek>   thank you very much
<lersek>   another question quickly while we're at it
<lersek>   how would you prefer checks for this in the C
           code?
<lersek>   feature PCD?
<lersek>   controlled by the build flag?
<lersek>   I think that's usually the best method for stuff
           like this
<lersek>   it's better than conditional inclusion (ie.
           #ifdef etc)
<lersek>   because all code gets verified by the compiler
<lersek>   I'll also add some sanity checks of course
<jljusten> Yeah, I think so.
<lersek>   so that a -D SMM_ENABLE build rejects running
           early, if qemu doesn't actually provide the feature
<lersek>   jljusten, thank you
<jljusten> Any chance to get video running before then?
           Probably not critical. If they are using the SMM build, they
           should know they need a newer QEMU...
<lersek>   jljusten, hm, I think video is initialized quite
           late; it depends on a UEFI driver and is connected in BDS --
           presence of SMM/SMRAM communicates that the user wants
           things to be secure, so it affects even SEC and PEI.
<lersek>   If those phases expect SMM/SMRAM and it's not
           there, the best would be just to stop right there (after
           logging an error message)
<lersek>   anyway we can perhaps refine this in one of the
           versions of the patchset-to-be
<lersek>   the important thing is that I can now start out
           with a static config flag, knowing that I won't have to
           rewrite *that*. :)
<lersek>   thanks!
<jljusten> I think SMM is not initialized until DXE, but
           yeah, I somehow doubt video would work in that case.
<lersek>   jljusten, indeed, SMM *drivers* only come into
           the picture in DXE
<lersek>   but until then the SMRAM area (TSEG on Q35/MCH)
           needs to be removed from the system memory HOB we install in
           PlatformPei
<lersek>   so that no memory allocations are served from
           there
<lersek>   the TSEG area is the last 1, 2, or 8 MB just
           below the end of the low RAM (ie. top of RAM under 4GB)
<lersek>   we need to set that aside while in PEI
<lersek>   even the permanent PEI ram cannot be installed
           the way it is now
<lersek>   (last 64 MB under top of RAM below 4GB)
<lersek>   so this affects a number of things
<lersek>   I'll also have to audit all of the special memory
           ranges that OVMF uses (see the "comprehensive memory map"
           section in the whitepaper)
<lersek>   because those ranges are sensitive (SMRAM is open
           while SEC and PEI run) and we must not allow the OS to poke
           data directly into some of them
<lersek>   so for example
<lersek>   we can't let SEC just reuse the pre-decompressed
           PEIFV on S3 resume
<lersek>   if SMM/SMRAM is selected
<lersek>   because a malicious OS may have overwritten the
           PEIFV
<lersek>   and then if it suspends, then the user resumes
<lersek>   the firmware will run the OS-injected code as
           part of PEI, with the SMRAM open
<lersek>   etc
<lersek>   it's going to be hugely intrusive
<jljusten> Right. I think most real platforms run code from
           flash in that case.
<lersek>   exactly
<lersek>   so that's one area where our smartness (ie.
           running PEI from RAM, and only SEC from flash) is hurting us
           wrt. SMM a little bit
<jljusten> You ought to be able to run from the SMRAM region
           though.
<lersek>   considering the above
<lersek>   the first idea I've come up with
<lersek>   is to just decompress the firmware volumes again
<lersek>   on S3 resume
<lersek>   overwriting whatever a potentially malicious
           runtime guest OS may have poked into there
<lersek>   but
<lersek>   of course
<lersek>   given the way we compress things, for the sake of
           good compression ratio
<lersek>   that means we'd decompress not just PEIFV but
           DXEFV too
<lersek>   and for that we'll need to set aside much more RAM
<lersek>   even from a non-malicious OS
<lersek>   because decompressing DXEFV too during S3 resume
           must not overwrite OS data
<lersek>   so this would be an argument for reverting to a
           more traditional layout
<lersek>   where PEI is uncompressed and runs from flash
<lersek>   and DXEFV is separately decompressed
<lersek>   in any case,
<lersek>   let me just run with this "decompress them both
           on S3" initial idea
<lersek>   we can discuss things better when there are
           patches for illustration
<lersek>   it doesn't have to be perfect in v1
<jljusten> We could trust code in SMRAM right?
<lersek>   see the original S3 series :)
<lersek>   that's one idea as well
<lersek>   I did think of it
<lersek>   we could place PEI in SMRAM
<lersek>   there are at least two problems with it though
<lersek>   (1) it would require making SEC too smart
<lersek>   configuring the SMRAM control registers and stuff
<lersek>   but that's the smaller issue
<lersek>   the big issue is the S3 PEIM itself
<lersek>   (2) that one has builtin knowledge about SMRAM
<lersek>   and it explicitly uses the SMM ACCESS2 PPI
<lersek>   to open and close smram
<lersek>   obviously
<lersek>   if it is running from SMRAM
<lersek>   it doesn't help if it closes down SMRAM
           underneath itself. :)
<lersek>   anyway what we know now should be enough for
           prototyping
<lersek>   without committing to directions that would be
           certainly wrong in the future
<lersek>   I'll send out a transcript of this discussion in
           the qemu-devel thread
<lersek>   I'll CC you too, and then from the message ID
           you'll be able to find the entire thread on gmane
Gerd Hoffmann April 23, 2015, 8:34 a.m. UTC | #16
Hi,

> I'll ask Jordan too about the dynamic feature detection, because there's
> another big hurdle with it (selecting a LockBox library instance at
> runtime, dependent on the presence of SMM). Deciding about SMM support
> at build time would make things hugely easier (because that's how
> firmware for physical platforms is built as well).

piix/q35 falls into the same category I guess.  We are just lucky that
(a) alot of the chipset-specific initialization is not needed on qemu
and (b) we can hide the remaining differences in ovmf-specific modules.

cheers,
  Gerd
Laszlo Ersek April 23, 2015, 8:42 a.m. UTC | #17
On 04/23/15 10:34, Gerd Hoffmann wrote:
>   Hi,
> 
>> I'll ask Jordan too about the dynamic feature detection, because there's
>> another big hurdle with it (selecting a LockBox library instance at
>> runtime, dependent on the presence of SMM). Deciding about SMM support
>> at build time would make things hugely easier (because that's how
>> firmware for physical platforms is built as well).
> 
> piix/q35 falls into the same category I guess.  We are just lucky that
> (a) alot of the chipset-specific initialization is not needed on qemu
> and (b) we can hide the remaining differences in ovmf-specific modules.

That's precisely it.

Thanks
Laszlo
Paolo Bonzini April 23, 2015, 10:27 a.m. UTC | #18
On 23/04/2015 09:02, Gerd Hoffmann wrote:
>>> > > The third one is messy. It relies on SMI_EN, which is an ioport at
>>> > > PMBA+0x30. It requires a configured PMBA.
> Isn't that initialized early anyway, because the pm timer lives there?
> It's not a regular pci bar exactly to allow early init, so the full pci
> bus scan + bar allocation done later will not mess up things.
> 
> [ just a question, could very well be that even when initialized early
>   it isn't early enough because you need to know the memory layout
>   before uncompressing the firmware modules ]

The nice thing about TSEG is that it's just normal RAM until you lock
it.  So you can add a HOB very early that reserves that memory, and only
later decide whether to use "real" SMM, or just not lock TSEG and jump
into the SMM handler.  (This would be the "experimentation" setting you
were talking about earlier; it would also be used if TSEG is not available).

The fake path would also be used on PIIX.  This means you would _always_
use the SMM stack, even if it's not running in SMM, and you could e.g.
get rid of the PEI/DXE drivers for flash access.

>> > There's another problem with basing this decision in OVMF on
>> > SMI_EN.APMC_EN: it is not an idempotent check. At some point the
>> > firmware itself has to set SMI_EN.APMC_EN.
> Yep, you'll need some variable saying whenever smm is there or not and
> check that.  Because of this.  Or (in case we are going for the fw_cfg
> file) because you don't want dig into fw_cfg each time you need to know
> this.

You can assume that either all of TSEG/SMM/outb(0xb2)/GLB_SMI_EN are
available, or none is.  Detection can go like this:

1) if APMC_EN = 0, SMM is available but other features might not be.
Set APMC_EN and GLB_SMI_EN, set SMI_LOCK(*), then clear GLB_SMI_EN.  Now
APMC_EN is always 1, but a new QEMU won't let you clear GLB_SMI_EN.

	(*) SMI_LOCK is in the configuration space of
	    device 31 function 0 (byte 0xa0, bit 4)

2) is GLB_SMI_EN = 1?  Then SMM is available.

3) else, QEMU has set APMC_EN to communicate that SMIs are not
available, or SMIs are available (e.g. old QEMU with TCG) but you don't
have TSEG.  Proceed with fake SMM.

Would this work?  It is idempotent at least.  Even if the OS tries to
maliciously set APMC_EN to 0 (SMI_LOCK doesn't lock APMC_EN), the result
after step 1 is always going to be the same: APMC_EN=GLB_SMI_EN=1 on new
QEMU, APMC_EN=1/GLB_SMI_EN=0 on old QEMU.

Paolo
diff mbox

Patch

diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index f0d840c..412ff0a 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -198,6 +198,28 @@  static const TypeInfo q35_host_info = {
  * MCH D0:F0
  */
 
+static uint64_t tseg_blackhole_read(void *ptr, hwaddr reg, unsigned size)
+{
+    return 0xffffffff;
+}
+
+static void tseg_blackhole_write(void *opaque, hwaddr addr, uint64_t val,
+                                 unsigned width)
+{
+    /* nothing */
+}
+
+static const MemoryRegionOps tseg_blackhole_ops = {
+    .read = tseg_blackhole_read,
+    .write = tseg_blackhole_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid.min_access_size = 1,
+    .valid.max_access_size = 4,
+    .impl.min_access_size = 4,
+    .impl.max_access_size = 4,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
 /* PCIe MMCFG */
 static void mch_update_pciexbar(MCHPCIState *mch)
 {
@@ -267,6 +289,7 @@  static void mch_update_smram(MCHPCIState *mch)
 {
     PCIDevice *pd = PCI_DEVICE(mch);
     bool h_smrame = (pd->config[MCH_HOST_BRIDGE_ESMRAMC] & MCH_HOST_BRIDGE_ESMRAMC_H_SMRAME);
+    uint32_t tseg_size;
 
     /* implement SMRAM.D_LCK */
     if (pd->config[MCH_HOST_BRIDGE_SMRAM] & MCH_HOST_BRIDGE_SMRAM_D_LCK) {
@@ -296,6 +319,32 @@  static void mch_update_smram(MCHPCIState *mch)
         memory_region_set_enabled(&mch->high_smram, false);
     }
 
+    if (pd->config[MCH_HOST_BRIDGE_ESMRAMC] & MCH_HOST_BRIDGE_ESMRAMC_T_EN) {
+        switch (pd->config[MCH_HOST_BRIDGE_ESMRAMC] &
+                MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_MASK) {
+        case MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_1MB:
+            tseg_size = 1024 * 1024;
+            break;
+        case MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_2MB:
+            tseg_size = 1024 * 1024 * 2;
+            break;
+        case MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_8MB:
+            tseg_size = 1024 * 1024 * 8;
+            break;
+        default:
+            tseg_size = 0;
+            break;
+        }
+    } else {
+        tseg_size = 0;
+    }
+    memory_region_del_subregion(mch->system_memory, &mch->tseg_blackhole);
+    memory_region_set_enabled(&mch->tseg_blackhole, tseg_size);
+    memory_region_set_size(&mch->tseg_blackhole, tseg_size);
+    memory_region_add_subregion_overlap(mch->system_memory,
+                                        mch->below_4g_mem_size - tseg_size,
+                                        &mch->tseg_blackhole, 1);
+
     memory_region_transaction_commit();
 }
 
@@ -443,6 +492,14 @@  static void mch_realize(PCIDevice *d, Error **errp)
     object_property_add_alias(qdev_get_machine(), "smram",
                               OBJECT(&mch->smram), ".", NULL);
 
+    memory_region_init_io(&mch->tseg_blackhole, OBJECT(mch),
+                          &tseg_blackhole_ops, NULL,
+                          "tseg-blackhole", 0);
+    memory_region_set_enabled(&mch->tseg_blackhole, false);
+    memory_region_add_subregion_overlap(mch->system_memory,
+                                        mch->below_4g_mem_size,
+                                        &mch->tseg_blackhole, 1);
+
     init_pam(DEVICE(mch), mch->ram_memory, mch->system_memory,
              mch->pci_address_space, &mch->pam_regions[0],
              PAM_BIOS_BASE, PAM_BIOS_SIZE);
diff --git a/include/hw/pci-host/q35.h b/include/hw/pci-host/q35.h
index 61bfe7e..ba64c70 100644
--- a/include/hw/pci-host/q35.h
+++ b/include/hw/pci-host/q35.h
@@ -55,6 +55,7 @@  typedef struct MCHPCIState {
     PAMMemoryRegion pam_regions[13];
     MemoryRegion smram_region, open_high_smram;
     MemoryRegion smram, low_smram, high_smram;
+    MemoryRegion tseg_blackhole;
     PcPciInfo pci_info;
     ram_addr_t below_4g_mem_size;
     ram_addr_t above_4g_mem_size;