diff mbox

[v10,03/10] piix: create host bridge to passthrough

Message ID 1436938670-7677-4-git-send-email-tiejun.chen@intel.com
State New
Headers show

Commit Message

Tiejun Chen July 15, 2015, 5:37 a.m. UTC
Implement a pci host bridge specific to passthrough. Actually
this just inherits the standard one. And we also just expose
a minimal real host bridge pci configuration subset.

Signed-off-by: Tiejun Chen <tiejun.chen@intel.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
---
v10:
 
* Nothing is changed.

v9:
 
* Just rebase on the latest.

 hw/pci-host/piix.c   | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/hw/i386/pc.h |  2 ++
 2 files changed, 84 insertions(+)

Comments

Alex Williamson Feb. 9, 2016, 3:32 a.m. UTC | #1
On Wed, 15 Jul 2015 13:37:43 +0800
Tiejun Chen <tiejun.chen@intel.com> wrote:

> Implement a pci host bridge specific to passthrough. Actually
> this just inherits the standard one. And we also just expose
> a minimal real host bridge pci configuration subset.
> 
> Signed-off-by: Tiejun Chen <tiejun.chen@intel.com>
> Acked-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> v10:
>  
> * Nothing is changed.
> 
> v9:
>  
> * Just rebase on the latest.
> 
>  hw/pci-host/piix.c   | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/i386/pc.h |  2 ++
>  2 files changed, 84 insertions(+)
> 
> diff --git a/hw/pci-host/piix.c b/hw/pci-host/piix.c
> index a203d93..7adf645 100644
> --- a/hw/pci-host/piix.c
> +++ b/hw/pci-host/piix.c
> @@ -734,6 +734,87 @@ static const TypeInfo i440fx_info = {
>      .class_init    = i440fx_class_init,
>  };
>  
> +/* IGD Passthrough Host Bridge. */
> +typedef struct {
> +    uint8_t offset;
> +    uint8_t len;
> +} IGDHostInfo;
> +
> +/* Here we just expose minimal host bridge offset subset. */
> +static const IGDHostInfo igd_host_bridge_infos[] = {
> +    {0x08, 2},  /* revision id */
> +    {0x2c, 2},  /* sybsystem vendor id */
> +    {0x2e, 2},  /* sybsystem id */
> +    {0x50, 2},  /* SNB: processor graphics control register */
> +    {0x52, 2},  /* processor graphics control register */
> +    {0xa4, 4},  /* SNB: graphics base of stolen memory */
> +    {0xa8, 4},  /* SNB: base of GTT stolen memory */
> +};

Hi,

I'm confused by these last 4 registers, could you please help me
understand them?

Offset 0x50 is the GMCH register for SandyBridge and newer processors,
that makes sense, but I suspect we really want to mask out the GMS
field to indicate the size of stolen memory is zero?  Is Xen providing
the VM with direct access to host stolen memory or does it have support
in the VM BIOS for matching the host address?

0x52 is unknown to me, it's listed as reserved for anything since
SandyBridge, does this date back to chipset-based graphics versus the
current processor-based graphics?

The comment on 0xa4 suggests it should be the BDSM regiser, but that's
at offset 0xb0 for everything since SandyBridge.  0xa4 is the second
DWORD of the Top Of Memory (TOM) register.

I'm guessing by the description of 0xa8 that it might be the BGSM
register, but that's at 0xb4.  0xa8 is the first DWORD of the Top Of
Upper Usable DRAM (TOUUD).  Neither TOM or TOUUD seem to make any sense
to expose to the VM, which really makes me wonder if they're actually
used.

I'm looking at volume 2 of the processor datasheets here:

http://www.intel.com/content/www/us/en/processors/core/core-technical-resources.html

Am I maybe looking in the wrong place?  Thanks for your time,

Alex
Alex Williamson Feb. 9, 2016, 5:43 p.m. UTC | #2
Tiejun's email bounced, so adding Allen for comment.  Host bridge
registers 52h, a4h, and a8h seem to be for versions of IGD prior to
SandyBridge or just wrong.  Even the GMCH (50h) doesn't seem to be
necessary for the VM host bridge, so I'm thinking about dropping all of
these for vfio-based IGD assignment.  Do we know which guest drivers
have specific dependencies on which host bridge and LPC bridge
registers?  It would even be nice to document the standard
registers for future maintainability.  Thanks,

Alex


On Mon, 8 Feb 2016 20:32:35 -0700
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Wed, 15 Jul 2015 13:37:43 +0800
> Tiejun Chen <tiejun.chen@intel.com> wrote:
> 
> > Implement a pci host bridge specific to passthrough. Actually
> > this just inherits the standard one. And we also just expose
> > a minimal real host bridge pci configuration subset.
> > 
> > Signed-off-by: Tiejun Chen <tiejun.chen@intel.com>
> > Acked-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> > v10:
> >  
> > * Nothing is changed.
> > 
> > v9:
> >  
> > * Just rebase on the latest.
> > 
> >  hw/pci-host/piix.c   | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  include/hw/i386/pc.h |  2 ++
> >  2 files changed, 84 insertions(+)
> > 
> > diff --git a/hw/pci-host/piix.c b/hw/pci-host/piix.c
> > index a203d93..7adf645 100644
> > --- a/hw/pci-host/piix.c
> > +++ b/hw/pci-host/piix.c
> > @@ -734,6 +734,87 @@ static const TypeInfo i440fx_info = {
> >      .class_init    = i440fx_class_init,
> >  };
> >  
> > +/* IGD Passthrough Host Bridge. */
> > +typedef struct {
> > +    uint8_t offset;
> > +    uint8_t len;
> > +} IGDHostInfo;
> > +
> > +/* Here we just expose minimal host bridge offset subset. */
> > +static const IGDHostInfo igd_host_bridge_infos[] = {
> > +    {0x08, 2},  /* revision id */
> > +    {0x2c, 2},  /* sybsystem vendor id */
> > +    {0x2e, 2},  /* sybsystem id */
> > +    {0x50, 2},  /* SNB: processor graphics control register */
> > +    {0x52, 2},  /* processor graphics control register */
> > +    {0xa4, 4},  /* SNB: graphics base of stolen memory */
> > +    {0xa8, 4},  /* SNB: base of GTT stolen memory */
> > +};  
> 
> Hi,
> 
> I'm confused by these last 4 registers, could you please help me
> understand them?
> 
> Offset 0x50 is the GMCH register for SandyBridge and newer processors,
> that makes sense, but I suspect we really want to mask out the GMS
> field to indicate the size of stolen memory is zero?  Is Xen providing
> the VM with direct access to host stolen memory or does it have support
> in the VM BIOS for matching the host address?
> 
> 0x52 is unknown to me, it's listed as reserved for anything since
> SandyBridge, does this date back to chipset-based graphics versus the
> current processor-based graphics?
> 
> The comment on 0xa4 suggests it should be the BDSM regiser, but that's
> at offset 0xb0 for everything since SandyBridge.  0xa4 is the second
> DWORD of the Top Of Memory (TOM) register.
> 
> I'm guessing by the description of 0xa8 that it might be the BGSM
> register, but that's at 0xb4.  0xa8 is the first DWORD of the Top Of
> Upper Usable DRAM (TOUUD).  Neither TOM or TOUUD seem to make any sense
> to expose to the VM, which really makes me wonder if they're actually
> used.
> 
> I'm looking at volume 2 of the processor datasheets here:
> 
> http://www.intel.com/content/www/us/en/processors/core/core-technical-resources.html
> 
> Am I maybe looking in the wrong place?  Thanks for your time,
> 
> Alex
>
Paolo Bonzini Feb. 9, 2016, 5:56 p.m. UTC | #3
On 09/02/2016 18:43, Alex Williamson wrote:
> Tiejun's email bounced, so adding Allen for comment.  Host bridge
> registers 52h, a4h, and a8h seem to be for versions of IGD prior to
> SandyBridge or just wrong.  Even the GMCH (50h) doesn't seem to be
> necessary for the VM host bridge, so I'm thinking about dropping all of
> these for vfio-based IGD assignment.  Do we know which guest drivers
> have specific dependencies on which host bridge and LPC bridge
> registers?  It would even be nice to document the standard
> registers for future maintainability.  Thanks,

I remember asking Tiejun for the same, but that was a year ago or more. :(

Paolo
Kay, Allen M Feb. 9, 2016, 7:47 p.m. UTC | #4
> -----Original Message-----
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Tuesday, February 09, 2016 9:44 AM
> Cc: ehabkost@redhat.com; mst@redhat.com;
> stefano.stabellini@eu.citrix.com; qemu-devel@nongnu.org;
> pbonzini@redhat.com; rth@twiddle.net; Kay, Allen M
> <allen.m.kay@intel.com>
> Subject: Re: [Qemu-devel] [v10][PATCH 03/10] piix: create host bridge to
> passthrough
> 
> 
> Tiejun's email bounced, so adding Allen for comment.
>

Tiejun has left Intel.  Shuai Ruan is his replacement (cc'ed).

>  Host bridge registers
> 52h, a4h, and a8h seem to be for versions of IGD prior to SandyBridge or just
> wrong. Even the GMCH (50h) doesn't seem to be necessary for the VM host
> bridge, so I'm thinking about dropping all of these for vfio-based IGD
> assignment.

I think dropping those register access in GMCH should be fine.  52h/a4h/a8h were for Ironlake IGD.  50h is now mirrored in IGD.  They can be removed as BDW/SKL driver no longer access them.

>  Do we know which guest drivers have specific dependencies on
> which host bridge and LPC bridge registers?  It would even be nice to
> document the standard registers for future maintainability.  Thanks,
> 

BDW/SKL drivers no longer need to access those registers in GMCH, especially in Universal Passthrough Mode.  From my perspective,  I'm OK with limiting KVM IGD passthrough to SKL+ platforms so that we don't need to add those hacks for KVM IGD passthrough.

For older HW, there weren't anything documented.  We added those register accesses in Xen/QEMU as we brought up Ironlake/SNB platforms on Xen.

Allen

> 
> On Mon, 8 Feb 2016 20:32:35 -0700
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> > On Wed, 15 Jul 2015 13:37:43 +0800
> > Tiejun Chen <tiejun.chen@intel.com> wrote:
> >
> > > Implement a pci host bridge specific to passthrough. Actually this
> > > just inherits the standard one. And we also just expose a minimal
> > > real host bridge pci configuration subset.
> > >
> > > Signed-off-by: Tiejun Chen <tiejun.chen@intel.com>
> > > Acked-by: Michael S. Tsirkin <mst@redhat.com>
> > > ---
> > > v10:
> > >
> > > * Nothing is changed.
> > >
> > > v9:
> > >
> > > * Just rebase on the latest.
> > >
> > >  hw/pci-host/piix.c   | 82
> ++++++++++++++++++++++++++++++++++++++++++++++++++++
> > >  include/hw/i386/pc.h |  2 ++
> > >  2 files changed, 84 insertions(+)
> > >
> > > diff --git a/hw/pci-host/piix.c b/hw/pci-host/piix.c index
> > > a203d93..7adf645 100644
> > > --- a/hw/pci-host/piix.c
> > > +++ b/hw/pci-host/piix.c
> > > @@ -734,6 +734,87 @@ static const TypeInfo i440fx_info = {
> > >      .class_init    = i440fx_class_init,
> > >  };
> > >
> > > +/* IGD Passthrough Host Bridge. */
> > > +typedef struct {
> > > +    uint8_t offset;
> > > +    uint8_t len;
> > > +} IGDHostInfo;
> > > +
> > > +/* Here we just expose minimal host bridge offset subset. */ static
> > > +const IGDHostInfo igd_host_bridge_infos[] = {
> > > +    {0x08, 2},  /* revision id */
> > > +    {0x2c, 2},  /* sybsystem vendor id */
> > > +    {0x2e, 2},  /* sybsystem id */
> > > +    {0x50, 2},  /* SNB: processor graphics control register */
> > > +    {0x52, 2},  /* processor graphics control register */
> > > +    {0xa4, 4},  /* SNB: graphics base of stolen memory */
> > > +    {0xa8, 4},  /* SNB: base of GTT stolen memory */ };
> >
> > Hi,
> >
> > I'm confused by these last 4 registers, could you please help me
> > understand them?
> >
> > Offset 0x50 is the GMCH register for SandyBridge and newer processors,
> > that makes sense, but I suspect we really want to mask out the GMS
> > field to indicate the size of stolen memory is zero?  Is Xen providing
> > the VM with direct access to host stolen memory or does it have support
> > in the VM BIOS for matching the host address?
> >

Xen does provide 1:1 stolen memory mapping in the guest.  However, I do agree with you we should disable stolen memory in the guest by mask out GMS field.  This will avoid complications involving stolen memory/RMRR support.  The only features uses stolen memory are Frame Buffer Compression and PAVP content protection.  PAVP won't work in the guest anyways as it requires access to ME/HECI device.

> > 0x52 is unknown to me, it's listed as reserved for anything since
> > SandyBridge, does this date back to chipset-based graphics versus the
> > current processor-based graphics?
> >

I believe this is same as 0x50 but on old Ironlake platform.  Xen started IGD passthrough support in Ironlake.

> > The comment on 0xa4 suggests it should be the BDSM regiser, but that's
> > at offset 0xb0 for everything since SandyBridge.  0xa4 is the second
> > DWORD of the Top Of Memory (TOM) register.
> >
> > I'm guessing by the description of 0xa8 that it might be the BGSM
> > register, but that's at 0xb4.  0xa8 is the first DWORD of the Top Of
> > Upper Usable DRAM (TOUUD).  Neither TOM or TOUUD seem to make any
> sense
> > to expose to the VM, which really makes me wonder if they're actually
> > used.
> >

TOM (0xa0/0xa4) and TOUUD (0xa8) were used in Ironlake generation of Windows driver.  They are no longer needed in BDW/SKL drivers.

> > I'm looking at volume 2 of the processor datasheets here:
> >
> > http://www.intel.com/content/www/us/en/processors/core/core-
> technical-resources.html
> >
> > Am I maybe looking in the wrong place?  Thanks for your time,
> >
> > Alex
> >
Alex Williamson Feb. 9, 2016, 9:32 p.m. UTC | #5
On Tue, 9 Feb 2016 19:47:49 +0000
"Kay, Allen M" <allen.m.kay@intel.com> wrote:

> > -----Original Message-----
> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Tuesday, February 09, 2016 9:44 AM
> > Cc: ehabkost@redhat.com; mst@redhat.com;
> > stefano.stabellini@eu.citrix.com; qemu-devel@nongnu.org;
> > pbonzini@redhat.com; rth@twiddle.net; Kay, Allen M
> > <allen.m.kay@intel.com>
> > Subject: Re: [Qemu-devel] [v10][PATCH 03/10] piix: create host bridge to
> > passthrough
> > 
> > 
> > Tiejun's email bounced, so adding Allen for comment.
> >  
> 
> Tiejun has left Intel.  Shuai Ruan is his replacement (cc'ed).
> 
> >  Host bridge registers
> > 52h, a4h, and a8h seem to be for versions of IGD prior to SandyBridge or just
> > wrong. Even the GMCH (50h) doesn't seem to be necessary for the VM host
> > bridge, so I'm thinking about dropping all of these for vfio-based IGD
> > assignment.  
> 
> I think dropping those register access in GMCH should be fine.  52h/a4h/a8h were for Ironlake IGD.  50h is now mirrored in IGD.  They can be removed as BDW/SKL driver no longer access them.
> 
> >  Do we know which guest drivers have specific dependencies on
> > which host bridge and LPC bridge registers?  It would even be nice to
> > document the standard registers for future maintainability.  Thanks,
> >   
> 
> BDW/SKL drivers no longer need to access those registers in GMCH, especially in Universal Passthrough Mode.  From my perspective,  I'm OK with limiting KVM IGD passthrough to SKL+ platforms so that we don't need to add those hacks for KVM IGD passthrough.
> 
> For older HW, there weren't anything documented.  We added those register accesses in Xen/QEMU as we brought up Ironlake/SNB platforms on Xen.

Hi Allen,

Thanks for the reply.  I'm totally onboard with you for BDW/SKL for
supported platforms, but I'd like to understand what the incremental
investment is for each back step within reason for older GPUs, at least
for best-effort, community support.  If we want to assume BDW/SKL and
Universal Passthrough Mode, then we could abandon the host bridge and
ISA bridge modifications altogether, right?  On the other hand, it's
not clear to me that UPT drivers are that pervasive and if we need to
enable some degree of host bridge/ISA bridge poke thrus, why not enable
a fair chunk of users, including me.

My IVB desktop seems to be working well (win10 + linux) with only
poking through the standard host bridge and ISA bridge
identifications.  Yes, I need to know about the different GMCH layout
of IVB vs BDW in the IGD device, but I've already taken care of that.
It seems like SNB is pretty similar to IVB (they run on the same
chipsets), but I haven't yet figured out the magic to make an SNB
laptop light up with IGD assignment, which would be useful to show that
OpRegion passthrough is actually doing something.

If we ignore IronLake and older GPUs, how many VM chipset hacks do we
need?  What combinations would require GMCH mirrored in the host
bridge, and couldn't we mirror it from the IGD device itself since it's
present in the same location all the way back through SNB.

> > On Mon, 8 Feb 2016 20:32:35 -0700
> > Alex Williamson <alex.williamson@redhat.com> wrote:
> >   
> > > On Wed, 15 Jul 2015 13:37:43 +0800
> > > Tiejun Chen <tiejun.chen@intel.com> wrote:
> > > > +/* Here we just expose minimal host bridge offset subset. */ static
> > > > +const IGDHostInfo igd_host_bridge_infos[] = {
> > > > +    {0x08, 2},  /* revision id */
> > > > +    {0x2c, 2},  /* sybsystem vendor id */
> > > > +    {0x2e, 2},  /* sybsystem id */
> > > > +    {0x50, 2},  /* SNB: processor graphics control register */
> > > > +    {0x52, 2},  /* processor graphics control register */
> > > > +    {0xa4, 4},  /* SNB: graphics base of stolen memory */
> > > > +    {0xa8, 4},  /* SNB: base of GTT stolen memory */ };  
> > >
> > > Hi,
> > >
> > > I'm confused by these last 4 registers, could you please help me
> > > understand them?
> > >
> > > Offset 0x50 is the GMCH register for SandyBridge and newer processors,
> > > that makes sense, but I suspect we really want to mask out the GMS
> > > field to indicate the size of stolen memory is zero?  Is Xen providing
> > > the VM with direct access to host stolen memory or does it have support
> > > in the VM BIOS for matching the host address?
> > >  
> 
> Xen does provide 1:1 stolen memory mapping in the guest.  However, I do agree with you we should disable stolen memory in the guest by mask out GMS field.  This will avoid complications involving stolen memory/RMRR support.  The only features uses stolen memory are Frame Buffer Compression and PAVP content protection.  PAVP won't work in the guest anyways as it requires access to ME/HECI device.

We certainly don't want to get into the nastiness of RMRRs in a VM, but
do the stolen memory use cases you've outlined explain the DMAR faults
to that region that I was seeing just booting a VM with a Linux live
CD?  Again, it seems just as easy, if not easier to copy GMCH from the
IGD itself into the host bridge.
 
> > > 0x52 is unknown to me, it's listed as reserved for anything since
> > > SandyBridge, does this date back to chipset-based graphics versus the
> > > current processor-based graphics?
> > >  
> 
> I believe this is same as 0x50 but on old Ironlake platform.  Xen started IGD passthrough support in Ironlake.

Ok, unless anyone shouts really loudly and it doesn't affect anything
newer than IronLake, I'm happy to let those fall off the plate.  I don't
have any older systems that I care to make work with this.

> > > The comment on 0xa4 suggests it should be the BDSM regiser, but that's
> > > at offset 0xb0 for everything since SandyBridge.  0xa4 is the second
> > > DWORD of the Top Of Memory (TOM) register.
> > >
> > > I'm guessing by the description of 0xa8 that it might be the BGSM
> > > register, but that's at 0xb4.  0xa8 is the first DWORD of the Top Of
> > > Upper Usable DRAM (TOUUD).  Neither TOM or TOUUD seem to make any  
> > sense  
> > > to expose to the VM, which really makes me wonder if they're actually
> > > used.
> > >  
> 
> TOM (0xa0/0xa4) and TOUUD (0xa8) were used in Ironlake generation of Windows driver.  They are no longer needed in BDW/SKL drivers.

Ok, so the code comment is pretty misleading for these.  Would anything
since SNB need these?  What about the BDSM register in the host
bridge?  Is the Xen experience of not needing to copy it relevant since
stolen memory is identity mapped into the VM anyway?  Maybe Xen
achieves the same effect by not copying it and exposing it as zero.
Thanks,

Alex
Kay, Allen M Feb. 10, 2016, 3:40 a.m. UTC | #6
> -----Original Message-----
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Tuesday, February 09, 2016 1:33 PM
> To: Kay, Allen M <allen.m.kay@intel.com>
> Cc: ehabkost@redhat.com; mst@redhat.com;
> stefano.stabellini@eu.citrix.com; qemu-devel@nongnu.org;
> pbonzini@redhat.com; rth@twiddle.net; Ruan, Shuai
> <shuai.ruan@intel.com>
> Subject: Re: [Qemu-devel] [v10][PATCH 03/10] piix: create host bridge to
> passthrough
> 
> Hi Allen,
> 
Hi Alex,

> Thanks for the reply.  I'm totally onboard with you for BDW/SKL for
> supported platforms, but I'd like to understand what the incremental
> investment is for each back step within reason for older GPUs, at least for
> best-effort, community support.  If we want to assume BDW/SKL and
> Universal Passthrough Mode, then we could abandon the host bridge and
> ISA bridge modifications altogether, right?

Right.

> On the other hand, it's not clear
> to me that UPT drivers are that pervasive and if we need to enable some
> degree of host bridge/ISA bridge poke thrus, why not enable a fair chunk of
> users, including me.
> 

Sounds good.

> My IVB desktop seems to be working well (win10 + linux) with only poking
> through the standard host bridge and ISA bridge identifications.  Yes, I need
> to know about the different GMCH layout of IVB vs BDW in the IGD device,
> but I've already taken care of that.

Other than VendorID/DeviceID, which other PCI config fields in host bridge and ISA bridge fields did you have to passthrough to get your IVB to work.  Are you passing IGD through as primary or secondary?

> It seems like SNB is pretty similar to IVB (they run on the same chipsets),

Agree.

> but I haven't yet fgured out the magic to make an SNB laptop light up with IGD
> assignment, which would be useful to show that OpRegion passthrough is
> actually doing something.
> 

There might be a difference in BDSM definition.  You might want to add 0xb0 in host bridge passthrough and try SNB again.

It is also possible the difference might be caused by difference in driver version.  If I google "sandybridge windows graphics driver" and "ivybridge windows graphics driver" I get the following:

SNB driver: https://downloadcenter.intel.com/product/81502/Intel-HD-Graphics-2000-for-2nd-Generation-Intel-Core-Processors
IVB driver: https://downloadcenter.intel.com/product/81501/Intel-HD-Graphics-2500-for-3rd-Generation-Intel-Core-Processors

SNB driver points to version 15.28 driver binary.  This driver supports SNB and ILK graphics.
IVB driver points to version 15.33 driver binary.  This driver supports IVB and SNB graphics.

One experiment you can try is to install same IVB 15.33 binary on both IVB and SNB systems.  To prevent Windows update messing with driver version, you should do the following:

1)  disable MSFT driver update: Computer->"Advance system settings"->Hardware->"Device Installation Settings"->"No, let me choose what to do"->"Never install driver software from Windows Update".
2) Disable Windows update to never check or download updates.
3) remove "c:\windows\system32\DriverStore\FileRepository\igd*"
4) re-install IGD driver to desired binary.
5) Double check driver versions of both IVB and SNB are the same after reboot guest.  Unfortunately, it won't said 15.33 but have a version number starts with 10.*.

> If we ignore IronLake and older GPUs, how many VM chipset hacks do we
> need?  What combinations would require GMCH mirrored in the host bridge,

Base on original code in Xen, you can try the following host bridge offset passthrough.  I have also attached pt-graphics.c from Xen/QEMU for your reference.

       case 0x00:        /* vendor id */
        case 0x02:        /* device id */
        case 0x08:        /* revision id */
        case 0x2c:        /* sybsystem vendor id */
        case 0x2e:        /* sybsystem id */
        case 0x50:        /* SNB: processor graphics control register */
        case 0x52:        /* processor graphics control register */
        case 0xa0:        /* top of memory */
        case 0xb0:        /* ILK: BSM: should read from dev 2 offset 0x5c */
        case 0x58:        /* SNB: PAVPC Offset */
        case 0xa4:        /* SNB: graphics base of stolen memory */
        case 0xa8:        /* SNB: base of GTT stolen memory */

> and couldn't we mirror it from the IGD device itself since it's present in the
> same location all the way back through SNB.
> 

Yes, you can copy content from MCCG in IGD device to same 0x50 offset in host bridge.

> We certainly don't want to get into the nastiness of RMRRs in a VM, but do
> the stolen memory use cases you've outlined explain the DMAR faults to that
> region that I was seeing just booting a VM with a Linux live CD?

What I described about stolen memory was based on Windows driver usage.  I will need to try Linux live CD to see DMAR faults you encountered.  I will try that after I finish setting up my environment.  I have been following your blog.  Great instructions!

> Again, it seems just as easy, if not easier to copy GMCH from the IGD itself into the
> host bridge.
> 
Not sure if I follow ... how does this would solve stolen memory/RMRR issue...?

> 
> Ok, unless anyone shouts really loudly and it doesn't affect anything newer
> than IronLake, I'm happy to let those fall off the plate.  I don't have any older
> systems that I care to make work with this.
> 
Sounds good.

> 
> Ok, so the code comment is pretty misleading for these.  Would anything
> since SNB need these?
>
We did not do much pruning pre-SNB.  Tiejun and I did checked HSW last year, most (maybe all) of these register are not needed.

> What about the BDSM register in the host bridge?
>  Is the Xen experience of not needing to copy it relevant since stolen memory is
> identity mapped into the VM anyway?  Maybe Xen achieves the same effect
> by not copying it and exposing it as zero.

In SNB, BDSM is at offset 0xb0 in host bridge.  Xen/QEMU passes thru 0xb0 in host bridge.  Although the comment says it is for ILK.  It is also true for SNB.  In current BDW/SKL driver, BDSM is located at 0x5c in IGD device.

Allen
/*
 * graphics passthrough
 */

#include "pass-through.h"
#include "pci.h"
#include "pci/header.h"
#include "pci/pci.h"

#include <unistd.h>
#include <sys/ioctl.h>
#include <assert.h>

extern int gfx_passthru;
extern int igd_passthru;

static uint32_t igd_guest_opregion = 0;

static int pch_map_irq(PCIDevice *pci_dev, int irq_num)
{
    PT_LOG("pch_map_irq called\n");
    return irq_num;
}

void intel_pch_init(PCIBus *bus)
{
    uint16_t vid, did;
    uint8_t  rid;
    struct pci_dev *pci_dev_1f;

    if ( !gfx_passthru )
        return;

    if ( !(pci_dev_1f=pt_pci_get_dev(0, 0x1f, 0)) )
    {
        PT_ERR("Error: Can't get pci_dev_host_bridge\n");
        abort();
    }

    vid = pt_pci_host_read(pci_dev_1f, PCI_VENDOR_ID, 2);
    did = pt_pci_host_read(pci_dev_1f, PCI_DEVICE_ID, 2);
    rid = pt_pci_host_read(pci_dev_1f, PCI_REVISION, 1);

    if (vid == PCI_VENDOR_ID_INTEL) {
        pci_isa_bridge_init(bus, PCI_DEVFN(0x1f, 0), vid, did, rid,
                            pch_map_irq, "intel_bridge_1f");

    }
}

uint32_t igd_read_opregion(struct pt_dev *pci_dev)
{
    uint32_t val = -1;

    if ( igd_guest_opregion == 0 )
        return -1;

    val = igd_guest_opregion;
#ifdef PT_DEBUG_PCI_CONFIG_ACCESS
    PT_LOG_DEV((PCIDevice*)pci_dev, "addr=%x len=%x val=%x\n",
            PCI_INTEL_OPREGION, 4, val);
#endif
    return val;
}

void igd_write_opregion(struct pt_dev *real_dev, uint32_t val)
{
    uint32_t host_opregion = 0;
    int ret;

    if ( igd_guest_opregion )
    {
        PT_LOG("opregion register already been set, ignoring %x\n", val);
        return;
    }

    host_opregion = pt_pci_host_read(real_dev->pci_dev, PCI_INTEL_OPREGION, 4);
    igd_guest_opregion = (val & ~0xfff) | (host_opregion & 0xfff);
    PT_LOG("Map OpRegion: %x -> %x\n", host_opregion, igd_guest_opregion);

    ret = xc_domain_memory_mapping(xc_handle, domid,
            igd_guest_opregion >> XC_PAGE_SHIFT,
            host_opregion >> XC_PAGE_SHIFT,
            2,
            DPCI_ADD_MAPPING);

    if ( ret != 0 )
    {
        PT_LOG("Error: Can't map opregion\n");
        igd_guest_opregion = 0;
    }
#ifdef PT_DEBUG_PCI_CONFIG_ACCESS
    PT_LOG_DEV((PCIDevice*)real_dev, "addr=%x len=%lx val=%x\n",
            PCI_INTEL_OPREGION, len, val);
#endif

}

void igd_pci_write(PCIDevice *pci_dev, uint32_t config_addr, uint32_t val, int len)
{
    struct pci_dev *pci_dev_host_bridge;
    assert(pci_dev->devfn == 0x00);
    if ( !igd_passthru )
        goto write_default;

    switch (config_addr)
    {
        case 0x58:        // PAVPC Offset
            break;
        default:
            goto write_default;
    }

    /* Host write */
    if ( !(pci_dev_host_bridge = pt_pci_get_dev(0, 0, 0)) )
    {
        PT_ERR("Error: Can't get pci_dev_host_bridge\n");
        abort();
    }

    pt_pci_host_write(pci_dev_host_bridge, config_addr, val, len);
#ifdef PT_DEBUG_PCI_CONFIG_ACCESS
    PT_LOG_DEV(pci_dev, "addr=%x len=%x val=%x\n",
               config_addr, len, val);
#endif
    return;

write_default:
    pci_default_write_config(pci_dev, config_addr, val, len);
}

uint32_t igd_pci_read(PCIDevice *pci_dev, uint32_t config_addr, int len)
{
    struct pci_dev *pci_dev_host_bridge;
    uint32_t val;

    assert(pci_dev->devfn == 0x00);
    if ( !igd_passthru )
        goto read_default;

    switch (config_addr)
    {
        case 0x00:        /* vendor id */
        case 0x02:        /* device id */
        case 0x08:        /* revision id */
        case 0x2c:        /* sybsystem vendor id */
        case 0x2e:        /* sybsystem id */
        case 0x50:        /* SNB: processor graphics control register */
        case 0x52:        /* processor graphics control register */
        case 0xa0:        /* top of memory */
        case 0xb0:        /* ILK: BSM: should read from dev 2 offset 0x5c */
        case 0x58:        /* SNB: PAVPC Offset */
        case 0xa4:        /* SNB: graphics base of stolen memory */
        case 0xa8:        /* SNB: base of GTT stolen memory */
            break;
        default:
            goto read_default;
    }

    /* Host read */
    if ( !(pci_dev_host_bridge = pt_pci_get_dev(0, 0, 0)) )
    {
        PT_ERR("Error: Can't get pci_dev_host_bridge\n");
        abort();
    }

    val = pt_pci_host_read(pci_dev_host_bridge, config_addr, len);
#ifdef PT_DEBUG_PCI_CONFIG_ACCESS
    PT_LOG_DEV(pci_dev, "addr=%x len=%x val=%x\n",
               config_addr, len, val);
#endif
    return val;
   
read_default:
   
   return pci_default_read_config(pci_dev, config_addr, len);
}

/*
 * register VGA resources for the domain with assigned gfx
 */
int register_vga_regions(struct pt_dev *real_device)
{
    u16 vendor_id;
    int ret = 0;

    if ( !gfx_passthru || real_device->pci_dev->device_class != 0x0300 )
        return ret;

    ret |= xc_domain_ioport_mapping(xc_handle, domid, 0x3B0,
            0x3B0, 0xC, DPCI_ADD_MAPPING);

    ret |= xc_domain_ioport_mapping(xc_handle, domid, 0x3C0,
            0x3C0, 0x20, DPCI_ADD_MAPPING);

    ret |= xc_domain_memory_mapping(xc_handle, domid,
            0xa0000 >> XC_PAGE_SHIFT,
            0xa0000 >> XC_PAGE_SHIFT,
            0x20,
            DPCI_ADD_MAPPING);

    if ( ret != 0 )
        PT_LOG("VGA region mapping failed\n");

    return ret;
}

/*
 * unregister VGA resources for the domain with assigned gfx
 */
int unregister_vga_regions(struct pt_dev *real_device)
{
    u32 vendor_id;
    int ret = 0;

    if ( !gfx_passthru || real_device->pci_dev->device_class != 0x0300 )
        return ret;

    ret |= xc_domain_ioport_mapping(xc_handle, domid, 0x3B0,
            0x3B0, 0xC, DPCI_REMOVE_MAPPING);

    ret |= xc_domain_ioport_mapping(xc_handle, domid, 0x3C0,
            0x3C0, 0x20, DPCI_REMOVE_MAPPING);

    ret |= xc_domain_memory_mapping(xc_handle, domid,
            0xa0000 >> XC_PAGE_SHIFT,
            0xa0000 >> XC_PAGE_SHIFT,
            20,
            DPCI_REMOVE_MAPPING);

    vendor_id = pt_pci_host_read(real_device->pci_dev, PCI_VENDOR_ID, 2);
    if ( (vendor_id == PCI_VENDOR_ID_INTEL) && igd_guest_opregion )
    {
        ret |= xc_domain_memory_mapping(xc_handle, domid,
                igd_guest_opregion >> XC_PAGE_SHIFT,
                igd_guest_opregion >> XC_PAGE_SHIFT,
                2,
                DPCI_REMOVE_MAPPING);
    }

    if ( ret != 0 )
        PT_LOG("VGA region unmapping failed\n");

    return ret;
}

static int get_vgabios(unsigned char *buf)
{
    int fd;
    uint32_t bios_size = 0;
    uint32_t start = 0xC0000;
    uint16_t magic = 0;

    if ( (fd = open("/dev/mem", O_RDONLY)) < 0 )
    {
        PT_LOG("Error: Can't open /dev/mem: %s\n", strerror(errno));
        return 0;
    }

    /*
     * Check if it a real bios extension.
     * The magic number is 0xAA55.
     */
    if ( start != lseek(fd, start, SEEK_SET) )
        goto out;
    if ( read(fd, &magic, 2) != 2 )
        goto out;
    if ( magic != 0xAA55 )
        goto out;

    /* Find the size of the rom extension */
    if ( start != lseek(fd, start, SEEK_SET) )
        goto out;
    if ( lseek(fd, 2, SEEK_CUR) != (start + 2) )
        goto out;
    if ( read(fd, &bios_size, 1) != 1 )
        goto out;

    /* This size is in 512 bytes */
    bios_size *= 512;

    /*
     * Set the file to the begining of the rombios,
     * to start the copy.
     */
    if ( start != lseek(fd, start, SEEK_SET) )
        goto out;

    if ( bios_size != read(fd, buf, bios_size))
        bios_size = 0;

out:
    close(fd);
    return bios_size;
}

int setup_vga_pt(struct pt_dev *real_device)
{
    unsigned char *bios = NULL;
    int bios_size = 0;
    char *c = NULL;
    char checksum = 0;
    int rc = 0;

    if ( !gfx_passthru || real_device->pci_dev->device_class != 0x0300 )
        return rc;

    /* Allocated 64K for the vga bios */
    if ( !(bios = malloc(64 * 1024)) )
        return -1;

    bios_size = get_vgabios(bios);
    if ( bios_size == 0 || bios_size > 64 * 1024)
    {
        PT_LOG("vga bios size (0x%x) is invalid!\n", bios_size);
        rc = -1;
        goto out;
    }

    /* Adjust the bios checksum */
    for ( c = (char*)bios; c < ((char*)bios + bios_size); c++ )
        checksum += *c;
    if ( checksum )
    {
        bios[bios_size - 1] -= checksum;
        PT_LOG("vga bios checksum is adjusted!\n");
    }

    cpu_physical_memory_rw(0xc0000, bios, bios_size, 1);
out:
    free(bios);
    return rc;
}
Alex Williamson Feb. 10, 2016, 6 a.m. UTC | #7
Hi Allen,

On Wed, 10 Feb 2016 03:40:54 +0000
"Kay, Allen M" <allen.m.kay@intel.com> wrote:

> > -----Original Message-----
> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Tuesday, February 09, 2016 1:33 PM
> > To: Kay, Allen M <allen.m.kay@intel.com>
> > Cc: ehabkost@redhat.com; mst@redhat.com;
> > stefano.stabellini@eu.citrix.com; qemu-devel@nongnu.org;
> > pbonzini@redhat.com; rth@twiddle.net; Ruan, Shuai
> > <shuai.ruan@intel.com>
> > Subject: Re: [Qemu-devel] [v10][PATCH 03/10] piix: create host bridge to
> > passthrough
> > 
> > Hi Allen,
> >   
> Hi Alex,
> 
> > Thanks for the reply.  I'm totally onboard with you for BDW/SKL for
> > supported platforms, but I'd like to understand what the incremental
> > investment is for each back step within reason for older GPUs, at least for
> > best-effort, community support.  If we want to assume BDW/SKL and
> > Universal Passthrough Mode, then we could abandon the host bridge and
> > ISA bridge modifications altogether, right?  
> 
> Right.
> 
> > On the other hand, it's not clear
> > to me that UPT drivers are that pervasive and if we need to enable some
> > degree of host bridge/ISA bridge poke thrus, why not enable a fair chunk of
> > users, including me.
> >   
> 
> Sounds good.
> 
> > My IVB desktop seems to be working well (win10 + linux) with only poking
> > through the standard host bridge and ISA bridge identifications.  Yes, I need
> > to know about the different GMCH layout of IVB vs BDW in the IGD device,
> > but I've already taken care of that.  
> 
> Other than VendorID/DeviceID, which other PCI config fields in host bridge and ISA bridge fields did you have to passthrough to get your IVB to work.  Are you passing IGD through as primary or secondary?

The host bridge I stripped down to:

    {PCI_REVISION_ID,         2},
    {PCI_SUBSYSTEM_VENDOR_ID, 2},
    {PCI_SUBSYSTEM_ID,        2},

And used this for the ISA bridge:

    {PCI_VENDOR_ID,           2},
    {PCI_DEVICE_ID,           2},
    {PCI_REVISION_ID,         2},
    {PCI_SUBSYSTEM_VENDOR_ID, 2},
    {PCI_SUBSYSTEM_ID,        2},

So nothing to do with GMCH/BDSM seems to be necessary for the IVB
desktop system. I didn't try to trim further.

I can run that system as either primary or secondary.  On both systems
I'm using pci-stub to keep i915 from interfering on the host, on the
SNB laptop I also use video=efifb:off since it's configured for UEFI
boot, I'm not sure if that play any role in why it's not working.

I've tried adding a bunch more registers on the SNB system to see if it
helps, for the host bridge:

    {0x50,                    2},
    {0x52,                    2},
    {0xa4,                    4},
    {0xa8,                    4},

(ie. the registers Tiejun added with this patch)

And on the ISA bridge:

    {0x40,        4},
    {0x44,        4},
    {0x48,        4},
    {0x4c,        4},
    {0xf0,        4},

These were registers I saw accessed with tracing enabled, but nothing
seemed to change by adding them. I don't see any accesses to the BDSM
register on the host bridge, but I do see GMCH accesses.  Sort of
interesting is that the guest reads 201h from that register and tries
to write 203h. In the read case we have a 2MB GGMS and GGCLCK set, the
guest tries to set IVD (IGD VGA Disable). I'm not sure why it bothers
to try to do this with GGCLCK indicated since the register is locked,
but it is slightly troubling that the spec indicates that the BIOS must
*not* set IVD to zero (VGA enabled) if GMS pre-allocates no memory,
which is exactly what we're doing to try to indicate no stolen memory.
If we could actually disable VGA on IGD, that might be a nice option,
but I know there are issues with that (iirc, there's no disable for one
of either the VGA MMIO or I/O port space, so the only option is to
disable that address space at the PCI command register, which has
obvious implications - I think it was probably I/O port space).

I'm currently trying a Linux live CD on the guest for the SNB, mostly
because I can see driver error messages and look at the driver source,
neither of which are available to me for the Windows driver.  Both of
the driver warnings I get are from the driver thinking a certain
feature should be disabled but finds it enabled.  They're also sort of
on a strange drm release path, maybe due to being in secondary mode and
Xorg not being configured to use the IGD on this image.  In case it
looks familiar:

[   12.397908] ------------[ cut here ]------------
[   12.399175] WARNING: CPU: 0 PID: 1135 at drivers/gpu/drm/i915/intel_display.c:1259 assert_fdi_rx_pll+0x78/0xb0 [i915]()
[   12.401720] FDI RX PLL assertion failure (expected off, current on)
[   12.403239] Modules linked in: ebtable_broute bridge stp llc ebtable_filter ebtable_nat ebtables ip6table_security ip6table_mangle ip6table_raw ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_filter ip6_tables iptable_security iptable_mangle iptable_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iTCO_wdt gpio_ich iTCO_vendor_support ppdev joydev parport_pc lpc_ich i2c_piix4 parport acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace isofs squashfs i915 bochs_drm video ttm i2c_algo_bit drm_kms_helper drm ata_generic serio_raw pata_acpi scsi_dh_rdac scsi_dh_emc scsi_dh_alua sunrpc loop
[   12.417994] CPU: 0 PID: 1135 Comm: Xorg Not tainted 4.2.3-300.fc23.x86_64 #1
[   12.419733] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.0-80-g2390eb5 04/01/2014
[   12.422100]  0000000000000000 00000000d92c128c ffff880078f53ab8 ffffffff81771fca
[   12.424023]  0000000000000000 ffff880078f53b10 ffff880078f53af8 ffffffff8109e4a6
[   12.425949]  ffff880078f53ad8 0000000000000000 ffff88007f680000 ffff88007fb72000
[   12.427719] Call Trace:
[   12.428354]  [<ffffffff81771fca>] dump_stack+0x45/0x57
[   12.429580]  [<ffffffff8109e4a6>] warn_slowpath_common+0x86/0xc0
[   12.431037]  [<ffffffff8109e535>] warn_slowpath_fmt+0x55/0x70
[   12.432497]  [<ffffffffa01e5a98>] assert_fdi_rx_pll+0x78/0xb0 [i915]
[   12.434033]  [<ffffffffa0221433>] intel_pre_enable_lvds+0x53/0x180 [i915]
[   12.435760]  [<ffffffffa01ef729>] ironlake_crtc_enable+0x199/0xd30 [i915]
[   12.437433]  [<ffffffffa0190c80>] ? intel_runtime_pm_put+0x50/0x60 [i915]
[   12.439083]  [<ffffffffa0190d98>] ? intel_display_power_put+0x108/0x160 [i915]
[   12.440914]  [<ffffffffa01ecff6>] __intel_set_mode+0x916/0xb60 [i915]
[   12.442458]  [<ffffffffa01f3ee6>] intel_crtc_set_config+0x2b6/0x580 [i915]
[   12.444180]  [<ffffffffa0099326>] drm_mode_set_config_internal+0x66/0x100 [drm]
[   12.445917]  [<ffffffffa016d512>] restore_fbdev_mode+0xc2/0xf0 [drm_kms_helper]
[   12.447695]  [<ffffffffa016f399>] drm_fb_helper_restore_fbdev_mode_unlocked+0x29/0x70 [drm_kms_helper]
[   12.450116]  [<ffffffffa020350e>] intel_fbdev_restore_mode+0x1e/0x50 [i915]
[   12.451794]  [<ffffffffa022c96e>] i915_driver_lastclose+0xe/0x20 [i915]
[   12.453360]  [<ffffffffa008c8ee>] drm_lastclose+0x2e/0x140 [drm]
[   12.454755]  [<ffffffffa008ccaf>] drm_release+0x2af/0x490 [drm]
[   12.456324]  [<ffffffff8121fbfc>] __fput+0xdc/0x1e0
[   12.457525]  [<ffffffff8121fd4e>] ____fput+0xe/0x10
[   12.458736]  [<ffffffff810bab65>] task_work_run+0x85/0xb0
[   12.460028]  [<ffffffff81014a2d>] do_notify_resume+0x8d/0x90
[   12.461424]  [<ffffffff81778b7c>] int_signal+0x12/0x17
[   12.462757] ---[ end trace 3278f15671a6fa94 ]---
[   12.546650] ------------[ cut here ]------------
[   12.547765] WARNING: CPU: 0 PID: 1135 at drivers/gpu/drm/i915/intel_display.c:1466 assert_pch_transcoder_disabled+0x67/0x70 [i915]()
[   12.550847] transcoder assertion failed, should be off on pipe A but is still active
[   12.552858] Modules linked in: ebtable_broute bridge stp llc ebtable_filter ebtable_nat ebtables ip6table_security ip6table_mangle ip6table_raw ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_filter ip6_tables iptable_security iptable_mangle iptable_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iTCO_wdt gpio_ich iTCO_vendor_support ppdev joydev parport_pc lpc_ich i2c_piix4 parport acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace isofs squashfs i915 bochs_drm video ttm i2c_algo_bit drm_kms_helper drm ata_generic serio_raw pata_acpi scsi_dh_rdac scsi_dh_emc scsi_dh_alua sunrpc loop
[   12.568444] CPU: 0 PID: 1135 Comm: Xorg Tainted: G        W       4.2.3-300.fc23.x86_64 #1
[   12.570599] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.0-80-g2390eb5 04/01/2014
[   12.572988]  0000000000000000 00000000d92c128c ffff880078f53b08 ffffffff81771fca
[   12.574973]  0000000000000000 ffff880078f53b60 ffff880078f53b48 ffffffff8109e4a6
[   12.576826]  ffff880078f53b28 0000000000000000 ffff88007f680000 ffff88007fb72000
[   12.578804] Call Trace:
[   12.579454]  [<ffffffff81771fca>] dump_stack+0x45/0x57
[   12.580775]  [<ffffffff8109e4a6>] warn_slowpath_common+0x86/0xc0
[   12.582419]  [<ffffffff8109e535>] warn_slowpath_fmt+0x55/0x70
[   12.583924]  [<ffffffffa01e1067>] assert_pch_transcoder_disabled+0x67/0x70 [i915]
[   12.585834]  [<ffffffffa01efb7c>] ironlake_crtc_enable+0x5ec/0xd30 [i915]
[   12.587634]  [<ffffffffa0190c80>] ? intel_runtime_pm_put+0x50/0x60 [i915]
[   12.589402]  [<ffffffffa0190d98>] ? intel_display_power_put+0x108/0x160 [i915]
[   12.591294]  [<ffffffffa01ecff6>] __intel_set_mode+0x916/0xb60 [i915]
[   12.593019]  [<ffffffffa01f3ee6>] intel_crtc_set_config+0x2b6/0x580 [i915]
[   12.594797]  [<ffffffffa0099326>] drm_mode_set_config_internal+0x66/0x100 [drm]
[   12.596651]  [<ffffffffa016d512>] restore_fbdev_mode+0xc2/0xf0 [drm_kms_helper]
[   12.598574]  [<ffffffffa016f399>] drm_fb_helper_restore_fbdev_mode_unlocked+0x29/0x70 [drm_kms_helper]
[   12.601022]  [<ffffffffa020350e>] intel_fbdev_restore_mode+0x1e/0x50 [i915]
[   12.602888]  [<ffffffffa022c96e>] i915_driver_lastclose+0xe/0x20 [i915]
[   12.604576]  [<ffffffffa008c8ee>] drm_lastclose+0x2e/0x140 [drm]
[   12.606098]  [<ffffffffa008ccaf>] drm_release+0x2af/0x490 [drm]
[   12.607662]  [<ffffffff8121fbfc>] __fput+0xdc/0x1e0
[   12.609019]  [<ffffffff8121fd4e>] ____fput+0xe/0x10
[   12.610283]  [<ffffffff810bab65>] task_work_run+0x85/0xb0
[   12.611718]  [<ffffffff81014a2d>] do_notify_resume+0x8d/0x90
[   12.613225]  [<ffffffff81778b7c>] int_signal+0x12/0x17
[   12.614600] ---[ end trace 3278f15671a6fa95 ]---
[   12.635700] [drm:intel_set_pch_fifo_underrun_reporting [i915]] *ERROR* uncleared pch fifo underrun on pch transcoder A
[   12.636007] [drm:intel_pch_fifo_underrun_irq_handler [i915]] *ERROR* PCH transcoder A FIFO underrun
[   12.642883] [drm:intel_cpu_fifo_underrun_irq_handler [i915]] *ERROR* CPU pipe A FIFO underrun

Otherwise i915 seems fairly happy:

[    4.940225] i915: No ACPI video bus found
[    4.943223] [drm] Initialized i915 1.6.0 20150522 for 0000:00:03.0 on minor 1
[    5.000053] [drm] GMBUS [i915 gmbus vga] timed out, falling back to bit banging on pin 2
[    5.038055] i915 0000:00:03.0: fb1: inteldrmfb frame buffer device

The bit banging messages is apparently normal on this system (Lenovo
W520, Nvidia disabled in BIOS - could optimus still be getting in the
way?). I don't know that any of the above is actually fatal though, I
just know that I don't get a signal out of the LVDS panel or any of the
other outputs. However, the Xorg log file looks normal and if I disable
mmap support in vfio and move the mouse cursor around on the VNC window,
the trace log looks a lot like graphics is running ok, the panel is just
turned off. I'm thinking that makes the above warnings more relevant
since they're in the space of FDI and PCH transcoders, which I think is
how we get to a display output.

> > It seems like SNB is pretty similar to IVB (they run on the same chipsets),  
> 
> Agree.
> 
> > but I haven't yet fgured out the magic to make an SNB laptop light up with IGD
> > assignment, which would be useful to show that OpRegion passthrough is
> > actually doing something.
> >   
> 
> There might be a difference in BDSM definition.  You might want to add 0xb0 in host bridge passthrough and try SNB again.

I'm not seeing any accesses there in the tracing, so I don't think the
driver cares about it.

> It is also possible the difference might be caused by difference in driver version.  If I google "sandybridge windows graphics driver" and "ivybridge windows graphics driver" I get the following:
> 
> SNB driver: https://downloadcenter.intel.com/product/81502/Intel-HD-Graphics-2000-for-2nd-Generation-Intel-Core-Processors
> IVB driver: https://downloadcenter.intel.com/product/81501/Intel-HD-Graphics-2500-for-3rd-Generation-Intel-Core-Processors
> 
> SNB driver points to version 15.28 driver binary.  This driver supports SNB and ILK graphics.
> IVB driver points to version 15.33 driver binary.  This driver supports IVB and SNB graphics.
> 
> One experiment you can try is to install same IVB 15.33 binary on both IVB and SNB systems.  To prevent Windows update messing with driver version, you should do the following:
> 
> 1)  disable MSFT driver update: Computer->"Advance system settings"->Hardware->"Device Installation Settings"->"No, let me choose what to do"->"Never install driver software from Windows Update".
> 2) Disable Windows update to never check or download updates.
> 3) remove "c:\windows\system32\DriverStore\FileRepository\igd*"
> 4) re-install IGD driver to desired binary.
> 5) Double check driver versions of both IVB and SNB are the same after reboot guest.  Unfortunately, it won't said 15.33 but have a version number starts with 10.*.

There are certainly a lot of subtle differences between IVB and SNB in
the Linux driver, if I have time I'll see about setting up a Windows VM
on the SNB system.  Linux feels pretty close though if I could just get
the panel turned on.

> > If we ignore IronLake and older GPUs, how many VM chipset hacks do we
> > need?  What combinations would require GMCH mirrored in the host bridge,  
> 
> Base on original code in Xen, you can try the following host bridge offset passthrough.  I have also attached pt-graphics.c from Xen/QEMU for your reference.
> 
>        case 0x00:        /* vendor id */
>         case 0x02:        /* device id */
>         case 0x08:        /* revision id */
>         case 0x2c:        /* sybsystem vendor id */
>         case 0x2e:        /* sybsystem id */
>         case 0x50:        /* SNB: processor graphics control register */
>         case 0x52:        /* processor graphics control register */
>         case 0xa0:        /* top of memory */
>         case 0xb0:        /* ILK: BSM: should read from dev 2 offset 0x5c */
>         case 0x58:        /* SNB: PAVPC Offset */
>         case 0xa4:        /* SNB: graphics base of stolen memory */
>         case 0xa8:        /* SNB: base of GTT stolen memory */
> 
> > and couldn't we mirror it from the IGD device itself since it's present in the
> > same location all the way back through SNB.
> >   
> 
> Yes, you can copy content from MCCG in IGD device to same 0x50 offset in host bridge.
> 
> > We certainly don't want to get into the nastiness of RMRRs in a VM, but do
> > the stolen memory use cases you've outlined explain the DMAR faults to that
> > region that I was seeing just booting a VM with a Linux live CD?  
> 
> What I described about stolen memory was based on Windows driver usage.  I will need to try Linux live CD to see DMAR faults you encountered.  I will try that after I finish setting up my environment.  I have been following your blog.  Great instructions!

Good, glad you found it.  I'm just running a simple VM like this:

/home/alwillia/local/bin/qemu-system-x86_64 -m 2G -bios /home/alwillia/Work/seabios.git/out/bios.bin -net none -cdrom /net/store/exports/ISOs/Fedora-Live-Cinnamon-x86_64-23-10.iso -boot d -monitor stdio -device vfio-pci,host=00:02.0,rombar=0,x-no-mmap=on -vnc :1 -vga std -enable-kvm -trace events=events.txt

events.txt contains:
vfio_region_read
vfio_region_write
vfio_pci_read_config
vfio_pci_write_config
vfio_pci_igd*
pci_cfg_*

If you base on the last patch series I posted on top of Gerd's changes,
you'll need -M pc,igd_passthru=on I believe too, the code I'm working
on passes the host and ISA bridge config through vfio and initiates the
ISA bridge automatically.  I'll try to post it this week.

> > Again, it seems just as easy, if not easier to copy GMCH from the IGD itself into the
> > host bridge.
> >   
> Not sure if I follow ... how does this would solve stolen memory/RMRR issue...?

In the vfio code I have now, the kernel virtualizes GMCH and BDSM on the
device to provide a pre-sanitized version, so if we need those on the
host bridge, it's easier to copy what's on the IGD instead of adding yet
more code that knows about the different GMCH layouts.  The solution is
the same, clear BDSM and GMCH.GMS to make the guest not try to access
the host stolen memory.

> > Ok, unless anyone shouts really loudly and it doesn't affect anything newer
> > than IronLake, I'm happy to let those fall off the plate.  I don't have any older
> > systems that I care to make work with this.
> >   
> Sounds good.
> 
> > 
> > Ok, so the code comment is pretty misleading for these.  Would anything
> > since SNB need these?
> >  
> We did not do much pruning pre-SNB.  Tiejun and I did checked HSW last year, most (maybe all) of these register are not needed.
> 
> > What about the BDSM register in the host bridge?
> >  Is the Xen experience of not needing to copy it relevant since stolen memory is
> > identity mapped into the VM anyway?  Maybe Xen achieves the same effect
> > by not copying it and exposing it as zero.  
> 
> In SNB, BDSM is at offset 0xb0 in host bridge.  Xen/QEMU passes thru 0xb0 in host bridge.  Although the comment says it is for ILK.  It is also true for SNB.  In current BDW/SKL driver, BDSM is located at 0x5c in IGD device.

If I could spot any accesses to BDSM on the host bridge, I'd jump on
this, but my guest driver doesn't seem to care about it according to
the tracing.  Lighting up an output seems to be the issue.  Thanks,

Alex
Kay, Allen M Feb. 11, 2016, 2:25 a.m. UTC | #8
> -----Original Message-----
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Tuesday, February 09, 2016 10:01 PM
> To: Kay, Allen M <allen.m.kay@intel.com>
> Cc: ehabkost@redhat.com; mst@redhat.com;
> stefano.stabellini@eu.citrix.com; qemu-devel@nongnu.org;
> pbonzini@redhat.com; rth@twiddle.net; Ruan, Shuai
> <shuai.ruan@intel.com>
> Subject: Re: [Qemu-devel] [v10][PATCH 03/10] piix: create host bridge to
> passthrough
> 
> I can run that system as either primary or secondary.  On both systems I'm
> using pci-stub to keep i915 from interfering on the host, on the SNB laptop I
> also use video=efifb:off since it's configured for UEFI boot, I'm not sure if
> that play any role in why it's not working.
> 

Hi Alex,

My understand if that your IVB system is a desktop running legacy mode BIOS and your SNB is a laptop running EFI mode BIOS, correct?  I'm curious how does your SNB system getting hold of VBT table, which is needed for lighting up local display.

There are two ways the driver can get VBT table:  1) VGA 0xc0000 memory,  2) OpRegion

On your IVB system, the guest driver would try to get VBT from 0xc0000 memory if IGD is configured as primary controller in the guest, assuming KVM/QEMU copies VGA 0xc0000 memory from host to guest 0xc0000 area.   If IGD is configured as secondary controller, the driver would get VBT from OpRegion.  Given IGD works in both configurations on IVB, this mean guest driver can successfully read VBT from both 0xc0000 area and OpRegion.

On your SNB system that is running in EFI mode on the host, 0xc0000 memory does not contain VBIOS/VBT.  This means if you configure IGD as primary controller in the guest, the driver cannot get to VBT at 0xc0000 memory even if KVM/QEMU copies 0xc0000 memory from host to guest.  If you configure IGD as secondary controller in the guest, and guest driver should be able to read VBT from the OpRegion.  You might want to trace i915 to see how does the  guest driver get to VBT via OpRegion in this configuration.

Another potential problem to watch out for with laptop vs. desktop has to do FLR timeout.  If you are working with a laptop with LCD panel attached (the usual case), FLR will take much longer than 100ms to finish.  The reason is the devices needs to power down the LCD panel before it can finish FLR. I have seen it takes more than 500ms for FLR to finish.  As a work around, I have increase FLR time out in Linux PCI driver to 1000ms when working with LCD panels.   Given that you have seen evidences that IGD HW is working, this might not be your issue.   I would focus tracing how does i915 get VBT from the OpRegion when IGD is configured as secondary controller in the guest.

Allen


> I've tried adding a bunch more registers on the SNB system to see if it helps,
> for the host bridge:
> 
>     {0x50,                    2},
>     {0x52,                    2},
>     {0xa4,                    4},
>     {0xa8,                    4},
> 
> (ie. the registers Tiejun added with this patch)
> 
> And on the ISA bridge:
> 
>     {0x40,        4},
>     {0x44,        4},
>     {0x48,        4},
>     {0x4c,        4},
>     {0xf0,        4},
> 
> These were registers I saw accessed with tracing enabled, but nothing
> seemed to change by adding them. I don't see any accesses to the BDSM
> register on the host bridge, but I do see GMCH accesses.  Sort of interesting
> is that the guest reads 201h from that register and tries to write 203h. In the
> read case we have a 2MB GGMS and GGCLCK set, the guest tries to set IVD
> (IGD VGA Disable). I'm not sure why it bothers to try to do this with GGCLCK
> indicated since the register is locked, but it is slightly troubling that the spec
> indicates that the BIOS must
> *not* set IVD to zero (VGA enabled) if GMS pre-allocates no memory, which
> is exactly what we're doing to try to indicate no stolen memory.
> If we could actually disable VGA on IGD, that might be a nice option, but I
> know there are issues with that (iirc, there's no disable for one of either the
> VGA MMIO or I/O port space, so the only option is to disable that address
> space at the PCI command register, which has obvious implications - I think it
> was probably I/O port space).
> 
> I'm currently trying a Linux live CD on the guest for the SNB, mostly because I
> can see driver error messages and look at the driver source, neither of which
> are available to me for the Windows driver.  Both of the driver warnings I get
> are from the driver thinking a certain feature should be disabled but finds it
> enabled.  They're also sort of on a strange drm release path, maybe due to
> being in secondary mode and Xorg not being configured to use the IGD on
> this image.  In case it looks familiar:
> 
> [   12.397908] ------------[ cut here ]------------
> [   12.399175] WARNING: CPU: 0 PID: 1135 at
> drivers/gpu/drm/i915/intel_display.c:1259 assert_fdi_rx_pll+0x78/0xb0
> [i915]()
> [   12.401720] FDI RX PLL assertion failure (expected off, current on)
> [   12.403239] Modules linked in: ebtable_broute bridge stp llc ebtable_filter
> ebtable_nat ebtables ip6table_security ip6table_mangle ip6table_raw
> ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_filter
> ip6_tables iptable_security iptable_mangle iptable_raw iptable_nat
> nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack
> iTCO_wdt gpio_ich iTCO_vendor_support ppdev joydev parport_pc lpc_ich
> i2c_piix4 parport acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace isofs
> squashfs i915 bochs_drm video ttm i2c_algo_bit drm_kms_helper drm
> ata_generic serio_raw pata_acpi scsi_dh_rdac scsi_dh_emc scsi_dh_alua
> sunrpc loop
> [   12.417994] CPU: 0 PID: 1135 Comm: Xorg Not tainted 4.2.3-300.fc23.x86_64
> #1
> [   12.419733] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> rel-1.9.0-80-g2390eb5 04/01/2014
> [   12.422100]  0000000000000000 00000000d92c128c ffff880078f53ab8
> ffffffff81771fca
> [   12.424023]  0000000000000000 ffff880078f53b10 ffff880078f53af8
> ffffffff8109e4a6
> [   12.425949]  ffff880078f53ad8 0000000000000000 ffff88007f680000
> ffff88007fb72000
> [   12.427719] Call Trace:
> [   12.428354]  [<ffffffff81771fca>] dump_stack+0x45/0x57
> [   12.429580]  [<ffffffff8109e4a6>] warn_slowpath_common+0x86/0xc0
> [   12.431037]  [<ffffffff8109e535>] warn_slowpath_fmt+0x55/0x70
> [   12.432497]  [<ffffffffa01e5a98>] assert_fdi_rx_pll+0x78/0xb0 [i915]
> [   12.434033]  [<ffffffffa0221433>] intel_pre_enable_lvds+0x53/0x180 [i915]
> [   12.435760]  [<ffffffffa01ef729>] ironlake_crtc_enable+0x199/0xd30 [i915]
> [   12.437433]  [<ffffffffa0190c80>] ? intel_runtime_pm_put+0x50/0x60 [i915]
> [   12.439083]  [<ffffffffa0190d98>] ? intel_display_power_put+0x108/0x160
> [i915]
> [   12.440914]  [<ffffffffa01ecff6>] __intel_set_mode+0x916/0xb60 [i915]
> [   12.442458]  [<ffffffffa01f3ee6>] intel_crtc_set_config+0x2b6/0x580 [i915]
> [   12.444180]  [<ffffffffa0099326>]
> drm_mode_set_config_internal+0x66/0x100 [drm]
> [   12.445917]  [<ffffffffa016d512>] restore_fbdev_mode+0xc2/0xf0
> [drm_kms_helper]
> [   12.447695]  [<ffffffffa016f399>]
> drm_fb_helper_restore_fbdev_mode_unlocked+0x29/0x70
> [drm_kms_helper]
> [   12.450116]  [<ffffffffa020350e>] intel_fbdev_restore_mode+0x1e/0x50
> [i915]
> [   12.451794]  [<ffffffffa022c96e>] i915_driver_lastclose+0xe/0x20 [i915]
> [   12.453360]  [<ffffffffa008c8ee>] drm_lastclose+0x2e/0x140 [drm]
> [   12.454755]  [<ffffffffa008ccaf>] drm_release+0x2af/0x490 [drm]
> [   12.456324]  [<ffffffff8121fbfc>] __fput+0xdc/0x1e0
> [   12.457525]  [<ffffffff8121fd4e>] ____fput+0xe/0x10
> [   12.458736]  [<ffffffff810bab65>] task_work_run+0x85/0xb0
> [   12.460028]  [<ffffffff81014a2d>] do_notify_resume+0x8d/0x90
> [   12.461424]  [<ffffffff81778b7c>] int_signal+0x12/0x17
> [   12.462757] ---[ end trace 3278f15671a6fa94 ]---
> [   12.546650] ------------[ cut here ]------------
> [   12.547765] WARNING: CPU: 0 PID: 1135 at
> drivers/gpu/drm/i915/intel_display.c:1466
> assert_pch_transcoder_disabled+0x67/0x70 [i915]()
> [   12.550847] transcoder assertion failed, should be off on pipe A but is still
> active
> [   12.552858] Modules linked in: ebtable_broute bridge stp llc ebtable_filter
> ebtable_nat ebtables ip6table_security ip6table_mangle ip6table_raw
> ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_filter
> ip6_tables iptable_security iptable_mangle iptable_raw iptable_nat
> nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack
> iTCO_wdt gpio_ich iTCO_vendor_support ppdev joydev parport_pc lpc_ich
> i2c_piix4 parport acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace isofs
> squashfs i915 bochs_drm video ttm i2c_algo_bit drm_kms_helper drm
> ata_generic serio_raw pata_acpi scsi_dh_rdac scsi_dh_emc scsi_dh_alua
> sunrpc loop
> [   12.568444] CPU: 0 PID: 1135 Comm: Xorg Tainted: G        W       4.2.3-
> 300.fc23.x86_64 #1
> [   12.570599] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> rel-1.9.0-80-g2390eb5 04/01/2014
> [   12.572988]  0000000000000000 00000000d92c128c ffff880078f53b08
> ffffffff81771fca
> [   12.574973]  0000000000000000 ffff880078f53b60 ffff880078f53b48
> ffffffff8109e4a6
> [   12.576826]  ffff880078f53b28 0000000000000000 ffff88007f680000
> ffff88007fb72000
> [   12.578804] Call Trace:
> [   12.579454]  [<ffffffff81771fca>] dump_stack+0x45/0x57
> [   12.580775]  [<ffffffff8109e4a6>] warn_slowpath_common+0x86/0xc0
> [   12.582419]  [<ffffffff8109e535>] warn_slowpath_fmt+0x55/0x70
> [   12.583924]  [<ffffffffa01e1067>]
> assert_pch_transcoder_disabled+0x67/0x70 [i915]
> [   12.585834]  [<ffffffffa01efb7c>] ironlake_crtc_enable+0x5ec/0xd30 [i915]
> [   12.587634]  [<ffffffffa0190c80>] ? intel_runtime_pm_put+0x50/0x60 [i915]
> [   12.589402]  [<ffffffffa0190d98>] ? intel_display_power_put+0x108/0x160
> [i915]
> [   12.591294]  [<ffffffffa01ecff6>] __intel_set_mode+0x916/0xb60 [i915]
> [   12.593019]  [<ffffffffa01f3ee6>] intel_crtc_set_config+0x2b6/0x580 [i915]
> [   12.594797]  [<ffffffffa0099326>]
> drm_mode_set_config_internal+0x66/0x100 [drm]
> [   12.596651]  [<ffffffffa016d512>] restore_fbdev_mode+0xc2/0xf0
> [drm_kms_helper]
> [   12.598574]  [<ffffffffa016f399>]
> drm_fb_helper_restore_fbdev_mode_unlocked+0x29/0x70
> [drm_kms_helper]
> [   12.601022]  [<ffffffffa020350e>] intel_fbdev_restore_mode+0x1e/0x50
> [i915]
> [   12.602888]  [<ffffffffa022c96e>] i915_driver_lastclose+0xe/0x20 [i915]
> [   12.604576]  [<ffffffffa008c8ee>] drm_lastclose+0x2e/0x140 [drm]
> [   12.606098]  [<ffffffffa008ccaf>] drm_release+0x2af/0x490 [drm]
> [   12.607662]  [<ffffffff8121fbfc>] __fput+0xdc/0x1e0
> [   12.609019]  [<ffffffff8121fd4e>] ____fput+0xe/0x10
> [   12.610283]  [<ffffffff810bab65>] task_work_run+0x85/0xb0
> [   12.611718]  [<ffffffff81014a2d>] do_notify_resume+0x8d/0x90
> [   12.613225]  [<ffffffff81778b7c>] int_signal+0x12/0x17
> [   12.614600] ---[ end trace 3278f15671a6fa95 ]---
> [   12.635700] [drm:intel_set_pch_fifo_underrun_reporting [i915]] *ERROR*
> uncleared pch fifo underrun on pch transcoder A
> [   12.636007] [drm:intel_pch_fifo_underrun_irq_handler [i915]] *ERROR*
> PCH transcoder A FIFO underrun
> [   12.642883] [drm:intel_cpu_fifo_underrun_irq_handler [i915]] *ERROR*
> CPU pipe A FIFO underrun
> 
> Otherwise i915 seems fairly happy:
> 
> [    4.940225] i915: No ACPI video bus found
> [    4.943223] [drm] Initialized i915 1.6.0 20150522 for 0000:00:03.0 on minor 1
> [    5.000053] [drm] GMBUS [i915 gmbus vga] timed out, falling back to bit
> banging on pin 2
> [    5.038055] i915 0000:00:03.0: fb1: inteldrmfb frame buffer device
> 
> The bit banging messages is apparently normal on this system (Lenovo W520,
> Nvidia disabled in BIOS - could optimus still be getting in the way?). I don't
> know that any of the above is actually fatal though, I just know that I don't
> get a signal out of the LVDS panel or any of the other outputs. However, the
> Xorg log file looks normal and if I disable mmap support in vfio and move the
> mouse cursor around on the VNC window, the trace log looks a lot like
> graphics is running ok, the panel is just turned off. I'm thinking that makes
> the above warnings more relevant since they're in the space of FDI and PCH
> transcoders, which I think is how we get to a display output.
> 
> 
> There are certainly a lot of subtle differences between IVB and SNB in the
> Linux driver, if I have time I'll see about setting up a Windows VM on the SNB
> system.  Linux feels pretty close though if I could just get the panel turned
> on.
> 
> 
> Good, glad you found it.  I'm just running a simple VM like this:
> 
> /home/alwillia/local/bin/qemu-system-x86_64 -m 2G -bios
> /home/alwillia/Work/seabios.git/out/bios.bin -net none -cdrom
> /net/store/exports/ISOs/Fedora-Live-Cinnamon-x86_64-23-10.iso -boot d -
> monitor stdio -device vfio-pci,host=00:02.0,rombar=0,x-no-mmap=on -vnc :1
> -vga std -enable-kvm -trace events=events.txt
> 
> events.txt contains:
> vfio_region_read
> vfio_region_write
> vfio_pci_read_config
> vfio_pci_write_config
> vfio_pci_igd*
> pci_cfg_*
> 
> If you base on the last patch series I posted on top of Gerd's changes, you'll
> need -M pc,igd_passthru=on I believe too, the code I'm working on passes
> the host and ISA bridge config through vfio and initiates the ISA bridge
> automatically.  I'll try to post it this week.
> 
> > > Again, it seems just as easy, if not easier to copy GMCH from the
> > > IGD itself into the host bridge.
> > >
> > Not sure if I follow ... how does this would solve stolen memory/RMRR
> issue...?
> 
> In the vfio code I have now, the kernel virtualizes GMCH and BDSM on the
> device to provide a pre-sanitized version, so if we need those on the host
> bridge, it's easier to copy what's on the IGD instead of adding yet more code
> that knows about the different GMCH layouts.  The solution is the same,
> clear BDSM and GMCH.GMS to make the guest not try to access the host
> stolen memory.
> 
.
> 
> If I could spot any accesses to BDSM on the host bridge, I'd jump on this, but
> my guest driver doesn't seem to care about it according to the tracing.
> Lighting up an output seems to be the issue.  Thanks,
> 
> Alex
Alex Williamson Feb. 11, 2016, 6:07 a.m. UTC | #9
On Thu, 11 Feb 2016 02:25:51 +0000
"Kay, Allen M" <allen.m.kay@intel.com> wrote:

> > -----Original Message-----
> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Tuesday, February 09, 2016 10:01 PM
> > To: Kay, Allen M <allen.m.kay@intel.com>
> > Cc: ehabkost@redhat.com; mst@redhat.com;
> > stefano.stabellini@eu.citrix.com; qemu-devel@nongnu.org;
> > pbonzini@redhat.com; rth@twiddle.net; Ruan, Shuai
> > <shuai.ruan@intel.com>
> > Subject: Re: [Qemu-devel] [v10][PATCH 03/10] piix: create host bridge to
> > passthrough
> > 
> > I can run that system as either primary or secondary.  On both systems I'm
> > using pci-stub to keep i915 from interfering on the host, on the SNB laptop I
> > also use video=efifb:off since it's configured for UEFI boot, I'm not sure if
> > that play any role in why it's not working.
> >   
> 
> Hi Alex,
> 
> My understand if that your IVB system is a desktop running legacy mode BIOS and your SNB is a laptop running EFI mode BIOS, correct?  I'm curious how does your SNB system getting hold of VBT table, which is needed for lighting up local display.
> 
> There are two ways the driver can get VBT table:  1) VGA 0xc0000 memory,  2) OpRegion
> 
> On your IVB system, the guest driver would try to get VBT from 0xc0000 memory if IGD is configured as primary controller in the guest, assuming KVM/QEMU copies VGA 0xc0000 memory from host to guest 0xc0000 area.   If IGD is configured as secondary controller, the driver would get VBT from OpRegion.  Given IGD works in both configurations on IVB, this mean guest driver can successfully read VBT from both 0xc0000 area and OpRegion.
> 
> On your SNB system that is running in EFI mode on the host, 0xc0000 memory does not contain VBIOS/VBT.  This means if you configure IGD as primary controller in the guest, the driver cannot get to VBT at 0xc0000 memory even if KVM/QEMU copies 0xc0000 memory from host to guest.  If you configure IGD as secondary controller in the guest, and guest driver should be able to read VBT from the OpRegion.  You might want to trace i915 to see how does the  guest driver get to VBT via OpRegion in this configuration.

Hi Allen,

Success!  We don't directly pass 0xc0000 from the host, but we can use
PCI option ROMs loaded via files into QEMU.  For the IVB system I
haven't been bothering with this, even in primary mode I've just been
letting the display initialize later in the guest boot.  I also haven't
been enabling VGA mode access in that case.  With the SNB laptop, if I
extract the ROM from sysfs, modify the device ID and fix the checksum,
the panel lights up, with or without VGA mode access (though of course
I only see a flash of SeaBIOS when VGA mode is enabled).  So it seems
that either the user is going to need to hack their own ROM file or I'm
going to need to make vfio do this automatically and pretend that the
ROM is actually implemented as a BAR on IGD.  The latter seems far more
accessible.  Lacking an actual ROM BAR, we'll also of course only be
able to do that when IGD is primary graphics on the host.

I'm not sure how we can do secondary mode in the VM with this config
though since only the primary graphics gets the coveted 0xc0000
location.  FWIW, to make this work I added 'romfile=igd.rom' to the
vfio-pci device with igd.rom being the modified copy retrieved from
sysfs.  Then I used '-vga none' to disable the QEMU primary VGA device
(-nodefaults is probably an option too).  I then added 'addr=2.0' to
the vfio-pci device to make it be the primary graphics from SeaBIOS'
perspective and added '-device secondary-vga,addr=3.0 -vnc :1', which
predominantly gives me a VNC connection where I can interact with the
VM via mouse and keyboard.

I'll need to do some further investigation to see what we're really
getting with the OpRegion.  Prior to adding the ROM, the Xorg log file
sure seems like it knew the LVDS panel was 1920x1080, but it might be
seeing more modes now that it has a video BIOS.  I'll also try to prune
the registers copied into the virtual host bridge and ISA bridge config
space to the minimum we need.

> Another potential problem to watch out for with laptop vs. desktop has to do FLR timeout.  If you are working with a laptop with LCD panel attached (the usual case), FLR will take much longer than 100ms to finish.  The reason is the devices needs to power down the LCD panel before it can finish FLR. I have seen it takes more than 500ms for FLR to finish.  As a work around, I have increase FLR time out in Linux PCI driver to 1000ms when working with LCD panels.   Given that you have seen evidences that IGD HW is working, this might not be your issue.   I would focus tracing how does i915 get VBT from the OpRegion when IGD is configured as secondary controller in the guest.

Ok, I did see evidence of this.  In my case the VM would always fail to
start on the first try with errors in dmesg indicating that vfio was
reading back -1 from config space of the device.  Presumably it was
only the first attempt in my case since previously I was never getting
the panel turned back on and subsequent resets were faster.  I also
notice that I see DMAR faults on the first reset that seem to be
accesses to the host stolen memory region pointed to by the BDSM.  I
don't see this if I reset the device from sysfs before trying to use it
with the VM, so maybe vfio should try to do a reset on probing IGD,
before it gets placed in an IOMMU domain that blocks access to that
host stolen memory.  Of course the next question is whether we can
easily determine whether an IGD has an LCD panel so we know which
timeout to use.  I suppose we could use a 1000ms timeout for all Intel
VGA class devices, but handling it with a quirk only for panel attached
configs would probably be preferable.  Thanks,

Alex
Gerd Hoffmann Feb. 15, 2016, 11:28 a.m. UTC | #10
Hi,

> > I believe this is same as 0x50 but on old Ironlake platform.  Xen started IGD passthrough support in Ironlake.
> 
> Ok, unless anyone shouts really loudly and it doesn't affect anything
> newer than IronLake, I'm happy to let those fall off the plate.  I don't
> have any older systems that I care to make work with this.

We are also arriving at a hardware age where vt-d for igd isn't working
properly, so supporting only Ironlake+ make sense to me.

cheers,
  Gerd
diff mbox

Patch

diff --git a/hw/pci-host/piix.c b/hw/pci-host/piix.c
index a203d93..7adf645 100644
--- a/hw/pci-host/piix.c
+++ b/hw/pci-host/piix.c
@@ -734,6 +734,87 @@  static const TypeInfo i440fx_info = {
     .class_init    = i440fx_class_init,
 };
 
+/* IGD Passthrough Host Bridge. */
+typedef struct {
+    uint8_t offset;
+    uint8_t len;
+} IGDHostInfo;
+
+/* Here we just expose minimal host bridge offset subset. */
+static const IGDHostInfo igd_host_bridge_infos[] = {
+    {0x08, 2},  /* revision id */
+    {0x2c, 2},  /* sybsystem vendor id */
+    {0x2e, 2},  /* sybsystem id */
+    {0x50, 2},  /* SNB: processor graphics control register */
+    {0x52, 2},  /* processor graphics control register */
+    {0xa4, 4},  /* SNB: graphics base of stolen memory */
+    {0xa8, 4},  /* SNB: base of GTT stolen memory */
+};
+
+static int host_pci_config_read(int pos, int len, uint32_t val)
+{
+    char path[PATH_MAX];
+    int config_fd;
+    ssize_t size = sizeof(path);
+    /* Access real host bridge. */
+    int rc = snprintf(path, size, "/sys/bus/pci/devices/%04x:%02x:%02x.%d/%s",
+                      0, 0, 0, 0, "config");
+
+    if (rc >= size || rc < 0) {
+        return -ENODEV;
+    }
+
+    config_fd = open(path, O_RDWR);
+    if (config_fd < 0) {
+        return -ENODEV;
+    }
+
+    do {
+        rc = pread(config_fd, (uint8_t *)&val, len, pos);
+    } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
+    if (rc != len) {
+        return -errno;
+    }
+
+    return 0;
+}
+
+static int igd_pt_i440fx_initfn(struct PCIDevice *pci_dev)
+{
+    uint32_t val = 0;
+    int rc, i, num;
+    int pos, len;
+
+    num = ARRAY_SIZE(igd_host_bridge_infos);
+    for (i = 0; i < num; i++) {
+        pos = igd_host_bridge_infos[i].offset;
+        len = igd_host_bridge_infos[i].len;
+        rc = host_pci_config_read(pos, len, val);
+        if (rc) {
+            return -ENODEV;
+        }
+        pci_default_write_config(pci_dev, pos, val, len);
+    }
+
+    return 0;
+}
+
+static void igd_passthrough_i440fx_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+    k->init = igd_pt_i440fx_initfn;
+    dc->desc = "IGD Passthrough Host bridge";
+}
+
+static const TypeInfo igd_passthrough_i440fx_info = {
+    .name          = TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE,
+    .parent        = TYPE_I440FX_PCI_DEVICE,
+    .instance_size = sizeof(PCII440FXState),
+    .class_init    = igd_passthrough_i440fx_class_init,
+};
+
 static const char *i440fx_pcihost_root_bus_path(PCIHostState *host_bridge,
                                                 PCIBus *rootbus)
 {
@@ -775,6 +856,7 @@  static const TypeInfo i440fx_pcihost_info = {
 static void i440fx_register_types(void)
 {
     type_register_static(&i440fx_info);
+    type_register_static(&igd_passthrough_i440fx_info);
     type_register_static(&piix3_pci_type_info);
     type_register_static(&piix3_info);
     type_register_static(&piix3_xen_info);
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 449bf4a..2db3f6d 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -230,6 +230,8 @@  typedef struct PCII440FXState PCII440FXState;
 #define TYPE_I440FX_PCI_HOST_BRIDGE "i440FX-pcihost"
 #define TYPE_I440FX_PCI_DEVICE "i440FX"
 
+#define TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE "igd-passthrough-i440FX"
+
 PCIBus *i440fx_init(const char *host_type, const char *pci_type,
                     PCII440FXState **pi440fx_state, int *piix_devfn,
                     ISABus **isa_bus, qemu_irq *pic,