Patchwork [v2] PIIX3: reset the VM when the Reset Control Register's RCPU bit gets set

login
register
mail settings
Submitter Laszlo Ersek
Date Jan. 16, 2013, 6:40 p.m.
Message ID <1358361619-10998-1-git-send-email-lersek@redhat.com>
Download mbox | patch
Permalink /patch/212956/
State New
Headers show

Comments

Laszlo Ersek - Jan. 16, 2013, 6:40 p.m.
From <http://mjg59.dreamwidth.org/3561.html>:

  Traditional PCI config space access is achieved by writing a 32 bit
  value to io port 0xcf8 to identify the bus, device, function and config
  register. Port 0xcfc then contains the register in question. But if you
  write the appropriate pair of magic values to 0xcf9, the machine will
  reboot. Spectacular! And not standardised in any way (certainly not part
  of the PCI spec), so different chipsets may have different requirements.
  Booo.

In the PIIX3 spec, IO port 0xcf9 is specified as the Reset Control
Register. Bit 1 (System Reset, SRST) would normally differentiate between
soft reset and hard reset, but we ignore the difference beyond allowing
the guest to read it back.

RHBZ reference: 890459

This patch introduces the following overlap between the preexistent
"pci-conf-idx" region and the "piix3-reset-control" region just being
added. Partial output from "info mtree":

  I/O
  0000000000000000-000000000000ffff (prio 0, RW): io
    0000000000000cf8-0000000000000cfb (prio 0, RW): pci-conf-idx
    0000000000000cf9-0000000000000cf9 (prio 1, RW): piix3-reset-control

I sanity-checked the patch by booting a RHEL-6.3 guest and found no
problems. I summoned gdb and set a breakpoint on rcr_write() in order to
gather a bit more confidence. Relevant frames of the stack:

  kvm_handle_io (port=3321, data=0x7f3f5f3de000, direction=1, size=1,
                 count=1)                                 [kvm-all.c:1422]
    cpu_outb (addr=3321, val=6 '\006')                      [ioport.c:289]
      ioport_write (index=0, address=3321, data=6)           [ioport.c:83]
        ioport_writeb_thunk (opaque=0x7f3f622c4680, addr=3321, data=6)
                                                            [ioport.c:212]
          memory_region_iorange_write (iorange=0x7f3f622c4680, offset=0,
                                       width=1, data=6)     [memory.c:439]
            access_with_adjusted_size (addr=0, value=0x7f3f531fbac0,
                                       size=1, access_size_min=1,
                                       access_size_max=4,
                                       access=0x7f3f5f6e0f90
                                           <memory_region_write_accessor>,
                                       opaque=0x7f3f6227b668)
                                                            [memory.c:364]
              memory_region_write_accessor (opaque=0x7f3f6227b668, addr=0,
                                            value=0x7f3f531fbac0, size=1,
                                            shift=0, mask=255)
                                                            [memory.c:334]
                rcr_write (opaque=0x7f3f6227afb0, addr=0, val=6, len=1)
                                                       [hw/piix_pci.c:498]

The dispatch happens in ioport_write(); "index=0" means byte-wide access:

    static void ioport_write(int index, uint32_t address, uint32_t data)
    {
        static IOPortWriteFunc * const default_func[3] = {
            default_ioport_writeb,
            default_ioport_writew,
            default_ioport_writel
        };
        IOPortWriteFunc *func = ioport_write_table[index][address];
        if (!func)
            func = default_func[index];
        func(ioport_opaque[address], address, data);
    }

The "ioport_write_table" and "ioport_opaque" arrays describe the flattened
IO port space. The first array is less interesting (it selects a thunk
function). The "ioport_opaque" array is interesting because it decides how
writing to the port is implemented ultimately.

4-byte wide access to 0xcf8 (pci-conf-idx):

  (gdb) print ioport_write_table[2][0xcf8]
  $1 = (IOPortWriteFunc *) 0x7f3f5f6d99ba <ioport_writel_thunk>

  (gdb) print \
        ((struct MemoryRegionIORange*)ioport_opaque[0xcf8])->mr->ops.write
  $2 = (void (*)(void *, hwaddr, uint64_t, unsigned int))
       0x7f3f5f5575cb <pci_host_config_write>

1-byte wide access to 0xcf9 (piix3-reset-control):

  (gdb) print ioport_write_table[0][0xcf9]
  $3 = (IOPortWriteFunc *) 0x7f3f5f6d98d0 <ioport_writeb_thunk>

  (gdb) print \
        ((struct MemoryRegionIORange*)ioport_opaque[0xcf9])->mr->ops.write
  $4 = (void (*)(void *, hwaddr, uint64_t, unsigned int))
       0x7f3f5f6b42f1 <rcr_write>

The higher priority of "piix3-reset-control" ensures that the 0xcf9
entries in ioport_write_table / ioport_opaque will always belong to it,
independently of its relative registration order versus "pci-conf-idx".

Signed-off-by: Laszlo Ersek <lersek@redhat.com>
---
 hw/piix_pci.c |   69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 69 insertions(+), 0 deletions(-)
Stefan Hajnoczi - Jan. 23, 2013, 8:36 a.m.
On Wed, Jan 16, 2013 at 07:40:19PM +0100, Laszlo Ersek wrote:
>  static int piix3_post_load(void *opaque, int version_id)
>  {
>      PIIX3State *piix3 = opaque;
>      piix3_update_irq_levels(piix3);
> +    piix3->rcr &= 2; /* keep System Reset type only */
>      return 0;
>  }

Is this necessary?  I think only an evil migration source could set
value not in {0x0, 0x2}.  And if so, it doesn't seem like our job to
validate that.

> +static void rcr_write(void *opaque, hwaddr addr, uint64_t val, unsigned len)
> +{
> +    PIIX3State *d = opaque;
> +
> +    if (val & 4) {
> +        qemu_system_reset_request();
> +        return;
> +    }
> +    d->rcr = val & 2; /* keep System Reset type only */
> +}

We don't preserve d->rcr across reset:

(qemu) o /b 0xcf9 2
rcr_write val 0x2 rcr 0x2
(qemu) o /b 0xcf9 4
piix3_reset rcr = 0
piix3_reset rcr = 0

Is this okay?

Stefan
Laszlo Ersek - Jan. 23, 2013, 10:46 a.m.
On 01/23/13 09:36, Stefan Hajnoczi wrote:
> On Wed, Jan 16, 2013 at 07:40:19PM +0100, Laszlo Ersek wrote:
>>  static int piix3_post_load(void *opaque, int version_id)
>>  {
>>      PIIX3State *piix3 = opaque;
>>      piix3_update_irq_levels(piix3);
>> +    piix3->rcr &= 2; /* keep System Reset type only */
>>      return 0;
>>  }
> 
> Is this necessary?  I think only an evil migration source could set
> value not in {0x0, 0x2}.

I wanted to make sure in general that no write path to "piix3->rcr"
would let an invalid value through.

> And if so, it doesn't seem like our job to
> validate that.

Independently of the patch: why not?

>> +static void rcr_write(void *opaque, hwaddr addr, uint64_t val, unsigned len)
>> +{
>> +    PIIX3State *d = opaque;
>> +
>> +    if (val & 4) {
>> +        qemu_system_reset_request();
>> +        return;
>> +    }
>> +    d->rcr = val & 2; /* keep System Reset type only */
>> +}
> 
> We don't preserve d->rcr across reset:
> 
> (qemu) o /b 0xcf9 2
> rcr_write val 0x2 rcr 0x2
> (qemu) o /b 0xcf9 4
> piix3_reset rcr = 0
> piix3_reset rcr = 0
> 
> Is this okay?

Yes, that was my intent with modifying piix3_reset().

Thanks,
Laszlo
Stefan Hajnoczi - Jan. 23, 2013, 12:51 p.m.
On Wed, Jan 23, 2013 at 11:46:48AM +0100, Laszlo Ersek wrote:
> On 01/23/13 09:36, Stefan Hajnoczi wrote:
> > On Wed, Jan 16, 2013 at 07:40:19PM +0100, Laszlo Ersek wrote:
> >>  static int piix3_post_load(void *opaque, int version_id)
> >>  {
> >>      PIIX3State *piix3 = opaque;
> >>      piix3_update_irq_levels(piix3);
> >> +    piix3->rcr &= 2; /* keep System Reset type only */
> >>      return 0;
> >>  }
> > 
> > Is this necessary?  I think only an evil migration source could set
> > value not in {0x0, 0x2}.
> 
> I wanted to make sure in general that no write path to "piix3->rcr"
> would let an invalid value through.
> 
> > And if so, it doesn't seem like our job to
> > validate that.
> 
> Independently of the patch: why not?

The migration source and the guest are both untrusted.  We just need to
protect against QEMU crashing or doing something unsafe which could lead
to security problems or data corruption.

An invalid value in this register means that the guest sees a junk value
but it's nothing dangerous that QEMU needs to protect against.

The migration source can already create a junk guest.  Storing a junk
value in this register is no worse.

We don't validate other migrated values, so it stood out to me.  I
wasn't sure if there's some subtle reason why we need to do this.

Consistently validating all register values in a separate patch would be
okay, but having just this one line is confusing.

Stefan

Patch

diff --git a/hw/piix_pci.c b/hw/piix_pci.c
index 3d79c73..38a1027 100644
--- a/hw/piix_pci.c
+++ b/hw/piix_pci.c
@@ -31,6 +31,7 @@ 
 #include "qemu/range.h"
 #include "xen.h"
 #include "pam.h"
+#include "sysemu/sysemu.h"
 
 /*
  * I440FX chipset data sheet.
@@ -46,6 +47,12 @@  typedef struct I440FXState {
 #define XEN_PIIX_NUM_PIRQS      128ULL
 #define PIIX_PIRQC              0x60
 
+/*
+ * Reset Control Register: PCI-accessible ISA-Compatible Register at address
+ * 0xcf9, provided by the PCI/ISA bridge (PIIX3 PCI function 0, 8086:7000).
+ */
+#define RCR_IOPORT 0xcf9
+
 typedef struct PIIX3State {
     PCIDevice dev;
 
@@ -67,6 +74,12 @@  typedef struct PIIX3State {
 
     /* This member isn't used. Just for save/load compatibility */
     int32_t pci_irq_levels_vmstate[PIIX_NUM_PIRQS];
+
+    /* Reset Control Register contents */
+    uint8_t rcr;
+
+    /* IO memory region for Reset Control Register (RCR_IOPORT) */
+    MemoryRegion rcr_mem;
 } PIIX3State;
 
 struct PCII440FXState {
@@ -442,12 +455,14 @@  static void piix3_reset(void *opaque)
     pci_conf[0xae] = 0x00;
 
     d->pic_levels = 0;
+    d->rcr = 0;
 }
 
 static int piix3_post_load(void *opaque, int version_id)
 {
     PIIX3State *piix3 = opaque;
     piix3_update_irq_levels(piix3);
+    piix3->rcr &= 2; /* keep System Reset type only */
     return 0;
 }
 
@@ -462,6 +477,23 @@  static void piix3_pre_save(void *opaque)
     }
 }
 
+static bool piix3_rcr_needed(void *opaque)
+{
+    PIIX3State *piix3 = opaque;
+
+    return (piix3->rcr != 0);
+}
+
+static const VMStateDescription vmstate_piix3_rcr = {
+    .name = "PIIX3/rcr",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField []) {
+        VMSTATE_UINT8(rcr, PIIX3State),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_piix3 = {
     .name = "PIIX3",
     .version_id = 3,
@@ -474,14 +506,51 @@  static const VMStateDescription vmstate_piix3 = {
         VMSTATE_INT32_ARRAY_V(pci_irq_levels_vmstate, PIIX3State,
                               PIIX_NUM_PIRQS, 3),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (VMStateSubsection []) {
+        {
+            .vmsd = &vmstate_piix3_rcr,
+            .needed = piix3_rcr_needed,
+        },
+        { 0 }
     }
 };
 
+
+static void rcr_write(void *opaque, hwaddr addr, uint64_t val, unsigned len)
+{
+    PIIX3State *d = opaque;
+
+    if (val & 4) {
+        qemu_system_reset_request();
+        return;
+    }
+    d->rcr = val & 2; /* keep System Reset type only */
+}
+
+static uint64_t rcr_read(void *opaque, hwaddr addr, unsigned len)
+{
+    PIIX3State *d = opaque;
+
+    return d->rcr;
+}
+
+static const MemoryRegionOps rcr_ops = {
+    .read = rcr_read,
+    .write = rcr_write,
+    .endianness = DEVICE_LITTLE_ENDIAN
+};
+
 static int piix3_initfn(PCIDevice *dev)
 {
     PIIX3State *d = DO_UPCAST(PIIX3State, dev, dev);
 
     isa_bus_new(&d->dev.qdev, pci_address_space_io(dev));
+
+    memory_region_init_io(&d->rcr_mem, &rcr_ops, d, "piix3-reset-control", 1);
+    memory_region_add_subregion_overlap(pci_address_space_io(dev), RCR_IOPORT,
+                                        &d->rcr_mem, 1);
+
     qemu_register_reset(piix3_reset, d);
     return 0;
 }