diff mbox series

[qemu] ppc/spapr: Receive and store device tree blob from SLOF

Message ID 20181108014406.29818-1-aik@ozlabs.ru
State New
Headers show
Series [qemu] ppc/spapr: Receive and store device tree blob from SLOF | expand

Commit Message

Alexey Kardashevskiy Nov. 8, 2018, 1:44 a.m. UTC
SLOF receives a device tree and updates it with various properties
before switching to the guest kernel and QEMU is not aware of any changes
made by SLOF. Since there is no real RTAS (QEMU implements it), it makes
sense to pass the SLOF final device tree to QEMU to let it implement
RTAS related tasks better, such as PCI host bus adapter hotplug.

Specifially, now QEMU can find out the actual XICS phandle (for PHB
hotplug) and the RTAS linux,rtas-entry/base properties (for firmware
assisted NMI - FWNMI).

This stores the initial DT blob in the sPAPR machine and replaces it
in the KVMPPC_H_UPDATE_DT (new private hypercall) handler.

This adds an @update_dt_enabled machine property to allow backward
migration.

SLOF already has a hypercall since
https://github.com/aik/SLOF/commit/e6fc84652c9c0073f9183

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 include/hw/ppc/spapr.h |  7 ++++++-
 hw/ppc/spapr.c         | 29 ++++++++++++++++++++++++++++-
 hw/ppc/spapr_hcall.c   | 32 ++++++++++++++++++++++++++++++++
 hw/ppc/trace-events    |  2 ++
 4 files changed, 68 insertions(+), 2 deletions(-)

Comments

Greg Kurz Nov. 11, 2018, 6:10 p.m. UTC | #1
Hi Alexey,

Just a few remarks. See below.

On Thu,  8 Nov 2018 12:44:06 +1100
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:

> SLOF receives a device tree and updates it with various properties
> before switching to the guest kernel and QEMU is not aware of any changes
> made by SLOF. Since there is no real RTAS (QEMU implements it), it makes
> sense to pass the SLOF final device tree to QEMU to let it implement
> RTAS related tasks better, such as PCI host bus adapter hotplug.
> 
> Specifially, now QEMU can find out the actual XICS phandle (for PHB
> hotplug) and the RTAS linux,rtas-entry/base properties (for firmware
> assisted NMI - FWNMI).
> 
> This stores the initial DT blob in the sPAPR machine and replaces it
> in the KVMPPC_H_UPDATE_DT (new private hypercall) handler.
> 
> This adds an @update_dt_enabled machine property to allow backward
> migration.
> 
> SLOF already has a hypercall since
> https://github.com/aik/SLOF/commit/e6fc84652c9c0073f9183
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  include/hw/ppc/spapr.h |  7 ++++++-
>  hw/ppc/spapr.c         | 29 ++++++++++++++++++++++++++++-
>  hw/ppc/spapr_hcall.c   | 32 ++++++++++++++++++++++++++++++++
>  hw/ppc/trace-events    |  2 ++
>  4 files changed, 68 insertions(+), 2 deletions(-)
> 
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index ad4d7cfd97..f5dcaf44cb 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -100,6 +100,7 @@ struct sPAPRMachineClass {
>  
>      /*< public >*/
>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
> +    bool update_dt_enabled;    /* enable KVMPPC_H_UPDATE_DT */
>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
>      bool pre_2_10_has_unused_icps;
>      bool legacy_irq_allocation;
> @@ -136,6 +137,9 @@ struct sPAPRMachineState {
>      int vrma_adjust;
>      ssize_t rtas_size;
>      void *rtas_blob;
> +    uint32_t fdt_size;
> +    uint32_t fdt_initial_size;

I don't quite see the purpose of fdt_initial_size... it seems to be only
used to print a trace.

> +    void *fdt_blob;
>      long kernel_size;
>      bool kernel_le;
>      uint32_t initrd_base;
> @@ -462,7 +466,8 @@ struct sPAPRMachineState {
>  #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
>  /* Client Architecture support */
>  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
> -#define KVMPPC_HCALL_MAX        KVMPPC_H_CAS
> +#define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
> +#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
>  
>  typedef struct sPAPRDeviceTreeUpdateHeader {
>      uint32_t version_id;
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index c08130facb..5e2d4d211c 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1633,7 +1633,10 @@ static void spapr_machine_reset(void)
>      /* Load the fdt */
>      qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
>      cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
> -    g_free(fdt);
> +    g_free(spapr->fdt_blob);
> +    spapr->fdt_size = fdt_totalsize(fdt);
> +    spapr->fdt_initial_size = spapr->fdt_size;
> +    spapr->fdt_blob = fdt;

Hmm... It looks weird to store state in a reset handler. I'd rather zeroe
both fdt_blob and fdt_size here.

>  
>      /* Set up the entry state */
>      spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, fdt_addr);
> @@ -1887,6 +1890,27 @@ static const VMStateDescription vmstate_spapr_irq_map = {
>      },
>  };
>  
> +static bool spapr_dtb_needed(void *opaque)
> +{
> +    sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(opaque);
> +
> +    return smc->update_dt_enabled;

This means we always migrate the fdt, even if migration occurs before
SLOF could call KVMPPC_H_UPDATE_DT.

With spapr->fdt_blob set to NULL on reset, a better check would be:

    sPAPRMachineState *spapr = SPAPR_MACHINE(opaque);

    return smc->update_dt_enabled && spapr->fdt_blob;

> +}
> +
> +static const VMStateDescription vmstate_spapr_dtb = {
> +    .name = "spapr_dtb",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = spapr_dtb_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT32(fdt_initial_size, sPAPRMachineState),
> +        VMSTATE_UINT32(fdt_size, sPAPRMachineState),
> +        VMSTATE_VBUFFER_ALLOC_UINT32(fdt_blob, sPAPRMachineState, 0, NULL,
> +                                     fdt_size),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
>  static const VMStateDescription vmstate_spapr = {
>      .name = "spapr",
>      .version_id = 3,
> @@ -1915,6 +1939,7 @@ static const VMStateDescription vmstate_spapr = {
>          &vmstate_spapr_cap_sbbc,
>          &vmstate_spapr_cap_ibs,
>          &vmstate_spapr_irq_map,
> +        &vmstate_spapr_dtb,

This needs to be rebased.

<<<<<<<
        &vmstate_spapr_cap_nested_kvm_hv,
=======
        &vmstate_spapr_dtb,
>>>>>>>


I'll try to find some time to respin the PHB hotplug series and I'll happily
give a try to this patch.

>          NULL
>      }
>  };
> @@ -3849,6 +3874,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>      hc->unplug = spapr_machine_device_unplug;
>  
>      smc->dr_lmb_enabled = true;
> +    smc->update_dt_enabled = true;
>      mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
>      mc->has_hotpluggable_cpus = true;
>      smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED;
> @@ -3965,6 +3991,7 @@ static void spapr_machine_3_0_class_options(MachineClass *mc)
>  
>      smc->legacy_irq_allocation = true;
>      smc->irq = &spapr_irq_xics_legacy;
> +    smc->update_dt_enabled = false;
>  }
>  
>  DEFINE_SPAPR_MACHINE(3_0, "3.0", false);
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index ae913d070f..d5833f3f8d 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -1717,6 +1717,36 @@ static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu,
>  
>      args[0] = characteristics;
>      args[1] = behaviour;
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_update_dt(PowerPCCPU *cpu, sPAPRMachineState *spapr,
> +                                target_ulong opcode, target_ulong *args)
> +{
> +    target_ulong dt = ppc64_phys_to_real(args[0]);
> +    struct fdt_header hdr = { 0 };
> +    unsigned cb;
> +    sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
> +
> +    cpu_physical_memory_read(dt, &hdr, sizeof(hdr));
> +    cb = fdt32_to_cpu(hdr.totalsize);
> +
> +    if (fdt_check_full(spapr->fdt_blob, cb)) {
> +        trace_spapr_update_dt_failed(spapr->fdt_initial_size, cb,
> +            fdt32_to_cpu(hdr.magic));
> +        return H_PARAMETER;
> +    }
> +
> +    if (!smc->update_dt_enabled) {
> +        return H_SUCCESS;
> +    }
> +
> +    g_free(spapr->fdt_blob);
> +    spapr->fdt_size = cb;
> +    spapr->fdt_blob = g_malloc0(cb);
> +    cpu_physical_memory_read(dt, spapr->fdt_blob, cb);
> +
> +    trace_spapr_update_dt(cb);
>  
>      return H_SUCCESS;
>  }
> @@ -1822,6 +1852,8 @@ static void hypercall_register_types(void)
>  
>      /* ibm,client-architecture-support support */
>      spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support);
> +
> +    spapr_register_hypercall(KVMPPC_H_UPDATE_DT, h_update_dt);
>  }
>  
>  type_init(hypercall_register_types)
> diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
> index dc5e65aee9..4432a5ce74 100644
> --- a/hw/ppc/trace-events
> +++ b/hw/ppc/trace-events
> @@ -22,6 +22,8 @@ spapr_cas_pvr_try(uint32_t pvr) "0x%x"
>  spapr_cas_pvr(uint32_t cur_pvr, bool explicit_match, uint32_t new_pvr) "current=0x%x, explicit_match=%u, new=0x%x"
>  spapr_h_resize_hpt_prepare(uint64_t flags, uint64_t shift) "flags=0x%"PRIx64", shift=%"PRIu64
>  spapr_h_resize_hpt_commit(uint64_t flags, uint64_t shift) "flags=0x%"PRIx64", shift=%"PRIu64
> +spapr_update_dt(unsigned cb) "New blob %u bytes"
> +spapr_update_dt_failed(unsigned cbold, unsigned cbnew, unsigned magic) "Old blob %u bytes, new blob %u bytes, magic 0x%x"
>  
>  # hw/ppc/spapr_iommu.c
>  spapr_iommu_put(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t ret) "liobn=0x%"PRIx64" ioba=0x%"PRIx64" tce=0x%"PRIx64" ret=%"PRId64
Alexey Kardashevskiy Nov. 12, 2018, 4:12 a.m. UTC | #2
On 12/11/2018 05:10, Greg Kurz wrote:
> Hi Alexey,
> 
> Just a few remarks. See below.
> 
> On Thu,  8 Nov 2018 12:44:06 +1100
> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> 
>> SLOF receives a device tree and updates it with various properties
>> before switching to the guest kernel and QEMU is not aware of any changes
>> made by SLOF. Since there is no real RTAS (QEMU implements it), it makes
>> sense to pass the SLOF final device tree to QEMU to let it implement
>> RTAS related tasks better, such as PCI host bus adapter hotplug.
>>
>> Specifially, now QEMU can find out the actual XICS phandle (for PHB
>> hotplug) and the RTAS linux,rtas-entry/base properties (for firmware
>> assisted NMI - FWNMI).
>>
>> This stores the initial DT blob in the sPAPR machine and replaces it
>> in the KVMPPC_H_UPDATE_DT (new private hypercall) handler.
>>
>> This adds an @update_dt_enabled machine property to allow backward
>> migration.
>>
>> SLOF already has a hypercall since
>> https://github.com/aik/SLOF/commit/e6fc84652c9c0073f9183
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  include/hw/ppc/spapr.h |  7 ++++++-
>>  hw/ppc/spapr.c         | 29 ++++++++++++++++++++++++++++-
>>  hw/ppc/spapr_hcall.c   | 32 ++++++++++++++++++++++++++++++++
>>  hw/ppc/trace-events    |  2 ++
>>  4 files changed, 68 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index ad4d7cfd97..f5dcaf44cb 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -100,6 +100,7 @@ struct sPAPRMachineClass {
>>  
>>      /*< public >*/
>>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
>> +    bool update_dt_enabled;    /* enable KVMPPC_H_UPDATE_DT */
>>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
>>      bool pre_2_10_has_unused_icps;
>>      bool legacy_irq_allocation;
>> @@ -136,6 +137,9 @@ struct sPAPRMachineState {
>>      int vrma_adjust;
>>      ssize_t rtas_size;
>>      void *rtas_blob;
>> +    uint32_t fdt_size;
>> +    uint32_t fdt_initial_size;
> 
> I don't quite see the purpose of fdt_initial_size... it seems to be only
> used to print a trace.


Ah, lost in rebase. The purpose was to test if the new device tree has
not grown too much.



> 
>> +    void *fdt_blob;
>>      long kernel_size;
>>      bool kernel_le;
>>      uint32_t initrd_base;
>> @@ -462,7 +466,8 @@ struct sPAPRMachineState {
>>  #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
>>  /* Client Architecture support */
>>  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_CAS
>> +#define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
>>  
>>  typedef struct sPAPRDeviceTreeUpdateHeader {
>>      uint32_t version_id;
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index c08130facb..5e2d4d211c 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -1633,7 +1633,10 @@ static void spapr_machine_reset(void)
>>      /* Load the fdt */
>>      qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
>>      cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
>> -    g_free(fdt);
>> +    g_free(spapr->fdt_blob);
>> +    spapr->fdt_size = fdt_totalsize(fdt);
>> +    spapr->fdt_initial_size = spapr->fdt_size;
>> +    spapr->fdt_blob = fdt;
> 
> Hmm... It looks weird to store state in a reset handler. I'd rather zeroe
> both fdt_blob and fdt_size here.


The device tree is built from the reset handler and the idea is that we
want to always have some tree in the machine.



> 
>>  
>>      /* Set up the entry state */
>>      spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, fdt_addr);
>> @@ -1887,6 +1890,27 @@ static const VMStateDescription vmstate_spapr_irq_map = {
>>      },
>>  };
>>  
>> +static bool spapr_dtb_needed(void *opaque)
>> +{
>> +    sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(opaque);
>> +
>> +    return smc->update_dt_enabled;
> 
> This means we always migrate the fdt, even if migration occurs before
> SLOF could call KVMPPC_H_UPDATE_DT.
> 
> With spapr->fdt_blob set to NULL on reset, a better check would be:
> 
>     sPAPRMachineState *spapr = SPAPR_MACHINE(opaque);
> 
>     return smc->update_dt_enabled && spapr->fdt_blob;
> 
>> +}
>> +
>> +static const VMStateDescription vmstate_spapr_dtb = {
>> +    .name = "spapr_dtb",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .needed = spapr_dtb_needed,
>> +    .fields = (VMStateField[]) {
>> +        VMSTATE_UINT32(fdt_initial_size, sPAPRMachineState),
>> +        VMSTATE_UINT32(fdt_size, sPAPRMachineState),
>> +        VMSTATE_VBUFFER_ALLOC_UINT32(fdt_blob, sPAPRMachineState, 0, NULL,
>> +                                     fdt_size),
>> +        VMSTATE_END_OF_LIST()
>> +    },
>> +};
>> +
>>  static const VMStateDescription vmstate_spapr = {
>>      .name = "spapr",
>>      .version_id = 3,
>> @@ -1915,6 +1939,7 @@ static const VMStateDescription vmstate_spapr = {
>>          &vmstate_spapr_cap_sbbc,
>>          &vmstate_spapr_cap_ibs,
>>          &vmstate_spapr_irq_map,
>> +        &vmstate_spapr_dtb,
> 
> This needs to be rebased.
> 
> <<<<<<<
>         &vmstate_spapr_cap_nested_kvm_hv,
> =======
>         &vmstate_spapr_dtb,
>>>>>>>>
> 
> 
> I'll try to find some time to respin the PHB hotplug series and I'll happily
> give a try to this patch.


Good :)


> 
>>          NULL
>>      }
>>  };
>> @@ -3849,6 +3874,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>>      hc->unplug = spapr_machine_device_unplug;
>>  
>>      smc->dr_lmb_enabled = true;
>> +    smc->update_dt_enabled = true;
>>      mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
>>      mc->has_hotpluggable_cpus = true;
>>      smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED;
>> @@ -3965,6 +3991,7 @@ static void spapr_machine_3_0_class_options(MachineClass *mc)
>>  
>>      smc->legacy_irq_allocation = true;
>>      smc->irq = &spapr_irq_xics_legacy;
>> +    smc->update_dt_enabled = false;
>>  }
>>  
>>  DEFINE_SPAPR_MACHINE(3_0, "3.0", false);
>> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
>> index ae913d070f..d5833f3f8d 100644
>> --- a/hw/ppc/spapr_hcall.c
>> +++ b/hw/ppc/spapr_hcall.c
>> @@ -1717,6 +1717,36 @@ static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu,
>>  
>>      args[0] = characteristics;
>>      args[1] = behaviour;
>> +    return H_SUCCESS;
>> +}
>> +
>> +static target_ulong h_update_dt(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>> +                                target_ulong opcode, target_ulong *args)
>> +{
>> +    target_ulong dt = ppc64_phys_to_real(args[0]);
>> +    struct fdt_header hdr = { 0 };
>> +    unsigned cb;
>> +    sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
>> +
>> +    cpu_physical_memory_read(dt, &hdr, sizeof(hdr));
>> +    cb = fdt32_to_cpu(hdr.totalsize);
>> +
>> +    if (fdt_check_full(spapr->fdt_blob, cb)) {
>> +        trace_spapr_update_dt_failed(spapr->fdt_initial_size, cb,
>> +            fdt32_to_cpu(hdr.magic));
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    if (!smc->update_dt_enabled) {
>> +        return H_SUCCESS;
>> +    }
>> +
>> +    g_free(spapr->fdt_blob);
>> +    spapr->fdt_size = cb;
>> +    spapr->fdt_blob = g_malloc0(cb);
>> +    cpu_physical_memory_read(dt, spapr->fdt_blob, cb);
>> +
>> +    trace_spapr_update_dt(cb);
>>  
>>      return H_SUCCESS;
>>  }
>> @@ -1822,6 +1852,8 @@ static void hypercall_register_types(void)
>>  
>>      /* ibm,client-architecture-support support */
>>      spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support);
>> +
>> +    spapr_register_hypercall(KVMPPC_H_UPDATE_DT, h_update_dt);
>>  }
>>  
>>  type_init(hypercall_register_types)
>> diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
>> index dc5e65aee9..4432a5ce74 100644
>> --- a/hw/ppc/trace-events
>> +++ b/hw/ppc/trace-events
>> @@ -22,6 +22,8 @@ spapr_cas_pvr_try(uint32_t pvr) "0x%x"
>>  spapr_cas_pvr(uint32_t cur_pvr, bool explicit_match, uint32_t new_pvr) "current=0x%x, explicit_match=%u, new=0x%x"
>>  spapr_h_resize_hpt_prepare(uint64_t flags, uint64_t shift) "flags=0x%"PRIx64", shift=%"PRIu64
>>  spapr_h_resize_hpt_commit(uint64_t flags, uint64_t shift) "flags=0x%"PRIx64", shift=%"PRIu64
>> +spapr_update_dt(unsigned cb) "New blob %u bytes"
>> +spapr_update_dt_failed(unsigned cbold, unsigned cbnew, unsigned magic) "Old blob %u bytes, new blob %u bytes, magic 0x%x"
>>  
>>  # hw/ppc/spapr_iommu.c
>>  spapr_iommu_put(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t ret) "liobn=0x%"PRIx64" ioba=0x%"PRIx64" tce=0x%"PRIx64" ret=%"PRId64
>
Greg Kurz Nov. 12, 2018, 9:05 a.m. UTC | #3
On Mon, 12 Nov 2018 15:12:26 +1100
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:

> On 12/11/2018 05:10, Greg Kurz wrote:
> > Hi Alexey,
> > 
> > Just a few remarks. See below.
> > 
> > On Thu,  8 Nov 2018 12:44:06 +1100
> > Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> >   
> >> SLOF receives a device tree and updates it with various properties
> >> before switching to the guest kernel and QEMU is not aware of any changes
> >> made by SLOF. Since there is no real RTAS (QEMU implements it), it makes
> >> sense to pass the SLOF final device tree to QEMU to let it implement
> >> RTAS related tasks better, such as PCI host bus adapter hotplug.
> >>
> >> Specifially, now QEMU can find out the actual XICS phandle (for PHB
> >> hotplug) and the RTAS linux,rtas-entry/base properties (for firmware
> >> assisted NMI - FWNMI).
> >>
> >> This stores the initial DT blob in the sPAPR machine and replaces it
> >> in the KVMPPC_H_UPDATE_DT (new private hypercall) handler.
> >>
> >> This adds an @update_dt_enabled machine property to allow backward
> >> migration.
> >>
> >> SLOF already has a hypercall since
> >> https://github.com/aik/SLOF/commit/e6fc84652c9c0073f9183
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>  include/hw/ppc/spapr.h |  7 ++++++-
> >>  hw/ppc/spapr.c         | 29 ++++++++++++++++++++++++++++-
> >>  hw/ppc/spapr_hcall.c   | 32 ++++++++++++++++++++++++++++++++
> >>  hw/ppc/trace-events    |  2 ++
> >>  4 files changed, 68 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >> index ad4d7cfd97..f5dcaf44cb 100644
> >> --- a/include/hw/ppc/spapr.h
> >> +++ b/include/hw/ppc/spapr.h
> >> @@ -100,6 +100,7 @@ struct sPAPRMachineClass {
> >>  
> >>      /*< public >*/
> >>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
> >> +    bool update_dt_enabled;    /* enable KVMPPC_H_UPDATE_DT */
> >>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
> >>      bool pre_2_10_has_unused_icps;
> >>      bool legacy_irq_allocation;
> >> @@ -136,6 +137,9 @@ struct sPAPRMachineState {
> >>      int vrma_adjust;
> >>      ssize_t rtas_size;
> >>      void *rtas_blob;
> >> +    uint32_t fdt_size;
> >> +    uint32_t fdt_initial_size;  
> > 
> > I don't quite see the purpose of fdt_initial_size... it seems to be only
> > used to print a trace.  
> 
> 
> Ah, lost in rebase. The purpose was to test if the new device tree has
> not grown too much.
> 

Ok, makes sense during development.

> 
> 
> >   
> >> +    void *fdt_blob;
> >>      long kernel_size;
> >>      bool kernel_le;
> >>      uint32_t initrd_base;
> >> @@ -462,7 +466,8 @@ struct sPAPRMachineState {
> >>  #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
> >>  /* Client Architecture support */
> >>  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
> >> -#define KVMPPC_HCALL_MAX        KVMPPC_H_CAS
> >> +#define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
> >> +#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
> >>  
> >>  typedef struct sPAPRDeviceTreeUpdateHeader {
> >>      uint32_t version_id;
> >> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >> index c08130facb..5e2d4d211c 100644
> >> --- a/hw/ppc/spapr.c
> >> +++ b/hw/ppc/spapr.c
> >> @@ -1633,7 +1633,10 @@ static void spapr_machine_reset(void)
> >>      /* Load the fdt */
> >>      qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
> >>      cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
> >> -    g_free(fdt);
> >> +    g_free(spapr->fdt_blob);
> >> +    spapr->fdt_size = fdt_totalsize(fdt);
> >> +    spapr->fdt_initial_size = spapr->fdt_size;
> >> +    spapr->fdt_blob = fdt;  
> > 
> > Hmm... It looks weird to store state in a reset handler. I'd rather zeroe
> > both fdt_blob and fdt_size here.  
> 
> 
> The device tree is built from the reset handler and the idea is that we
> want to always have some tree in the machine.
> 

Yes of course, I forgot that we need to keep the fdt to be kept
somewhere so that we can use it :). My remark has more to do
with migration actually: the fdt built at reset time is supposed
to derive from the command line and hot-(un)plugged devices, ie,
identical in source and destination. This isn't state we should
migrate IIUC.

Maybe add a boolean field that tells that the fdt was updated, use
it in spapr_dtb_needed() and reset it in spapr_machine_reset() ?

> 
> 
> >   
> >>  
> >>      /* Set up the entry state */
> >>      spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, fdt_addr);
> >> @@ -1887,6 +1890,27 @@ static const VMStateDescription vmstate_spapr_irq_map = {
> >>      },
> >>  };
> >>  
> >> +static bool spapr_dtb_needed(void *opaque)
> >> +{
> >> +    sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(opaque);
> >> +
> >> +    return smc->update_dt_enabled;  
> > 
> > This means we always migrate the fdt, even if migration occurs before
> > SLOF could call KVMPPC_H_UPDATE_DT.
> > 
> > With spapr->fdt_blob set to NULL on reset, a better check would be:
> > 
> >     sPAPRMachineState *spapr = SPAPR_MACHINE(opaque);
> > 
> >     return smc->update_dt_enabled && spapr->fdt_blob;
> >   
> >> +}
> >> +
> >> +static const VMStateDescription vmstate_spapr_dtb = {
> >> +    .name = "spapr_dtb",
> >> +    .version_id = 1,
> >> +    .minimum_version_id = 1,
> >> +    .needed = spapr_dtb_needed,
> >> +    .fields = (VMStateField[]) {
> >> +        VMSTATE_UINT32(fdt_initial_size, sPAPRMachineState),
> >> +        VMSTATE_UINT32(fdt_size, sPAPRMachineState),
> >> +        VMSTATE_VBUFFER_ALLOC_UINT32(fdt_blob, sPAPRMachineState, 0, NULL,
> >> +                                     fdt_size),
> >> +        VMSTATE_END_OF_LIST()
> >> +    },
> >> +};
> >> +
> >>  static const VMStateDescription vmstate_spapr = {
> >>      .name = "spapr",
> >>      .version_id = 3,
> >> @@ -1915,6 +1939,7 @@ static const VMStateDescription vmstate_spapr = {
> >>          &vmstate_spapr_cap_sbbc,
> >>          &vmstate_spapr_cap_ibs,
> >>          &vmstate_spapr_irq_map,
> >> +        &vmstate_spapr_dtb,  
> > 
> > This needs to be rebased.
> > 
> > <<<<<<<
> >         &vmstate_spapr_cap_nested_kvm_hv,
> > =======
> >         &vmstate_spapr_dtb,  
> >>>>>>>>  
> > 
> > 
> > I'll try to find some time to respin the PHB hotplug series and I'll happily
> > give a try to this patch.  
> 
> 
> Good :)
> 
> 
> >   
> >>          NULL
> >>      }
> >>  };
> >> @@ -3849,6 +3874,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
> >>      hc->unplug = spapr_machine_device_unplug;
> >>  
> >>      smc->dr_lmb_enabled = true;
> >> +    smc->update_dt_enabled = true;
> >>      mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
> >>      mc->has_hotpluggable_cpus = true;
> >>      smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED;
> >> @@ -3965,6 +3991,7 @@ static void spapr_machine_3_0_class_options(MachineClass *mc)
> >>  
> >>      smc->legacy_irq_allocation = true;
> >>      smc->irq = &spapr_irq_xics_legacy;
> >> +    smc->update_dt_enabled = false;
> >>  }
> >>  
> >>  DEFINE_SPAPR_MACHINE(3_0, "3.0", false);
> >> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> >> index ae913d070f..d5833f3f8d 100644
> >> --- a/hw/ppc/spapr_hcall.c
> >> +++ b/hw/ppc/spapr_hcall.c
> >> @@ -1717,6 +1717,36 @@ static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu,
> >>  
> >>      args[0] = characteristics;
> >>      args[1] = behaviour;
> >> +    return H_SUCCESS;
> >> +}
> >> +
> >> +static target_ulong h_update_dt(PowerPCCPU *cpu, sPAPRMachineState *spapr,
> >> +                                target_ulong opcode, target_ulong *args)
> >> +{
> >> +    target_ulong dt = ppc64_phys_to_real(args[0]);
> >> +    struct fdt_header hdr = { 0 };
> >> +    unsigned cb;
> >> +    sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
> >> +
> >> +    cpu_physical_memory_read(dt, &hdr, sizeof(hdr));
> >> +    cb = fdt32_to_cpu(hdr.totalsize);
> >> +
> >> +    if (fdt_check_full(spapr->fdt_blob, cb)) {
> >> +        trace_spapr_update_dt_failed(spapr->fdt_initial_size, cb,
> >> +            fdt32_to_cpu(hdr.magic));
> >> +        return H_PARAMETER;
> >> +    }
> >> +
> >> +    if (!smc->update_dt_enabled) {
> >> +        return H_SUCCESS;
> >> +    }
> >> +
> >> +    g_free(spapr->fdt_blob);
> >> +    spapr->fdt_size = cb;
> >> +    spapr->fdt_blob = g_malloc0(cb);
> >> +    cpu_physical_memory_read(dt, spapr->fdt_blob, cb);
> >> +
> >> +    trace_spapr_update_dt(cb);
> >>  
> >>      return H_SUCCESS;
> >>  }
> >> @@ -1822,6 +1852,8 @@ static void hypercall_register_types(void)
> >>  
> >>      /* ibm,client-architecture-support support */
> >>      spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support);
> >> +
> >> +    spapr_register_hypercall(KVMPPC_H_UPDATE_DT, h_update_dt);
> >>  }
> >>  
> >>  type_init(hypercall_register_types)
> >> diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
> >> index dc5e65aee9..4432a5ce74 100644
> >> --- a/hw/ppc/trace-events
> >> +++ b/hw/ppc/trace-events
> >> @@ -22,6 +22,8 @@ spapr_cas_pvr_try(uint32_t pvr) "0x%x"
> >>  spapr_cas_pvr(uint32_t cur_pvr, bool explicit_match, uint32_t new_pvr) "current=0x%x, explicit_match=%u, new=0x%x"
> >>  spapr_h_resize_hpt_prepare(uint64_t flags, uint64_t shift) "flags=0x%"PRIx64", shift=%"PRIu64
> >>  spapr_h_resize_hpt_commit(uint64_t flags, uint64_t shift) "flags=0x%"PRIx64", shift=%"PRIu64
> >> +spapr_update_dt(unsigned cb) "New blob %u bytes"
> >> +spapr_update_dt_failed(unsigned cbold, unsigned cbnew, unsigned magic) "Old blob %u bytes, new blob %u bytes, magic 0x%x"
> >>  
> >>  # hw/ppc/spapr_iommu.c
> >>  spapr_iommu_put(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t ret) "liobn=0x%"PRIx64" ioba=0x%"PRIx64" tce=0x%"PRIx64" ret=%"PRId64  
> >   
>
Alexey Kardashevskiy Nov. 13, 2018, 5:31 a.m. UTC | #4
On 12/11/2018 20:05, Greg Kurz wrote:
> On Mon, 12 Nov 2018 15:12:26 +1100
> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> 
>> On 12/11/2018 05:10, Greg Kurz wrote:
>>> Hi Alexey,
>>>
>>> Just a few remarks. See below.
>>>
>>> On Thu,  8 Nov 2018 12:44:06 +1100
>>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>   
>>>> SLOF receives a device tree and updates it with various properties
>>>> before switching to the guest kernel and QEMU is not aware of any changes
>>>> made by SLOF. Since there is no real RTAS (QEMU implements it), it makes
>>>> sense to pass the SLOF final device tree to QEMU to let it implement
>>>> RTAS related tasks better, such as PCI host bus adapter hotplug.
>>>>
>>>> Specifially, now QEMU can find out the actual XICS phandle (for PHB
>>>> hotplug) and the RTAS linux,rtas-entry/base properties (for firmware
>>>> assisted NMI - FWNMI).
>>>>
>>>> This stores the initial DT blob in the sPAPR machine and replaces it
>>>> in the KVMPPC_H_UPDATE_DT (new private hypercall) handler.
>>>>
>>>> This adds an @update_dt_enabled machine property to allow backward
>>>> migration.
>>>>
>>>> SLOF already has a hypercall since
>>>> https://github.com/aik/SLOF/commit/e6fc84652c9c0073f9183
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>>  include/hw/ppc/spapr.h |  7 ++++++-
>>>>  hw/ppc/spapr.c         | 29 ++++++++++++++++++++++++++++-
>>>>  hw/ppc/spapr_hcall.c   | 32 ++++++++++++++++++++++++++++++++
>>>>  hw/ppc/trace-events    |  2 ++
>>>>  4 files changed, 68 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>> index ad4d7cfd97..f5dcaf44cb 100644
>>>> --- a/include/hw/ppc/spapr.h
>>>> +++ b/include/hw/ppc/spapr.h
>>>> @@ -100,6 +100,7 @@ struct sPAPRMachineClass {
>>>>  
>>>>      /*< public >*/
>>>>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
>>>> +    bool update_dt_enabled;    /* enable KVMPPC_H_UPDATE_DT */
>>>>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
>>>>      bool pre_2_10_has_unused_icps;
>>>>      bool legacy_irq_allocation;
>>>> @@ -136,6 +137,9 @@ struct sPAPRMachineState {
>>>>      int vrma_adjust;
>>>>      ssize_t rtas_size;
>>>>      void *rtas_blob;
>>>> +    uint32_t fdt_size;
>>>> +    uint32_t fdt_initial_size;  
>>>
>>> I don't quite see the purpose of fdt_initial_size... it seems to be only
>>> used to print a trace.  
>>
>>
>> Ah, lost in rebase. The purpose was to test if the new device tree has
>> not grown too much.
>>
> 
> Ok, makes sense during development.
> 
>>
>>
>>>   
>>>> +    void *fdt_blob;
>>>>      long kernel_size;
>>>>      bool kernel_le;
>>>>      uint32_t initrd_base;
>>>> @@ -462,7 +466,8 @@ struct sPAPRMachineState {
>>>>  #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
>>>>  /* Client Architecture support */
>>>>  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
>>>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_CAS
>>>> +#define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
>>>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
>>>>  
>>>>  typedef struct sPAPRDeviceTreeUpdateHeader {
>>>>      uint32_t version_id;
>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>> index c08130facb..5e2d4d211c 100644
>>>> --- a/hw/ppc/spapr.c
>>>> +++ b/hw/ppc/spapr.c
>>>> @@ -1633,7 +1633,10 @@ static void spapr_machine_reset(void)
>>>>      /* Load the fdt */
>>>>      qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
>>>>      cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
>>>> -    g_free(fdt);
>>>> +    g_free(spapr->fdt_blob);
>>>> +    spapr->fdt_size = fdt_totalsize(fdt);
>>>> +    spapr->fdt_initial_size = spapr->fdt_size;
>>>> +    spapr->fdt_blob = fdt;  
>>>
>>> Hmm... It looks weird to store state in a reset handler. I'd rather zeroe
>>> both fdt_blob and fdt_size here.  
>>
>>
>> The device tree is built from the reset handler and the idea is that we
>> want to always have some tree in the machine.
>>
> 
> Yes of course, I forgot that we need to keep the fdt to be kept
> somewhere so that we can use it :). My remark has more to do
> with migration actually: the fdt built at reset time is supposed
> to derive from the command line and hot-(un)plugged devices, ie,
> identical in source and destination. This isn't state we should
> migrate IIUC.

Having some device tree all the time seems more convenient than managing
the state when we do have one and when we do not.

It is not a big deal though, I'd wait and see what David thinks. Thanks,



> Maybe add a boolean field that tells that the fdt was updated, use
> it in spapr_dtb_needed() and reset it in spapr_machine_reset() ?
> 
>>
>>
>>>   
>>>>  
>>>>      /* Set up the entry state */
>>>>      spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, fdt_addr);
>>>> @@ -1887,6 +1890,27 @@ static const VMStateDescription vmstate_spapr_irq_map = {
>>>>      },
>>>>  };
>>>>  
>>>> +static bool spapr_dtb_needed(void *opaque)
>>>> +{
>>>> +    sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(opaque);
>>>> +
>>>> +    return smc->update_dt_enabled;  
>>>
>>> This means we always migrate the fdt, even if migration occurs before
>>> SLOF could call KVMPPC_H_UPDATE_DT.
>>>
>>> With spapr->fdt_blob set to NULL on reset, a better check would be:
>>>
>>>     sPAPRMachineState *spapr = SPAPR_MACHINE(opaque);
>>>
>>>     return smc->update_dt_enabled && spapr->fdt_blob;
>>>   
>>>> +}
>>>> +
>>>> +static const VMStateDescription vmstate_spapr_dtb = {
>>>> +    .name = "spapr_dtb",
>>>> +    .version_id = 1,
>>>> +    .minimum_version_id = 1,
>>>> +    .needed = spapr_dtb_needed,
>>>> +    .fields = (VMStateField[]) {
>>>> +        VMSTATE_UINT32(fdt_initial_size, sPAPRMachineState),
>>>> +        VMSTATE_UINT32(fdt_size, sPAPRMachineState),
>>>> +        VMSTATE_VBUFFER_ALLOC_UINT32(fdt_blob, sPAPRMachineState, 0, NULL,
>>>> +                                     fdt_size),
>>>> +        VMSTATE_END_OF_LIST()
>>>> +    },
>>>> +};
>>>> +
>>>>  static const VMStateDescription vmstate_spapr = {
>>>>      .name = "spapr",
>>>>      .version_id = 3,
>>>> @@ -1915,6 +1939,7 @@ static const VMStateDescription vmstate_spapr = {
>>>>          &vmstate_spapr_cap_sbbc,
>>>>          &vmstate_spapr_cap_ibs,
>>>>          &vmstate_spapr_irq_map,
>>>> +        &vmstate_spapr_dtb,  
>>>
>>> This needs to be rebased.
>>>
>>> <<<<<<<
>>>         &vmstate_spapr_cap_nested_kvm_hv,
>>> =======
>>>         &vmstate_spapr_dtb,  
>>>>>>>>>>  
>>>
>>>
>>> I'll try to find some time to respin the PHB hotplug series and I'll happily
>>> give a try to this patch.  
>>
>>
>> Good :)
>>
>>
>>>   
>>>>          NULL
>>>>      }
>>>>  };
>>>> @@ -3849,6 +3874,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>>>>      hc->unplug = spapr_machine_device_unplug;
>>>>  
>>>>      smc->dr_lmb_enabled = true;
>>>> +    smc->update_dt_enabled = true;
>>>>      mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
>>>>      mc->has_hotpluggable_cpus = true;
>>>>      smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED;
>>>> @@ -3965,6 +3991,7 @@ static void spapr_machine_3_0_class_options(MachineClass *mc)
>>>>  
>>>>      smc->legacy_irq_allocation = true;
>>>>      smc->irq = &spapr_irq_xics_legacy;
>>>> +    smc->update_dt_enabled = false;
>>>>  }
>>>>  
>>>>  DEFINE_SPAPR_MACHINE(3_0, "3.0", false);
>>>> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
>>>> index ae913d070f..d5833f3f8d 100644
>>>> --- a/hw/ppc/spapr_hcall.c
>>>> +++ b/hw/ppc/spapr_hcall.c
>>>> @@ -1717,6 +1717,36 @@ static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu,
>>>>  
>>>>      args[0] = characteristics;
>>>>      args[1] = behaviour;
>>>> +    return H_SUCCESS;
>>>> +}
>>>> +
>>>> +static target_ulong h_update_dt(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>>>> +                                target_ulong opcode, target_ulong *args)
>>>> +{
>>>> +    target_ulong dt = ppc64_phys_to_real(args[0]);
>>>> +    struct fdt_header hdr = { 0 };
>>>> +    unsigned cb;
>>>> +    sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
>>>> +
>>>> +    cpu_physical_memory_read(dt, &hdr, sizeof(hdr));
>>>> +    cb = fdt32_to_cpu(hdr.totalsize);
>>>> +
>>>> +    if (fdt_check_full(spapr->fdt_blob, cb)) {
>>>> +        trace_spapr_update_dt_failed(spapr->fdt_initial_size, cb,
>>>> +            fdt32_to_cpu(hdr.magic));
>>>> +        return H_PARAMETER;
>>>> +    }
>>>> +
>>>> +    if (!smc->update_dt_enabled) {
>>>> +        return H_SUCCESS;
>>>> +    }
>>>> +
>>>> +    g_free(spapr->fdt_blob);
>>>> +    spapr->fdt_size = cb;
>>>> +    spapr->fdt_blob = g_malloc0(cb);
>>>> +    cpu_physical_memory_read(dt, spapr->fdt_blob, cb);
>>>> +
>>>> +    trace_spapr_update_dt(cb);
>>>>  
>>>>      return H_SUCCESS;
>>>>  }
>>>> @@ -1822,6 +1852,8 @@ static void hypercall_register_types(void)
>>>>  
>>>>      /* ibm,client-architecture-support support */
>>>>      spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support);
>>>> +
>>>> +    spapr_register_hypercall(KVMPPC_H_UPDATE_DT, h_update_dt);
>>>>  }
>>>>  
>>>>  type_init(hypercall_register_types)
>>>> diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
>>>> index dc5e65aee9..4432a5ce74 100644
>>>> --- a/hw/ppc/trace-events
>>>> +++ b/hw/ppc/trace-events
>>>> @@ -22,6 +22,8 @@ spapr_cas_pvr_try(uint32_t pvr) "0x%x"
>>>>  spapr_cas_pvr(uint32_t cur_pvr, bool explicit_match, uint32_t new_pvr) "current=0x%x, explicit_match=%u, new=0x%x"
>>>>  spapr_h_resize_hpt_prepare(uint64_t flags, uint64_t shift) "flags=0x%"PRIx64", shift=%"PRIu64
>>>>  spapr_h_resize_hpt_commit(uint64_t flags, uint64_t shift) "flags=0x%"PRIx64", shift=%"PRIu64
>>>> +spapr_update_dt(unsigned cb) "New blob %u bytes"
>>>> +spapr_update_dt_failed(unsigned cbold, unsigned cbnew, unsigned magic) "Old blob %u bytes, new blob %u bytes, magic 0x%x"
>>>>  
>>>>  # hw/ppc/spapr_iommu.c
>>>>  spapr_iommu_put(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t ret) "liobn=0x%"PRIx64" ioba=0x%"PRIx64" tce=0x%"PRIx64" ret=%"PRId64  
>>>   
>>
>
David Gibson Dec. 10, 2018, 6:20 a.m. UTC | #5
On Mon, Nov 12, 2018 at 03:12:26PM +1100, Alexey Kardashevskiy wrote:
> 
> 
> On 12/11/2018 05:10, Greg Kurz wrote:
> > Hi Alexey,
> > 
> > Just a few remarks. See below.
> > 
> > On Thu,  8 Nov 2018 12:44:06 +1100
> > Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> > 
> >> SLOF receives a device tree and updates it with various properties
> >> before switching to the guest kernel and QEMU is not aware of any changes
> >> made by SLOF. Since there is no real RTAS (QEMU implements it), it makes
> >> sense to pass the SLOF final device tree to QEMU to let it implement
> >> RTAS related tasks better, such as PCI host bus adapter hotplug.
> >>
> >> Specifially, now QEMU can find out the actual XICS phandle (for PHB
> >> hotplug) and the RTAS linux,rtas-entry/base properties (for firmware
> >> assisted NMI - FWNMI).
> >>
> >> This stores the initial DT blob in the sPAPR machine and replaces it
> >> in the KVMPPC_H_UPDATE_DT (new private hypercall) handler.
> >>
> >> This adds an @update_dt_enabled machine property to allow backward
> >> migration.
> >>
> >> SLOF already has a hypercall since
> >> https://github.com/aik/SLOF/commit/e6fc84652c9c0073f9183
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>  include/hw/ppc/spapr.h |  7 ++++++-
> >>  hw/ppc/spapr.c         | 29 ++++++++++++++++++++++++++++-
> >>  hw/ppc/spapr_hcall.c   | 32 ++++++++++++++++++++++++++++++++
> >>  hw/ppc/trace-events    |  2 ++
> >>  4 files changed, 68 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >> index ad4d7cfd97..f5dcaf44cb 100644
> >> --- a/include/hw/ppc/spapr.h
> >> +++ b/include/hw/ppc/spapr.h
> >> @@ -100,6 +100,7 @@ struct sPAPRMachineClass {
> >>  
> >>      /*< public >*/
> >>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
> >> +    bool update_dt_enabled;    /* enable KVMPPC_H_UPDATE_DT */
> >>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
> >>      bool pre_2_10_has_unused_icps;
> >>      bool legacy_irq_allocation;
> >> @@ -136,6 +137,9 @@ struct sPAPRMachineState {
> >>      int vrma_adjust;
> >>      ssize_t rtas_size;
> >>      void *rtas_blob;
> >> +    uint32_t fdt_size;
> >> +    uint32_t fdt_initial_size;
> > 
> > I don't quite see the purpose of fdt_initial_size... it seems to be only
> > used to print a trace.
> 
> 
> Ah, lost in rebase. The purpose was to test if the new device tree has
> not grown too much.
> 
> 
> 
> > 
> >> +    void *fdt_blob;
> >>      long kernel_size;
> >>      bool kernel_le;
> >>      uint32_t initrd_base;
> >> @@ -462,7 +466,8 @@ struct sPAPRMachineState {
> >>  #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
> >>  /* Client Architecture support */
> >>  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
> >> -#define KVMPPC_HCALL_MAX        KVMPPC_H_CAS
> >> +#define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
> >> +#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
> >>  
> >>  typedef struct sPAPRDeviceTreeUpdateHeader {
> >>      uint32_t version_id;
> >> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >> index c08130facb..5e2d4d211c 100644
> >> --- a/hw/ppc/spapr.c
> >> +++ b/hw/ppc/spapr.c
> >> @@ -1633,7 +1633,10 @@ static void spapr_machine_reset(void)
> >>      /* Load the fdt */
> >>      qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
> >>      cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
> >> -    g_free(fdt);
> >> +    g_free(spapr->fdt_blob);
> >> +    spapr->fdt_size = fdt_totalsize(fdt);
> >> +    spapr->fdt_initial_size = spapr->fdt_size;
> >> +    spapr->fdt_blob = fdt;
> > 
> > Hmm... It looks weird to store state in a reset handler. I'd rather zeroe
> > both fdt_blob and fdt_size here.
> 
> The device tree is built from the reset handler and the idea is that we
> want to always have some tree in the machine.

Yes, I think the approach here is fine.  Otherwise when we want to
look up the current fdt state in RTAS calls or whatever we'd always
have to do
	if (fdt_blob)
		look up that
	else
		look up qemu created fdt.

Incidentally 'fdt' and 'fdt_blob' names do a terrible job of
distinguishing what the difference is.  Renaming fdt to fdt_initial
(to match fdt_initial_size) and fdt_blob to fdt should make that
clearer.
Greg Kurz Dec. 10, 2018, 9:30 a.m. UTC | #6
On Mon, 10 Dec 2018 17:20:43 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Mon, Nov 12, 2018 at 03:12:26PM +1100, Alexey Kardashevskiy wrote:
> > 
> > 
> > On 12/11/2018 05:10, Greg Kurz wrote:  
> > > Hi Alexey,
> > > 
> > > Just a few remarks. See below.
> > > 
> > > On Thu,  8 Nov 2018 12:44:06 +1100
> > > Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> > >   
> > >> SLOF receives a device tree and updates it with various properties
> > >> before switching to the guest kernel and QEMU is not aware of any changes
> > >> made by SLOF. Since there is no real RTAS (QEMU implements it), it makes
> > >> sense to pass the SLOF final device tree to QEMU to let it implement
> > >> RTAS related tasks better, such as PCI host bus adapter hotplug.
> > >>
> > >> Specifially, now QEMU can find out the actual XICS phandle (for PHB
> > >> hotplug) and the RTAS linux,rtas-entry/base properties (for firmware
> > >> assisted NMI - FWNMI).
> > >>
> > >> This stores the initial DT blob in the sPAPR machine and replaces it
> > >> in the KVMPPC_H_UPDATE_DT (new private hypercall) handler.
> > >>
> > >> This adds an @update_dt_enabled machine property to allow backward
> > >> migration.
> > >>
> > >> SLOF already has a hypercall since
> > >> https://github.com/aik/SLOF/commit/e6fc84652c9c0073f9183
> > >>
> > >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > >> ---
> > >>  include/hw/ppc/spapr.h |  7 ++++++-
> > >>  hw/ppc/spapr.c         | 29 ++++++++++++++++++++++++++++-
> > >>  hw/ppc/spapr_hcall.c   | 32 ++++++++++++++++++++++++++++++++
> > >>  hw/ppc/trace-events    |  2 ++
> > >>  4 files changed, 68 insertions(+), 2 deletions(-)
> > >>
> > >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> > >> index ad4d7cfd97..f5dcaf44cb 100644
> > >> --- a/include/hw/ppc/spapr.h
> > >> +++ b/include/hw/ppc/spapr.h
> > >> @@ -100,6 +100,7 @@ struct sPAPRMachineClass {
> > >>  
> > >>      /*< public >*/
> > >>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
> > >> +    bool update_dt_enabled;    /* enable KVMPPC_H_UPDATE_DT */
> > >>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
> > >>      bool pre_2_10_has_unused_icps;
> > >>      bool legacy_irq_allocation;
> > >> @@ -136,6 +137,9 @@ struct sPAPRMachineState {
> > >>      int vrma_adjust;
> > >>      ssize_t rtas_size;
> > >>      void *rtas_blob;
> > >> +    uint32_t fdt_size;
> > >> +    uint32_t fdt_initial_size;  
> > > 
> > > I don't quite see the purpose of fdt_initial_size... it seems to be only
> > > used to print a trace.  
> > 
> > 
> > Ah, lost in rebase. The purpose was to test if the new device tree has
> > not grown too much.
> > 
> > 
> >   
> > >   
> > >> +    void *fdt_blob;
> > >>      long kernel_size;
> > >>      bool kernel_le;
> > >>      uint32_t initrd_base;
> > >> @@ -462,7 +466,8 @@ struct sPAPRMachineState {
> > >>  #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
> > >>  /* Client Architecture support */
> > >>  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
> > >> -#define KVMPPC_HCALL_MAX        KVMPPC_H_CAS
> > >> +#define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
> > >> +#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
> > >>  
> > >>  typedef struct sPAPRDeviceTreeUpdateHeader {
> > >>      uint32_t version_id;
> > >> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > >> index c08130facb..5e2d4d211c 100644
> > >> --- a/hw/ppc/spapr.c
> > >> +++ b/hw/ppc/spapr.c
> > >> @@ -1633,7 +1633,10 @@ static void spapr_machine_reset(void)
> > >>      /* Load the fdt */
> > >>      qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
> > >>      cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
> > >> -    g_free(fdt);
> > >> +    g_free(spapr->fdt_blob);
> > >> +    spapr->fdt_size = fdt_totalsize(fdt);
> > >> +    spapr->fdt_initial_size = spapr->fdt_size;
> > >> +    spapr->fdt_blob = fdt;  
> > > 
> > > Hmm... It looks weird to store state in a reset handler. I'd rather zeroe
> > > both fdt_blob and fdt_size here.  
> > 
> > The device tree is built from the reset handler and the idea is that we
> > want to always have some tree in the machine.  
> 
> Yes, I think the approach here is fine.  Otherwise when we want to
> look up the current fdt state in RTAS calls or whatever we'd always
> have to do
> 	if (fdt_blob)
> 		look up that
> 	else
> 		look up qemu created fdt.
> 

No. We only have one fdt blob: the initial one, I'd rather
call reset time one, or the updated one.

> Incidentally 'fdt' and 'fdt_blob' names do a terrible job of
> distinguishing what the difference is.  Renaming fdt to fdt_initial
> (to match fdt_initial_size) and fdt_blob to fdt should make that
> clearer.
> 

As mentioned earlier in this thread, spapr->fdt_initial_size is only used
for tracing if the received fdt blob fails fdt_check_full()...

$ git grep -H fdt_initial_size
hw/ppc/spapr.c:    spapr->fdt_initial_size = spapr->fdt_size;
hw/ppc/spapr.c:        VMSTATE_UINT32(fdt_initial_size, sPAPRMachineState),
hw/ppc/spapr_hcall.c:        trace_spapr_update_dt_failed(spapr->fdt_initial_size, cb,
include/hw/ppc/spapr.h:    uint32_t fdt_initial_size;

Not sure it is helpful, and anyway, it is expected to be the same in source
and destination, so why put it in the migration stream ?

The only case where we want to migrate something is when h_update_dt() has
succeeded, ie, the guest passed a valid DT blob. This implies that its
size isn't 0, otherwise fdt_check_full() would return -FDT_ERR_TRUNCATED.

I would suggest rather to:

- completely drop spapr->fdt_initial_size
- clear spapr->fdt_size at machine reset
- migrate if spapr->fdt_size is not zero

Also, I've just realized another problem... nothing prevents a malicious
guest to pass an insanely great size to h_update_dt, which would cause
g_malloc0() to abort... The passed size should be checked against
FDT_MAX_SIZE.

Cheers,

--
Greg
Alexey Kardashevskiy Dec. 11, 2018, 3:36 a.m. UTC | #7
On 10/12/2018 17:20, David Gibson wrote:
> On Mon, Nov 12, 2018 at 03:12:26PM +1100, Alexey Kardashevskiy wrote:
>>
>>
>> On 12/11/2018 05:10, Greg Kurz wrote:
>>> Hi Alexey,
>>>
>>> Just a few remarks. See below.
>>>
>>> On Thu,  8 Nov 2018 12:44:06 +1100
>>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>
>>>> SLOF receives a device tree and updates it with various properties
>>>> before switching to the guest kernel and QEMU is not aware of any changes
>>>> made by SLOF. Since there is no real RTAS (QEMU implements it), it makes
>>>> sense to pass the SLOF final device tree to QEMU to let it implement
>>>> RTAS related tasks better, such as PCI host bus adapter hotplug.
>>>>
>>>> Specifially, now QEMU can find out the actual XICS phandle (for PHB
>>>> hotplug) and the RTAS linux,rtas-entry/base properties (for firmware
>>>> assisted NMI - FWNMI).
>>>>
>>>> This stores the initial DT blob in the sPAPR machine and replaces it
>>>> in the KVMPPC_H_UPDATE_DT (new private hypercall) handler.
>>>>
>>>> This adds an @update_dt_enabled machine property to allow backward
>>>> migration.
>>>>
>>>> SLOF already has a hypercall since
>>>> https://github.com/aik/SLOF/commit/e6fc84652c9c0073f9183
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>>  include/hw/ppc/spapr.h |  7 ++++++-
>>>>  hw/ppc/spapr.c         | 29 ++++++++++++++++++++++++++++-
>>>>  hw/ppc/spapr_hcall.c   | 32 ++++++++++++++++++++++++++++++++
>>>>  hw/ppc/trace-events    |  2 ++
>>>>  4 files changed, 68 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>> index ad4d7cfd97..f5dcaf44cb 100644
>>>> --- a/include/hw/ppc/spapr.h
>>>> +++ b/include/hw/ppc/spapr.h
>>>> @@ -100,6 +100,7 @@ struct sPAPRMachineClass {
>>>>  
>>>>      /*< public >*/
>>>>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
>>>> +    bool update_dt_enabled;    /* enable KVMPPC_H_UPDATE_DT */
>>>>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
>>>>      bool pre_2_10_has_unused_icps;
>>>>      bool legacy_irq_allocation;
>>>> @@ -136,6 +137,9 @@ struct sPAPRMachineState {
>>>>      int vrma_adjust;
>>>>      ssize_t rtas_size;
>>>>      void *rtas_blob;
>>>> +    uint32_t fdt_size;
>>>> +    uint32_t fdt_initial_size;
>>>
>>> I don't quite see the purpose of fdt_initial_size... it seems to be only
>>> used to print a trace.
>>
>>
>> Ah, lost in rebase. The purpose was to test if the new device tree has
>> not grown too much.
>>
>>
>>
>>>
>>>> +    void *fdt_blob;
>>>>      long kernel_size;
>>>>      bool kernel_le;
>>>>      uint32_t initrd_base;
>>>> @@ -462,7 +466,8 @@ struct sPAPRMachineState {
>>>>  #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
>>>>  /* Client Architecture support */
>>>>  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
>>>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_CAS
>>>> +#define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
>>>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
>>>>  
>>>>  typedef struct sPAPRDeviceTreeUpdateHeader {
>>>>      uint32_t version_id;
>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>> index c08130facb..5e2d4d211c 100644
>>>> --- a/hw/ppc/spapr.c
>>>> +++ b/hw/ppc/spapr.c
>>>> @@ -1633,7 +1633,10 @@ static void spapr_machine_reset(void)
>>>>      /* Load the fdt */
>>>>      qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
>>>>      cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
>>>> -    g_free(fdt);
>>>> +    g_free(spapr->fdt_blob);
>>>> +    spapr->fdt_size = fdt_totalsize(fdt);
>>>> +    spapr->fdt_initial_size = spapr->fdt_size;
>>>> +    spapr->fdt_blob = fdt;
>>>
>>> Hmm... It looks weird to store state in a reset handler. I'd rather zeroe
>>> both fdt_blob and fdt_size here.
>>
>> The device tree is built from the reset handler and the idea is that we
>> want to always have some tree in the machine.
> 
> Yes, I think the approach here is fine.  Otherwise when we want to
> look up the current fdt state in RTAS calls or whatever we'd always
> have to do
> 	if (fdt_blob)
> 		look up that
> 	else
> 		look up qemu created fdt.
> 
> Incidentally 'fdt' and 'fdt_blob' names do a terrible job of
> distinguishing what the difference is.  Renaming fdt to fdt_initial
> (to match fdt_initial_size) and fdt_blob to fdt should make that
> clearer.

There is just one fdt in sPAPRMachineState - it is fdt_blob as it is
flattened. The "fdt" symbol above is local to spapr_machine_reset() and
when the tree is built - it is stored in fdt_blob.
Alexey Kardashevskiy Dec. 11, 2018, 3:53 a.m. UTC | #8
On 10/12/2018 20:30, Greg Kurz wrote:
> On Mon, 10 Dec 2018 17:20:43 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
>> On Mon, Nov 12, 2018 at 03:12:26PM +1100, Alexey Kardashevskiy wrote:
>>>
>>>
>>> On 12/11/2018 05:10, Greg Kurz wrote:  
>>>> Hi Alexey,
>>>>
>>>> Just a few remarks. See below.
>>>>
>>>> On Thu,  8 Nov 2018 12:44:06 +1100
>>>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>>   
>>>>> SLOF receives a device tree and updates it with various properties
>>>>> before switching to the guest kernel and QEMU is not aware of any changes
>>>>> made by SLOF. Since there is no real RTAS (QEMU implements it), it makes
>>>>> sense to pass the SLOF final device tree to QEMU to let it implement
>>>>> RTAS related tasks better, such as PCI host bus adapter hotplug.
>>>>>
>>>>> Specifially, now QEMU can find out the actual XICS phandle (for PHB
>>>>> hotplug) and the RTAS linux,rtas-entry/base properties (for firmware
>>>>> assisted NMI - FWNMI).
>>>>>
>>>>> This stores the initial DT blob in the sPAPR machine and replaces it
>>>>> in the KVMPPC_H_UPDATE_DT (new private hypercall) handler.
>>>>>
>>>>> This adds an @update_dt_enabled machine property to allow backward
>>>>> migration.
>>>>>
>>>>> SLOF already has a hypercall since
>>>>> https://github.com/aik/SLOF/commit/e6fc84652c9c0073f9183
>>>>>
>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>> ---
>>>>>  include/hw/ppc/spapr.h |  7 ++++++-
>>>>>  hw/ppc/spapr.c         | 29 ++++++++++++++++++++++++++++-
>>>>>  hw/ppc/spapr_hcall.c   | 32 ++++++++++++++++++++++++++++++++
>>>>>  hw/ppc/trace-events    |  2 ++
>>>>>  4 files changed, 68 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>>> index ad4d7cfd97..f5dcaf44cb 100644
>>>>> --- a/include/hw/ppc/spapr.h
>>>>> +++ b/include/hw/ppc/spapr.h
>>>>> @@ -100,6 +100,7 @@ struct sPAPRMachineClass {
>>>>>  
>>>>>      /*< public >*/
>>>>>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
>>>>> +    bool update_dt_enabled;    /* enable KVMPPC_H_UPDATE_DT */
>>>>>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
>>>>>      bool pre_2_10_has_unused_icps;
>>>>>      bool legacy_irq_allocation;
>>>>> @@ -136,6 +137,9 @@ struct sPAPRMachineState {
>>>>>      int vrma_adjust;
>>>>>      ssize_t rtas_size;
>>>>>      void *rtas_blob;
>>>>> +    uint32_t fdt_size;
>>>>> +    uint32_t fdt_initial_size;  
>>>>
>>>> I don't quite see the purpose of fdt_initial_size... it seems to be only
>>>> used to print a trace.  
>>>
>>>
>>> Ah, lost in rebase. The purpose was to test if the new device tree has
>>> not grown too much.
>>>
>>>
>>>   
>>>>   
>>>>> +    void *fdt_blob;
>>>>>      long kernel_size;
>>>>>      bool kernel_le;
>>>>>      uint32_t initrd_base;
>>>>> @@ -462,7 +466,8 @@ struct sPAPRMachineState {
>>>>>  #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
>>>>>  /* Client Architecture support */
>>>>>  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
>>>>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_CAS
>>>>> +#define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
>>>>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
>>>>>  
>>>>>  typedef struct sPAPRDeviceTreeUpdateHeader {
>>>>>      uint32_t version_id;
>>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>>> index c08130facb..5e2d4d211c 100644
>>>>> --- a/hw/ppc/spapr.c
>>>>> +++ b/hw/ppc/spapr.c
>>>>> @@ -1633,7 +1633,10 @@ static void spapr_machine_reset(void)
>>>>>      /* Load the fdt */
>>>>>      qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
>>>>>      cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
>>>>> -    g_free(fdt);
>>>>> +    g_free(spapr->fdt_blob);
>>>>> +    spapr->fdt_size = fdt_totalsize(fdt);
>>>>> +    spapr->fdt_initial_size = spapr->fdt_size;
>>>>> +    spapr->fdt_blob = fdt;  
>>>>
>>>> Hmm... It looks weird to store state in a reset handler. I'd rather zeroe
>>>> both fdt_blob and fdt_size here.  
>>>
>>> The device tree is built from the reset handler and the idea is that we
>>> want to always have some tree in the machine.  
>>
>> Yes, I think the approach here is fine.  Otherwise when we want to
>> look up the current fdt state in RTAS calls or whatever we'd always
>> have to do
>> 	if (fdt_blob)
>> 		look up that
>> 	else
>> 		look up qemu created fdt.
>>
> 
> No. We only have one fdt blob: the initial one, I'd rather
> call reset time one, or the updated one.

There is one fdt in the machine, always. Either initial or from cas.



>> Incidentally 'fdt' and 'fdt_blob' names do a terrible job of
>> distinguishing what the difference is.  Renaming fdt to fdt_initial
>> (to match fdt_initial_size) and fdt_blob to fdt should make that
>> clearer.
>>
> 
> As mentioned earlier in this thread, spapr->fdt_initial_size is only used
> for tracing if the received fdt blob fails fdt_check_full()...
> 
> $ git grep -H fdt_initial_size
> hw/ppc/spapr.c:    spapr->fdt_initial_size = spapr->fdt_size;
> hw/ppc/spapr.c:        VMSTATE_UINT32(fdt_initial_size, sPAPRMachineState),
> hw/ppc/spapr_hcall.c:        trace_spapr_update_dt_failed(spapr->fdt_initial_size, cb,
> include/hw/ppc/spapr.h:    uint32_t fdt_initial_size;
> 
> Not sure it is helpful, and anyway, it is expected to be the same in source
> and destination, so why put it in the migration stream ?


Well, we do build the fdt anyway even when receive migration but we do
not have to and yes we can expect the fdt on the destination to be of
the same size since it is the same command line, it is just guessing and
expecting vs. knowing and I prefer the latter as the reset time fdt and
migration source fdt might have different size because of
host-model/host-serial/slot-label/similar properties.


> The only case where we want to migrate something is when h_update_dt() has
> succeeded, ie, the guest passed a valid DT blob. This implies that its
> size isn't 0, otherwise fdt_check_full() would return -FDT_ERR_TRUNCATED.
> 
> I would suggest rather to:
> 
> - completely drop spapr->fdt_initial_size
> - clear spapr->fdt_size at machine reset
> - migrate if spapr->fdt_size is not zero
> 
> Also, I've just realized another problem... nothing prevents a malicious
> guest to pass an insanely great size to h_update_dt, which would cause
> g_malloc0() to abort... The passed size should be checked against
> FDT_MAX_SIZE.

Good point. Just noticed - as posted, the checker actually checks the
reset time tree, not the updated one, my bad :)
Greg Kurz Dec. 11, 2018, 9:55 a.m. UTC | #9
On Tue, 11 Dec 2018 14:53:32 +1100
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:

> On 10/12/2018 20:30, Greg Kurz wrote:
> > On Mon, 10 Dec 2018 17:20:43 +1100
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> >> On Mon, Nov 12, 2018 at 03:12:26PM +1100, Alexey Kardashevskiy wrote:  
> >>>
> >>>
> >>> On 12/11/2018 05:10, Greg Kurz wrote:    
> >>>> Hi Alexey,
> >>>>
> >>>> Just a few remarks. See below.
> >>>>
> >>>> On Thu,  8 Nov 2018 12:44:06 +1100
> >>>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> >>>>     
> >>>>> SLOF receives a device tree and updates it with various properties
> >>>>> before switching to the guest kernel and QEMU is not aware of any changes
> >>>>> made by SLOF. Since there is no real RTAS (QEMU implements it), it makes
> >>>>> sense to pass the SLOF final device tree to QEMU to let it implement
> >>>>> RTAS related tasks better, such as PCI host bus adapter hotplug.
> >>>>>
> >>>>> Specifially, now QEMU can find out the actual XICS phandle (for PHB
> >>>>> hotplug) and the RTAS linux,rtas-entry/base properties (for firmware
> >>>>> assisted NMI - FWNMI).
> >>>>>
> >>>>> This stores the initial DT blob in the sPAPR machine and replaces it
> >>>>> in the KVMPPC_H_UPDATE_DT (new private hypercall) handler.
> >>>>>
> >>>>> This adds an @update_dt_enabled machine property to allow backward
> >>>>> migration.
> >>>>>
> >>>>> SLOF already has a hypercall since
> >>>>> https://github.com/aik/SLOF/commit/e6fc84652c9c0073f9183
> >>>>>
> >>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>> ---
> >>>>>  include/hw/ppc/spapr.h |  7 ++++++-
> >>>>>  hw/ppc/spapr.c         | 29 ++++++++++++++++++++++++++++-
> >>>>>  hw/ppc/spapr_hcall.c   | 32 ++++++++++++++++++++++++++++++++
> >>>>>  hw/ppc/trace-events    |  2 ++
> >>>>>  4 files changed, 68 insertions(+), 2 deletions(-)
> >>>>>
> >>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >>>>> index ad4d7cfd97..f5dcaf44cb 100644
> >>>>> --- a/include/hw/ppc/spapr.h
> >>>>> +++ b/include/hw/ppc/spapr.h
> >>>>> @@ -100,6 +100,7 @@ struct sPAPRMachineClass {
> >>>>>  
> >>>>>      /*< public >*/
> >>>>>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
> >>>>> +    bool update_dt_enabled;    /* enable KVMPPC_H_UPDATE_DT */
> >>>>>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
> >>>>>      bool pre_2_10_has_unused_icps;
> >>>>>      bool legacy_irq_allocation;
> >>>>> @@ -136,6 +137,9 @@ struct sPAPRMachineState {
> >>>>>      int vrma_adjust;
> >>>>>      ssize_t rtas_size;
> >>>>>      void *rtas_blob;
> >>>>> +    uint32_t fdt_size;
> >>>>> +    uint32_t fdt_initial_size;    
> >>>>
> >>>> I don't quite see the purpose of fdt_initial_size... it seems to be only
> >>>> used to print a trace.    
> >>>
> >>>
> >>> Ah, lost in rebase. The purpose was to test if the new device tree has
> >>> not grown too much.
> >>>
> >>>
> >>>     
> >>>>     
> >>>>> +    void *fdt_blob;
> >>>>>      long kernel_size;
> >>>>>      bool kernel_le;
> >>>>>      uint32_t initrd_base;
> >>>>> @@ -462,7 +466,8 @@ struct sPAPRMachineState {
> >>>>>  #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
> >>>>>  /* Client Architecture support */
> >>>>>  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
> >>>>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_CAS
> >>>>> +#define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
> >>>>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
> >>>>>  
> >>>>>  typedef struct sPAPRDeviceTreeUpdateHeader {
> >>>>>      uint32_t version_id;
> >>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >>>>> index c08130facb..5e2d4d211c 100644
> >>>>> --- a/hw/ppc/spapr.c
> >>>>> +++ b/hw/ppc/spapr.c
> >>>>> @@ -1633,7 +1633,10 @@ static void spapr_machine_reset(void)
> >>>>>      /* Load the fdt */
> >>>>>      qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
> >>>>>      cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
> >>>>> -    g_free(fdt);
> >>>>> +    g_free(spapr->fdt_blob);
> >>>>> +    spapr->fdt_size = fdt_totalsize(fdt);
> >>>>> +    spapr->fdt_initial_size = spapr->fdt_size;
> >>>>> +    spapr->fdt_blob = fdt;    
> >>>>
> >>>> Hmm... It looks weird to store state in a reset handler. I'd rather zeroe
> >>>> both fdt_blob and fdt_size here.    
> >>>
> >>> The device tree is built from the reset handler and the idea is that we
> >>> want to always have some tree in the machine.    
> >>
> >> Yes, I think the approach here is fine.  Otherwise when we want to
> >> look up the current fdt state in RTAS calls or whatever we'd always
> >> have to do
> >> 	if (fdt_blob)
> >> 		look up that
> >> 	else
> >> 		look up qemu created fdt.
> >>  
> > 
> > No. We only have one fdt blob: the initial one, I'd rather
> > call reset time one, or the updated one.  
> 
> There is one fdt in the machine, always. Either initial or from cas.
> 

Yeah, reset time fdt is either the initial one, either cas... and I'm now
wandering what happens if migration occurs between cas that sets cas_reboot
and the corresponding reset. With the current code base, I have the impression
that the destination will redo the full cas+cas_reboot cycle after restart or
am I missing something ? 

> 
> 
> >> Incidentally 'fdt' and 'fdt_blob' names do a terrible job of
> >> distinguishing what the difference is.  Renaming fdt to fdt_initial
> >> (to match fdt_initial_size) and fdt_blob to fdt should make that
> >> clearer.
> >>  
> > 
> > As mentioned earlier in this thread, spapr->fdt_initial_size is only used
> > for tracing if the received fdt blob fails fdt_check_full()...
> > 
> > $ git grep -H fdt_initial_size
> > hw/ppc/spapr.c:    spapr->fdt_initial_size = spapr->fdt_size;
> > hw/ppc/spapr.c:        VMSTATE_UINT32(fdt_initial_size, sPAPRMachineState),
> > hw/ppc/spapr_hcall.c:        trace_spapr_update_dt_failed(spapr->fdt_initial_size, cb,
> > include/hw/ppc/spapr.h:    uint32_t fdt_initial_size;
> > 
> > Not sure it is helpful, and anyway, it is expected to be the same in source
> > and destination, so why put it in the migration stream ?  
> 
> 
> Well, we do build the fdt anyway even when receive migration but we do
> not have to and yes we can expect the fdt on the destination to be of
> the same size since it is the same command line, it is just guessing and
> expecting vs. knowing and I prefer the latter as the reset time fdt and
> migration source fdt might have different size because of
> host-model/host-serial/slot-label/similar properties.
> 

Right but I still don't see the usefulness of fdt_initial_size...

> 
> > The only case where we want to migrate something is when h_update_dt() has
> > succeeded, ie, the guest passed a valid DT blob. This implies that its
> > size isn't 0, otherwise fdt_check_full() would return -FDT_ERR_TRUNCATED.
> > 
> > I would suggest rather to:
> > 
> > - completely drop spapr->fdt_initial_size
> > - clear spapr->fdt_size at machine reset
> > - migrate if spapr->fdt_size is not zero
> > 
> > Also, I've just realized another problem... nothing prevents a malicious
> > guest to pass an insanely great size to h_update_dt, which would cause
> > g_malloc0() to abort... The passed size should be checked against
> > FDT_MAX_SIZE.  
> 
> Good point. Just noticed - as posted, the checker actually checks the
> reset time tree, not the updated one, my bad :)
> 
> 
>
David Gibson Dec. 12, 2018, 12:20 a.m. UTC | #10
On Tue, Dec 11, 2018 at 02:36:09PM +1100, Alexey Kardashevskiy wrote:
> 
> 
> On 10/12/2018 17:20, David Gibson wrote:
> > On Mon, Nov 12, 2018 at 03:12:26PM +1100, Alexey Kardashevskiy wrote:
> >>
> >>
> >> On 12/11/2018 05:10, Greg Kurz wrote:
> >>> Hi Alexey,
> >>>
> >>> Just a few remarks. See below.
> >>>
> >>> On Thu,  8 Nov 2018 12:44:06 +1100
> >>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> >>>
> >>>> SLOF receives a device tree and updates it with various properties
> >>>> before switching to the guest kernel and QEMU is not aware of any changes
> >>>> made by SLOF. Since there is no real RTAS (QEMU implements it), it makes
> >>>> sense to pass the SLOF final device tree to QEMU to let it implement
> >>>> RTAS related tasks better, such as PCI host bus adapter hotplug.
> >>>>
> >>>> Specifially, now QEMU can find out the actual XICS phandle (for PHB
> >>>> hotplug) and the RTAS linux,rtas-entry/base properties (for firmware
> >>>> assisted NMI - FWNMI).
> >>>>
> >>>> This stores the initial DT blob in the sPAPR machine and replaces it
> >>>> in the KVMPPC_H_UPDATE_DT (new private hypercall) handler.
> >>>>
> >>>> This adds an @update_dt_enabled machine property to allow backward
> >>>> migration.
> >>>>
> >>>> SLOF already has a hypercall since
> >>>> https://github.com/aik/SLOF/commit/e6fc84652c9c0073f9183
> >>>>
> >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>> ---
> >>>>  include/hw/ppc/spapr.h |  7 ++++++-
> >>>>  hw/ppc/spapr.c         | 29 ++++++++++++++++++++++++++++-
> >>>>  hw/ppc/spapr_hcall.c   | 32 ++++++++++++++++++++++++++++++++
> >>>>  hw/ppc/trace-events    |  2 ++
> >>>>  4 files changed, 68 insertions(+), 2 deletions(-)
> >>>>
> >>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >>>> index ad4d7cfd97..f5dcaf44cb 100644
> >>>> --- a/include/hw/ppc/spapr.h
> >>>> +++ b/include/hw/ppc/spapr.h
> >>>> @@ -100,6 +100,7 @@ struct sPAPRMachineClass {
> >>>>  
> >>>>      /*< public >*/
> >>>>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
> >>>> +    bool update_dt_enabled;    /* enable KVMPPC_H_UPDATE_DT */
> >>>>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
> >>>>      bool pre_2_10_has_unused_icps;
> >>>>      bool legacy_irq_allocation;
> >>>> @@ -136,6 +137,9 @@ struct sPAPRMachineState {
> >>>>      int vrma_adjust;
> >>>>      ssize_t rtas_size;
> >>>>      void *rtas_blob;
> >>>> +    uint32_t fdt_size;
> >>>> +    uint32_t fdt_initial_size;
> >>>
> >>> I don't quite see the purpose of fdt_initial_size... it seems to be only
> >>> used to print a trace.
> >>
> >>
> >> Ah, lost in rebase. The purpose was to test if the new device tree has
> >> not grown too much.
> >>
> >>
> >>
> >>>
> >>>> +    void *fdt_blob;
> >>>>      long kernel_size;
> >>>>      bool kernel_le;
> >>>>      uint32_t initrd_base;
> >>>> @@ -462,7 +466,8 @@ struct sPAPRMachineState {
> >>>>  #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
> >>>>  /* Client Architecture support */
> >>>>  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
> >>>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_CAS
> >>>> +#define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
> >>>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
> >>>>  
> >>>>  typedef struct sPAPRDeviceTreeUpdateHeader {
> >>>>      uint32_t version_id;
> >>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >>>> index c08130facb..5e2d4d211c 100644
> >>>> --- a/hw/ppc/spapr.c
> >>>> +++ b/hw/ppc/spapr.c
> >>>> @@ -1633,7 +1633,10 @@ static void spapr_machine_reset(void)
> >>>>      /* Load the fdt */
> >>>>      qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
> >>>>      cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
> >>>> -    g_free(fdt);
> >>>> +    g_free(spapr->fdt_blob);
> >>>> +    spapr->fdt_size = fdt_totalsize(fdt);
> >>>> +    spapr->fdt_initial_size = spapr->fdt_size;
> >>>> +    spapr->fdt_blob = fdt;
> >>>
> >>> Hmm... It looks weird to store state in a reset handler. I'd rather zeroe
> >>> both fdt_blob and fdt_size here.
> >>
> >> The device tree is built from the reset handler and the idea is that we
> >> want to always have some tree in the machine.
> > 
> > Yes, I think the approach here is fine.  Otherwise when we want to
> > look up the current fdt state in RTAS calls or whatever we'd always
> > have to do
> > 	if (fdt_blob)
> > 		look up that
> > 	else
> > 		look up qemu created fdt.
> > 
> > Incidentally 'fdt' and 'fdt_blob' names do a terrible job of
> > distinguishing what the difference is.  Renaming fdt to fdt_initial
> > (to match fdt_initial_size) and fdt_blob to fdt should make that
> > clearer.
> 
> There is just one fdt in sPAPRMachineState - it is fdt_blob as it is
> flattened. The "fdt" symbol above is local to spapr_machine_reset() and
> when the tree is built - it is stored in fdt_blob.

Uh, sorry, I misread.  I'll look more carefully at the next spin.
David Gibson Dec. 12, 2018, 12:29 a.m. UTC | #11
On Tue, Dec 11, 2018 at 10:55:59AM +0100, Greg Kurz wrote:
> On Tue, 11 Dec 2018 14:53:32 +1100
> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> 
> > On 10/12/2018 20:30, Greg Kurz wrote:
> > > On Mon, 10 Dec 2018 17:20:43 +1100
> > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > >   
> > >> On Mon, Nov 12, 2018 at 03:12:26PM +1100, Alexey Kardashevskiy wrote:  
> > >>>
> > >>>
> > >>> On 12/11/2018 05:10, Greg Kurz wrote:    
> > >>>> Hi Alexey,
> > >>>>
> > >>>> Just a few remarks. See below.
> > >>>>
> > >>>> On Thu,  8 Nov 2018 12:44:06 +1100
> > >>>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> > >>>>     
> > >>>>> SLOF receives a device tree and updates it with various properties
> > >>>>> before switching to the guest kernel and QEMU is not aware of any changes
> > >>>>> made by SLOF. Since there is no real RTAS (QEMU implements it), it makes
> > >>>>> sense to pass the SLOF final device tree to QEMU to let it implement
> > >>>>> RTAS related tasks better, such as PCI host bus adapter hotplug.
> > >>>>>
> > >>>>> Specifially, now QEMU can find out the actual XICS phandle (for PHB
> > >>>>> hotplug) and the RTAS linux,rtas-entry/base properties (for firmware
> > >>>>> assisted NMI - FWNMI).
> > >>>>>
> > >>>>> This stores the initial DT blob in the sPAPR machine and replaces it
> > >>>>> in the KVMPPC_H_UPDATE_DT (new private hypercall) handler.
> > >>>>>
> > >>>>> This adds an @update_dt_enabled machine property to allow backward
> > >>>>> migration.
> > >>>>>
> > >>>>> SLOF already has a hypercall since
> > >>>>> https://github.com/aik/SLOF/commit/e6fc84652c9c0073f9183
> > >>>>>
> > >>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > >>>>> ---
> > >>>>>  include/hw/ppc/spapr.h |  7 ++++++-
> > >>>>>  hw/ppc/spapr.c         | 29 ++++++++++++++++++++++++++++-
> > >>>>>  hw/ppc/spapr_hcall.c   | 32 ++++++++++++++++++++++++++++++++
> > >>>>>  hw/ppc/trace-events    |  2 ++
> > >>>>>  4 files changed, 68 insertions(+), 2 deletions(-)
> > >>>>>
> > >>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> > >>>>> index ad4d7cfd97..f5dcaf44cb 100644
> > >>>>> --- a/include/hw/ppc/spapr.h
> > >>>>> +++ b/include/hw/ppc/spapr.h
> > >>>>> @@ -100,6 +100,7 @@ struct sPAPRMachineClass {
> > >>>>>  
> > >>>>>      /*< public >*/
> > >>>>>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
> > >>>>> +    bool update_dt_enabled;    /* enable KVMPPC_H_UPDATE_DT */
> > >>>>>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
> > >>>>>      bool pre_2_10_has_unused_icps;
> > >>>>>      bool legacy_irq_allocation;
> > >>>>> @@ -136,6 +137,9 @@ struct sPAPRMachineState {
> > >>>>>      int vrma_adjust;
> > >>>>>      ssize_t rtas_size;
> > >>>>>      void *rtas_blob;
> > >>>>> +    uint32_t fdt_size;
> > >>>>> +    uint32_t fdt_initial_size;    
> > >>>>
> > >>>> I don't quite see the purpose of fdt_initial_size... it seems to be only
> > >>>> used to print a trace.    
> > >>>
> > >>>
> > >>> Ah, lost in rebase. The purpose was to test if the new device tree has
> > >>> not grown too much.
> > >>>
> > >>>
> > >>>     
> > >>>>     
> > >>>>> +    void *fdt_blob;
> > >>>>>      long kernel_size;
> > >>>>>      bool kernel_le;
> > >>>>>      uint32_t initrd_base;
> > >>>>> @@ -462,7 +466,8 @@ struct sPAPRMachineState {
> > >>>>>  #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
> > >>>>>  /* Client Architecture support */
> > >>>>>  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
> > >>>>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_CAS
> > >>>>> +#define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
> > >>>>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
> > >>>>>  
> > >>>>>  typedef struct sPAPRDeviceTreeUpdateHeader {
> > >>>>>      uint32_t version_id;
> > >>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > >>>>> index c08130facb..5e2d4d211c 100644
> > >>>>> --- a/hw/ppc/spapr.c
> > >>>>> +++ b/hw/ppc/spapr.c
> > >>>>> @@ -1633,7 +1633,10 @@ static void spapr_machine_reset(void)
> > >>>>>      /* Load the fdt */
> > >>>>>      qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
> > >>>>>      cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
> > >>>>> -    g_free(fdt);
> > >>>>> +    g_free(spapr->fdt_blob);
> > >>>>> +    spapr->fdt_size = fdt_totalsize(fdt);
> > >>>>> +    spapr->fdt_initial_size = spapr->fdt_size;
> > >>>>> +    spapr->fdt_blob = fdt;    
> > >>>>
> > >>>> Hmm... It looks weird to store state in a reset handler. I'd rather zeroe
> > >>>> both fdt_blob and fdt_size here.    
> > >>>
> > >>> The device tree is built from the reset handler and the idea is that we
> > >>> want to always have some tree in the machine.    
> > >>
> > >> Yes, I think the approach here is fine.  Otherwise when we want to
> > >> look up the current fdt state in RTAS calls or whatever we'd always
> > >> have to do
> > >> 	if (fdt_blob)
> > >> 		look up that
> > >> 	else
> > >> 		look up qemu created fdt.
> > >>  
> > > 
> > > No. We only have one fdt blob: the initial one, I'd rather
> > > call reset time one, or the updated one.  
> > 
> > There is one fdt in the machine, always. Either initial or from cas.
> 
> Yeah, reset time fdt is either the initial one, either cas... and I'm now
> wandering what happens if migration occurs between cas that sets cas_reboot
> and the corresponding reset. With the current code base, I have the impression
> that the destination will redo the full cas+cas_reboot cycle after restart or
> am I missing something ?

Yes, I believe that's correct.  It's kind of an edge case and that CAS
cycle should still complete ok, it'll just take a little longer to
boot, so I thought that was preferable to the complexity of migrating
the CAS state.

> > >> Incidentally 'fdt' and 'fdt_blob' names do a terrible job of
> > >> distinguishing what the difference is.  Renaming fdt to fdt_initial
> > >> (to match fdt_initial_size) and fdt_blob to fdt should make that
> > >> clearer.
> > >>  
> > > 
> > > As mentioned earlier in this thread, spapr->fdt_initial_size is only used
> > > for tracing if the received fdt blob fails fdt_check_full()...
> > > 
> > > $ git grep -H fdt_initial_size
> > > hw/ppc/spapr.c:    spapr->fdt_initial_size = spapr->fdt_size;
> > > hw/ppc/spapr.c:        VMSTATE_UINT32(fdt_initial_size, sPAPRMachineState),
> > > hw/ppc/spapr_hcall.c:        trace_spapr_update_dt_failed(spapr->fdt_initial_size, cb,
> > > include/hw/ppc/spapr.h:    uint32_t fdt_initial_size;
> > > 
> > > Not sure it is helpful, and anyway, it is expected to be the same in source
> > > and destination, so why put it in the migration stream ?  
> > 
> > 
> > Well, we do build the fdt anyway even when receive migration but we do
> > not have to and yes we can expect the fdt on the destination to be of
> > the same size since it is the same command line, it is just guessing and
> > expecting vs. knowing and I prefer the latter as the reset time fdt and
> > migration source fdt might have different size because of
> > host-model/host-serial/slot-label/similar properties.
> 
> Right but I still don't see the usefulness of fdt_initial_size...

So, it's there to address exactly the problem you pointed out elswhere
in the thread: the idea was to disallow the guest resubmitting an fdt
which is "too much" bigger than the original one, thereby consuming a
bunch of qemu memory.  The thought was that this is a bit more robust
that just checking against a fixed max size, especially if we need to
increase that fixed size in future to handle really big partitions.

> > > The only case where we want to migrate something is when h_update_dt() has
> > > succeeded, ie, the guest passed a valid DT blob. This implies that its
> > > size isn't 0, otherwise fdt_check_full() would return -FDT_ERR_TRUNCATED.
> > > 
> > > I would suggest rather to:
> > > 
> > > - completely drop spapr->fdt_initial_size
> > > - clear spapr->fdt_size at machine reset
> > > - migrate if spapr->fdt_size is not zero
> > > 
> > > Also, I've just realized another problem... nothing prevents a malicious
> > > guest to pass an insanely great size to h_update_dt, which would cause
> > > g_malloc0() to abort... The passed size should be checked against
> > > FDT_MAX_SIZE.  
> > 
> > Good point. Just noticed - as posted, the checker actually checks the
> > reset time tree, not the updated one, my bad :)
> > 
> > 
> > 
>
Greg Kurz Dec. 12, 2018, 4:54 p.m. UTC | #12
On Wed, 12 Dec 2018 11:29:55 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Tue, Dec 11, 2018 at 10:55:59AM +0100, Greg Kurz wrote:
> > On Tue, 11 Dec 2018 14:53:32 +1100
> > Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> >   
> > > On 10/12/2018 20:30, Greg Kurz wrote:  
> > > > On Mon, 10 Dec 2018 17:20:43 +1100
> > > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > > >     
> > > >> On Mon, Nov 12, 2018 at 03:12:26PM +1100, Alexey Kardashevskiy wrote:    
> > > >>>
> > > >>>
> > > >>> On 12/11/2018 05:10, Greg Kurz wrote:      
> > > >>>> Hi Alexey,
> > > >>>>
> > > >>>> Just a few remarks. See below.
> > > >>>>
> > > >>>> On Thu,  8 Nov 2018 12:44:06 +1100
> > > >>>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> > > >>>>       
> > > >>>>> SLOF receives a device tree and updates it with various properties
> > > >>>>> before switching to the guest kernel and QEMU is not aware of any changes
> > > >>>>> made by SLOF. Since there is no real RTAS (QEMU implements it), it makes
> > > >>>>> sense to pass the SLOF final device tree to QEMU to let it implement
> > > >>>>> RTAS related tasks better, such as PCI host bus adapter hotplug.
> > > >>>>>
> > > >>>>> Specifially, now QEMU can find out the actual XICS phandle (for PHB
> > > >>>>> hotplug) and the RTAS linux,rtas-entry/base properties (for firmware
> > > >>>>> assisted NMI - FWNMI).
> > > >>>>>
> > > >>>>> This stores the initial DT blob in the sPAPR machine and replaces it
> > > >>>>> in the KVMPPC_H_UPDATE_DT (new private hypercall) handler.
> > > >>>>>
> > > >>>>> This adds an @update_dt_enabled machine property to allow backward
> > > >>>>> migration.
> > > >>>>>
> > > >>>>> SLOF already has a hypercall since
> > > >>>>> https://github.com/aik/SLOF/commit/e6fc84652c9c0073f9183
> > > >>>>>
> > > >>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > >>>>> ---
> > > >>>>>  include/hw/ppc/spapr.h |  7 ++++++-
> > > >>>>>  hw/ppc/spapr.c         | 29 ++++++++++++++++++++++++++++-
> > > >>>>>  hw/ppc/spapr_hcall.c   | 32 ++++++++++++++++++++++++++++++++
> > > >>>>>  hw/ppc/trace-events    |  2 ++
> > > >>>>>  4 files changed, 68 insertions(+), 2 deletions(-)
> > > >>>>>
> > > >>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> > > >>>>> index ad4d7cfd97..f5dcaf44cb 100644
> > > >>>>> --- a/include/hw/ppc/spapr.h
> > > >>>>> +++ b/include/hw/ppc/spapr.h
> > > >>>>> @@ -100,6 +100,7 @@ struct sPAPRMachineClass {
> > > >>>>>  
> > > >>>>>      /*< public >*/
> > > >>>>>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
> > > >>>>> +    bool update_dt_enabled;    /* enable KVMPPC_H_UPDATE_DT */
> > > >>>>>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
> > > >>>>>      bool pre_2_10_has_unused_icps;
> > > >>>>>      bool legacy_irq_allocation;
> > > >>>>> @@ -136,6 +137,9 @@ struct sPAPRMachineState {
> > > >>>>>      int vrma_adjust;
> > > >>>>>      ssize_t rtas_size;
> > > >>>>>      void *rtas_blob;
> > > >>>>> +    uint32_t fdt_size;
> > > >>>>> +    uint32_t fdt_initial_size;      
> > > >>>>
> > > >>>> I don't quite see the purpose of fdt_initial_size... it seems to be only
> > > >>>> used to print a trace.      
> > > >>>
> > > >>>
> > > >>> Ah, lost in rebase. The purpose was to test if the new device tree has
> > > >>> not grown too much.
> > > >>>
> > > >>>
> > > >>>       
> > > >>>>       
> > > >>>>> +    void *fdt_blob;
> > > >>>>>      long kernel_size;
> > > >>>>>      bool kernel_le;
> > > >>>>>      uint32_t initrd_base;
> > > >>>>> @@ -462,7 +466,8 @@ struct sPAPRMachineState {
> > > >>>>>  #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
> > > >>>>>  /* Client Architecture support */
> > > >>>>>  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
> > > >>>>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_CAS
> > > >>>>> +#define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
> > > >>>>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
> > > >>>>>  
> > > >>>>>  typedef struct sPAPRDeviceTreeUpdateHeader {
> > > >>>>>      uint32_t version_id;
> > > >>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > > >>>>> index c08130facb..5e2d4d211c 100644
> > > >>>>> --- a/hw/ppc/spapr.c
> > > >>>>> +++ b/hw/ppc/spapr.c
> > > >>>>> @@ -1633,7 +1633,10 @@ static void spapr_machine_reset(void)
> > > >>>>>      /* Load the fdt */
> > > >>>>>      qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
> > > >>>>>      cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
> > > >>>>> -    g_free(fdt);
> > > >>>>> +    g_free(spapr->fdt_blob);
> > > >>>>> +    spapr->fdt_size = fdt_totalsize(fdt);
> > > >>>>> +    spapr->fdt_initial_size = spapr->fdt_size;
> > > >>>>> +    spapr->fdt_blob = fdt;      
> > > >>>>
> > > >>>> Hmm... It looks weird to store state in a reset handler. I'd rather zeroe
> > > >>>> both fdt_blob and fdt_size here.      
> > > >>>
> > > >>> The device tree is built from the reset handler and the idea is that we
> > > >>> want to always have some tree in the machine.      
> > > >>
> > > >> Yes, I think the approach here is fine.  Otherwise when we want to
> > > >> look up the current fdt state in RTAS calls or whatever we'd always
> > > >> have to do
> > > >> 	if (fdt_blob)
> > > >> 		look up that
> > > >> 	else
> > > >> 		look up qemu created fdt.
> > > >>    
> > > > 
> > > > No. We only have one fdt blob: the initial one, I'd rather
> > > > call reset time one, or the updated one.    
> > > 
> > > There is one fdt in the machine, always. Either initial or from cas.  
> > 
> > Yeah, reset time fdt is either the initial one, either cas... and I'm now
> > wandering what happens if migration occurs between cas that sets cas_reboot
> > and the corresponding reset. With the current code base, I have the impression
> > that the destination will redo the full cas+cas_reboot cycle after restart or
> > am I missing something ?  
> 
> Yes, I believe that's correct.  It's kind of an edge case and that CAS
> cycle should still complete ok, it'll just take a little longer to
> boot, so I thought that was preferable to the complexity of migrating
> the CAS state.
> 

You're probably right.

> > > >> Incidentally 'fdt' and 'fdt_blob' names do a terrible job of
> > > >> distinguishing what the difference is.  Renaming fdt to fdt_initial
> > > >> (to match fdt_initial_size) and fdt_blob to fdt should make that
> > > >> clearer.
> > > >>    
> > > > 
> > > > As mentioned earlier in this thread, spapr->fdt_initial_size is only used
> > > > for tracing if the received fdt blob fails fdt_check_full()...
> > > > 
> > > > $ git grep -H fdt_initial_size
> > > > hw/ppc/spapr.c:    spapr->fdt_initial_size = spapr->fdt_size;
> > > > hw/ppc/spapr.c:        VMSTATE_UINT32(fdt_initial_size, sPAPRMachineState),
> > > > hw/ppc/spapr_hcall.c:        trace_spapr_update_dt_failed(spapr->fdt_initial_size, cb,
> > > > include/hw/ppc/spapr.h:    uint32_t fdt_initial_size;
> > > > 
> > > > Not sure it is helpful, and anyway, it is expected to be the same in source
> > > > and destination, so why put it in the migration stream ?    
> > > 
> > > 
> > > Well, we do build the fdt anyway even when receive migration but we do
> > > not have to and yes we can expect the fdt on the destination to be of
> > > the same size since it is the same command line, it is just guessing and
> > > expecting vs. knowing and I prefer the latter as the reset time fdt and
> > > migration source fdt might have different size because of
> > > host-model/host-serial/slot-label/similar properties.  
> > 
> > Right but I still don't see the usefulness of fdt_initial_size...  
> 
> So, it's there to address exactly the problem you pointed out elswhere
> in the thread: the idea was to disallow the guest resubmitting an fdt
> which is "too much" bigger than the original one, thereby consuming a
> bunch of qemu memory.  The thought was that this is a bit more robust
> that just checking against a fixed max size, especially if we need to
> increase that fixed size in future to handle really big partitions.
> 

Yeah, I saw that with Alexey's new patch.

Thanks for the detailed clarification !

> > > > The only case where we want to migrate something is when h_update_dt() has
> > > > succeeded, ie, the guest passed a valid DT blob. This implies that its
> > > > size isn't 0, otherwise fdt_check_full() would return -FDT_ERR_TRUNCATED.
> > > > 
> > > > I would suggest rather to:
> > > > 
> > > > - completely drop spapr->fdt_initial_size
> > > > - clear spapr->fdt_size at machine reset
> > > > - migrate if spapr->fdt_size is not zero
> > > > 
> > > > Also, I've just realized another problem... nothing prevents a malicious
> > > > guest to pass an insanely great size to h_update_dt, which would cause
> > > > g_malloc0() to abort... The passed size should be checked against
> > > > FDT_MAX_SIZE.    
> > > 
> > > Good point. Just noticed - as posted, the checker actually checks the
> > > reset time tree, not the updated one, my bad :)
> > > 
> > > 
> > >   
> >   
>
diff mbox series

Patch

diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index ad4d7cfd97..f5dcaf44cb 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -100,6 +100,7 @@  struct sPAPRMachineClass {
 
     /*< public >*/
     bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
+    bool update_dt_enabled;    /* enable KVMPPC_H_UPDATE_DT */
     bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
     bool pre_2_10_has_unused_icps;
     bool legacy_irq_allocation;
@@ -136,6 +137,9 @@  struct sPAPRMachineState {
     int vrma_adjust;
     ssize_t rtas_size;
     void *rtas_blob;
+    uint32_t fdt_size;
+    uint32_t fdt_initial_size;
+    void *fdt_blob;
     long kernel_size;
     bool kernel_le;
     uint32_t initrd_base;
@@ -462,7 +466,8 @@  struct sPAPRMachineState {
 #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
 /* Client Architecture support */
 #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
-#define KVMPPC_HCALL_MAX        KVMPPC_H_CAS
+#define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
+#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
 
 typedef struct sPAPRDeviceTreeUpdateHeader {
     uint32_t version_id;
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index c08130facb..5e2d4d211c 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1633,7 +1633,10 @@  static void spapr_machine_reset(void)
     /* Load the fdt */
     qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
     cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
-    g_free(fdt);
+    g_free(spapr->fdt_blob);
+    spapr->fdt_size = fdt_totalsize(fdt);
+    spapr->fdt_initial_size = spapr->fdt_size;
+    spapr->fdt_blob = fdt;
 
     /* Set up the entry state */
     spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, fdt_addr);
@@ -1887,6 +1890,27 @@  static const VMStateDescription vmstate_spapr_irq_map = {
     },
 };
 
+static bool spapr_dtb_needed(void *opaque)
+{
+    sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(opaque);
+
+    return smc->update_dt_enabled;
+}
+
+static const VMStateDescription vmstate_spapr_dtb = {
+    .name = "spapr_dtb",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = spapr_dtb_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(fdt_initial_size, sPAPRMachineState),
+        VMSTATE_UINT32(fdt_size, sPAPRMachineState),
+        VMSTATE_VBUFFER_ALLOC_UINT32(fdt_blob, sPAPRMachineState, 0, NULL,
+                                     fdt_size),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
 static const VMStateDescription vmstate_spapr = {
     .name = "spapr",
     .version_id = 3,
@@ -1915,6 +1939,7 @@  static const VMStateDescription vmstate_spapr = {
         &vmstate_spapr_cap_sbbc,
         &vmstate_spapr_cap_ibs,
         &vmstate_spapr_irq_map,
+        &vmstate_spapr_dtb,
         NULL
     }
 };
@@ -3849,6 +3874,7 @@  static void spapr_machine_class_init(ObjectClass *oc, void *data)
     hc->unplug = spapr_machine_device_unplug;
 
     smc->dr_lmb_enabled = true;
+    smc->update_dt_enabled = true;
     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
     mc->has_hotpluggable_cpus = true;
     smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED;
@@ -3965,6 +3991,7 @@  static void spapr_machine_3_0_class_options(MachineClass *mc)
 
     smc->legacy_irq_allocation = true;
     smc->irq = &spapr_irq_xics_legacy;
+    smc->update_dt_enabled = false;
 }
 
 DEFINE_SPAPR_MACHINE(3_0, "3.0", false);
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index ae913d070f..d5833f3f8d 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1717,6 +1717,36 @@  static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu,
 
     args[0] = characteristics;
     args[1] = behaviour;
+    return H_SUCCESS;
+}
+
+static target_ulong h_update_dt(PowerPCCPU *cpu, sPAPRMachineState *spapr,
+                                target_ulong opcode, target_ulong *args)
+{
+    target_ulong dt = ppc64_phys_to_real(args[0]);
+    struct fdt_header hdr = { 0 };
+    unsigned cb;
+    sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
+
+    cpu_physical_memory_read(dt, &hdr, sizeof(hdr));
+    cb = fdt32_to_cpu(hdr.totalsize);
+
+    if (fdt_check_full(spapr->fdt_blob, cb)) {
+        trace_spapr_update_dt_failed(spapr->fdt_initial_size, cb,
+            fdt32_to_cpu(hdr.magic));
+        return H_PARAMETER;
+    }
+
+    if (!smc->update_dt_enabled) {
+        return H_SUCCESS;
+    }
+
+    g_free(spapr->fdt_blob);
+    spapr->fdt_size = cb;
+    spapr->fdt_blob = g_malloc0(cb);
+    cpu_physical_memory_read(dt, spapr->fdt_blob, cb);
+
+    trace_spapr_update_dt(cb);
 
     return H_SUCCESS;
 }
@@ -1822,6 +1852,8 @@  static void hypercall_register_types(void)
 
     /* ibm,client-architecture-support support */
     spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support);
+
+    spapr_register_hypercall(KVMPPC_H_UPDATE_DT, h_update_dt);
 }
 
 type_init(hypercall_register_types)
diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
index dc5e65aee9..4432a5ce74 100644
--- a/hw/ppc/trace-events
+++ b/hw/ppc/trace-events
@@ -22,6 +22,8 @@  spapr_cas_pvr_try(uint32_t pvr) "0x%x"
 spapr_cas_pvr(uint32_t cur_pvr, bool explicit_match, uint32_t new_pvr) "current=0x%x, explicit_match=%u, new=0x%x"
 spapr_h_resize_hpt_prepare(uint64_t flags, uint64_t shift) "flags=0x%"PRIx64", shift=%"PRIu64
 spapr_h_resize_hpt_commit(uint64_t flags, uint64_t shift) "flags=0x%"PRIx64", shift=%"PRIu64
+spapr_update_dt(unsigned cb) "New blob %u bytes"
+spapr_update_dt_failed(unsigned cbold, unsigned cbnew, unsigned magic) "Old blob %u bytes, new blob %u bytes, magic 0x%x"
 
 # hw/ppc/spapr_iommu.c
 spapr_iommu_put(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t ret) "liobn=0x%"PRIx64" ioba=0x%"PRIx64" tce=0x%"PRIx64" ret=%"PRId64