diff mbox series

[4/7] spapr: Fix record-replay machine reset consuming too many events

Message ID 20230623125707.323517-5-npiggin@gmail.com
State New
Headers show
Series ppc: fix larx migration, fix record-replay | expand

Commit Message

Nicholas Piggin June 23, 2023, 12:57 p.m. UTC
spapr_machine_reset gets a random number to populate the device-tree
rng seed with. When loading a snapshot for record-replay, the machine
is reset again, and that tries to consume the random event record
again, crashing due to inconsistent record

Fix this by saving the seed to populate the device tree with, and
skipping the rng on snapshot load.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 hw/ppc/spapr.c         | 12 +++++++++---
 include/hw/ppc/spapr.h |  1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

Comments

Pavel Dovgalyuk June 26, 2023, 8:07 a.m. UTC | #1
e500 has the same problem, I think, according to this issue: 
https://gitlab.com/qemu-project/qemu/-/issues/1634

Btw, ARM virt platform rebuilds fdt only at initialization phase, not 
when reset.
Isn't this behavior correct? Shouldn't PPC platforms do the similar thing?

On 23.06.2023 15:57, Nicholas Piggin wrote:
> spapr_machine_reset gets a random number to populate the device-tree
> rng seed with. When loading a snapshot for record-replay, the machine
> is reset again, and that tries to consume the random event record
> again, crashing due to inconsistent record
> 
> Fix this by saving the seed to populate the device tree with, and
> skipping the rng on snapshot load.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>   hw/ppc/spapr.c         | 12 +++++++++---
>   include/hw/ppc/spapr.h |  1 +
>   2 files changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index d290acfa95..55948f233f 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1017,7 +1017,6 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset)
>   {
>       MachineState *machine = MACHINE(spapr);
>       SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
> -    uint8_t rng_seed[32];
>       int chosen;
>   
>       _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen"));
> @@ -1095,8 +1094,7 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset)
>           spapr_dt_ov5_platform_support(spapr, fdt, chosen);
>       }
>   
> -    qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed));
> -    _FDT(fdt_setprop(fdt, chosen, "rng-seed", rng_seed, sizeof(rng_seed)));
> +    _FDT(fdt_setprop(fdt, chosen, "rng-seed", spapr->fdt_rng_seed, 32));
>   
>       _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5"));
>   }
> @@ -1649,6 +1647,14 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason)
>       void *fdt;
>       int rc;
>   
> +    if (reason != SHUTDOWN_CAUSE_SNAPSHOT_LOAD) {
> +        /*
> +         * Record-replay snapshot load must not consume random, this was
> +         * already replayed from initial machine reset.
> +         */
> +        qemu_guest_getrandom_nofail(spapr->fdt_rng_seed, 32);
> +    }
> +
>       pef_kvm_reset(machine->cgs, &error_fatal);
>       spapr_caps_apply(spapr);
>   
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index f47e8419a5..f4bd204d86 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -204,6 +204,7 @@ struct SpaprMachineState {
>       uint32_t fdt_size;
>       uint32_t fdt_initial_size;
>       void *fdt_blob;
> +    uint8_t fdt_rng_seed[32];
>       long kernel_size;
>       bool kernel_le;
>       uint64_t kernel_addr;
Nicholas Piggin June 26, 2023, 10:04 a.m. UTC | #2
On Mon Jun 26, 2023 at 6:07 PM AEST, Pavel Dovgalyuk wrote:
> e500 has the same problem, I think, according to this issue: 
> https://gitlab.com/qemu-project/qemu/-/issues/1634

Same symptoms. e500 looks like it does the dt build in
machine_init_done notifier, though. Maybe I miss something.
I'll take a look.

>
> Btw, ARM virt platform rebuilds fdt only at initialization phase, not 
> when reset.

I was actually wondering about keeping the same rng-seed across resets
to make the code even simpler, but decided to keep behaviour unchanged.
That seems like one downside.

> Isn't this behavior correct? Shouldn't PPC platforms do the similar thing?

I believe spapr does this for an spapr "feature" that rebuilds the fdt
at runtime ("client-architecture-support"), so reset has to build the
original one again.

Thanks,
Nick

>
> On 23.06.2023 15:57, Nicholas Piggin wrote:
> > spapr_machine_reset gets a random number to populate the device-tree
> > rng seed with. When loading a snapshot for record-replay, the machine
> > is reset again, and that tries to consume the random event record
> > again, crashing due to inconsistent record
> > 
> > Fix this by saving the seed to populate the device tree with, and
> > skipping the rng on snapshot load.
> > 
> > Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> > ---
> >   hw/ppc/spapr.c         | 12 +++++++++---
> >   include/hw/ppc/spapr.h |  1 +
> >   2 files changed, 10 insertions(+), 3 deletions(-)
> > 
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index d290acfa95..55948f233f 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -1017,7 +1017,6 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset)
> >   {
> >       MachineState *machine = MACHINE(spapr);
> >       SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
> > -    uint8_t rng_seed[32];
> >       int chosen;
> >   
> >       _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen"));
> > @@ -1095,8 +1094,7 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset)
> >           spapr_dt_ov5_platform_support(spapr, fdt, chosen);
> >       }
> >   
> > -    qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed));
> > -    _FDT(fdt_setprop(fdt, chosen, "rng-seed", rng_seed, sizeof(rng_seed)));
> > +    _FDT(fdt_setprop(fdt, chosen, "rng-seed", spapr->fdt_rng_seed, 32));
> >   
> >       _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5"));
> >   }
> > @@ -1649,6 +1647,14 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason)
> >       void *fdt;
> >       int rc;
> >   
> > +    if (reason != SHUTDOWN_CAUSE_SNAPSHOT_LOAD) {
> > +        /*
> > +         * Record-replay snapshot load must not consume random, this was
> > +         * already replayed from initial machine reset.
> > +         */
> > +        qemu_guest_getrandom_nofail(spapr->fdt_rng_seed, 32);
> > +    }
> > +
> >       pef_kvm_reset(machine->cgs, &error_fatal);
> >       spapr_caps_apply(spapr);
> >   
> > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> > index f47e8419a5..f4bd204d86 100644
> > --- a/include/hw/ppc/spapr.h
> > +++ b/include/hw/ppc/spapr.h
> > @@ -204,6 +204,7 @@ struct SpaprMachineState {
> >       uint32_t fdt_size;
> >       uint32_t fdt_initial_size;
> >       void *fdt_blob;
> > +    uint8_t fdt_rng_seed[32];
> >       long kernel_size;
> >       bool kernel_le;
> >       uint64_t kernel_addr;
diff mbox series

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index d290acfa95..55948f233f 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1017,7 +1017,6 @@  static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset)
 {
     MachineState *machine = MACHINE(spapr);
     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
-    uint8_t rng_seed[32];
     int chosen;
 
     _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen"));
@@ -1095,8 +1094,7 @@  static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset)
         spapr_dt_ov5_platform_support(spapr, fdt, chosen);
     }
 
-    qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed));
-    _FDT(fdt_setprop(fdt, chosen, "rng-seed", rng_seed, sizeof(rng_seed)));
+    _FDT(fdt_setprop(fdt, chosen, "rng-seed", spapr->fdt_rng_seed, 32));
 
     _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5"));
 }
@@ -1649,6 +1647,14 @@  static void spapr_machine_reset(MachineState *machine, ShutdownCause reason)
     void *fdt;
     int rc;
 
+    if (reason != SHUTDOWN_CAUSE_SNAPSHOT_LOAD) {
+        /*
+         * Record-replay snapshot load must not consume random, this was
+         * already replayed from initial machine reset.
+         */
+        qemu_guest_getrandom_nofail(spapr->fdt_rng_seed, 32);
+    }
+
     pef_kvm_reset(machine->cgs, &error_fatal);
     spapr_caps_apply(spapr);
 
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index f47e8419a5..f4bd204d86 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -204,6 +204,7 @@  struct SpaprMachineState {
     uint32_t fdt_size;
     uint32_t fdt_initial_size;
     void *fdt_blob;
+    uint8_t fdt_rng_seed[32];
     long kernel_size;
     bool kernel_le;
     uint64_t kernel_addr;