Patchwork [2/2] pseries: Use new hook to correct reset sequence

login
register
mail settings
Submitter David Gibson
Date Aug. 2, 2012, 2:10 a.m.
Message ID <1343873409-8571-3-git-send-email-david@gibson.dropbear.id.au>
Download mbox | patch
Permalink /patch/174656/
State New
Headers show

Comments

David Gibson - Aug. 2, 2012, 2:10 a.m.
A number of things need to occur during reset of the PAPR paravirtualized
platform in a specific order.  For example, the hash table needs to be
cleared before the CPUs are reset, so that they initialize their register
state correctly, and the CPUs need to have their main reset called before
we set up the entry point state on the boot cpu.  We also need to have
the main qdev reset happen before the creation and installation of the
device tree for the new boot, because we need the state of the devices
settled to correctly construct the device tree.

Currently reset of pseries is broken in a number of ways, and in other
cases works largely by accident. This patch uses the new QEMUMachine reset
hook to correct these problems, by replacing the several existing spapr
reset hooks with one new machine hook which ensures that the various stages
happen in the correct order.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/spapr.c |   66 +++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 36 insertions(+), 30 deletions(-)
Andreas Färber - Aug. 2, 2012, 3:44 p.m.
Am 02.08.2012 04:10, schrieb David Gibson:
> A number of things need to occur during reset of the PAPR paravirtualized
> platform in a specific order.  For example, the hash table needs to be
> cleared before the CPUs are reset, so that they initialize their register
> state correctly, and the CPUs need to have their main reset called before
> we set up the entry point state on the boot cpu.  We also need to have
> the main qdev reset happen before the creation and installation of the
> device tree for the new boot, because we need the state of the devices
> settled to correctly construct the device tree.
> 
> Currently reset of pseries is broken in a number of ways, and in other
> cases works largely by accident. This patch uses the new QEMUMachine reset
> hook to correct these problems, by replacing the several existing spapr
> reset hooks with one new machine hook which ensures that the various stages
> happen in the correct order.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  hw/spapr.c |   66 +++++++++++++++++++++++++++++++++---------------------------
>  1 file changed, 36 insertions(+), 30 deletions(-)
> 
> diff --git a/hw/spapr.c b/hw/spapr.c
> index 2453bae..1e60ec1 100644
> --- a/hw/spapr.c
> +++ b/hw/spapr.c
> @@ -582,29 +582,22 @@ static void spapr_reset_htab(sPAPREnvironment *spapr)
>      }
>  }
>  
> -static void spapr_reset(void *opaque)
> +static void spapr_reset_cpu(CPUPPCState *env)
>  {
> -    sPAPREnvironment *spapr = (sPAPREnvironment *)opaque;
> -
> -    /* Reset the hash table & recalc the RMA */
> -    spapr_reset_htab(spapr);
> -
> -    /* Load the fdt */
> -    spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
> -                       spapr->rtas_size);
> -}
> -
> -static void spapr_cpu_reset(void *opaque)
> -{
> -    PowerPCCPU *cpu = opaque;
> -    CPUPPCState *env = &cpu->env;
> +    PowerPCCPU *cpu = container_of(env, PowerPCCPU, env);

NACK. Please don't undo the cleanups I have applied! Functions should
take a QOM PowerPCCPU, not its internal CPUPPCState. Fields are
gradually being moved from CPUxxxState into CPUState.

>  
>      cpu_reset(CPU(cpu));

Also note the current discussion about CPU reset and ordering, e.g.:
http://patchwork.ozlabs.org/patch/174602/

Anthony was favoring moving reset code out of machines and expressed
dislike for looping through CPUs, which my above patch took into
account. The ordering issue between CPU and devices is still unsolved there.

Some on-list comments from Anthony would be nice, since we are moving
into opposing directions here - having the sPAPR machine be more in
control vs. moving code away from the PC machine into target-i386 CPU
and/or common CPU code.

Cheers,
Andreas

>  
>      env->external_htab = spapr->htab;
>      env->htab_base = -1;
>      env->htab_mask = HTAB_SIZE(spapr) - 1;
> +    /* CPUs need to start halted at reset, the platform reset code
> +     * will activate CPU0 then the rest are explicitly started by the
> +     * guest using RTAS */
> +    env->halted = 1;
>  
> +    /* Secondary CPUs get the CPU ID in r3 on entry */
> +    env->gpr[3] = env->cpu_index;
>      env->spr[SPR_SDR1] = (unsigned long)spapr->htab |
>          (spapr->htab_shift - 18);
>  
> @@ -612,14 +605,35 @@ static void spapr_cpu_reset(void *opaque)
>          kvmppc_update_sdr1(env);
>      }
>  
> -    /* Set up the entry state */
> -    if (env == first_cpu) {
> -        env->gpr[3] = spapr->fdt_addr;
> -        env->gpr[5] = 0;
> -        env->halted = 0;
> -        env->nip = spapr->entry_point;
> +    tb_flush(env);
> +}
> +
> +static void spapr_reset(bool report)
> +{
> +    CPUPPCState *env = first_cpu;
> +
> +    /* Reset the qdevs */
> +    qemu_default_system_reset(report);
> +
> +    /* Reset the hash table & recalc the RMA */
> +    spapr_reset_htab(spapr);
> +
> +    /* Reset the CPUs */
> +    for (env = first_cpu; env != NULL; env = env->next_cpu) {
> +        spapr_reset_cpu(env);
>      }
>  
> +    /* Load the fdt */
> +    spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
> +                       spapr->rtas_size);
> +
> +    /* Set up the entry state on CPU0 */
> +    env = first_cpu;
> +
> +    env->gpr[3] = spapr->fdt_addr;
> +    env->gpr[5] = 0;
> +    env->halted = 0;
> +    env->nip = spapr->entry_point;
>      tb_flush(env);
>  }
>  
> @@ -718,8 +732,6 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>      /* FIXME: we should change this default based on RAM size */
>      spapr->htab_shift = 24;
>  
> -    qemu_register_reset(spapr_reset, spapr);
> -
>      /* init CPUs */
>      if (cpu_model == NULL) {
>          cpu_model = kvm_enabled() ? "host" : "POWER7";
> @@ -734,11 +746,9 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>  
>          /* Set time-base frequency to 512 MHz */
>          cpu_ppc_tb_init(env, TIMEBASE_FREQ);
> -        qemu_register_reset(spapr_cpu_reset, cpu);
>  
>          env->hreset_vector = 0x60;
>          env->hreset_excp_prefix = 0;
> -        env->gpr[3] = env->cpu_index;
>      }
>  
>      /* allocate RAM */
> @@ -883,11 +893,6 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>  
>      spapr->entry_point = 0x100;
>  
> -    /* SLOF will startup the secondary CPUs using RTAS */
> -    for (env = first_cpu; env != NULL; env = env->next_cpu) {
> -        env->halted = 1;
> -    }
> -
>      /* Prepare the device tree */
>      spapr->fdt_skel = spapr_create_fdt_skel(cpu_model,
>                                              initrd_base, initrd_size,
> @@ -900,6 +905,7 @@ static QEMUMachine spapr_machine = {
>      .name = "pseries",
>      .desc = "pSeries Logical Partition (PAPR compliant)",
>      .init = ppc_spapr_init,
> +    .reset = spapr_reset,
>      .max_cpus = MAX_CPUS,
>      .no_parallel = 1,
>      .use_scsi = 1,
>
Anthony Liguori - Aug. 2, 2012, 6:29 p.m.
Andreas Färber <afaerber@suse.de> writes:

> Am 02.08.2012 04:10, schrieb David Gibson:
>> A number of things need to occur during reset of the PAPR paravirtualized
>> platform in a specific order.  For example, the hash table needs to be
>> cleared before the CPUs are reset, so that they initialize their register
>> state correctly, and the CPUs need to have their main reset called before
>> we set up the entry point state on the boot cpu.  We also need to have
>> the main qdev reset happen before the creation and installation of the
>> device tree for the new boot, because we need the state of the devices
>> settled to correctly construct the device tree.
>> 
>> Currently reset of pseries is broken in a number of ways, and in other
>> cases works largely by accident. This patch uses the new QEMUMachine reset
>> hook to correct these problems, by replacing the several existing spapr
>> reset hooks with one new machine hook which ensures that the various stages
>> happen in the correct order.
>> 
>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
>> ---
>>  hw/spapr.c |   66 +++++++++++++++++++++++++++++++++---------------------------
>>  1 file changed, 36 insertions(+), 30 deletions(-)
>> 
>> diff --git a/hw/spapr.c b/hw/spapr.c
>> index 2453bae..1e60ec1 100644
>> --- a/hw/spapr.c
>> +++ b/hw/spapr.c
>> @@ -582,29 +582,22 @@ static void spapr_reset_htab(sPAPREnvironment *spapr)
>>      }
>>  }
>>  
>> -static void spapr_reset(void *opaque)
>> +static void spapr_reset_cpu(CPUPPCState *env)
>>  {
>> -    sPAPREnvironment *spapr = (sPAPREnvironment *)opaque;
>> -
>> -    /* Reset the hash table & recalc the RMA */
>> -    spapr_reset_htab(spapr);
>> -
>> -    /* Load the fdt */
>> -    spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
>> -                       spapr->rtas_size);
>> -}
>> -
>> -static void spapr_cpu_reset(void *opaque)
>> -{
>> -    PowerPCCPU *cpu = opaque;
>> -    CPUPPCState *env = &cpu->env;
>> +    PowerPCCPU *cpu = container_of(env, PowerPCCPU, env);
>
> NACK. Please don't undo the cleanups I have applied! Functions should
> take a QOM PowerPCCPU, not its internal CPUPPCState. Fields are
> gradually being moved from CPUxxxState into CPUState.
>
>>  
>>      cpu_reset(CPU(cpu));
>
> Also note the current discussion about CPU reset and ordering, e.g.:
> http://patchwork.ozlabs.org/patch/174602/
>
> Anthony was favoring moving reset code out of machines and expressed
> dislike for looping through CPUs, which my above patch took into
> account. The ordering issue between CPU and devices is still unsolved there.
>
> Some on-list comments from Anthony would be nice, since we are moving
> into opposing directions here - having the sPAPR machine be more in
> control vs. moving code away from the PC machine into target-i386 CPU
> and/or common CPU code.

I already commented on the first patch because I had a feeling you'd
post something like this ;-)

Regarding reset:

1) Devices should implement DeviceState::reset()

2) If a device doesn't implement ::reset(), it should call
qemu_register_reset()

3) Reset should propagate through the device model, starting with the
top-level machine which is logically what's plugged into the wall and
is the source of power in the first place.

Regards,

Anthony Liguori

>
> Cheers,
> Andreas
>
>>  
>>      env->external_htab = spapr->htab;
>>      env->htab_base = -1;
>>      env->htab_mask = HTAB_SIZE(spapr) - 1;
>> +    /* CPUs need to start halted at reset, the platform reset code
>> +     * will activate CPU0 then the rest are explicitly started by the
>> +     * guest using RTAS */
>> +    env->halted = 1;
>>  
>> +    /* Secondary CPUs get the CPU ID in r3 on entry */
>> +    env->gpr[3] = env->cpu_index;
>>      env->spr[SPR_SDR1] = (unsigned long)spapr->htab |
>>          (spapr->htab_shift - 18);
>>  
>> @@ -612,14 +605,35 @@ static void spapr_cpu_reset(void *opaque)
>>          kvmppc_update_sdr1(env);
>>      }
>>  
>> -    /* Set up the entry state */
>> -    if (env == first_cpu) {
>> -        env->gpr[3] = spapr->fdt_addr;
>> -        env->gpr[5] = 0;
>> -        env->halted = 0;
>> -        env->nip = spapr->entry_point;
>> +    tb_flush(env);
>> +}
>> +
>> +static void spapr_reset(bool report)
>> +{
>> +    CPUPPCState *env = first_cpu;
>> +
>> +    /* Reset the qdevs */
>> +    qemu_default_system_reset(report);
>> +
>> +    /* Reset the hash table & recalc the RMA */
>> +    spapr_reset_htab(spapr);
>> +
>> +    /* Reset the CPUs */
>> +    for (env = first_cpu; env != NULL; env = env->next_cpu) {
>> +        spapr_reset_cpu(env);
>>      }
>>  
>> +    /* Load the fdt */
>> +    spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
>> +                       spapr->rtas_size);
>> +
>> +    /* Set up the entry state on CPU0 */
>> +    env = first_cpu;
>> +
>> +    env->gpr[3] = spapr->fdt_addr;
>> +    env->gpr[5] = 0;
>> +    env->halted = 0;
>> +    env->nip = spapr->entry_point;
>>      tb_flush(env);
>>  }
>>  
>> @@ -718,8 +732,6 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>>      /* FIXME: we should change this default based on RAM size */
>>      spapr->htab_shift = 24;
>>  
>> -    qemu_register_reset(spapr_reset, spapr);
>> -
>>      /* init CPUs */
>>      if (cpu_model == NULL) {
>>          cpu_model = kvm_enabled() ? "host" : "POWER7";
>> @@ -734,11 +746,9 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>>  
>>          /* Set time-base frequency to 512 MHz */
>>          cpu_ppc_tb_init(env, TIMEBASE_FREQ);
>> -        qemu_register_reset(spapr_cpu_reset, cpu);
>>  
>>          env->hreset_vector = 0x60;
>>          env->hreset_excp_prefix = 0;
>> -        env->gpr[3] = env->cpu_index;
>>      }
>>  
>>      /* allocate RAM */
>> @@ -883,11 +893,6 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>>  
>>      spapr->entry_point = 0x100;
>>  
>> -    /* SLOF will startup the secondary CPUs using RTAS */
>> -    for (env = first_cpu; env != NULL; env = env->next_cpu) {
>> -        env->halted = 1;
>> -    }
>> -
>>      /* Prepare the device tree */
>>      spapr->fdt_skel = spapr_create_fdt_skel(cpu_model,
>>                                              initrd_base, initrd_size,
>> @@ -900,6 +905,7 @@ static QEMUMachine spapr_machine = {
>>      .name = "pseries",
>>      .desc = "pSeries Logical Partition (PAPR compliant)",
>>      .init = ppc_spapr_init,
>> +    .reset = spapr_reset,
>>      .max_cpus = MAX_CPUS,
>>      .no_parallel = 1,
>>      .use_scsi = 1,
>> 
>
>
> -- 
> SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
> GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
Andreas Färber - Aug. 2, 2012, 6:38 p.m.
Am 02.08.2012 20:29, schrieb Anthony Liguori:
> Andreas Färber <afaerber@suse.de> writes:
> 
>> Anthony was favoring moving reset code out of machines and expressed
>> dislike for looping through CPUs, which my above patch took into
>> account. The ordering issue between CPU and devices is still unsolved there.
>>
>> Some on-list comments from Anthony would be nice, since we are moving
>> into opposing directions here - having the sPAPR machine be more in
>> control vs. moving code away from the PC machine into target-i386 CPU
>> and/or common CPU code.
> 
> I already commented on the first patch because I had a feeling you'd
> post something like this ;-)

I was not cc'ed. :(

> Regarding reset:
> 
> 1) Devices should implement DeviceState::reset()
> 
> 2) If a device doesn't implement ::reset(), it should call
> qemu_register_reset()
> 
> 3) Reset should propagate through the device model, starting with the
> top-level machine which is logically what's plugged into the wall and
> is the source of power in the first place.

So you changed your opinion over night?

I wanted to keep the reset callbacks in the machine. You applied a patch
breaking that pattern and argued you wanted to move reset code *out* of
the machine. Now you say the machine should *propagate* reset. Sorry,
that's unlogical to me...

If the machine should propagate reset then the disputed i386 patch is
doing The Wrong Thing.

If reset code should vanish from machine code AFAP then this patch is
doing The Wrong Thing.

No?

Regards,
Andreas
Anthony Liguori - Aug. 2, 2012, 7:40 p.m.
Andreas Färber <afaerber@suse.de> writes:

> Am 02.08.2012 20:29, schrieb Anthony Liguori:
>> Andreas Färber <afaerber@suse.de> writes:
>> 
>>> Anthony was favoring moving reset code out of machines and expressed
>>> dislike for looping through CPUs, which my above patch took into
>>> account. The ordering issue between CPU and devices is still unsolved there.
>>>
>>> Some on-list comments from Anthony would be nice, since we are moving
>>> into opposing directions here - having the sPAPR machine be more in
>>> control vs. moving code away from the PC machine into target-i386 CPU
>>> and/or common CPU code.
>> 
>> I already commented on the first patch because I had a feeling you'd
>> post something like this ;-)
>
> I was not cc'ed. :(
>
>> Regarding reset:
>> 
>> 1) Devices should implement DeviceState::reset()
>> 
>> 2) If a device doesn't implement ::reset(), it should call
>> qemu_register_reset()
>> 
>> 3) Reset should propagate through the device model, starting with the
>> top-level machine which is logically what's plugged into the wall and
>> is the source of power in the first place.
>
> So you changed your opinion over night?

No.

> I wanted to keep the reset callbacks in the machine. You applied a patch
> breaking that pattern and argued you wanted to move reset code *out* of
> the machine. Now you say the machine should *propagate* reset. Sorry,
> that's unlogical to me...

You're not listening carefully.  Just a friendly piece of advise--
instead of sending knee-jerk emails, spend some time going back and
re-reading these discussions.

This has been discussed literally to death now for years.

Reset propagates.  There is unanimous consensus that this is the Right
Way to model reset.  There is also wide consensus that reset typically
will propagate through the composition tree although in some cases,
reset actually propagates through the bus (this mostly affects devices
that are children of /peripheral paths though).

The "root" of the composition tree is the machine.  The machine in the
abstract sense, not the QEMUMachine sense.  QEMUMachine::init() should
eventually become trivial--just create a handful of devices that
represent the core components of the machine with everything else being
created through composition.

Open coded logic in QEMUMachine::init is always bad.  Handling reset for
a specific device in QEMUMachine::init is bad.  That goes against the
idea of making QEMUMachine::init trivial.

However, reset does logically start at QEMUMachine.  That doesn't mean
that QEMUMachine should be explicitly resetting devices in a specific
order.  This is why I was quick to comment on David's patch because the
argument about having a controller that determines reset ordering was
silly.  While this does exist on some architectures, it's not at all
typical.  Reset should flow with QEMUMachine::reset just playing the
role of deciding whether it starts propagating from.

The only machines that can have complex reset logic are ones that can
afford to take an extremely long time to startup--typically doing a
tremendous amount of self-checks in the process.  These are not common
among the types of machines QEMU simulates.

Regards,

Anthony Liguori

>
> If the machine should propagate reset then the disputed i386 patch is
> doing The Wrong Thing.
>
> If reset code should vanish from machine code AFAP then this patch is
> doing The Wrong Thing.
>
> No?
>
> Regards,
> Andreas
>
> -- 
> SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
> GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
David Gibson - Aug. 3, 2012, 2:31 a.m.
On Thu, Aug 02, 2012 at 05:44:49PM +0200, Andreas Färber wrote:
> Am 02.08.2012 04:10, schrieb David Gibson:
> > A number of things need to occur during reset of the PAPR paravirtualized
> > platform in a specific order.  For example, the hash table needs to be
> > cleared before the CPUs are reset, so that they initialize their register
> > state correctly, and the CPUs need to have their main reset called before
> > we set up the entry point state on the boot cpu.  We also need to have
> > the main qdev reset happen before the creation and installation of the
> > device tree for the new boot, because we need the state of the devices
> > settled to correctly construct the device tree.
> > 
> > Currently reset of pseries is broken in a number of ways, and in other
> > cases works largely by accident. This patch uses the new QEMUMachine reset
> > hook to correct these problems, by replacing the several existing spapr
> > reset hooks with one new machine hook which ensures that the various stages
> > happen in the correct order.
> > 
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  hw/spapr.c |   66 +++++++++++++++++++++++++++++++++---------------------------
> >  1 file changed, 36 insertions(+), 30 deletions(-)
> > 
> > diff --git a/hw/spapr.c b/hw/spapr.c
> > index 2453bae..1e60ec1 100644
> > --- a/hw/spapr.c
> > +++ b/hw/spapr.c
> > @@ -582,29 +582,22 @@ static void spapr_reset_htab(sPAPREnvironment *spapr)
> >      }
> >  }
> >  
> > -static void spapr_reset(void *opaque)
> > +static void spapr_reset_cpu(CPUPPCState *env)
> >  {
> > -    sPAPREnvironment *spapr = (sPAPREnvironment *)opaque;
> > -
> > -    /* Reset the hash table & recalc the RMA */
> > -    spapr_reset_htab(spapr);
> > -
> > -    /* Load the fdt */
> > -    spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
> > -                       spapr->rtas_size);
> > -}
> > -
> > -static void spapr_cpu_reset(void *opaque)
> > -{
> > -    PowerPCCPU *cpu = opaque;
> > -    CPUPPCState *env = &cpu->env;
> > +    PowerPCCPU *cpu = container_of(env, PowerPCCPU, env);
> 
> NACK. Please don't undo the cleanups I have applied! Functions should
> take a QOM PowerPCCPU, not its internal CPUPPCState. Fields are
> gradually being moved from CPUxxxState into CPUState.

Um, ok.  So how do I iterate the PowerPCCPUs instead of the CPUPPCStates?
David Gibson - Aug. 3, 2012, 2:37 a.m.
On Thu, Aug 02, 2012 at 02:40:19PM -0500, Anthony Liguori wrote:
> Andreas Färber <afaerber@suse.de> writes:
> 
> > Am 02.08.2012 20:29, schrieb Anthony Liguori:
> >> Andreas Färber <afaerber@suse.de> writes:
> >> 
> >>> Anthony was favoring moving reset code out of machines and expressed
> >>> dislike for looping through CPUs, which my above patch took into
> >>> account. The ordering issue between CPU and devices is still unsolved there.
> >>>
> >>> Some on-list comments from Anthony would be nice, since we are moving
> >>> into opposing directions here - having the sPAPR machine be more in
> >>> control vs. moving code away from the PC machine into target-i386 CPU
> >>> and/or common CPU code.
> >> 
> >> I already commented on the first patch because I had a feeling you'd
> >> post something like this ;-)
> >
> > I was not cc'ed. :(
> >
> >> Regarding reset:
> >> 
> >> 1) Devices should implement DeviceState::reset()
> >> 
> >> 2) If a device doesn't implement ::reset(), it should call
> >> qemu_register_reset()
> >> 
> >> 3) Reset should propagate through the device model, starting with the
> >> top-level machine which is logically what's plugged into the wall and
> >> is the source of power in the first place.
> >
> > So you changed your opinion over night?
> 
> No.
> 
> > I wanted to keep the reset callbacks in the machine. You applied a patch
> > breaking that pattern and argued you wanted to move reset code *out* of
> > the machine. Now you say the machine should *propagate* reset. Sorry,
> > that's unlogical to me...
> 
> You're not listening carefully.  Just a friendly piece of advise--
> instead of sending knee-jerk emails, spend some time going back and
> re-reading these discussions.
> 
> This has been discussed literally to death now for years.
> 
> Reset propagates.  There is unanimous consensus that this is the Right
> Way to model reset.  There is also wide consensus that reset typically
> will propagate through the composition tree although in some cases,
> reset actually propagates through the bus (this mostly affects devices
> that are children of /peripheral paths though).
> 
> The "root" of the composition tree is the machine.  The machine in the
> abstract sense, not the QEMUMachine sense.  QEMUMachine::init() should
> eventually become trivial--just create a handful of devices that
> represent the core components of the machine with everything else being
> created through composition.

So what code controls the order in which "the machine in the abstract
sense" initiates the reset at the top-leve?

> Open coded logic in QEMUMachine::init is always bad.  Handling reset for
> a specific device in QEMUMachine::init is bad.  That goes against the
> idea of making QEMUMachine::init trivial.
> 
> However, reset does logically start at QEMUMachine.  That doesn't mean
> that QEMUMachine should be explicitly resetting devices in a specific
> order.  This is why I was quick to comment on David's patch because the
> argument about having a controller that determines reset ordering was
> silly.  While this does exist on some architectures,

Some platforms; architecture does not imply a particular platform -
this is one of the more subtle and pervasive x86-isms around.

> it's not at all
> typical.

So?  If it sometimes exists, we need to support that model.  The
argument that "real" hardware never has reset order dependencies is
simply incorrect.

>  Reset should flow with QEMUMachine::reset just playing the
> role of deciding whether it starts propagating from.
> 
> The only machines that can have complex reset logic are ones that can
> afford to take an extremely long time to startup--typically doing a
> tremendous amount of self-checks in the process.  These are not common
> among the types of machines QEMU simulates.

"having at least one order dependency in reset" != "complex and slow
reset logic".
Anthony Liguori - Aug. 3, 2012, 1:50 p.m.
David Gibson <david@gibson.dropbear.id.au> writes:

> On Thu, Aug 02, 2012 at 02:40:19PM -0500, Anthony Liguori wrote:
>> The "root" of the composition tree is the machine.  The machine in the
>> abstract sense, not the QEMUMachine sense.  QEMUMachine::init() should
>> eventually become trivial--just create a handful of devices that
>> represent the core components of the machine with everything else being
>> created through composition.
>
> So what code controls the order in which "the machine in the abstract
> sense" initiates the reset at the top-leve?

There ought to be a hierarchy (based on composition) that reset flows
through.  In the case of the PC, at the top level of the hierarchy are
the CPUs and Northbridge (i440fx).  They either are going to sit on the
same bus or within a container of some kind.  Reset flows through the
bus/container to the CPUs and the Northbridge.  The northbridge then
flows reset through the PCI bus and then to the southbridge and all of
the devices behind it.

>> Open coded logic in QEMUMachine::init is always bad.  Handling reset for
>> a specific device in QEMUMachine::init is bad.  That goes against the
>> idea of making QEMUMachine::init trivial.
>> 
>> However, reset does logically start at QEMUMachine.  That doesn't mean
>> that QEMUMachine should be explicitly resetting devices in a specific
>> order.  This is why I was quick to comment on David's patch because the
>> argument about having a controller that determines reset ordering was
>> silly.  While this does exist on some architectures,
>
> Some platforms; architecture does not imply a particular platform -
> this is one of the more subtle and pervasive x86-isms around.
>
>> it's not at all
>> typical.
>
> So?  If it sometimes exists, we need to support that model.  The
> argument that "real" hardware never has reset order dependencies is
> simply incorrect.

"Support the model" is different from "make it a first class
abstraction".

-M pseries is not a real machine.  The things it has to do--initialize
all devices, build a device tree, then initialize the CPU with where the
device tree lives, is unique to !not a real machine.

This is what I mean by complex reset logic.  It's not just a matter of
an ordering of some kind, it's reset X, do A, reset Y, do B, reset Z, do
C.  The "do [ABC]" is the part that we shouldn't be trying to model as a
general mechanism.

I'm not saying that we shouldn't support being able to do this, but this
is an exception, not the way all other platforms should behave.

>>  Reset should flow with QEMUMachine::reset just playing the
>> role of deciding whether it starts propagating from.
>> 
>> The only machines that can have complex reset logic are ones that can
>> afford to take an extremely long time to startup--typically doing a
>> tremendous amount of self-checks in the process.  These are not common
>> among the types of machines QEMU simulates.
>
> "having at least one order dependency in reset" != "complex and slow
> reset logic".

It's not an issue of dependency.

We're trying to move to a model where everything is a device.
QEMUMachine essentially goes away because a user can create the machine
by just creating individual devices and tying them together.

But this will never be possible with -M pseries because of the "do ABC"
logic that it requires which doesn't fit within the model of everything
is a device.

That's okay.  We have the same problem with Xen and I anticipate we'll
have the same problem with S390.  We should support this model, but that
doesn't mean we shouldn't work toward moving everything else to
"everything is a device".

Regards,

Anthony Liguori

>
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson
Peter Maydell - Aug. 3, 2012, 1:57 p.m.
On 3 August 2012 14:50, Anthony Liguori <anthony@codemonkey.ws> wrote:
> There ought to be a hierarchy (based on composition) that reset flows
> through.

I think saying "the reset tree is isomorphic to the composition tree"
is making the same mistake that qbus did with "the bus tree is
isomorphic to the composition tree". The stakes are lower for reset
and we can probably get away with it, but it really isn't how the
hardware works...

-- PMM
Anthony Liguori - Aug. 3, 2012, 2:22 p.m.
Peter Maydell <peter.maydell@linaro.org> writes:

> On 3 August 2012 14:50, Anthony Liguori <anthony@codemonkey.ws> wrote:
>> There ought to be a hierarchy (based on composition) that reset flows
>> through.
>
> I think saying "the reset tree is isomorphic to the composition tree"
> is making the same mistake that qbus did with "the bus tree is
> isomorphic to the composition tree". The stakes are lower for reset
> and we can probably get away with it, but it really isn't how the
> hardware works...

It flows through the composition tree by default, but can be overridden
at any point.

For instance, the i440fx will absolutely want to override this behavior
such that it can flow reset through the PCI bus (which is how the PIIX3
would be reset).  However, the PIIX3 has no need to override this
behavior.

So this model should work very well for most types of virtual hardware.
But it doesn't provide for a mechanism to "after all devices are
initialized, build FDT in guest memory, then set the CPU registers to
point to it".

There's no logical device that has a scope like that that also has the
mechanism to get that type of hook in the reset path.  That's why we
need to have the QEMUMachine::reset() hook.

Regards,

Anthony Liguori

>
> -- PMM
Peter Maydell - Aug. 3, 2012, 2:35 p.m.
On 3 August 2012 15:22, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Peter Maydell <peter.maydell@linaro.org> writes:
>> On 3 August 2012 14:50, Anthony Liguori <anthony@codemonkey.ws> wrote:
>>> There ought to be a hierarchy (based on composition) that reset flows
>>> through.
>>
>> I think saying "the reset tree is isomorphic to the composition tree"
>> is making the same mistake that qbus did with "the bus tree is
>> isomorphic to the composition tree". The stakes are lower for reset
>> and we can probably get away with it, but it really isn't how the
>> hardware works...
>
> It flows through the composition tree by default, but can be overridden
> at any point.

That doesn't let you model situations where reset doesn't start at
the root of the tree, though. (eg, reset controller wants to trigger
a reset of just the CPUs, or of CPUs + board devices).

> So this model should work very well for most types of virtual hardware.
> But it doesn't provide for a mechanism to "after all devices are
> initialized, build FDT in guest memory, then set the CPU registers to
> point to it".
>
> There's no logical device that has a scope like that that also has the
> mechanism to get that type of hook in the reset path.  That's why we
> need to have the QEMUMachine::reset() hook.

Yeah, I see the need, but I wonder if calling it 'reset' is confusing:
maybe it should be 'post-reset', 'post-realize' or something?

The arm_boot code needs to do set up and run at this point too.

The other oddball case for reset is ARM M-profile cores, where the
initial PC is read from a vector table at reset rather than being
a fixed value. At the moment the mechanism we use for this is deeply
hacky: some more generic mechanism for "do this when we come out of
reset but before starting to execute" might be useful there.

-- PMM
Andreas Färber - Aug. 3, 2012, 2:51 p.m.
Am 02.08.2012 21:40, schrieb Anthony Liguori:
> Andreas Färber <afaerber@suse.de> writes:
> 
>> Am 02.08.2012 20:29, schrieb Anthony Liguori:
>>> Andreas Färber <afaerber@suse.de> writes:
>>>
>>>> Anthony was favoring moving reset code out of machines and expressed
>>>> dislike for looping through CPUs, which my above patch took into
>>>> account. The ordering issue between CPU and devices is still unsolved there.
>>>>
>>>> Some on-list comments from Anthony would be nice, since we are moving
>>>> into opposing directions here - having the sPAPR machine be more in
>>>> control vs. moving code away from the PC machine into target-i386 CPU
>>>> and/or common CPU code.
>>>
>>> I already commented on the first patch because I had a feeling you'd
>>> post something like this ;-)
>>
>> I was not cc'ed. :(

I did read the reply wrt reset controller chip btw in case you meant
that one, but it doesn't discuss QEMU API at all, only wording changes
to the commit message.

>>> Regarding reset:
>>>
>>> 1) Devices should implement DeviceState::reset()
>>>
>>> 2) If a device doesn't implement ::reset(), it should call
>>> qemu_register_reset()
>>>
>>> 3) Reset should propagate through the device model, starting with the
>>> top-level machine which is logically what's plugged into the wall and
>>> is the source of power in the first place.
>>
>> So you changed your opinion over night?
> 
> No.

Ben's cover letter indicated "as discussed with Anthony on a call",
suggesting to me that you agree to the solution presented here! Bad
choice of words then.

>> I wanted to keep the reset callbacks in the machine. You applied a patch
>> breaking that pattern and argued you wanted to move reset code *out* of
>> the machine. Now you say the machine should *propagate* reset. Sorry,
>> that's unlogical to me...
> 
> You're not listening carefully.  Just a friendly piece of advise--
> instead of sending knee-jerk emails, spend some time going back and
> re-reading these discussions.
> 
> This has been discussed literally to death now for years.

Mind you, you are communicating with non-native speakers and I had to
look up "knee-jerk". If you have a point to make, do it clearly. Your
replies have been anything but helpful to me.

You find my emails knee-jerked, I find your applying Igor's second patch
just before the 1.2 freeze a knee-jerk reaction. Especially considering
that you apply that series but not his earlier initfn one that did not
get objections any more. Two opinions.

Now, I have close to 20,000 unread qemu-devel mails alone. If you have
time to re-read the discussions from several years then I wonder why you
are not processing more uncontroversial patches and PULLs and replying
to mails. Otherwise don't ask people to do what you don't humanly manage
yourself.

Regards,
Andreas
Andreas Färber - Aug. 3, 2012, 3:01 p.m.
Am 02.08.2012 21:40, schrieb Anthony Liguori:
> Reset propagates.  There is unanimous consensus that this is the Right
> Way to model reset.  There is also wide consensus that reset typically
> will propagate through the composition tree although in some cases,
> reset actually propagates through the bus (this mostly affects devices
> that are children of /peripheral paths though).
> 
> The "root" of the composition tree is the machine.  The machine in the
> abstract sense, not the QEMUMachine sense.  QEMUMachine::init() should
> eventually become trivial--just create a handful of devices that
> represent the core components of the machine with everything else being
> created through composition.
> 
> Open coded logic in QEMUMachine::init is always bad.  Handling reset for
> a specific device in QEMUMachine::init is bad.  That goes against the
> idea of making QEMUMachine::init trivial.

We don't seem in disagreement so far. No one is questioning bus resets.
The issue at hand is specifically CPU reset, for which there is no bus,
no container and thus must happen somehow at machine level.

I have posted a suggestion where CPU reset is triggered by "the machine
as an abstract concept" (needs a bit of tweaking still, but the general
idea is there).
Based on that, shouldn't it be rather easy to add a Notifier similar to
"machine init done" that lets individual machines do post-reset setup?
I.e. not have QEMUMachine trigger and control the reset.

An alternative would be to have a CPUState::reset callback (in addition
to CPUClass::reset) that would by default be NULL but could be used by
the odd machines to piggy-back reset code. I think this is the safest
solution, assuring that on every cpu_reset() the custom reset code is
executed immediately.

The other issue wrt reset callback placement is CPU hotplug, where I
believe we need a callback at machine level in lack of a bus CPUs are
attached to. When the CPU is plugged we need to assure it later gets
reset by someone and added as a QOM child in the proper place. Currently
we don't have that. If we iterate through CPUs as done here we would get
that for free, otherwise we may need to register reset callbacks on
hotplug and unregister on hot-unplug at QEMUMachine level.

I am all ears for practical solutions, but theoretical talk about
containers and reset propagation doesn't seem to get us a solution.
Please say what container you mean and how/where your solutions are
supposed to work in code and how which of the proposals should be improved.

Thanks,
Andreas
Andreas Färber - Aug. 3, 2012, 3:13 p.m.
Am 03.08.2012 04:31, schrieb David Gibson:
> On Thu, Aug 02, 2012 at 05:44:49PM +0200, Andreas Färber wrote:
>> Am 02.08.2012 04:10, schrieb David Gibson:
>>> A number of things need to occur during reset of the PAPR paravirtualized
>>> platform in a specific order.  For example, the hash table needs to be
>>> cleared before the CPUs are reset, so that they initialize their register
>>> state correctly, and the CPUs need to have their main reset called before
>>> we set up the entry point state on the boot cpu.  We also need to have
>>> the main qdev reset happen before the creation and installation of the
>>> device tree for the new boot, because we need the state of the devices
>>> settled to correctly construct the device tree.
>>>
>>> Currently reset of pseries is broken in a number of ways, and in other
>>> cases works largely by accident. This patch uses the new QEMUMachine reset
>>> hook to correct these problems, by replacing the several existing spapr
>>> reset hooks with one new machine hook which ensures that the various stages
>>> happen in the correct order.
>>>
>>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
>>> ---
>>>  hw/spapr.c |   66 +++++++++++++++++++++++++++++++++---------------------------
>>>  1 file changed, 36 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/hw/spapr.c b/hw/spapr.c
>>> index 2453bae..1e60ec1 100644
>>> --- a/hw/spapr.c
>>> +++ b/hw/spapr.c
>>> @@ -582,29 +582,22 @@ static void spapr_reset_htab(sPAPREnvironment *spapr)
>>>      }
>>>  }
>>>  
>>> -static void spapr_reset(void *opaque)
>>> +static void spapr_reset_cpu(CPUPPCState *env)
>>>  {
>>> -    sPAPREnvironment *spapr = (sPAPREnvironment *)opaque;
>>> -
>>> -    /* Reset the hash table & recalc the RMA */
>>> -    spapr_reset_htab(spapr);
>>> -
>>> -    /* Load the fdt */
>>> -    spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
>>> -                       spapr->rtas_size);
>>> -}
>>> -
>>> -static void spapr_cpu_reset(void *opaque)
>>> -{
>>> -    PowerPCCPU *cpu = opaque;
>>> -    CPUPPCState *env = &cpu->env;
>>> +    PowerPCCPU *cpu = container_of(env, PowerPCCPU, env);
>>
>> NACK. Please don't undo the cleanups I have applied! Functions should
>> take a QOM PowerPCCPU, not its internal CPUPPCState. Fields are
>> gradually being moved from CPUxxxState into CPUState.
> 
> Um, ok.  So how do I iterate the PowerPCCPUs instead of the CPUPPCStates?

You can't, yet. The QOM CPUState part 4 series (that got stalled due to
APIC modelling) moved quite some fields to CPUState but not enough to
change the first_cpu type despite the really long (74?) series.

So the solution here is to iterate the CPUPPCState, call
ppc_env_get_cpu() on it and pass that as opaque as before.

I could add a cpu_foreach() function though if that helps in the
meantime? Either way the idea is to limit the number of places to touch
in the upcoming refactorings.

Andreas
Anthony Liguori - Aug. 3, 2012, 4:21 p.m.
Andreas Färber <afaerber@suse.de> writes:

> Am 02.08.2012 21:40, schrieb Anthony Liguori:
>> Reset propagates.  There is unanimous consensus that this is the Right
>> Way to model reset.  There is also wide consensus that reset typically
>> will propagate through the composition tree although in some cases,
>> reset actually propagates through the bus (this mostly affects devices
>> that are children of /peripheral paths though).
>> 
>> The "root" of the composition tree is the machine.  The machine in the
>> abstract sense, not the QEMUMachine sense.  QEMUMachine::init() should
>> eventually become trivial--just create a handful of devices that
>> represent the core components of the machine with everything else being
>> created through composition.
>> 
>> Open coded logic in QEMUMachine::init is always bad.  Handling reset for
>> a specific device in QEMUMachine::init is bad.  That goes against the
>> idea of making QEMUMachine::init trivial.
>
> We don't seem in disagreement so far. No one is questioning bus resets.
> The issue at hand is specifically CPU reset, for which there is no bus,
> no container and thus must happen somehow at machine level.
>
> I have posted a suggestion where CPU reset is triggered by "the machine
> as an abstract concept" (needs a bit of tweaking still, but the general
> idea is there).
> Based on that, shouldn't it be rather easy to add a Notifier similar to
> "machine init done" that lets individual machines do post-reset setup?
> I.e. not have QEMUMachine trigger and control the reset.

This means that the reset logic will be spread out.  A single hook in
QEMUMachine is much nicer.

> An alternative would be to have a CPUState::reset callback (in addition
> to CPUClass::reset) that would by default be NULL but could be used by
> the odd machines to piggy-back reset code. I think this is the safest
> solution, assuring that on every cpu_reset() the custom reset code is
> executed immediately.

I think the right way to handle reset for CPU's is exactly what's done
today.  The CPUState registers a reset handler via
qemu_register_reset().

Eventually, we would model CPUSocket, CPUCore, CPUThread (which is
essentially what is CPUState today).

Reset of CPUState would propagate its CPUCores, which would then
propagate to CPUThread.

The machine/board device would have 1-N link<CPUSocket> properties and
the reset handler for the board would propagate reset to the CPUSocket links.

> The other issue wrt reset callback placement is CPU hotplug, where I
> believe we need a callback at machine level in lack of a bus CPUs are
> attached to. When the CPU is plugged we need to assure it later gets
> reset by someone and added as a QOM child in the proper place.

Reset does two things today: tear down current state and establish
initial state.

The fact that we rely on an external call to reset to establish initial
state after construction is not ideal.  Initial state should be
established during construction either by explicitly calling a function
(it could be reset) or by setting initial state.

> Currently
> we don't have that. If we iterate through CPUs as done here we would get
> that for free, otherwise we may need to register reset callbacks on
> hotplug and unregister on hot-unplug at QEMUMachine level.

If you hotplug a CPUSocket by setting a link<> on the machine device,
then this all Just Works without any special handling.

This is what makes propagation so attractive.  You don't have to
maintain a list of everything that needs to be reset along with data
descriptions of the order.  That gets figured out lazily during reset.

Regards,

Anthony Liguori

>
> I am all ears for practical solutions, but theoretical talk about
> containers and reset propagation doesn't seem to get us a solution.
> Please say what container you mean and how/where your solutions are
> supposed to work in code and how which of the proposals should be improved.
>
> Thanks,
> Andreas
>
> -- 
> SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
> GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
David Gibson - Aug. 6, 2012, 12:31 a.m.
On Fri, Aug 03, 2012 at 05:13:48PM +0200, Andreas Färber wrote:
> Am 03.08.2012 04:31, schrieb David Gibson:
> > On Thu, Aug 02, 2012 at 05:44:49PM +0200, Andreas Färber wrote:
> >> Am 02.08.2012 04:10, schrieb David Gibson:
[snip]
> >>> -static void spapr_cpu_reset(void *opaque)
> >>> -{
> >>> -    PowerPCCPU *cpu = opaque;
> >>> -    CPUPPCState *env = &cpu->env;
> >>> +    PowerPCCPU *cpu = container_of(env, PowerPCCPU, env);
> >>
> >> NACK. Please don't undo the cleanups I have applied! Functions should
> >> take a QOM PowerPCCPU, not its internal CPUPPCState. Fields are
> >> gradually being moved from CPUxxxState into CPUState.
> > 
> > Um, ok.  So how do I iterate the PowerPCCPUs instead of the CPUPPCStates?
> 
> You can't, yet. The QOM CPUState part 4 series (that got stalled due to
> APIC modelling) moved quite some fields to CPUState but not enough to
> change the first_cpu type despite the really long (74?) series.
> 
> So the solution here is to iterate the CPUPPCState, call
> ppc_env_get_cpu() on it and pass that as opaque as before.

So, move the container_of to the caller and use the wrapper for it (I
thought one might exist, but I hadn't spotted it).  Ok, I can do that.

> I could add a cpu_foreach() function though if that helps in the
> meantime? Either way the idea is to limit the number of places to touch
> in the upcoming refactorings.

If you like, but I'm not planning to delay these patches to wait for
it.
Benjamin Herrenschmidt - Aug. 7, 2012, 10:02 p.m.
On Fri, 2012-08-03 at 17:01 +0200, Andreas Färber wrote:
> 
> I have posted a suggestion where CPU reset is triggered by "the
> machine
> as an abstract concept" (needs a bit of tweaking still, but the
> general
> idea is there).
> Based on that, shouldn't it be rather easy to add a Notifier similar
> to
> "machine init done" that lets individual machines do post-reset setup?
> I.e. not have QEMUMachine trigger and control the reset.
> 

Note that we really want pre and post reset vs the device reset.

That's why the machine should be the one in charge. The top level of the
reset sequencing is -not- the CPU, it's the machine. All machines (or
SoCs) have some kind of reset controller and provide facilities for
resetting individual devices, busses, processor cores.... the global
"system" reset (when it exists) itself might have interesting ordering
or sequencing requirements.

Now, to fix our immediate problem on ppc for 1.2 the hook proposed by
Anthony for which David sent a patch does the job just fine, it allows
us to clean out all our iommu tables before the device-reset, meaning
that in-flights DMA cannot overwrite the various "files" (SLOF image
etc.... that are auto-loaded via reset handlers implicitely created by
load_image_targphys), and we can then do some post-initializations as
well to get things ready for a restart (rebuild the device-tree, etc...)

Cheers,
Ben.
Andreas Färber - Aug. 7, 2012, 10:32 p.m.
Am 08.08.2012 00:02, schrieb Benjamin Herrenschmidt:
> On Fri, 2012-08-03 at 17:01 +0200, Andreas Färber wrote:
>>
>> I have posted a suggestion where CPU reset is triggered by "the
>> machine
>> as an abstract concept" (needs a bit of tweaking still, but the
>> general
>> idea is there).
>> Based on that, shouldn't it be rather easy to add a Notifier similar
>> to
>> "machine init done" that lets individual machines do post-reset setup?
>> I.e. not have QEMUMachine trigger and control the reset.
>>
> 
> Note that we really want pre and post reset vs the device reset.
> 
> That's why the machine should be the one in charge. The top level of the
> reset sequencing is -not- the CPU, it's the machine. All machines (or
> SoCs) have some kind of reset controller and provide facilities for
> resetting individual devices, busses, processor cores.... the global
> "system" reset (when it exists) itself might have interesting ordering
> or sequencing requirements.
> 
> Now, to fix our immediate problem on ppc for 1.2 the hook proposed by
> Anthony for which David sent a patch does the job just fine, it allows
> us to clean out all our iommu tables before the device-reset, meaning
> that in-flights DMA cannot overwrite the various "files" (SLOF image
> etc.... that are auto-loaded via reset handlers implicitely created by
> load_image_targphys), and we can then do some post-initializations as
> well to get things ready for a restart (rebuild the device-tree, etc...)

That's all good, except for embedded machines without such implicit
reset handling. It does contradict the "a machine is just a config file,
setting up QOM objects" concept, but I was not the one to push that! :)

What I was thinking about however were those mentioned individual cores
being reset using cpu_reset(). If we want to piggy-back some
machine-specific register initialization for individual CPUStates then
QEMUMachine::reset is not going to be enough because it only gets
triggered for complete system reset. My suggestion was thus to just call
cpu_reset() in your QEMUMachine::reset and have cpu_reset() take care of
its initialization wherever called from. Any of these solutions are easy
to implement for 1.2 if agreement is reached what people want.

What I am missing from Anthony's side is some communication to machine
maintainers on the course to adopt before applying random patches. Right
now x86 and ppc are moving into opposite directions and arm, mips, etc.
maintainers may not even be aware of ongoing changes, and there's a
pending uc32 machine that should be reviewed in this light.

Cheers,
Andreas
Anthony Liguori - Aug. 8, 2012, midnight
On Aug 7, 2012 5:32 PM, "Andreas Färber" <afaerber@suse.de> wrote:
>
> Am 08.08.2012 00:02, schrieb Benjamin Herrenschmidt:
> > On Fri, 2012-08-03 at 17:01 +0200, Andreas Färber wrote:
> >>
> >> I have posted a suggestion where CPU reset is triggered by "the
> >> machine
> >> as an abstract concept" (needs a bit of tweaking still, but the
> >> general
> >> idea is there).
> >> Based on that, shouldn't it be rather easy to add a Notifier similar
> >> to
> >> "machine init done" that lets individual machines do post-reset setup?
> >> I.e. not have QEMUMachine trigger and control the reset.
> >>
> >
> > Note that we really want pre and post reset vs the device reset.
> >
> > That's why the machine should be the one in charge. The top level of the
> > reset sequencing is -not- the CPU, it's the machine. All machines (or
> > SoCs) have some kind of reset controller and provide facilities for
> > resetting individual devices, busses, processor cores.... the global
> > "system" reset (when it exists) itself might have interesting ordering
> > or sequencing requirements.
> >
> > Now, to fix our immediate problem on ppc for 1.2 the hook proposed by
> > Anthony for which David sent a patch does the job just fine, it allows
> > us to clean out all our iommu tables before the device-reset, meaning
> > that in-flights DMA cannot overwrite the various "files" (SLOF image
> > etc.... that are auto-loaded via reset handlers implicitely created by
> > load_image_targphys), and we can then do some post-initializations as
> > well to get things ready for a restart (rebuild the device-tree, etc...)
>
> That's all good, except for embedded machines without such implicit
> reset handling. It does contradict the "a machine is just a config file,
> setting up QOM objects" concept, but I was not the one to push that! :)
>

I will be without a proper email client for 24 hours so I'll keep this
brief for now.  What Ben et al are trying to model is something that exists
outside of the model of virtual hardware that QOM is designed for.  Since
they are trying to model something that exists outside the scope of virtual
hardware, they need something that exists at a higher level.

They need a per machine hook before and after devices are created.  This is
okay and it turns out it can be handy for other machines too that do
similiar could not exist outside of a simulator features.

Regards,

Anthony Liguori

> What I was thinking about however were those mentioned individual cores
> being reset using cpu_reset(). If we want to piggy-back some
> machine-specific register initialization for individual CPUStates then
> QEMUMachine::reset is not going to be enough because it only gets
> triggered for complete system reset. My suggestion was thus to just call
> cpu_reset() in your QEMUMachine::reset and have cpu_reset() take care of
> its initialization wherever called from. Any of these solutions are easy
> to implement for 1.2 if agreement is reached what people want.
>
> What I am missing from Anthony's side is some communication to machine
> maintainers on the course to adopt before applying random patches. Right
> now x86 and ppc are moving into opposite directions and arm, mips, etc.
> maintainers may not even be aware of ongoing changes, and there's a
> pending uc32 machine that should be reviewed in this light.
>
> Cheers,
> Andreas
>
> --
> SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
> GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
David Gibson - Aug. 8, 2012, 1:45 a.m.
On Wed, Aug 08, 2012 at 12:32:39AM +0200, Andreas Färber wrote:
> Am 08.08.2012 00:02, schrieb Benjamin Herrenschmidt:
> > On Fri, 2012-08-03 at 17:01 +0200, Andreas Färber wrote:
> >>
> >> I have posted a suggestion where CPU reset is triggered by "the
> >> machine
> >> as an abstract concept" (needs a bit of tweaking still, but the
> >> general
> >> idea is there).
> >> Based on that, shouldn't it be rather easy to add a Notifier similar
> >> to
> >> "machine init done" that lets individual machines do post-reset setup?
> >> I.e. not have QEMUMachine trigger and control the reset.
> >>
> > 
> > Note that we really want pre and post reset vs the device reset.
> > 
> > That's why the machine should be the one in charge. The top level of the
> > reset sequencing is -not- the CPU, it's the machine. All machines (or
> > SoCs) have some kind of reset controller and provide facilities for
> > resetting individual devices, busses, processor cores.... the global
> > "system" reset (when it exists) itself might have interesting ordering
> > or sequencing requirements.
> > 
> > Now, to fix our immediate problem on ppc for 1.2 the hook proposed by
> > Anthony for which David sent a patch does the job just fine, it allows
> > us to clean out all our iommu tables before the device-reset, meaning
> > that in-flights DMA cannot overwrite the various "files" (SLOF image
> > etc.... that are auto-loaded via reset handlers implicitely created by
> > load_image_targphys), and we can then do some post-initializations as
> > well to get things ready for a restart (rebuild the device-tree, etc...)
> 
> That's all good, except for embedded machines without such implicit
> reset handling. It does contradict the "a machine is just a config file,
> setting up QOM objects" concept, but I was not the one to push that! :)
> 
> What I was thinking about however were those mentioned individual cores
> being reset using cpu_reset(). If we want to piggy-back some
> machine-specific register initialization for individual CPUStates then
> QEMUMachine::reset is not going to be enough because it only gets
> triggered for complete system reset. My suggestion was thus to just call
> cpu_reset() in your QEMUMachine::reset and have cpu_reset() take care of
> its initialization wherever called from. Any of these solutions are easy
> to implement for 1.2 if agreement is reached what people want.

So, I more or less reaslied that myself and my new version of the
reset patch (which I expect to send out later today) kind of does
that.  I no longer do the machine specific CPU state setup from the
QEMUMachine::reset, it's done from the per-cpu reset handler.  The
QEMUMachine::reset just does the special setup that's only for the
CPU0 entry conditions, which *is* specific to a full system reset (not
that I think we can get an individual CPU reset on pseries, anyway).

> What I am missing from Anthony's side is some communication to machine
> maintainers on the course to adopt before applying random patches. Right
> now x86 and ppc are moving into opposite directions and arm, mips, etc.
> maintainers may not even be aware of ongoing changes, and there's a
> pending uc32 machine that should be reviewed in this light.

So.. having the CPU reset at the top of the tree definitely makes no
sense - if nothing else, *which* cpu when there's more than one.
Peter Maydell - Aug. 8, 2012, 7:58 a.m.
On 8 August 2012 01:00, Anthony Liguori <anthony@codemonkey.ws> wrote:
>
> They need a per machine hook before and after devices are created.  This is
> okay and it turns out it can be handy for other machines too that do
> similiar could not exist outside of a simulator features.

If it's 'before and after device creation' maybe we could call it something
less confusing than "reset" ?

-- PMM
David Gibson - Aug. 8, 2012, 8:44 a.m.
On Wed, Aug 08, 2012 at 08:58:07AM +0100, Peter Maydell wrote:
> On 8 August 2012 01:00, Anthony Liguori <anthony@codemonkey.ws> wrote:
> >
> > They need a per machine hook before and after devices are created.  This is
> > okay and it turns out it can be handy for other machines too that do
> > similiar could not exist outside of a simulator features.
> 
> If it's 'before and after device creation' maybe we could call it something
> less confusing than "reset" ?

It's not actually before and after devices are created.  It's before
and after devices are reset.  At least that's the easiest way to use
it, but it can also be used to take over control of the device reset
order itself.
Andreas Färber - Aug. 8, 2012, 3:22 p.m.
Am 08.08.2012 03:45, schrieb David Gibson:
> On Wed, Aug 08, 2012 at 12:32:39AM +0200, Andreas Färber wrote:
>> Am 08.08.2012 00:02, schrieb Benjamin Herrenschmidt:
>>> On Fri, 2012-08-03 at 17:01 +0200, Andreas Färber wrote:
>>>>
>>>> I have posted a suggestion where CPU reset is triggered by "the
>>>> machine
>>>> as an abstract concept" (needs a bit of tweaking still, but the
>>>> general
>>>> idea is there).
>>>> Based on that, shouldn't it be rather easy to add a Notifier similar
>>>> to
>>>> "machine init done" that lets individual machines do post-reset setup?
>>>> I.e. not have QEMUMachine trigger and control the reset.
>>>>
>>>
>>> Note that we really want pre and post reset vs the device reset.
>>>
>>> That's why the machine should be the one in charge. The top level of the
>>> reset sequencing is -not- the CPU, it's the machine. All machines (or
>>> SoCs) have some kind of reset controller and provide facilities for
>>> resetting individual devices, busses, processor cores.... the global
>>> "system" reset (when it exists) itself might have interesting ordering
>>> or sequencing requirements.
>>>
>>> Now, to fix our immediate problem on ppc for 1.2 the hook proposed by
>>> Anthony for which David sent a patch does the job just fine, it allows
>>> us to clean out all our iommu tables before the device-reset, meaning
>>> that in-flights DMA cannot overwrite the various "files" (SLOF image
>>> etc.... that are auto-loaded via reset handlers implicitely created by
>>> load_image_targphys), and we can then do some post-initializations as
>>> well to get things ready for a restart (rebuild the device-tree, etc...)
>>
>> That's all good, except for embedded machines without such implicit
>> reset handling. It does contradict the "a machine is just a config file,
>> setting up QOM objects" concept, but I was not the one to push that! :)
>>
>> What I was thinking about however were those mentioned individual cores
>> being reset using cpu_reset(). If we want to piggy-back some
>> machine-specific register initialization for individual CPUStates then
>> QEMUMachine::reset is not going to be enough because it only gets
>> triggered for complete system reset. My suggestion was thus to just call
>> cpu_reset() in your QEMUMachine::reset and have cpu_reset() take care of
>> its initialization wherever called from. Any of these solutions are easy
>> to implement for 1.2 if agreement is reached what people want.
> 
> So, I more or less reaslied that myself and my new version of the
> reset patch (which I expect to send out later today) kind of does
> that.  I no longer do the machine specific CPU state setup from the
> QEMUMachine::reset, it's done from the per-cpu reset handler.  The
> QEMUMachine::reset just does the special setup that's only for the
> CPU0 entry conditions, which *is* specific to a full system reset (not
> that I think we can get an individual CPU reset on pseries, anyway).
> 
>> What I am missing from Anthony's side is some communication to machine
>> maintainers on the course to adopt before applying random patches. Right
>> now x86 and ppc are moving into opposite directions and arm, mips, etc.
>> maintainers may not even be aware of ongoing changes, and there's a
>> pending uc32 machine that should be reviewed in this light.
> 
> So.. having the CPU reset at the top of the tree definitely makes no
> sense - if nothing else, *which* cpu when there's more than one.

Maybe let me restate clearly what I am looking for in this discussion:

I would like a clear definition of
* what is the "normal" case, and
* what is the special case.

The special case sPAPR seems uncontroversial.

So, a bonus would be if we can have a default implementation (of
QEMUMachine::reset or whatever we end up doing) so that the average
machine does not need to fiddle with reset callbacks in
QEMUMachine::init. For example, have a machine_default_reset() as
fallback for QEMUMachine::reset == NULL that resets all CPUs (in order
of the singly linked list) and then does qemu_devices_reset()? sPAPR
would then override that default implementation by specifying its own
implementation and we could get rid of reset callbacks in an estimated
70% of QEMUMachine::init. (The less people fiddle at that level the
easier to refactor for me.) That could well be a later follow-up to your
v2, which looked okay on brief sight.

Cheers,
Andreas
David Gibson - Aug. 9, 2012, 12:12 a.m.
On Wed, Aug 08, 2012 at 05:22:11PM +0200, Andreas Färber wrote:
> Am 08.08.2012 03:45, schrieb David Gibson:
> > On Wed, Aug 08, 2012 at 12:32:39AM +0200, Andreas Färber wrote:
> >> Am 08.08.2012 00:02, schrieb Benjamin Herrenschmidt:
> >>> On Fri, 2012-08-03 at 17:01 +0200, Andreas Färber wrote:
> >>>>
> >>>> I have posted a suggestion where CPU reset is triggered by "the
> >>>> machine
> >>>> as an abstract concept" (needs a bit of tweaking still, but the
> >>>> general
> >>>> idea is there).
> >>>> Based on that, shouldn't it be rather easy to add a Notifier similar
> >>>> to
> >>>> "machine init done" that lets individual machines do post-reset setup?
> >>>> I.e. not have QEMUMachine trigger and control the reset.
> >>>>
> >>>
> >>> Note that we really want pre and post reset vs the device reset.
> >>>
> >>> That's why the machine should be the one in charge. The top level of the
> >>> reset sequencing is -not- the CPU, it's the machine. All machines (or
> >>> SoCs) have some kind of reset controller and provide facilities for
> >>> resetting individual devices, busses, processor cores.... the global
> >>> "system" reset (when it exists) itself might have interesting ordering
> >>> or sequencing requirements.
> >>>
> >>> Now, to fix our immediate problem on ppc for 1.2 the hook proposed by
> >>> Anthony for which David sent a patch does the job just fine, it allows
> >>> us to clean out all our iommu tables before the device-reset, meaning
> >>> that in-flights DMA cannot overwrite the various "files" (SLOF image
> >>> etc.... that are auto-loaded via reset handlers implicitely created by
> >>> load_image_targphys), and we can then do some post-initializations as
> >>> well to get things ready for a restart (rebuild the device-tree, etc...)
> >>
> >> That's all good, except for embedded machines without such implicit
> >> reset handling. It does contradict the "a machine is just a config file,
> >> setting up QOM objects" concept, but I was not the one to push that! :)
> >>
> >> What I was thinking about however were those mentioned individual cores
> >> being reset using cpu_reset(). If we want to piggy-back some
> >> machine-specific register initialization for individual CPUStates then
> >> QEMUMachine::reset is not going to be enough because it only gets
> >> triggered for complete system reset. My suggestion was thus to just call
> >> cpu_reset() in your QEMUMachine::reset and have cpu_reset() take care of
> >> its initialization wherever called from. Any of these solutions are easy
> >> to implement for 1.2 if agreement is reached what people want.
> > 
> > So, I more or less reaslied that myself and my new version of the
> > reset patch (which I expect to send out later today) kind of does
> > that.  I no longer do the machine specific CPU state setup from the
> > QEMUMachine::reset, it's done from the per-cpu reset handler.  The
> > QEMUMachine::reset just does the special setup that's only for the
> > CPU0 entry conditions, which *is* specific to a full system reset (not
> > that I think we can get an individual CPU reset on pseries, anyway).
> > 
> >> What I am missing from Anthony's side is some communication to machine
> >> maintainers on the course to adopt before applying random patches. Right
> >> now x86 and ppc are moving into opposite directions and arm, mips, etc.
> >> maintainers may not even be aware of ongoing changes, and there's a
> >> pending uc32 machine that should be reviewed in this light.
> > 
> > So.. having the CPU reset at the top of the tree definitely makes no
> > sense - if nothing else, *which* cpu when there's more than one.
> 
> Maybe let me restate clearly what I am looking for in this discussion:
> 
> I would like a clear definition of
> * what is the "normal" case, and
> * what is the special case.
> 
> The special case sPAPR seems uncontroversial.
> 
> So, a bonus would be if we can have a default implementation (of
> QEMUMachine::reset or whatever we end up doing) so that the average
> machine does not need to fiddle with reset callbacks in
> QEMUMachine::init. For example, have a machine_default_reset() as
> fallback for QEMUMachine::reset == NULL that resets all CPUs (in order
> of the singly linked list) and then does qemu_devices_reset()? sPAPR
> would then override that default implementation by specifying its own
> implementation and we could get rid of reset callbacks in an estimated
> 70% of QEMUMachine::init. (The less people fiddle at that level the
> easier to refactor for me.) That could well be a later follow-up to your
> v2, which looked okay on brief sight.

We already have that.  If QEMUMachine::reset is NULL,
qemu_system_reset() does qemu_devices_reset() which is exactly the
same as what it did before.  qemu_devices_reset() calls all the reset
callback handlers, so it will also reset the CPUs if a suitable CPU
reset handler has been registered.

Patch

diff --git a/hw/spapr.c b/hw/spapr.c
index 2453bae..1e60ec1 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -582,29 +582,22 @@  static void spapr_reset_htab(sPAPREnvironment *spapr)
     }
 }
 
-static void spapr_reset(void *opaque)
+static void spapr_reset_cpu(CPUPPCState *env)
 {
-    sPAPREnvironment *spapr = (sPAPREnvironment *)opaque;
-
-    /* Reset the hash table & recalc the RMA */
-    spapr_reset_htab(spapr);
-
-    /* Load the fdt */
-    spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
-                       spapr->rtas_size);
-}
-
-static void spapr_cpu_reset(void *opaque)
-{
-    PowerPCCPU *cpu = opaque;
-    CPUPPCState *env = &cpu->env;
+    PowerPCCPU *cpu = container_of(env, PowerPCCPU, env);
 
     cpu_reset(CPU(cpu));
 
     env->external_htab = spapr->htab;
     env->htab_base = -1;
     env->htab_mask = HTAB_SIZE(spapr) - 1;
+    /* CPUs need to start halted at reset, the platform reset code
+     * will activate CPU0 then the rest are explicitly started by the
+     * guest using RTAS */
+    env->halted = 1;
 
+    /* Secondary CPUs get the CPU ID in r3 on entry */
+    env->gpr[3] = env->cpu_index;
     env->spr[SPR_SDR1] = (unsigned long)spapr->htab |
         (spapr->htab_shift - 18);
 
@@ -612,14 +605,35 @@  static void spapr_cpu_reset(void *opaque)
         kvmppc_update_sdr1(env);
     }
 
-    /* Set up the entry state */
-    if (env == first_cpu) {
-        env->gpr[3] = spapr->fdt_addr;
-        env->gpr[5] = 0;
-        env->halted = 0;
-        env->nip = spapr->entry_point;
+    tb_flush(env);
+}
+
+static void spapr_reset(bool report)
+{
+    CPUPPCState *env = first_cpu;
+
+    /* Reset the qdevs */
+    qemu_default_system_reset(report);
+
+    /* Reset the hash table & recalc the RMA */
+    spapr_reset_htab(spapr);
+
+    /* Reset the CPUs */
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        spapr_reset_cpu(env);
     }
 
+    /* Load the fdt */
+    spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
+                       spapr->rtas_size);
+
+    /* Set up the entry state on CPU0 */
+    env = first_cpu;
+
+    env->gpr[3] = spapr->fdt_addr;
+    env->gpr[5] = 0;
+    env->halted = 0;
+    env->nip = spapr->entry_point;
     tb_flush(env);
 }
 
@@ -718,8 +732,6 @@  static void ppc_spapr_init(ram_addr_t ram_size,
     /* FIXME: we should change this default based on RAM size */
     spapr->htab_shift = 24;
 
-    qemu_register_reset(spapr_reset, spapr);
-
     /* init CPUs */
     if (cpu_model == NULL) {
         cpu_model = kvm_enabled() ? "host" : "POWER7";
@@ -734,11 +746,9 @@  static void ppc_spapr_init(ram_addr_t ram_size,
 
         /* Set time-base frequency to 512 MHz */
         cpu_ppc_tb_init(env, TIMEBASE_FREQ);
-        qemu_register_reset(spapr_cpu_reset, cpu);
 
         env->hreset_vector = 0x60;
         env->hreset_excp_prefix = 0;
-        env->gpr[3] = env->cpu_index;
     }
 
     /* allocate RAM */
@@ -883,11 +893,6 @@  static void ppc_spapr_init(ram_addr_t ram_size,
 
     spapr->entry_point = 0x100;
 
-    /* SLOF will startup the secondary CPUs using RTAS */
-    for (env = first_cpu; env != NULL; env = env->next_cpu) {
-        env->halted = 1;
-    }
-
     /* Prepare the device tree */
     spapr->fdt_skel = spapr_create_fdt_skel(cpu_model,
                                             initrd_base, initrd_size,
@@ -900,6 +905,7 @@  static QEMUMachine spapr_machine = {
     .name = "pseries",
     .desc = "pSeries Logical Partition (PAPR compliant)",
     .init = ppc_spapr_init,
+    .reset = spapr_reset,
     .max_cpus = MAX_CPUS,
     .no_parallel = 1,
     .use_scsi = 1,