diff mbox series

[v6,3/3] target/ppc: support single stepping with KVM HV

Message ID 20200110151344.278471-4-farosas@linux.ibm.com
State New
Headers show
Series [v6,1/3] target/ppc: Clarify the meaning of return values in kvm_handle_debug | expand

Commit Message

Fabiano Rosas Jan. 10, 2020, 3:13 p.m. UTC
The hardware singlestep mechanism in POWER works via a Trace Interrupt
(0xd00) that happens after any instruction executes, whenever MSR_SE =
1 (PowerISA Section 6.5.15 - Trace Interrupt).

However, with kvm_hv, the Trace Interrupt happens inside the guest and
KVM has no visibility of it. Therefore, when the gdbstub uses the
KVM_SET_GUEST_DEBUG ioctl to enable singlestep, KVM simply ignores it.

This patch takes advantage of the Trace Interrupt to perform the step
inside the guest, but uses a breakpoint at the Trace Interrupt handler
to return control to KVM. The exit is treated by KVM as a regular
breakpoint and it returns to the host (and QEMU eventually).

Before signalling GDB, QEMU sets the Next Instruction Pointer to the
instruction following the one being stepped and restores the MSR,
SRR0, SRR1 values from before the step, effectively skipping the
interrupt handler execution and hiding the trace interrupt breakpoint
from GDB.

This approach works with both of GDB's 'scheduler-locking' options
(off, step).

Note:

- kvm_arch_emulate_singlestep happens after GDB asks for a single step,
  while the vcpus are stopped.

- kvm_handle_singlestep executes after the step, during the handling
  of the Emulation Assist Interrupt (breakpoint).

Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
---
 hw/ppc/spapr_hcall.c     |   5 +
 target/ppc/cpu.h         |  21 +++
 target/ppc/excp_helper.c |  15 ++
 target/ppc/kvm.c         | 330 ++++++++++++++++++++++++++++++++++++++-
 target/ppc/kvm_ppc.h     |   6 +-
 5 files changed, 369 insertions(+), 8 deletions(-)

Comments

David Gibson Jan. 20, 2020, 2:35 a.m. UTC | #1
On Fri, Jan 10, 2020 at 12:13:44PM -0300, Fabiano Rosas wrote:
> The hardware singlestep mechanism in POWER works via a Trace Interrupt
> (0xd00) that happens after any instruction executes, whenever MSR_SE =
> 1 (PowerISA Section 6.5.15 - Trace Interrupt).
> 
> However, with kvm_hv, the Trace Interrupt happens inside the guest and
> KVM has no visibility of it. Therefore, when the gdbstub uses the
> KVM_SET_GUEST_DEBUG ioctl to enable singlestep, KVM simply ignores it.
> 
> This patch takes advantage of the Trace Interrupt to perform the step
> inside the guest, but uses a breakpoint at the Trace Interrupt handler
> to return control to KVM. The exit is treated by KVM as a regular
> breakpoint and it returns to the host (and QEMU eventually).
> 
> Before signalling GDB, QEMU sets the Next Instruction Pointer to the
> instruction following the one being stepped and restores the MSR,
> SRR0, SRR1 values from before the step, effectively skipping the
> interrupt handler execution and hiding the trace interrupt breakpoint
> from GDB.
> 
> This approach works with both of GDB's 'scheduler-locking' options
> (off, step).
> 
> Note:
> 
> - kvm_arch_emulate_singlestep happens after GDB asks for a single step,
>   while the vcpus are stopped.
> 
> - kvm_handle_singlestep executes after the step, during the handling
>   of the Emulation Assist Interrupt (breakpoint).
> 
> Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>


> ---
>  hw/ppc/spapr_hcall.c     |   5 +
>  target/ppc/cpu.h         |  21 +++
>  target/ppc/excp_helper.c |  15 ++
>  target/ppc/kvm.c         | 330 ++++++++++++++++++++++++++++++++++++++-
>  target/ppc/kvm_ppc.h     |   6 +-
>  5 files changed, 369 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index f1799b1b70..194b68ed22 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -1382,6 +1382,7 @@ static target_ulong h_set_mode_resource_addr_trans_mode(PowerPCCPU *cpu,
>                                                          target_ulong value1,
>                                                          target_ulong value2)
>  {
> +    CPUState *cs = CPU(cpu);
>      PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
>  
>      if (!(pcc->insns_flags2 & PPC2_ISA207S)) {
> @@ -1400,6 +1401,10 @@ static target_ulong h_set_mode_resource_addr_trans_mode(PowerPCCPU *cpu,
>  
>      spapr_set_all_lpcrs(mflags << LPCR_AIL_SHIFT, LPCR_AIL);
>  
> +    if (cs->singlestep_enabled && kvm_enabled()) {
> +        kvm_singlestep_ail_change(cs);
> +    }
> +
>      return H_SUCCESS;
>  }
>  
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index 103bfe9dc2..b69f8565aa 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -440,6 +440,10 @@ typedef struct ppc_v3_pate_t {
>  #define msr_ts   ((env->msr >> MSR_TS1)  & 3)
>  #define msr_tm   ((env->msr >> MSR_TM)   & 1)
>  
> +#define srr1_ir ((env->spr[SPR_SRR1] >> MSR_IR) & 1)
> +#define srr1_dr ((env->spr[SPR_SRR1] >> MSR_DR) & 1)
> +#define srr1_se ((env->spr[SPR_SRR1] >> MSR_SE) & 1)

I'd prefer not to introduce these.  The msr_xx macros are already kind
of dubious because they assume the meaning of 'env' in the context
they're used.  I'm ok to use them because they're so useful, so
often.  These srr1 variants however are used in far fewer situations.
So, I'd prefer to just open code these as (env->spr[SPR_SRR1] &
MSR_IR) in the relatively few places they're used for now.

>  #define DBCR0_ICMP (1 << 27)
>  #define DBCR0_BRT (1 << 26)
>  #define DBSR_ICMP (1 << 27)
> @@ -1158,6 +1162,16 @@ struct CPUPPCState {
>      uint32_t tm_vscr;
>      uint64_t tm_dscr;
>      uint64_t tm_tar;
> +
> +    /* Used for software single step */
> +    target_ulong sstep_srr0;
> +    target_ulong sstep_srr1;
> +    target_ulong sstep_insn;
> +    target_ulong trace_handler_addr;
> +    int sstep_kind;

Won't you need to migrate this extra state, at least some of the time?

> +#define SSTEP_REGULAR 0
> +#define SSTEP_PENDING 1
> +#define SSTEP_GUEST   2

Some comments on what these modes mean would be useful.

>  };
>  
>  #define SET_FIT_PERIOD(a_, b_, c_, d_)          \
> @@ -1251,6 +1265,7 @@ struct PPCVirtualHypervisorClass {
>      OBJECT_GET_CLASS(PPCVirtualHypervisorClass, (obj), \
>                       TYPE_PPC_VIRTUAL_HYPERVISOR)
>  
> +target_ulong ppc_get_trace_int_handler_addr(CPUState *cs, bool mmu_on);
>  void ppc_cpu_do_interrupt(CPUState *cpu);
>  bool ppc_cpu_exec_interrupt(CPUState *cpu, int int_req);
>  void ppc_cpu_dump_state(CPUState *cpu, FILE *f, int flags);
> @@ -2220,6 +2235,12 @@ enum {
>                          PPC2_ISA300)
>  };
>  
> +#define OP_RFID 19
> +#define XOP_RFID 18
> +#define OP_MOV 31
> +#define XOP_MFMSR 83
> +#define XOP_MTSPR 467
> +
>  /*****************************************************************************/
>  /*
>   * Memory access type :
> diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
> index 5752ed4a4d..87230f871a 100644
> --- a/target/ppc/excp_helper.c
> +++ b/target/ppc/excp_helper.c
> @@ -784,6 +784,21 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int excp_model, int excp)
>      check_tlb_flush(env, false);
>  }
>  
> +target_ulong ppc_get_trace_int_handler_addr(CPUState *cs, bool mmu_on)
> +{
> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> +    CPUPPCState *env = &cpu->env;
> +    int ail = AIL_NONE;
> +
> +    /* AIL is only used if translation is enabled */
> +    if (mmu_on) {
> +        ail = (env->spr[SPR_LPCR] & LPCR_AIL) >> LPCR_AIL_SHIFT;
> +    }
> +
> +    return env->excp_vectors[POWERPC_EXCP_TRACE] |
> +        ppc_excp_vector_offset(cs, ail);
> +}
> +
>  void ppc_cpu_do_interrupt(CPUState *cs)
>  {
>      PowerPCCPU *cpu = POWERPC_CPU(cs);
> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> index 6fb8687126..2b6cba59c8 100644
> --- a/target/ppc/kvm.c
> +++ b/target/ppc/kvm.c
> @@ -1546,6 +1546,202 @@ void kvm_arch_remove_all_hw_breakpoints(void)
>      nb_hw_breakpoint = nb_hw_watchpoint = 0;
>  }
>  
> +static uint32_t ppc_gdb_read_insn(CPUState *cs, target_ulong addr)
> +{
> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> +    CPUPPCState *env = &cpu->env;
> +    uint32_t insn;
> +
> +    cpu_memory_rw_debug(cs, addr, (uint8_t *)&insn, sizeof(insn), 0);
> +
> +    if (msr_le) {
> +        return ldl_le_p(&insn);
> +    } else {
> +        return ldl_be_p(&insn);
> +    }

I think you can just use cpu_ldl_code() for this.

> +}
> +
> +static uint32_t ppc_gdb_get_op(uint32_t insn)
> +{
> +    return extract32(insn, 26, 6);
> +}
> +
> +static uint32_t ppc_gdb_get_xop(uint32_t insn)
> +{
> +    return extract32(insn, 1, 10);
> +}
> +
> +static uint32_t ppc_gdb_get_spr(uint32_t insn)
> +{
> +    return extract32(insn, 11, 5) << 5 | extract32(insn, 16, 5);
> +}
> +
> +static uint32_t ppc_gdb_get_rt(uint32_t insn)
> +{
> +    return extract32(insn, 21, 5);
> +}
> +
> +static void kvm_insert_singlestep_breakpoint(CPUState *cs, bool mmu_on)
> +{
> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> +    CPUPPCState *env = &cpu->env;
> +    target_ulong bp_addr;
> +    target_ulong saved_msr = env->msr;
> +
> +    bp_addr = ppc_get_trace_int_handler_addr(cs, mmu_on);
> +    if (env->nip == bp_addr) {
> +        /*
> +         * We are trying to step the interrupt handler address itself;
> +         * move the breakpoint to the next instruction.
> +         */
> +        bp_addr += 4;

What if the first instruction of the interrupt handler is a branch?

> +    }
> +
> +    /*
> +     * The actual access by the guest might be made in a mode
> +     * different than we are now (due to rfid) so temporarily set the
> +     * MSR to reflect that during the breakpoint placement.
> +     *
> +     * Note: we need this because breakpoints currently use
> +     * cpu_memory_rw_debug, which does the memory accesses from the
> +     * guest point of view.
> +     */
> +    if ((msr_ir & msr_dr) != mmu_on) {

Should be msr_ir && msr_dr - you only get away with bitwise and by
accident.

> +        if (mmu_on) {
> +            env->msr |= (1ULL << MSR_IR | 1ULL << MSR_DR);
> +        } else {
> +            env->msr &= ~(1ULL << MSR_IR | 1ULL << MSR_DR);
> +        }
> +    }

Wouldn't it be simpler to unconditionally set env->msr based on
mmu_on.

> +
> +    kvm_insert_breakpoint(cs, bp_addr, 4, GDB_BREAKPOINT_SW);

Hrm.... I don't actually see how changing env->msr helps you here.
AFAICT if kvm_insert_breakpoint() resolves to kvm_arch_sw_breakpoint()
it doesn't rely on the MSR value at all.  If it resolves to
kvm_arch_hw_breakpoint(), then it looks like it just stashes
information to be pushed into KVM when we re-enter the guest.  None of
the information stashed appears to depend on the current MSR, and once
we re-enter the MSR will already have been restored.

> +    env->trace_handler_addr = bp_addr;
> +    env->msr = saved_msr;
> +}
> +
> +/*
> + * The emulated singlestep works by forcing a Trace Interrupt inside
> + * the guest by setting its MSR_SE bit.  When the guest executes the
> + * instruction, the Trace Interrupt triggers and the step is done. We
> + * then need to hide the fact that the interrupt ever happened before
> + * returning into the guest.
> + *
> + * Since the Trace Interrupt does not set MSR_HV (the whole point of
> + * having to emulate the step), we set a breakpoint at the interrupt
> + * handler address (0xd00) so that it reaches KVM (this is treated by
> + * KVM like an ordinary breakpoint) and control is returned to QEMU.
> + *
> + * There are things to consider before the step (this function):
> + *
> + * - The breakpoint location depends on where the interrupt vectors are,
> + *   which in turn depends on the LPCR_AIL and MSR_IR|DR bits;
> + *
> + * - If the stepped instruction changes the LPCR_AIL, the breakpoint
> + *   location needs to be updated mid-step;
> + *
> + * - If the stepped instruction is rfid, the MSR bits after the step
> + *   will come from SRR1.
> + *
> + *
> + * There are some fixups needed after the step, before returning to
> + * the guest (kvm_handle_singlestep):
> + *
> + * - The interrupt will set SRR0 and SRR1, so we need to restore them
> + *   to what they were before the interrupt;
> + *
> + * - If the stepped instruction wrote to the SRRs, we need to avoid
> + *   undoing that;
> + *
> + * - We set the MSR_SE bit, so it needs to be cleared.
> + *
> + */
> +void kvm_arch_emulate_singlestep(CPUState *cs, int enabled)
> +{
> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> +    CPUPPCState *env = &cpu->env;
> +    uint32_t insn;
> +    bool rfid, mmu_on;
> +
> +    if (!enabled) {
> +        return;
> +    }
> +
> +    if (env->sstep_kind != SSTEP_PENDING) {
> +        env->sstep_kind = SSTEP_REGULAR;
> +    }
> +
> +    cpu_synchronize_state(cs);
> +    insn = ppc_gdb_read_insn(cs, env->nip);
> +
> +    /*
> +     * rfid needs special handling because it:
> +     *   - overwrites NIP with SRR0;
> +     *   - overwrites MSR with SRR1;
> +     *   - cannot be single stepped.
> +     */
> +    rfid = ppc_gdb_get_op(insn) == OP_RFID && ppc_gdb_get_xop(insn) == XOP_RFID;
> +
> +    if (rfid && (kvm_find_sw_breakpoint(cs, env->spr[SPR_SRR0]) ||
> +                 env->sstep_kind == SSTEP_PENDING)) {
> +        /*
> +         * There is a breakpoint at the next instruction address or a
> +         * pending step. It will already cause the vm exit we need for
> +         * the single step, so there's nothing to be done.
> +         */
> +        env->sstep_kind = SSTEP_REGULAR;
> +        return;
> +    }
> +
> +    /*
> +     * Save the registers that will be affected by the single step
> +     * mechanism. These will be used after the step.
> +     */
> +    env->sstep_srr0 = env->spr[SPR_SRR0];
> +    env->sstep_srr1 = env->spr[SPR_SRR1];
> +    env->sstep_insn = insn;
> +
> +    /*
> +     * MSR_SE = 1 will cause a Trace Interrupt in the guest after the
> +     * next instruction executes. If this is a rfid, use SRR1 instead
> +     * of MSR.
> +     */
> +    if (rfid) {
> +        if ((env->spr[SPR_SRR1] >> MSR_SE) & 1) {
> +            /*
> +             * The guest is doing a single step itself. Make sure we
> +             * restore it later.
> +             */
> +            env->sstep_kind = SSTEP_GUEST;
> +        }
> +
> +        env->spr[SPR_SRR1] |= (1ULL << MSR_SE);
> +        mmu_on = srr1_ir & srr1_dr;

s/&/&&/

> +    } else {
> +        env->msr |= (1ULL << MSR_SE);
> +        mmu_on = msr_ir & msr_dr;

s/&/&&/

Also, what happens if the guest is using MSR[DR] != MSR[IR]?  It's
rare, but it is occasionally useful.

> +    }
> +
> +    kvm_insert_singlestep_breakpoint(cs, mmu_on);
> +}
> +
> +void kvm_singlestep_ail_change(CPUState *cs)
> +{
> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> +    CPUPPCState *env = &cpu->env;
> +
> +    if (kvm_arch_can_singlestep(cs)) {
> +        return;
> +    }
> +
> +    /*
> +     * The instruction being stepped altered the interrupt vectors
> +     * location (AIL). Re-insert the single step breakpoint at the new
> +     * location
> +     */
> +    kvm_remove_breakpoint(cs, env->trace_handler_addr, 4, GDB_BREAKPOINT_SW);
> +    kvm_insert_singlestep_breakpoint(cs, (msr_ir & msr_dr));

s/&/&&/

> +}
> +
>  void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
>  {
>      int n;
> @@ -1585,6 +1781,98 @@ void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
>      }
>  }
>  
> +/* Revert any side-effects caused during single step */
> +static void restore_singlestep_env(CPUState *cs)
> +{
> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> +    CPUPPCState *env = &cpu->env;
> +    uint32_t insn = env->sstep_insn;
> +    int reg;
> +    int spr;
> +
> +    env->spr[SPR_SRR0] = env->sstep_srr0;
> +    env->spr[SPR_SRR1] = env->sstep_srr1;
> +
> +    if (ppc_gdb_get_op(insn) != OP_MOV) {
> +        return;
> +    }
> +
> +    reg = ppc_gdb_get_rt(insn);
> +
> +    switch (ppc_gdb_get_xop(insn)) {
> +    case XOP_MTSPR:
> +        /*
> +         * mtspr: the guest altered the SRR, so do not use the
> +         *        pre-step value.
> +         */
> +        spr = ppc_gdb_get_spr(insn);
> +        if (spr == SPR_SRR0 || spr == SPR_SRR1) {
> +            env->spr[spr] = env->gpr[reg];
> +        }
> +        break;
> +    case XOP_MFMSR:
> +        /*
> +         * mfmsr: clear MSR_SE bit to avoid the guest knowing
> +         *         that it is being single-stepped.
> +         */
> +        env->gpr[reg] &= ~(1ULL << MSR_SE);

Don't you need to check for the case where the guest also thinks it is
single stepping here?

> +        break;
> +    }
> +}
> +
> +static int kvm_handle_singlestep(CPUState *cs, target_ulong address)
> +{
> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> +    CPUPPCState *env = &cpu->env;
> +
> +    if (kvm_arch_can_singlestep(cs)) {
> +        return DEBUG_RETURN_GDB;
> +    }
> +
> +    cpu_synchronize_state(cs);
> +
> +    if (address == env->trace_handler_addr) {
> +        kvm_remove_breakpoint(cs, env->trace_handler_addr, 4,
> +                              GDB_BREAKPOINT_SW);
> +
> +        if (env->sstep_kind == SSTEP_GUEST) {
> +            /*
> +             * The guest expects the last instruction to have caused a
> +             * single step, go back into the interrupt handler.
> +             */
> +            env->sstep_kind = SSTEP_REGULAR;
> +            return DEBUG_RETURN_GDB;
> +        }
> +
> +        env->nip = env->spr[SPR_SRR0];
> +        /* Bits 33-36, 43-47 are set by the interrupt */
> +        env->msr = env->spr[SPR_SRR1] & ~(1ULL << MSR_SE |
> +                                          PPC_BITMASK(33, 36) |
> +                                          PPC_BITMASK(43, 47));
> +        restore_singlestep_env(cs);
> +
> +    } else if (address == env->trace_handler_addr + 4) {
> +        /*
> +         * A step at trace_handler_addr would interfere with the
> +         * single step mechanism itself, so we have previously
> +         * displaced the breakpoint to the next instruction.
> +         */
> +        kvm_remove_breakpoint(cs, env->trace_handler_addr + 4, 4,
> +                              GDB_BREAKPOINT_SW);
> +        restore_singlestep_env(cs);
> +    } else {
> +        /*
> +         * Another interrupt (e.g. system call, program interrupt)
> +         * took us somewhere else in the code and we hit a breakpoint
> +         * there. Mark the step as pending.
> +         */
> +        env->sstep_kind = SSTEP_PENDING;
> +    }
> +
> +    cs->singlestep_enabled = false;
> +    return DEBUG_RETURN_GDB;
> +}
> +
>  static int kvm_handle_hw_breakpoint(CPUState *cs,
>                                      struct kvm_debug_exit_arch *arch_info)
>  {
> @@ -1612,13 +1900,41 @@ static int kvm_handle_hw_breakpoint(CPUState *cs,
>      return handle;
>  }
>  
> -static int kvm_handle_singlestep(void)
> +static int kvm_handle_sw_breakpoint(CPUState *cs, target_ulong address)
>  {
> -    return DEBUG_RETURN_GDB;
> -}
> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> +    CPUPPCState *env = &cpu->env;
> +
> +    if (kvm_arch_can_singlestep(cs)) {
> +        return DEBUG_RETURN_GDB;
> +    }
> +
> +    cpu_synchronize_state(cs);
> +
> +    if (address == env->trace_handler_addr) {
> +        if (env->sstep_kind == SSTEP_PENDING) {
> +            /*
> +             * Although we have singlestep_enabled == false by now,
> +             * the original step still wasn't handled (is pending).
> +             */
> +            cs->singlestep_enabled = true;
> +            env->sstep_kind = SSTEP_REGULAR;
> +
> +            return kvm_handle_singlestep(cs, address);
> +        }
> +
> +        CPU_FOREACH(cs) {
> +            if (cs->singlestep_enabled) {
> +                /*
> +                 * We hit this breakpoint while another cpu is doing a
> +                 * software single step. Go back into the guest to
> +                 * give chance for the single step to finish.
> +                 */
> +                return DEBUG_RETURN_GUEST;
> +            }
> +        }
> +    }
>  
> -static int kvm_handle_sw_breakpoint(void)
> -{
>      return DEBUG_RETURN_GDB;
>  }
>  
> @@ -1629,7 +1945,7 @@ static int kvm_handle_debug(PowerPCCPU *cpu, struct kvm_run *run)
>      struct kvm_debug_exit_arch *arch_info = &run->debug.arch;
>  
>      if (cs->singlestep_enabled) {
> -        return kvm_handle_singlestep();
> +        return kvm_handle_singlestep(cs, arch_info->address);
>      }
>  
>      if (arch_info->status) {
> @@ -1637,7 +1953,7 @@ static int kvm_handle_debug(PowerPCCPU *cpu, struct kvm_run *run)
>      }
>  
>      if (kvm_find_sw_breakpoint(cs, arch_info->address)) {
> -        return kvm_handle_sw_breakpoint();
> +        return kvm_handle_sw_breakpoint(cs, arch_info->address);
>      }
>  
>      /*
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index f22daabf51..dc1b6df422 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -81,7 +81,7 @@ bool kvmppc_hpt_needs_host_contiguous_pages(void);
>  void kvm_check_mmu(PowerPCCPU *cpu, Error **errp);
>  void kvmppc_set_reg_ppc_online(PowerPCCPU *cpu, unsigned int online);
>  void kvmppc_set_reg_tb_offset(PowerPCCPU *cpu, int64_t tb_offset);
> -
> +void kvm_singlestep_ail_change(CPUState *cs);
>  #else
>  
>  static inline uint32_t kvmppc_get_tbfreq(void)
> @@ -211,6 +211,10 @@ static inline void kvmppc_set_reg_tb_offset(PowerPCCPU *cpu, int64_t tb_offset)
>  {
>  }
>  
> +static inline void kvm_singlestep_ail_change(CPUState *cs)
> +{
> +}
> +
>  #ifndef CONFIG_USER_ONLY
>  static inline bool kvmppc_spapr_use_multitce(void)
>  {
Fabiano Rosas Jan. 20, 2020, 8:11 p.m. UTC | #2
David Gibson <david@gibson.dropbear.id.au> writes:

>> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
>> index 103bfe9dc2..b69f8565aa 100644
>> --- a/target/ppc/cpu.h
>> +++ b/target/ppc/cpu.h
>> @@ -440,6 +440,10 @@ typedef struct ppc_v3_pate_t {
>>  #define msr_ts   ((env->msr >> MSR_TS1)  & 3)
>>  #define msr_tm   ((env->msr >> MSR_TM)   & 1)
>>  
>> +#define srr1_ir ((env->spr[SPR_SRR1] >> MSR_IR) & 1)
>> +#define srr1_dr ((env->spr[SPR_SRR1] >> MSR_DR) & 1)
>> +#define srr1_se ((env->spr[SPR_SRR1] >> MSR_SE) & 1)
>
> I'd prefer not to introduce these.  The msr_xx macros are already kind
> of dubious because they assume the meaning of 'env' in the context
> they're used.  I'm ok to use them because they're so useful, so
> often.  These srr1 variants however are used in far fewer situations.
> So, I'd prefer to just open code these as (env->spr[SPR_SRR1] &
> MSR_IR) in the relatively few places they're used for now.
>

Ok. No problem.

>>  #define DBCR0_ICMP (1 << 27)
>>  #define DBCR0_BRT (1 << 26)
>>  #define DBSR_ICMP (1 << 27)
>> @@ -1158,6 +1162,16 @@ struct CPUPPCState {
>>      uint32_t tm_vscr;
>>      uint64_t tm_dscr;
>>      uint64_t tm_tar;
>> +
>> +    /* Used for software single step */
>> +    target_ulong sstep_srr0;
>> +    target_ulong sstep_srr1;
>> +    target_ulong sstep_insn;
>> +    target_ulong trace_handler_addr;
>> +    int sstep_kind;
>
> Won't you need to migrate this extra state, at least some of the time?
>

Ah. I haven't looked into this yet. Will do that for the next
version. Need to learn a bit about migration first.

>> +#define SSTEP_REGULAR 0
>> +#define SSTEP_PENDING 1
>> +#define SSTEP_GUEST   2
>
> Some comments on what these modes mean would be useful.
>

Ok.

>> +static uint32_t ppc_gdb_read_insn(CPUState *cs, target_ulong addr)
>> +{
>> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
>> +    CPUPPCState *env = &cpu->env;
>> +    uint32_t insn;
>> +
>> +    cpu_memory_rw_debug(cs, addr, (uint8_t *)&insn, sizeof(insn), 0);
>> +
>> +    if (msr_le) {
>> +        return ldl_le_p(&insn);
>> +    } else {
>> +        return ldl_be_p(&insn);
>> +    }
>
> I think you can just use cpu_ldl_code() for this.
>

I'll look into it.

>> +static void kvm_insert_singlestep_breakpoint(CPUState *cs, bool mmu_on)
>> +{
>> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
>> +    CPUPPCState *env = &cpu->env;
>> +    target_ulong bp_addr;
>> +    target_ulong saved_msr = env->msr;
>> +
>> +    bp_addr = ppc_get_trace_int_handler_addr(cs, mmu_on);
>> +    if (env->nip == bp_addr) {
>> +        /*
>> +         * We are trying to step the interrupt handler address itself;
>> +         * move the breakpoint to the next instruction.
>> +         */
>> +        bp_addr += 4;
>
> What if the first instruction of the interrupt handler is a branch?
>

Well, I need to displace the breakpoint somehow. I don't think I can
handle this particular case without having _some_ knowledge of the
handler's code. The ones I've seen so far don't have a branch as first
instruction.

>> +    }
>> +
>> +    /*
>> +     * The actual access by the guest might be made in a mode
>> +     * different than we are now (due to rfid) so temporarily set the
>> +     * MSR to reflect that during the breakpoint placement.
>> +     *
>> +     * Note: we need this because breakpoints currently use
>> +     * cpu_memory_rw_debug, which does the memory accesses from the
>> +     * guest point of view.
>> +     */
>> +    if ((msr_ir & msr_dr) != mmu_on) {
>
> Should be msr_ir && msr_dr - you only get away with bitwise and by
> accident.
>

Ack.

>> +        if (mmu_on) {
>> +            env->msr |= (1ULL << MSR_IR | 1ULL << MSR_DR);
>> +        } else {
>> +            env->msr &= ~(1ULL << MSR_IR | 1ULL << MSR_DR);
>> +        }
>> +    }
>
> Wouldn't it be simpler to unconditionally set env->msr based on
> mmu_on.
>

Yes.

>> +
>> +    kvm_insert_breakpoint(cs, bp_addr, 4, GDB_BREAKPOINT_SW);
>
> Hrm.... I don't actually see how changing env->msr helps you here.
> AFAICT if kvm_insert_breakpoint() resolves to kvm_arch_sw_breakpoint()
> it doesn't rely on the MSR value at all.  If it resolves to
> kvm_arch_hw_breakpoint(), then it looks like it just stashes
> information to be pushed into KVM when we re-enter the guest.  None of
> the information stashed appears to depend on the current MSR, and once
> we re-enter the MSR will already have been restored.
>

This is the call chain:

kvm_arch_insert_sw_breakpoint -> cpu_memory_rw_debug ->
cpu_get_phys_page_attrs_debug -> ppc_cpu_get_phys_page_debug ->
ppc64_v3_get_phys_page_debug -> ppc_radix64_get_phys_page_debug:

    /* Handle Real Mode */
    if (msr_dr == 0) {
        /* In real mode top 4 effective addr bits (mostly) ignored */
        return eaddr & 0x0FFFFFFFFFFFFFFFULL;
    }


Actually, I think there is a bug after ppc_cpu_get_phys_page_debug
somewhere. There are some cases where GDB wants to read/write to some
memory, but it gets denied access. Presumably because of one such
discrepancy as the one above. I need to spend more time looking at this
to define the problem properly, though.

>> +    /*
>> +     * MSR_SE = 1 will cause a Trace Interrupt in the guest after the
>> +     * next instruction executes. If this is a rfid, use SRR1 instead
>> +     * of MSR.
>> +     */
>> +    if (rfid) {
>> +        if ((env->spr[SPR_SRR1] >> MSR_SE) & 1) {
>> +            /*
>> +             * The guest is doing a single step itself. Make sure we
>> +             * restore it later.
>> +             */
>> +            env->sstep_kind = SSTEP_GUEST;
>> +        }
>> +
>> +        env->spr[SPR_SRR1] |= (1ULL << MSR_SE);
>> +        mmu_on = srr1_ir & srr1_dr;
>
> s/&/&&/
>

Ack.

>> +    } else {
>> +        env->msr |= (1ULL << MSR_SE);
>> +        mmu_on = msr_ir & msr_dr;
>
> s/&/&&/
>

Ack.

> Also, what happens if the guest is using MSR[DR] != MSR[IR]?  It's
> rare, but it is occasionally useful.
>

I understand from the ISA that for the purposes of AIL, both bits need
to be set. So mmu_on = 0 is correct here.

>> +    }
>> +
>> +    kvm_insert_singlestep_breakpoint(cs, mmu_on);
>> +}
>> +
>> +void kvm_singlestep_ail_change(CPUState *cs)
>> +{
>> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
>> +    CPUPPCState *env = &cpu->env;
>> +
>> +    if (kvm_arch_can_singlestep(cs)) {
>> +        return;
>> +    }
>> +
>> +    /*
>> +     * The instruction being stepped altered the interrupt vectors
>> +     * location (AIL). Re-insert the single step breakpoint at the new
>> +     * location
>> +     */
>> +    kvm_remove_breakpoint(cs, env->trace_handler_addr, 4, GDB_BREAKPOINT_SW);
>> +    kvm_insert_singlestep_breakpoint(cs, (msr_ir & msr_dr));
>
> s/&/&&/
>

Ack.

>> +}
>> +
>>  void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
>>  {
>>      int n;
>> @@ -1585,6 +1781,98 @@ void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
>>      }
>>  }
>>  
>> +/* Revert any side-effects caused during single step */
>> +static void restore_singlestep_env(CPUState *cs)
>> +{
>> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
>> +    CPUPPCState *env = &cpu->env;
>> +    uint32_t insn = env->sstep_insn;
>> +    int reg;
>> +    int spr;
>> +
>> +    env->spr[SPR_SRR0] = env->sstep_srr0;
>> +    env->spr[SPR_SRR1] = env->sstep_srr1;
>> +
>> +    if (ppc_gdb_get_op(insn) != OP_MOV) {
>> +        return;
>> +    }
>> +
>> +    reg = ppc_gdb_get_rt(insn);
>> +
>> +    switch (ppc_gdb_get_xop(insn)) {
>> +    case XOP_MTSPR:
>> +        /*
>> +         * mtspr: the guest altered the SRR, so do not use the
>> +         *        pre-step value.
>> +         */
>> +        spr = ppc_gdb_get_spr(insn);
>> +        if (spr == SPR_SRR0 || spr == SPR_SRR1) {
>> +            env->spr[spr] = env->gpr[reg];
>> +        }
>> +        break;
>> +    case XOP_MFMSR:
>> +        /*
>> +         * mfmsr: clear MSR_SE bit to avoid the guest knowing
>> +         *         that it is being single-stepped.
>> +         */
>> +        env->gpr[reg] &= ~(1ULL << MSR_SE);
>
> Don't you need to check for the case where the guest also thinks it is
> single stepping here?
>

Hm. I had this in some previous version but removed it for some
reason. I'll review it.


Thanks
David Gibson Jan. 21, 2020, 3:32 a.m. UTC | #3
On Mon, Jan 20, 2020 at 05:11:50PM -0300, Fabiano Rosas wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> >> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> >> index 103bfe9dc2..b69f8565aa 100644
> >> --- a/target/ppc/cpu.h
> >> +++ b/target/ppc/cpu.h
> >> @@ -440,6 +440,10 @@ typedef struct ppc_v3_pate_t {
> >>  #define msr_ts   ((env->msr >> MSR_TS1)  & 3)
> >>  #define msr_tm   ((env->msr >> MSR_TM)   & 1)
> >>  
> >> +#define srr1_ir ((env->spr[SPR_SRR1] >> MSR_IR) & 1)
> >> +#define srr1_dr ((env->spr[SPR_SRR1] >> MSR_DR) & 1)
> >> +#define srr1_se ((env->spr[SPR_SRR1] >> MSR_SE) & 1)
> >
> > I'd prefer not to introduce these.  The msr_xx macros are already kind
> > of dubious because they assume the meaning of 'env' in the context
> > they're used.  I'm ok to use them because they're so useful, so
> > often.  These srr1 variants however are used in far fewer situations.
> > So, I'd prefer to just open code these as (env->spr[SPR_SRR1] &
> > MSR_IR) in the relatively few places they're used for now.
> >
> 
> Ok. No problem.
> 
> >>  #define DBCR0_ICMP (1 << 27)
> >>  #define DBCR0_BRT (1 << 26)
> >>  #define DBSR_ICMP (1 << 27)
> >> @@ -1158,6 +1162,16 @@ struct CPUPPCState {
> >>      uint32_t tm_vscr;
> >>      uint64_t tm_dscr;
> >>      uint64_t tm_tar;
> >> +
> >> +    /* Used for software single step */
> >> +    target_ulong sstep_srr0;
> >> +    target_ulong sstep_srr1;
> >> +    target_ulong sstep_insn;
> >> +    target_ulong trace_handler_addr;
> >> +    int sstep_kind;
> >
> > Won't you need to migrate this extra state, at least some of the time?
> >
> 
> Ah. I haven't looked into this yet. Will do that for the next
> version. Need to learn a bit about migration first.
> 
> >> +#define SSTEP_REGULAR 0
> >> +#define SSTEP_PENDING 1
> >> +#define SSTEP_GUEST   2
> >
> > Some comments on what these modes mean would be useful.
> >
> 
> Ok.
> 
> >> +static uint32_t ppc_gdb_read_insn(CPUState *cs, target_ulong addr)
> >> +{
> >> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> >> +    CPUPPCState *env = &cpu->env;
> >> +    uint32_t insn;
> >> +
> >> +    cpu_memory_rw_debug(cs, addr, (uint8_t *)&insn, sizeof(insn), 0);
> >> +
> >> +    if (msr_le) {
> >> +        return ldl_le_p(&insn);
> >> +    } else {
> >> +        return ldl_be_p(&insn);
> >> +    }
> >
> > I think you can just use cpu_ldl_code() for this.
> >
> 
> I'll look into it.
> 
> >> +static void kvm_insert_singlestep_breakpoint(CPUState *cs, bool mmu_on)
> >> +{
> >> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> >> +    CPUPPCState *env = &cpu->env;
> >> +    target_ulong bp_addr;
> >> +    target_ulong saved_msr = env->msr;
> >> +
> >> +    bp_addr = ppc_get_trace_int_handler_addr(cs, mmu_on);
> >> +    if (env->nip == bp_addr) {
> >> +        /*
> >> +         * We are trying to step the interrupt handler address itself;
> >> +         * move the breakpoint to the next instruction.
> >> +         */
> >> +        bp_addr += 4;
> >
> > What if the first instruction of the interrupt handler is a branch?
> >
> 
> Well, I need to displace the breakpoint somehow. I don't think I can
> handle this particular case without having _some_ knowledge of the
> handler's code. The ones I've seen so far don't have a branch as first
> instruction.

So, at least for unconditional branches it's possible - you could
parse the instruction and place your breakpoint on the target.  For
conditional branches it would be much harder, but they are much less
likely as the very first instruction on the interrupt vector.

I do think we should at least detect and flag some sort of error in
this situation though.

> >> +    }
> >> +
> >> +    /*
> >> +     * The actual access by the guest might be made in a mode
> >> +     * different than we are now (due to rfid) so temporarily set the
> >> +     * MSR to reflect that during the breakpoint placement.
> >> +     *
> >> +     * Note: we need this because breakpoints currently use
> >> +     * cpu_memory_rw_debug, which does the memory accesses from the
> >> +     * guest point of view.
> >> +     */
> >> +    if ((msr_ir & msr_dr) != mmu_on) {
> >
> > Should be msr_ir && msr_dr - you only get away with bitwise and by
> > accident.
> >
> 
> Ack.
> 
> >> +        if (mmu_on) {
> >> +            env->msr |= (1ULL << MSR_IR | 1ULL << MSR_DR);
> >> +        } else {
> >> +            env->msr &= ~(1ULL << MSR_IR | 1ULL << MSR_DR);
> >> +        }
> >> +    }
> >
> > Wouldn't it be simpler to unconditionally set env->msr based on
> > mmu_on.
> >
> 
> Yes.
> 
> >> +
> >> +    kvm_insert_breakpoint(cs, bp_addr, 4, GDB_BREAKPOINT_SW);
> >
> > Hrm.... I don't actually see how changing env->msr helps you here.
> > AFAICT if kvm_insert_breakpoint() resolves to kvm_arch_sw_breakpoint()
> > it doesn't rely on the MSR value at all.  If it resolves to
> > kvm_arch_hw_breakpoint(), then it looks like it just stashes
> > information to be pushed into KVM when we re-enter the guest.  None of
> > the information stashed appears to depend on the current MSR, and once
> > we re-enter the MSR will already have been restored.
> >
>
> This is the call chain:
> 
> kvm_arch_insert_sw_breakpoint -> cpu_memory_rw_debug ->
> cpu_get_phys_page_attrs_debug -> ppc_cpu_get_phys_page_debug ->
> ppc64_v3_get_phys_page_debug -> ppc_radix64_get_phys_page_debug:
> 
>     /* Handle Real Mode */
>     if (msr_dr == 0) {
>         /* In real mode top 4 effective addr bits (mostly) ignored */
>         return eaddr & 0x0FFFFFFFFFFFFFFFULL;
>     }

Ah, right.  Basically the issue is that kvm_insert_breakpoint() takes
an effective address, not a real address, but it might be happening in
a different context than we're executing right now.

Ok, that makes sense.  Though.. aren't you always inserting the
breakpoint into an interrupt vector?  So wouldn't it always be MMU
off?  Under what circumstances would this get called with mmu_on =
true?

> Actually, I think there is a bug after ppc_cpu_get_phys_page_debug
> somewhere. There are some cases where GDB wants to read/write to some
> memory, but it gets denied access. Presumably because of one such
> discrepancy as the one above. I need to spend more time looking at this
> to define the problem properly, though.

Hm, ok.

> >> +    /*
> >> +     * MSR_SE = 1 will cause a Trace Interrupt in the guest after the
> >> +     * next instruction executes. If this is a rfid, use SRR1 instead
> >> +     * of MSR.
> >> +     */
> >> +    if (rfid) {
> >> +        if ((env->spr[SPR_SRR1] >> MSR_SE) & 1) {
> >> +            /*
> >> +             * The guest is doing a single step itself. Make sure we
> >> +             * restore it later.
> >> +             */
> >> +            env->sstep_kind = SSTEP_GUEST;
> >> +        }
> >> +
> >> +        env->spr[SPR_SRR1] |= (1ULL << MSR_SE);
> >> +        mmu_on = srr1_ir & srr1_dr;
> >
> > s/&/&&/
> >
> 
> Ack.
> 
> >> +    } else {
> >> +        env->msr |= (1ULL << MSR_SE);
> >> +        mmu_on = msr_ir & msr_dr;
> >
> > s/&/&&/
> >
> 
> Ack.
> 
> > Also, what happens if the guest is using MSR[DR] != MSR[IR]?  It's
> > rare, but it is occasionally useful.
> 
> I understand from the ISA that for the purposes of AIL, both bits need
> to be set. So mmu_on = 0 is correct here.

I'm not sure what you mean by "for the purposes of AIL".

> 
> >> +    }
> >> +
> >> +    kvm_insert_singlestep_breakpoint(cs, mmu_on);
> >> +}
> >> +
> >> +void kvm_singlestep_ail_change(CPUState *cs)
> >> +{
> >> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> >> +    CPUPPCState *env = &cpu->env;
> >> +
> >> +    if (kvm_arch_can_singlestep(cs)) {
> >> +        return;
> >> +    }
> >> +
> >> +    /*
> >> +     * The instruction being stepped altered the interrupt vectors
> >> +     * location (AIL). Re-insert the single step breakpoint at the new
> >> +     * location
> >> +     */
> >> +    kvm_remove_breakpoint(cs, env->trace_handler_addr, 4, GDB_BREAKPOINT_SW);
> >> +    kvm_insert_singlestep_breakpoint(cs, (msr_ir & msr_dr));
> >
> > s/&/&&/
> >
> 
> Ack.
> 
> >> +}
> >> +
> >>  void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
> >>  {
> >>      int n;
> >> @@ -1585,6 +1781,98 @@ void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
> >>      }
> >>  }
> >>  
> >> +/* Revert any side-effects caused during single step */
> >> +static void restore_singlestep_env(CPUState *cs)
> >> +{
> >> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> >> +    CPUPPCState *env = &cpu->env;
> >> +    uint32_t insn = env->sstep_insn;
> >> +    int reg;
> >> +    int spr;
> >> +
> >> +    env->spr[SPR_SRR0] = env->sstep_srr0;
> >> +    env->spr[SPR_SRR1] = env->sstep_srr1;
> >> +
> >> +    if (ppc_gdb_get_op(insn) != OP_MOV) {
> >> +        return;
> >> +    }
> >> +
> >> +    reg = ppc_gdb_get_rt(insn);
> >> +
> >> +    switch (ppc_gdb_get_xop(insn)) {
> >> +    case XOP_MTSPR:
> >> +        /*
> >> +         * mtspr: the guest altered the SRR, so do not use the
> >> +         *        pre-step value.
> >> +         */
> >> +        spr = ppc_gdb_get_spr(insn);
> >> +        if (spr == SPR_SRR0 || spr == SPR_SRR1) {
> >> +            env->spr[spr] = env->gpr[reg];
> >> +        }
> >> +        break;
> >> +    case XOP_MFMSR:
> >> +        /*
> >> +         * mfmsr: clear MSR_SE bit to avoid the guest knowing
> >> +         *         that it is being single-stepped.
> >> +         */
> >> +        env->gpr[reg] &= ~(1ULL << MSR_SE);
> >
> > Don't you need to check for the case where the guest also thinks it is
> > single stepping here?
> >
> 
> Hm. I had this in some previous version but removed it for some
> reason. I'll review it.
> 
> 
> Thanks
>
Fabiano Rosas Jan. 21, 2020, 8:23 p.m. UTC | #4
David Gibson <david@gibson.dropbear.id.au> writes:

(...)
>> > Hrm.... I don't actually see how changing env->msr helps you here.
>> > AFAICT if kvm_insert_breakpoint() resolves to kvm_arch_sw_breakpoint()
>> > it doesn't rely on the MSR value at all.  If it resolves to
>> > kvm_arch_hw_breakpoint(), then it looks like it just stashes
>> > information to be pushed into KVM when we re-enter the guest.  None of
>> > the information stashed appears to depend on the current MSR, and once
>> > we re-enter the MSR will already have been restored.
>> >
>>
>> This is the call chain:
>> 
>> kvm_arch_insert_sw_breakpoint -> cpu_memory_rw_debug ->
>> cpu_get_phys_page_attrs_debug -> ppc_cpu_get_phys_page_debug ->
>> ppc64_v3_get_phys_page_debug -> ppc_radix64_get_phys_page_debug:
>> 
>>     /* Handle Real Mode */
>>     if (msr_dr == 0) {
>>         /* In real mode top 4 effective addr bits (mostly) ignored */
>>         return eaddr & 0x0FFFFFFFFFFFFFFFULL;
>>     }
>
> Ah, right.  Basically the issue is that kvm_insert_breakpoint() takes
> an effective address, not a real address, but it might be happening in
> a different context than we're executing right now.
>
> Ok, that makes sense.  Though.. aren't you always inserting the
> breakpoint into an interrupt vector?  So wouldn't it always be MMU
> off?  Under what circumstances would this get called with mmu_on =
> true?
>

Well, the MSR state at the moment of the breakpoint is that of the
currently executing instruction. So this gets called with mmu_on = true
very often because we're often debugging code than runs with IR|DR=1.

However, we could be at a point when IR|DR=1, but the next traced
instruction will execute with IR|DR=0. This happens at the rfid at the
end of __enter_rtas, for instance.

So ppc_radix64_get_phys_page_debug will check the MSR, see that we are
(now) not in real mode and proceed with the page table walk, which could
fail.

In the particular case of the __enter_rtas rfid, we have PIDR=1 [1] so
if we don't exit ppc_radix64_get_phys_page_debug at the msr_dr == 0
check, it will fail to translate the address.

1 - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=eeb715c3e995fbdda0cc05e61216c6c5609bce66

>> Actually, I think there is a bug after ppc_cpu_get_phys_page_debug
>> somewhere. There are some cases where GDB wants to read/write to some
>> memory, but it gets denied access. Presumably because of one such
>> discrepancy as the one above. I need to spend more time looking at this
>> to define the problem properly, though.
>
> Hm, ok.
>
>> >> +    /*
>> >> +     * MSR_SE = 1 will cause a Trace Interrupt in the guest after the
>> >> +     * next instruction executes. If this is a rfid, use SRR1 instead
>> >> +     * of MSR.
>> >> +     */
>> >> +    if (rfid) {
>> >> +        if ((env->spr[SPR_SRR1] >> MSR_SE) & 1) {
>> >> +            /*
>> >> +             * The guest is doing a single step itself. Make sure we
>> >> +             * restore it later.
>> >> +             */
>> >> +            env->sstep_kind = SSTEP_GUEST;
>> >> +        }
>> >> +
>> >> +        env->spr[SPR_SRR1] |= (1ULL << MSR_SE);
>> >> +        mmu_on = srr1_ir & srr1_dr;
>> >
>> > s/&/&&/
>> >
>> 
>> Ack.
>> 
>> >> +    } else {
>> >> +        env->msr |= (1ULL << MSR_SE);
>> >> +        mmu_on = msr_ir & msr_dr;
>> >
>> > s/&/&&/
>> >
>> 
>> Ack.
>> 
>> > Also, what happens if the guest is using MSR[DR] != MSR[IR]?  It's
>> > rare, but it is occasionally useful.
>> 
>> I understand from the ISA that for the purposes of AIL, both bits need
>> to be set. So mmu_on = 0 is correct here.
>
> I'm not sure what you mean by "for the purposes of AIL".
>

The reason I'm tracking the translation state here is to be able to tell
what will be the value of AIL, since an Alternate Interrupt Location is
not used when translation is disabled. In the ISA, under "Alternate
Interrupt Location" the only mention of MSR_IR != MSR_DR is:

"Other interrupts that occur when MSR IR=0 or MSR DR=0, are taken as
if LPCR AIL=0."

and my interpretation of that text is that AIL value is 0 when IR DR are
either 0b00, 0b01 or 0b10.

So "for the purposes of AIL", I'm considering either IR=0 or DR=0 as
meaning MMU off.

>> 
>> >> +    }
>> >> +
>> >> +    kvm_insert_singlestep_breakpoint(cs, mmu_on);
>> >> +}
>> >> +
>> >> +void kvm_singlestep_ail_change(CPUState *cs)
>> >> +{
>> >> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
>> >> +    CPUPPCState *env = &cpu->env;
>> >> +
>> >> +    if (kvm_arch_can_singlestep(cs)) {
>> >> +        return;
>> >> +    }
>> >> +
>> >> +    /*
>> >> +     * The instruction being stepped altered the interrupt vectors
>> >> +     * location (AIL). Re-insert the single step breakpoint at the new
>> >> +     * location
>> >> +     */
>> >> +    kvm_remove_breakpoint(cs, env->trace_handler_addr, 4, GDB_BREAKPOINT_SW);
>> >> +    kvm_insert_singlestep_breakpoint(cs, (msr_ir & msr_dr));
>> >
>> > s/&/&&/
>> >
>> 
>> Ack.
>> 
>> >> +}
>> >> +
>> >>  void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
>> >>  {
>> >>      int n;
>> >> @@ -1585,6 +1781,98 @@ void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
>> >>      }
>> >>  }
>> >>  
>> >> +/* Revert any side-effects caused during single step */
>> >> +static void restore_singlestep_env(CPUState *cs)
>> >> +{
>> >> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
>> >> +    CPUPPCState *env = &cpu->env;
>> >> +    uint32_t insn = env->sstep_insn;
>> >> +    int reg;
>> >> +    int spr;
>> >> +
>> >> +    env->spr[SPR_SRR0] = env->sstep_srr0;
>> >> +    env->spr[SPR_SRR1] = env->sstep_srr1;
>> >> +
>> >> +    if (ppc_gdb_get_op(insn) != OP_MOV) {
>> >> +        return;
>> >> +    }
>> >> +
>> >> +    reg = ppc_gdb_get_rt(insn);
>> >> +
>> >> +    switch (ppc_gdb_get_xop(insn)) {
>> >> +    case XOP_MTSPR:
>> >> +        /*
>> >> +         * mtspr: the guest altered the SRR, so do not use the
>> >> +         *        pre-step value.
>> >> +         */
>> >> +        spr = ppc_gdb_get_spr(insn);
>> >> +        if (spr == SPR_SRR0 || spr == SPR_SRR1) {
>> >> +            env->spr[spr] = env->gpr[reg];
>> >> +        }
>> >> +        break;
>> >> +    case XOP_MFMSR:
>> >> +        /*
>> >> +         * mfmsr: clear MSR_SE bit to avoid the guest knowing
>> >> +         *         that it is being single-stepped.
>> >> +         */
>> >> +        env->gpr[reg] &= ~(1ULL << MSR_SE);
>> >
>> > Don't you need to check for the case where the guest also thinks it is
>> > single stepping here?
>> >
>> 
>> Hm. I had this in some previous version but removed it for some
>> reason. I'll review it.
>> 
>> 
>> Thanks
>>
David Gibson Jan. 22, 2020, 3:11 a.m. UTC | #5
On Tue, Jan 21, 2020 at 05:23:03PM -0300, Fabiano Rosas wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> (...)
> >> > Hrm.... I don't actually see how changing env->msr helps you here.
> >> > AFAICT if kvm_insert_breakpoint() resolves to kvm_arch_sw_breakpoint()
> >> > it doesn't rely on the MSR value at all.  If it resolves to
> >> > kvm_arch_hw_breakpoint(), then it looks like it just stashes
> >> > information to be pushed into KVM when we re-enter the guest.  None of
> >> > the information stashed appears to depend on the current MSR, and once
> >> > we re-enter the MSR will already have been restored.
> >> >
> >>
> >> This is the call chain:
> >> 
> >> kvm_arch_insert_sw_breakpoint -> cpu_memory_rw_debug ->
> >> cpu_get_phys_page_attrs_debug -> ppc_cpu_get_phys_page_debug ->
> >> ppc64_v3_get_phys_page_debug -> ppc_radix64_get_phys_page_debug:
> >> 
> >>     /* Handle Real Mode */
> >>     if (msr_dr == 0) {
> >>         /* In real mode top 4 effective addr bits (mostly) ignored */
> >>         return eaddr & 0x0FFFFFFFFFFFFFFFULL;
> >>     }
> >
> > Ah, right.  Basically the issue is that kvm_insert_breakpoint() takes
> > an effective address, not a real address, but it might be happening in
> > a different context than we're executing right now.
> >
> > Ok, that makes sense.  Though.. aren't you always inserting the
> > breakpoint into an interrupt vector?  So wouldn't it always be MMU
> > off?  Under what circumstances would this get called with mmu_on =
> > true?
> 
> Well, the MSR state at the moment of the breakpoint is that of the
> currently executing instruction.

Uh... at the moment of setting the breakpoint, or the moment of
hitting the breakpoint.

> So this gets called with mmu_on = true
> very often because we're often debugging code than runs with
> IR|DR=1.

Uh... but isn't the whole point here that the state of mmu_on might
not match the MSR state.  So the two sentences above don't seem to
mesh together.

What I think I'm understanding from the code is that in order to *set*
the breakpoint, you need to set up the MSR to match what you expect it
will be when you *hit* the breakpoint.  Yes?

But since the breakpoint is always placed in an interrupt vector,
won't that always be real mode?  Or is this one of the vectors that
can be entered in virtual mode on recent chips?

> However, we could be at a point when IR|DR=1, but the next traced
> instruction will execute with IR|DR=0. This happens at the rfid at the
> end of __enter_rtas, for instance.
> 
> So ppc_radix64_get_phys_page_debug will check the MSR, see that we are
> (now) not in real mode and proceed with the page table walk, which could
> fail.
> 
> In the particular case of the __enter_rtas rfid, we have PIDR=1 [1] so
> if we don't exit ppc_radix64_get_phys_page_debug at the msr_dr == 0
> check, it will fail to translate the address.
> 
> 1 - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=eeb715c3e995fbdda0cc05e61216c6c5609bce66
> 
> >> Actually, I think there is a bug after ppc_cpu_get_phys_page_debug
> >> somewhere. There are some cases where GDB wants to read/write to some
> >> memory, but it gets denied access. Presumably because of one such
> >> discrepancy as the one above. I need to spend more time looking at this
> >> to define the problem properly, though.
> >
> > Hm, ok.
> >
> >> >> +    /*
> >> >> +     * MSR_SE = 1 will cause a Trace Interrupt in the guest after the
> >> >> +     * next instruction executes. If this is a rfid, use SRR1 instead
> >> >> +     * of MSR.
> >> >> +     */
> >> >> +    if (rfid) {
> >> >> +        if ((env->spr[SPR_SRR1] >> MSR_SE) & 1) {
> >> >> +            /*
> >> >> +             * The guest is doing a single step itself. Make sure we
> >> >> +             * restore it later.
> >> >> +             */
> >> >> +            env->sstep_kind = SSTEP_GUEST;
> >> >> +        }
> >> >> +
> >> >> +        env->spr[SPR_SRR1] |= (1ULL << MSR_SE);
> >> >> +        mmu_on = srr1_ir & srr1_dr;
> >> >
> >> > s/&/&&/
> >> >
> >> 
> >> Ack.
> >> 
> >> >> +    } else {
> >> >> +        env->msr |= (1ULL << MSR_SE);
> >> >> +        mmu_on = msr_ir & msr_dr;
> >> >
> >> > s/&/&&/
> >> >
> >> 
> >> Ack.
> >> 
> >> > Also, what happens if the guest is using MSR[DR] != MSR[IR]?  It's
> >> > rare, but it is occasionally useful.
> >> 
> >> I understand from the ISA that for the purposes of AIL, both bits need
> >> to be set. So mmu_on = 0 is correct here.
> >
> > I'm not sure what you mean by "for the purposes of AIL".
> >
> 
> The reason I'm tracking the translation state here is to be able to tell
> what will be the value of AIL, since an Alternate Interrupt Location is
> not used when translation is disabled. In the ISA, under "Alternate
> Interrupt Location" the only mention of MSR_IR != MSR_DR is:
> 
> "Other interrupts that occur when MSR IR=0 or MSR DR=0, are taken as
> if LPCR AIL=0."
> 
> and my interpretation of that text is that AIL value is 0 when IR DR are
> either 0b00, 0b01 or 0b10.
> 
> So "for the purposes of AIL", I'm considering either IR=0 or DR=0 as
> meaning MMU off.

But AFAICT the mmu_on flag you're setting here has influences other
than tracking AIL.
Fabiano Rosas Jan. 22, 2020, 7:34 p.m. UTC | #6
David Gibson <david@gibson.dropbear.id.au> writes:

> On Tue, Jan 21, 2020 at 05:23:03PM -0300, Fabiano Rosas wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:
>> 
>> (...)
>> >> > Hrm.... I don't actually see how changing env->msr helps you here.
>> >> > AFAICT if kvm_insert_breakpoint() resolves to kvm_arch_sw_breakpoint()
>> >> > it doesn't rely on the MSR value at all.  If it resolves to
>> >> > kvm_arch_hw_breakpoint(), then it looks like it just stashes
>> >> > information to be pushed into KVM when we re-enter the guest.  None of
>> >> > the information stashed appears to depend on the current MSR, and once
>> >> > we re-enter the MSR will already have been restored.
>> >> >
>> >>
>> >> This is the call chain:
>> >> 
>> >> kvm_arch_insert_sw_breakpoint -> cpu_memory_rw_debug ->
>> >> cpu_get_phys_page_attrs_debug -> ppc_cpu_get_phys_page_debug ->
>> >> ppc64_v3_get_phys_page_debug -> ppc_radix64_get_phys_page_debug:
>> >> 
>> >>     /* Handle Real Mode */
>> >>     if (msr_dr == 0) {
>> >>         /* In real mode top 4 effective addr bits (mostly) ignored */
>> >>         return eaddr & 0x0FFFFFFFFFFFFFFFULL;
>> >>     }
>> >
>> > Ah, right.  Basically the issue is that kvm_insert_breakpoint() takes
>> > an effective address, not a real address, but it might be happening in
>> > a different context than we're executing right now.
>> >
>> > Ok, that makes sense.  Though.. aren't you always inserting the
>> > breakpoint into an interrupt vector?  So wouldn't it always be MMU
>> > off?  Under what circumstances would this get called with mmu_on =
>> > true?
>> 
>> Well, the MSR state at the moment of the breakpoint is that of the
>> currently executing instruction.
>
> Uh... at the moment of setting the breakpoint, or the moment of
> hitting the breakpoint.
>

Setting. I reworded that sentence so many times it actually got worse.

>> So this gets called with mmu_on = true
>> very often because we're often debugging code than runs with
>> IR|DR=1.
>
> Uh... but isn't the whole point here that the state of mmu_on might
> not match the MSR state.  So the two sentences above don't seem to
> mesh together.
>

I meant that they match most of the time but when they don't we hit the
issue we are discussing right now.

> What I think I'm understanding from the code is that in order to *set*
> the breakpoint, you need to set up the MSR to match what you expect it
> will be when you *hit* the breakpoint.  Yes?
>

Yes.

> But since the breakpoint is always placed in an interrupt vector,
> won't that always be real mode?  Or is this one of the vectors that
> can be entered in virtual mode on recent chips?
>

The interrupt sets IR DR according to the AIL value, so it may be
handled in virtual mode as well.

>> However, we could be at a point when IR|DR=1, but the next traced
>> instruction will execute with IR|DR=0. This happens at the rfid at the
>> end of __enter_rtas, for instance.
>> 
>> So ppc_radix64_get_phys_page_debug will check the MSR, see that we are
>> (now) not in real mode and proceed with the page table walk, which could
>> fail.
>> 
>> In the particular case of the __enter_rtas rfid, we have PIDR=1 [1] so
>> if we don't exit ppc_radix64_get_phys_page_debug at the msr_dr == 0
>> check, it will fail to translate the address.
>> 
>> 1 - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=eeb715c3e995fbdda0cc05e61216c6c5609bce66
>> 
>> >> Actually, I think there is a bug after ppc_cpu_get_phys_page_debug
>> >> somewhere. There are some cases where GDB wants to read/write to some
>> >> memory, but it gets denied access. Presumably because of one such
>> >> discrepancy as the one above. I need to spend more time looking at this
>> >> to define the problem properly, though.
>> >
>> > Hm, ok.
>> >
>> >> >> +    /*
>> >> >> +     * MSR_SE = 1 will cause a Trace Interrupt in the guest after the
>> >> >> +     * next instruction executes. If this is a rfid, use SRR1 instead
>> >> >> +     * of MSR.
>> >> >> +     */
>> >> >> +    if (rfid) {
>> >> >> +        if ((env->spr[SPR_SRR1] >> MSR_SE) & 1) {
>> >> >> +            /*
>> >> >> +             * The guest is doing a single step itself. Make sure we
>> >> >> +             * restore it later.
>> >> >> +             */
>> >> >> +            env->sstep_kind = SSTEP_GUEST;
>> >> >> +        }
>> >> >> +
>> >> >> +        env->spr[SPR_SRR1] |= (1ULL << MSR_SE);
>> >> >> +        mmu_on = srr1_ir & srr1_dr;
>> >> >
>> >> > s/&/&&/
>> >> >
>> >> 
>> >> Ack.
>> >> 
>> >> >> +    } else {
>> >> >> +        env->msr |= (1ULL << MSR_SE);
>> >> >> +        mmu_on = msr_ir & msr_dr;
>> >> >
>> >> > s/&/&&/
>> >> >
>> >> 
>> >> Ack.
>> >> 
>> >> > Also, what happens if the guest is using MSR[DR] != MSR[IR]?  It's
>> >> > rare, but it is occasionally useful.
>> >> 
>> >> I understand from the ISA that for the purposes of AIL, both bits need
>> >> to be set. So mmu_on = 0 is correct here.
>> >
>> > I'm not sure what you mean by "for the purposes of AIL".
>> >
>> 
>> The reason I'm tracking the translation state here is to be able to tell
>> what will be the value of AIL, since an Alternate Interrupt Location is
>> not used when translation is disabled. In the ISA, under "Alternate
>> Interrupt Location" the only mention of MSR_IR != MSR_DR is:
>> 
>> "Other interrupts that occur when MSR IR=0 or MSR DR=0, are taken as
>> if LPCR AIL=0."
>> 
>> and my interpretation of that text is that AIL value is 0 when IR DR are
>> either 0b00, 0b01 or 0b10.
>> 
>> So "for the purposes of AIL", I'm considering either IR=0 or DR=0 as
>> meaning MMU off.
>
> But AFAICT the mmu_on flag you're setting here has influences other
> than tracking AIL.

The purpose of the mmu_on flag is to know what the AIL value will be at
the time the interrupt happens and therefore know in which mode the
interrupt will be handled, i.e where to put the breakpoint. Remember:

MMU off or AIL = 0 => 0xd00
AIL = 3 => 0xc000000000004d00

With this recent change, I also use it to know what I should temporarily
put in MSR_DR|IR so that the address translation doesn't
fail. Ultimately, the mmu_on flag only influences the placement of the
breakpoint at the Trace Interrupt handler.
diff mbox series

Patch

diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index f1799b1b70..194b68ed22 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1382,6 +1382,7 @@  static target_ulong h_set_mode_resource_addr_trans_mode(PowerPCCPU *cpu,
                                                         target_ulong value1,
                                                         target_ulong value2)
 {
+    CPUState *cs = CPU(cpu);
     PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
 
     if (!(pcc->insns_flags2 & PPC2_ISA207S)) {
@@ -1400,6 +1401,10 @@  static target_ulong h_set_mode_resource_addr_trans_mode(PowerPCCPU *cpu,
 
     spapr_set_all_lpcrs(mflags << LPCR_AIL_SHIFT, LPCR_AIL);
 
+    if (cs->singlestep_enabled && kvm_enabled()) {
+        kvm_singlestep_ail_change(cs);
+    }
+
     return H_SUCCESS;
 }
 
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 103bfe9dc2..b69f8565aa 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -440,6 +440,10 @@  typedef struct ppc_v3_pate_t {
 #define msr_ts   ((env->msr >> MSR_TS1)  & 3)
 #define msr_tm   ((env->msr >> MSR_TM)   & 1)
 
+#define srr1_ir ((env->spr[SPR_SRR1] >> MSR_IR) & 1)
+#define srr1_dr ((env->spr[SPR_SRR1] >> MSR_DR) & 1)
+#define srr1_se ((env->spr[SPR_SRR1] >> MSR_SE) & 1)
+
 #define DBCR0_ICMP (1 << 27)
 #define DBCR0_BRT (1 << 26)
 #define DBSR_ICMP (1 << 27)
@@ -1158,6 +1162,16 @@  struct CPUPPCState {
     uint32_t tm_vscr;
     uint64_t tm_dscr;
     uint64_t tm_tar;
+
+    /* Used for software single step */
+    target_ulong sstep_srr0;
+    target_ulong sstep_srr1;
+    target_ulong sstep_insn;
+    target_ulong trace_handler_addr;
+    int sstep_kind;
+#define SSTEP_REGULAR 0
+#define SSTEP_PENDING 1
+#define SSTEP_GUEST   2
 };
 
 #define SET_FIT_PERIOD(a_, b_, c_, d_)          \
@@ -1251,6 +1265,7 @@  struct PPCVirtualHypervisorClass {
     OBJECT_GET_CLASS(PPCVirtualHypervisorClass, (obj), \
                      TYPE_PPC_VIRTUAL_HYPERVISOR)
 
+target_ulong ppc_get_trace_int_handler_addr(CPUState *cs, bool mmu_on);
 void ppc_cpu_do_interrupt(CPUState *cpu);
 bool ppc_cpu_exec_interrupt(CPUState *cpu, int int_req);
 void ppc_cpu_dump_state(CPUState *cpu, FILE *f, int flags);
@@ -2220,6 +2235,12 @@  enum {
                         PPC2_ISA300)
 };
 
+#define OP_RFID 19
+#define XOP_RFID 18
+#define OP_MOV 31
+#define XOP_MFMSR 83
+#define XOP_MTSPR 467
+
 /*****************************************************************************/
 /*
  * Memory access type :
diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 5752ed4a4d..87230f871a 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -784,6 +784,21 @@  static inline void powerpc_excp(PowerPCCPU *cpu, int excp_model, int excp)
     check_tlb_flush(env, false);
 }
 
+target_ulong ppc_get_trace_int_handler_addr(CPUState *cs, bool mmu_on)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+    CPUPPCState *env = &cpu->env;
+    int ail = AIL_NONE;
+
+    /* AIL is only used if translation is enabled */
+    if (mmu_on) {
+        ail = (env->spr[SPR_LPCR] & LPCR_AIL) >> LPCR_AIL_SHIFT;
+    }
+
+    return env->excp_vectors[POWERPC_EXCP_TRACE] |
+        ppc_excp_vector_offset(cs, ail);
+}
+
 void ppc_cpu_do_interrupt(CPUState *cs)
 {
     PowerPCCPU *cpu = POWERPC_CPU(cs);
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 6fb8687126..2b6cba59c8 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -1546,6 +1546,202 @@  void kvm_arch_remove_all_hw_breakpoints(void)
     nb_hw_breakpoint = nb_hw_watchpoint = 0;
 }
 
+static uint32_t ppc_gdb_read_insn(CPUState *cs, target_ulong addr)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+    CPUPPCState *env = &cpu->env;
+    uint32_t insn;
+
+    cpu_memory_rw_debug(cs, addr, (uint8_t *)&insn, sizeof(insn), 0);
+
+    if (msr_le) {
+        return ldl_le_p(&insn);
+    } else {
+        return ldl_be_p(&insn);
+    }
+}
+
+static uint32_t ppc_gdb_get_op(uint32_t insn)
+{
+    return extract32(insn, 26, 6);
+}
+
+static uint32_t ppc_gdb_get_xop(uint32_t insn)
+{
+    return extract32(insn, 1, 10);
+}
+
+static uint32_t ppc_gdb_get_spr(uint32_t insn)
+{
+    return extract32(insn, 11, 5) << 5 | extract32(insn, 16, 5);
+}
+
+static uint32_t ppc_gdb_get_rt(uint32_t insn)
+{
+    return extract32(insn, 21, 5);
+}
+
+static void kvm_insert_singlestep_breakpoint(CPUState *cs, bool mmu_on)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+    CPUPPCState *env = &cpu->env;
+    target_ulong bp_addr;
+    target_ulong saved_msr = env->msr;
+
+    bp_addr = ppc_get_trace_int_handler_addr(cs, mmu_on);
+    if (env->nip == bp_addr) {
+        /*
+         * We are trying to step the interrupt handler address itself;
+         * move the breakpoint to the next instruction.
+         */
+        bp_addr += 4;
+    }
+
+    /*
+     * The actual access by the guest might be made in a mode
+     * different than we are now (due to rfid) so temporarily set the
+     * MSR to reflect that during the breakpoint placement.
+     *
+     * Note: we need this because breakpoints currently use
+     * cpu_memory_rw_debug, which does the memory accesses from the
+     * guest point of view.
+     */
+    if ((msr_ir & msr_dr) != mmu_on) {
+        if (mmu_on) {
+            env->msr |= (1ULL << MSR_IR | 1ULL << MSR_DR);
+        } else {
+            env->msr &= ~(1ULL << MSR_IR | 1ULL << MSR_DR);
+        }
+    }
+
+    kvm_insert_breakpoint(cs, bp_addr, 4, GDB_BREAKPOINT_SW);
+    env->trace_handler_addr = bp_addr;
+    env->msr = saved_msr;
+}
+
+/*
+ * The emulated singlestep works by forcing a Trace Interrupt inside
+ * the guest by setting its MSR_SE bit.  When the guest executes the
+ * instruction, the Trace Interrupt triggers and the step is done. We
+ * then need to hide the fact that the interrupt ever happened before
+ * returning into the guest.
+ *
+ * Since the Trace Interrupt does not set MSR_HV (the whole point of
+ * having to emulate the step), we set a breakpoint at the interrupt
+ * handler address (0xd00) so that it reaches KVM (this is treated by
+ * KVM like an ordinary breakpoint) and control is returned to QEMU.
+ *
+ * There are things to consider before the step (this function):
+ *
+ * - The breakpoint location depends on where the interrupt vectors are,
+ *   which in turn depends on the LPCR_AIL and MSR_IR|DR bits;
+ *
+ * - If the stepped instruction changes the LPCR_AIL, the breakpoint
+ *   location needs to be updated mid-step;
+ *
+ * - If the stepped instruction is rfid, the MSR bits after the step
+ *   will come from SRR1.
+ *
+ *
+ * There are some fixups needed after the step, before returning to
+ * the guest (kvm_handle_singlestep):
+ *
+ * - The interrupt will set SRR0 and SRR1, so we need to restore them
+ *   to what they were before the interrupt;
+ *
+ * - If the stepped instruction wrote to the SRRs, we need to avoid
+ *   undoing that;
+ *
+ * - We set the MSR_SE bit, so it needs to be cleared.
+ *
+ */
+void kvm_arch_emulate_singlestep(CPUState *cs, int enabled)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+    CPUPPCState *env = &cpu->env;
+    uint32_t insn;
+    bool rfid, mmu_on;
+
+    if (!enabled) {
+        return;
+    }
+
+    if (env->sstep_kind != SSTEP_PENDING) {
+        env->sstep_kind = SSTEP_REGULAR;
+    }
+
+    cpu_synchronize_state(cs);
+    insn = ppc_gdb_read_insn(cs, env->nip);
+
+    /*
+     * rfid needs special handling because it:
+     *   - overwrites NIP with SRR0;
+     *   - overwrites MSR with SRR1;
+     *   - cannot be single stepped.
+     */
+    rfid = ppc_gdb_get_op(insn) == OP_RFID && ppc_gdb_get_xop(insn) == XOP_RFID;
+
+    if (rfid && (kvm_find_sw_breakpoint(cs, env->spr[SPR_SRR0]) ||
+                 env->sstep_kind == SSTEP_PENDING)) {
+        /*
+         * There is a breakpoint at the next instruction address or a
+         * pending step. It will already cause the vm exit we need for
+         * the single step, so there's nothing to be done.
+         */
+        env->sstep_kind = SSTEP_REGULAR;
+        return;
+    }
+
+    /*
+     * Save the registers that will be affected by the single step
+     * mechanism. These will be used after the step.
+     */
+    env->sstep_srr0 = env->spr[SPR_SRR0];
+    env->sstep_srr1 = env->spr[SPR_SRR1];
+    env->sstep_insn = insn;
+
+    /*
+     * MSR_SE = 1 will cause a Trace Interrupt in the guest after the
+     * next instruction executes. If this is a rfid, use SRR1 instead
+     * of MSR.
+     */
+    if (rfid) {
+        if ((env->spr[SPR_SRR1] >> MSR_SE) & 1) {
+            /*
+             * The guest is doing a single step itself. Make sure we
+             * restore it later.
+             */
+            env->sstep_kind = SSTEP_GUEST;
+        }
+
+        env->spr[SPR_SRR1] |= (1ULL << MSR_SE);
+        mmu_on = srr1_ir & srr1_dr;
+    } else {
+        env->msr |= (1ULL << MSR_SE);
+        mmu_on = msr_ir & msr_dr;
+    }
+
+    kvm_insert_singlestep_breakpoint(cs, mmu_on);
+}
+
+void kvm_singlestep_ail_change(CPUState *cs)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+    CPUPPCState *env = &cpu->env;
+
+    if (kvm_arch_can_singlestep(cs)) {
+        return;
+    }
+
+    /*
+     * The instruction being stepped altered the interrupt vectors
+     * location (AIL). Re-insert the single step breakpoint at the new
+     * location
+     */
+    kvm_remove_breakpoint(cs, env->trace_handler_addr, 4, GDB_BREAKPOINT_SW);
+    kvm_insert_singlestep_breakpoint(cs, (msr_ir & msr_dr));
+}
+
 void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
 {
     int n;
@@ -1585,6 +1781,98 @@  void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
     }
 }
 
+/* Revert any side-effects caused during single step */
+static void restore_singlestep_env(CPUState *cs)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+    CPUPPCState *env = &cpu->env;
+    uint32_t insn = env->sstep_insn;
+    int reg;
+    int spr;
+
+    env->spr[SPR_SRR0] = env->sstep_srr0;
+    env->spr[SPR_SRR1] = env->sstep_srr1;
+
+    if (ppc_gdb_get_op(insn) != OP_MOV) {
+        return;
+    }
+
+    reg = ppc_gdb_get_rt(insn);
+
+    switch (ppc_gdb_get_xop(insn)) {
+    case XOP_MTSPR:
+        /*
+         * mtspr: the guest altered the SRR, so do not use the
+         *        pre-step value.
+         */
+        spr = ppc_gdb_get_spr(insn);
+        if (spr == SPR_SRR0 || spr == SPR_SRR1) {
+            env->spr[spr] = env->gpr[reg];
+        }
+        break;
+    case XOP_MFMSR:
+        /*
+         * mfmsr: clear MSR_SE bit to avoid the guest knowing
+         *         that it is being single-stepped.
+         */
+        env->gpr[reg] &= ~(1ULL << MSR_SE);
+        break;
+    }
+}
+
+static int kvm_handle_singlestep(CPUState *cs, target_ulong address)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+    CPUPPCState *env = &cpu->env;
+
+    if (kvm_arch_can_singlestep(cs)) {
+        return DEBUG_RETURN_GDB;
+    }
+
+    cpu_synchronize_state(cs);
+
+    if (address == env->trace_handler_addr) {
+        kvm_remove_breakpoint(cs, env->trace_handler_addr, 4,
+                              GDB_BREAKPOINT_SW);
+
+        if (env->sstep_kind == SSTEP_GUEST) {
+            /*
+             * The guest expects the last instruction to have caused a
+             * single step, go back into the interrupt handler.
+             */
+            env->sstep_kind = SSTEP_REGULAR;
+            return DEBUG_RETURN_GDB;
+        }
+
+        env->nip = env->spr[SPR_SRR0];
+        /* Bits 33-36, 43-47 are set by the interrupt */
+        env->msr = env->spr[SPR_SRR1] & ~(1ULL << MSR_SE |
+                                          PPC_BITMASK(33, 36) |
+                                          PPC_BITMASK(43, 47));
+        restore_singlestep_env(cs);
+
+    } else if (address == env->trace_handler_addr + 4) {
+        /*
+         * A step at trace_handler_addr would interfere with the
+         * single step mechanism itself, so we have previously
+         * displaced the breakpoint to the next instruction.
+         */
+        kvm_remove_breakpoint(cs, env->trace_handler_addr + 4, 4,
+                              GDB_BREAKPOINT_SW);
+        restore_singlestep_env(cs);
+    } else {
+        /*
+         * Another interrupt (e.g. system call, program interrupt)
+         * took us somewhere else in the code and we hit a breakpoint
+         * there. Mark the step as pending.
+         */
+        env->sstep_kind = SSTEP_PENDING;
+    }
+
+    cs->singlestep_enabled = false;
+    return DEBUG_RETURN_GDB;
+}
+
 static int kvm_handle_hw_breakpoint(CPUState *cs,
                                     struct kvm_debug_exit_arch *arch_info)
 {
@@ -1612,13 +1900,41 @@  static int kvm_handle_hw_breakpoint(CPUState *cs,
     return handle;
 }
 
-static int kvm_handle_singlestep(void)
+static int kvm_handle_sw_breakpoint(CPUState *cs, target_ulong address)
 {
-    return DEBUG_RETURN_GDB;
-}
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+    CPUPPCState *env = &cpu->env;
+
+    if (kvm_arch_can_singlestep(cs)) {
+        return DEBUG_RETURN_GDB;
+    }
+
+    cpu_synchronize_state(cs);
+
+    if (address == env->trace_handler_addr) {
+        if (env->sstep_kind == SSTEP_PENDING) {
+            /*
+             * Although we have singlestep_enabled == false by now,
+             * the original step still wasn't handled (is pending).
+             */
+            cs->singlestep_enabled = true;
+            env->sstep_kind = SSTEP_REGULAR;
+
+            return kvm_handle_singlestep(cs, address);
+        }
+
+        CPU_FOREACH(cs) {
+            if (cs->singlestep_enabled) {
+                /*
+                 * We hit this breakpoint while another cpu is doing a
+                 * software single step. Go back into the guest to
+                 * give chance for the single step to finish.
+                 */
+                return DEBUG_RETURN_GUEST;
+            }
+        }
+    }
 
-static int kvm_handle_sw_breakpoint(void)
-{
     return DEBUG_RETURN_GDB;
 }
 
@@ -1629,7 +1945,7 @@  static int kvm_handle_debug(PowerPCCPU *cpu, struct kvm_run *run)
     struct kvm_debug_exit_arch *arch_info = &run->debug.arch;
 
     if (cs->singlestep_enabled) {
-        return kvm_handle_singlestep();
+        return kvm_handle_singlestep(cs, arch_info->address);
     }
 
     if (arch_info->status) {
@@ -1637,7 +1953,7 @@  static int kvm_handle_debug(PowerPCCPU *cpu, struct kvm_run *run)
     }
 
     if (kvm_find_sw_breakpoint(cs, arch_info->address)) {
-        return kvm_handle_sw_breakpoint();
+        return kvm_handle_sw_breakpoint(cs, arch_info->address);
     }
 
     /*
diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index f22daabf51..dc1b6df422 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -81,7 +81,7 @@  bool kvmppc_hpt_needs_host_contiguous_pages(void);
 void kvm_check_mmu(PowerPCCPU *cpu, Error **errp);
 void kvmppc_set_reg_ppc_online(PowerPCCPU *cpu, unsigned int online);
 void kvmppc_set_reg_tb_offset(PowerPCCPU *cpu, int64_t tb_offset);
-
+void kvm_singlestep_ail_change(CPUState *cs);
 #else
 
 static inline uint32_t kvmppc_get_tbfreq(void)
@@ -211,6 +211,10 @@  static inline void kvmppc_set_reg_tb_offset(PowerPCCPU *cpu, int64_t tb_offset)
 {
 }
 
+static inline void kvm_singlestep_ail_change(CPUState *cs)
+{
+}
+
 #ifndef CONFIG_USER_ONLY
 static inline bool kvmppc_spapr_use_multitce(void)
 {