diff mbox

[2/2,v2] KVM: PPC: booke: Add watchdog emulation

Message ID 1341830087-12728-1-git-send-email-Bharat.Bhushan@freescale.com
State New, archived
Headers show

Commit Message

Bharat Bhushan July 9, 2012, 10:34 a.m. UTC
This patch adds the watchdog emulation in KVM. The watchdog
emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
The kernel timer are used for watchdog emulation and emulates
h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
it is configured.

Signed-off-by: Liu Yu <yu.liu@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
---
 -v2:
  - Addressed the review comments 

 arch/powerpc/include/asm/kvm_book3s.h |    5 ++
 arch/powerpc/include/asm/kvm_booke.h  |    5 ++
 arch/powerpc/include/asm/kvm_host.h   |    3 +
 arch/powerpc/include/asm/kvm_ppc.h    |    3 +
 arch/powerpc/include/asm/reg_booke.h  |    7 ++
 arch/powerpc/kvm/44x.c                |    1 +
 arch/powerpc/kvm/booke.c              |  128 +++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/booke_emulate.c      |    8 ++
 arch/powerpc/kvm/powerpc.c            |   31 +++++++-
 include/linux/kvm.h                   |    2 +
 10 files changed, 189 insertions(+), 4 deletions(-)

Comments

Alexander Graf July 16, 2012, 5:18 p.m. UTC | #1
On 07/09/2012 12:34 PM, Bharat Bhushan wrote:
 > This patch adds the watchdog emulation in KVM. The watchdog
 > emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
 > The kernel timer are used for watchdog emulation and emulates
 > h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
 > if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
 > it is configured.
 >
 > Signed-off-by: Liu Yu <yu.liu@freescale.com>
 > Signed-off-by: Scott Wood <scottwood@freescale.com>
 > Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
 > ---
 >  -v2:
 >   - Addressed the review comments
 >
 >  arch/powerpc/include/asm/kvm_book3s.h |    5 ++
 >  arch/powerpc/include/asm/kvm_booke.h  |    5 ++
 >  arch/powerpc/include/asm/kvm_host.h   |    3 +
 >  arch/powerpc/include/asm/kvm_ppc.h    |    3 +
 >  arch/powerpc/include/asm/reg_booke.h  |    7 ++
 >  arch/powerpc/kvm/44x.c                |    1 +
 >  arch/powerpc/kvm/booke.c              |  128 
+++++++++++++++++++++++++++++++++
 >  arch/powerpc/kvm/booke_emulate.c      |    8 ++
 >  arch/powerpc/kvm/powerpc.c            |   31 +++++++-
 >  include/linux/kvm.h                   |    2 +
 >  10 files changed, 189 insertions(+), 4 deletions(-)
 >
 > diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
 > index f0e0c6a..7bbc6cd 100644
 > --- a/arch/powerpc/include/asm/kvm_book3s.h
 > +++ b/arch/powerpc/include/asm/kvm_book3s.h
 > @@ -446,6 +446,11 @@ static inline bool 
kvmppc_critical_section(struct kvm_vcpu *vcpu)
 >  }
 >  #endif
 >
 > +static inline u32 kvmppc_get_tsr_wrc(struct kvm_vcpu *vcpu)
 > +{
 > +    return 0;
 > +}
 > +
 >  /* Magic register values loaded into r3 and r4 before the 'sc' assembly
 >   * instruction for the OSI hypercalls */
 >  #define OSI_SC_MAGIC_R3            0x113724FA
 > diff --git a/arch/powerpc/include/asm/kvm_booke.h 
b/arch/powerpc/include/asm/kvm_booke.h
 > index b7cd335..e5b86c1 100644
 > --- a/arch/powerpc/include/asm/kvm_booke.h
 > +++ b/arch/powerpc/include/asm/kvm_booke.h
 > @@ -100,4 +100,9 @@ static inline ulong kvmppc_get_msr(struct 
kvm_vcpu *vcpu)
 >  {
 >      return vcpu->arch.shared->msr;
 >  }
 > +
 > +static inline u32 kvmppc_get_tsr_wrc(struct kvm_vcpu *vcpu)
 > +{
 > +    return vcpu->arch.tsr & TCR_WRC_MASK;
 > +}
 >  #endif /* __ASM_KVM_BOOKE_H__ */
 > diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
 > index 50ea12f..01047f4 100644
 > --- a/arch/powerpc/include/asm/kvm_host.h
 > +++ b/arch/powerpc/include/asm/kvm_host.h
 > @@ -467,6 +467,8 @@ struct kvm_vcpu_arch {
 >      ulong fault_esr;
 >      ulong queued_dear;
 >      ulong queued_esr;
 > +    spinlock_t wdt_lock;
 > +    struct timer_list wdt_timer;

These should be booke specific. If you can't fit them anywhere else, at 
least do an #ifdef for booke kvm around them, so we know where they belong.

 >      u32 tlbcfg[4];
 >      u32 mmucfg;
 >      u32 epr;
 > @@ -482,6 +484,7 @@ struct kvm_vcpu_arch {
 >      u8 osi_needed;
 >      u8 osi_enabled;
 >      u8 papr_enabled;
 > +    u8 watchdog_enable;
 >      u8 sane;
 >      u8 cpu_type;
 >      u8 hcall_needed;
 > diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
 > index 0124937..e5cf4b9 100644
 > --- a/arch/powerpc/include/asm/kvm_ppc.h
 > +++ b/arch/powerpc/include/asm/kvm_ppc.h
 > @@ -67,6 +67,7 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run, 
struct kvm_vcpu *vcpu);
 >  extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 >  extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 >  extern void kvmppc_decrementer_func(unsigned long data);
 > +extern void kvmppc_watchdog_func(unsigned long data);
 >  extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 >
 >  /* Core-specific hooks */
 > @@ -104,6 +105,8 @@ extern void kvmppc_core_queue_external(struct 
kvm_vcpu *vcpu,
 >                                         struct kvm_interrupt *irq);
 >  extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
 >                                           struct kvm_interrupt *irq);
 > +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
 > +extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
 >
 >  extern int kvmppc_core_emulate_op(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
 >                                    unsigned int op, int *advance);
 > diff --git a/arch/powerpc/include/asm/reg_booke.h 
b/arch/powerpc/include/asm/reg_booke.h
 > index 2d916c4..e07e6af 100644
 > --- a/arch/powerpc/include/asm/reg_booke.h
 > +++ b/arch/powerpc/include/asm/reg_booke.h
 > @@ -539,6 +539,13 @@
 >  #define TCR_FIE        0x00800000    /* FIT Interrupt Enable */
 >  #define TCR_ARE        0x00400000    /* Auto Reload Enable */
 >
 > +#ifdef CONFIG_E500
 > +#define TCR_GET_WP(tcr)  ((((tcr) & 0xC0000000) >> 30) | \
 > +                  (((tcr) & 0x1E0000) >> 15))
 > +#else
 > +#define TCR_GET_WP(tcr)  (((tcr) & 0xC0000000) >> 30)
 > +#endif

Ben and/or Kumar, mind to ack this bit?

 > +
 >  /* Bit definitions for the TSR. */
 >  #define TSR_ENW        0x80000000    /* Enable Next Watchdog */
 >  #define TSR_WIS        0x40000000    /* WDT Interrupt Status */
 > diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
 > index 50e7dbc..a213aba 100644
 > --- a/arch/powerpc/kvm/44x.c
 > +++ b/arch/powerpc/kvm/44x.c
 > @@ -28,6 +28,7 @@
 >  #include <asm/kvm_44x.h>
 >  #include <asm/kvm_ppc.h>
 >
 > +#include "booke.h"

Look 2 lines below :)

 >  #include "44x_tlb.h"
 >  #include "booke.h"
 >
 > diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
 > index d25a097..97f54ea 100644
 > --- a/arch/powerpc/kvm/booke.c
 > +++ b/arch/powerpc/kvm/booke.c
 > @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct 
kvm_vcpu *vcpu,
 >      clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, 
&vcpu->arch.pending_exceptions);
 >  }
 >
 > +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
 > +{
 > +    kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
 > +}
 > +
 > +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
 > +{
 > +    clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
 > +}
 > +
 >  static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, 
u32 srr1)
 >  {
 >  #ifdef CONFIG_KVM_BOOKE_HV
 > @@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct 
kvm_vcpu *vcpu,
 >          msr_mask = MSR_CE | MSR_ME | MSR_DE;
 >          int_class = INT_CLASS_NONCRIT;
 >          break;
 > +    case BOOKE_IRQPRIO_WATCHDOG:
 >      case BOOKE_IRQPRIO_CRITICAL:
 >      case BOOKE_IRQPRIO_DBELL_CRIT:
 >          allowed = vcpu->arch.shared->msr & MSR_CE;
 > @@ -404,12 +415,101 @@ static int kvmppc_booke_irqprio_deliver(struct 
kvm_vcpu *vcpu,
 >      return allowed;
 >  }
 >
 > +/*
 > + * Return the number of jiffies until the next timeout.  If the 
timeout is
 > + * longer than the NEXT_TIMER_MAX_DELTA, that

then?

 >  return NEXT_TIMER_MAX_DELTA
 > + * instead.

I can read code. The important piece of information in the comment is 
missing: The reason.

 > + */
 > +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
 > +{
 > +    u64 tb, wdt_tb, wdt_ticks = 0;
 > +    u64 nr_jiffies = 0;
 > +    u32 period = TCR_GET_WP(vcpu->arch.tcr);
 > +
 > +    wdt_tb = 1ULL << (63 - period);
 > +    tb = get_tb();
 > +    /*
 > +     * The watchdog timeout will hapeen when TB bit corresponding
 > +     * to watchdog will toggle from 0 to 1.
 > +     */
 > +    if (tb & wdt_tb)
 > +        wdt_ticks = wdt_tb;
 > +
 > +    wdt_ticks += wdt_tb - (tb & (wdt_tb - 1));
 > +
 > +    /* Convert timebase ticks to jiffies */
 > +    nr_jiffies = wdt_ticks;
 > +
 > +    if (do_div(nr_jiffies, tb_ticks_per_jiffy))
 > +        nr_jiffies++;
 > +
 > +    return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA);
 > +}
 > +
 > +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
 > +{
 > +    unsigned long nr_jiffies;
 > +
 > +    nr_jiffies = watchdog_next_timeout(vcpu);
 > +    spin_lock(&vcpu->arch.wdt_lock);
 > +    if (nr_jiffies < NEXT_TIMER_MAX_DELTA)
 > +        mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
 > +    else
 > +        del_timer(&vcpu->arch.wdt_timer);
 > +    spin_unlock(&vcpu->arch.wdt_lock);
 > +}
 > +
 > +void kvmppc_watchdog_func(unsigned long data)
 > +{
 > +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
 > +    u32 tsr, new_tsr;
 > +    int final;
 > +
 > +    do {
 > +        new_tsr = tsr = vcpu->arch.tsr;
 > +        final = 0;
 > +
 > +        /* Time out event */
 > +        if (tsr & TSR_ENW) {
 > +            if (tsr & TSR_WIS) {
 > +                new_tsr = (tsr & ~TCR_WRC_MASK) |
 > +                      (vcpu->arch.tcr & TCR_WRC_MASK);
 > +                vcpu->arch.tcr &= ~TCR_WRC_MASK;

Can't we just poke the vcpu to exit the VM and do the above on its own? 
This is the watdog expired case, right? I'd also prefer to have an 
explicit event for the expiry than a special TSR check in the main loop.

Also call me sceptic on the reset of tcr. If our user space watchdog 
event is "write a message", then we essentially want to hide the fact 
that the watchdog expired from the guest, right? In that case, the 
second time-out wouldn't do anything guest visible.

 > +                final = 1;
 > +            } else {
 > +                new_tsr = tsr | TSR_WIS;
 > +            }
 > +        } else {
 > +            new_tsr = tsr | TSR_ENW;
 > +        }
 > +    } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
 > +
 > +    if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
 > +        smp_wmb();
 > +        kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
 > +        kvm_vcpu_kick(vcpu);
 > +    }
 > +
 > +    /*
 > +     * Stop running the watchdog timer after final expiration to
 > +     * prevent the host from being flooded with timers if the
 > +     * guest sets a short period.
 > +     * Timers will resume when TSR/TCR is updated next time.
 > +     */
 > +    if (!final)
 > +        arm_next_watchdog(vcpu);
 > +}
 >  static void update_timer_ints(struct kvm_vcpu *vcpu)
 >  {
 >      if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
 >          kvmppc_core_queue_dec(vcpu);
 >      else
 >          kvmppc_core_dequeue_dec(vcpu);
 > +
 > +    if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
 > +        kvmppc_core_queue_watchdog(vcpu);
 > +    else
 > +        kvmppc_core_dequeue_watchdog(vcpu);
 >  }
 >
 >  static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
 > @@ -516,6 +616,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, 
struct kvm_vcpu *vcpu)
 >          goto out;
 >      }
 >
 > +    if (vcpu->arch.tsr & TCR_WRC_MASK) {
 > +        kvm_run->exit_reason = KVM_EXIT_WDT;
 > +        ret = 0;
 > +        goto out;
 > +    }
 > +
 >      kvm_guest_enter();
 >
 >  #ifdef CONFIG_PPC_FPU
 > @@ -977,6 +1083,10 @@ int kvmppc_handle_exit(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
 >              kvmppc_account_exit(vcpu, SIGNAL_EXITS);
 >          }
 >      }
 > +    if (vcpu->arch.tsr & TCR_WRC_MASK) {
 > +        run->exit_reason = KVM_EXIT_WDT;
 > +        r = RESUME_HOST | (r & RESUME_FLAG_NV);
 > +    }
 >
 >      return r;
 >  }
 > @@ -1106,7 +1216,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
 >      }
 >
 >      if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
 > +        u32 old_tsr = vcpu->arch.tsr;
 > +
 >          vcpu->arch.tsr = sregs->u.e.tsr;
 > +
 > +        if (vcpu->arch.watchdog_enable && ((old_tsr ^ vcpu->arch.tsr) &
 > +            (TSR_ENW | TSR_WIS | TCR_WRC_MASK)))
 > +            arm_next_watchdog(vcpu);

Can you please move all required checks into arm_next_watchdog and call 
it unconditionally? Especially the watchdog_enable. Unless you want to 
follow Scott's scheme and merely make the user space delivery depend on 
watchdog_enable, which also works.

 > +
 >          update_timer_ints(vcpu);
 >      }
 >
 > @@ -1267,6 +1384,8 @@ void kvmppc_core_commit_memory_region(struct 
kvm *kvm,
 >  void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
 >  {
 >      vcpu->arch.tcr = new_tcr;
 > +    if (vcpu->arch.watchdog_enable)
 > +        arm_next_watchdog(vcpu);
 >      update_timer_ints(vcpu);
 >  }
 >
 > @@ -1281,6 +1400,15 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu 
*vcpu, u32 tsr_bits)
 >  void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 >  {
 >      clear_bits(tsr_bits, &vcpu->arch.tsr);
 > +
 > +    /*
 > +     * We may have stopped the watchdog due to
 > +     * being stuck on final expiration.
 > +     */
 > +    if (vcpu->arch.watchdog_enable && (tsr_bits & (TSR_ENW |
 > +            TSR_WIS | TCR_WRC_MASK)))
 > +        arm_next_watchdog(vcpu);
 > +
 >      update_timer_ints(vcpu);
 >  }
 >
 > diff --git a/arch/powerpc/kvm/booke_emulate.c 
b/arch/powerpc/kvm/booke_emulate.c
 > index 12834bb..5a66ade 100644
 > --- a/arch/powerpc/kvm/booke_emulate.c
 > +++ b/arch/powerpc/kvm/booke_emulate.c
 > @@ -145,6 +145,14 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu 
*vcpu, int sprn, ulong spr_val)
 >          kvmppc_clr_tsr_bits(vcpu, spr_val);
 >          break;
 >      case SPRN_TCR:
 > +        /*
 > +         * WRC is a 2-bit field that is supposed to preserve its
 > +         * value once written to non-zero.
 > +         */
 > +        if (vcpu->arch.tcr & TCR_WRC_MASK) {
 > +            spr_val &= ~TCR_WRC_MASK;
 > +            spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
 > +        }
 >          kvmppc_set_tcr(vcpu, spr_val);
 >          break;
 >
 > diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
 > index 87f4dc8..646c4da 100644
 > --- a/arch/powerpc/kvm/powerpc.c
 > +++ b/arch/powerpc/kvm/powerpc.c
 > @@ -38,9 +38,13 @@
 >
 >  int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 >  {
 > -    return !(v->arch.shared->msr & MSR_WE) ||
 > -           !!(v->arch.pending_exceptions) ||
 > -           v->requests;
 > +    bool ret = !(v->arch.shared->msr & MSR_WE) ||
 > +           !!(v->arch.pending_exceptions) ||
 > +           v->requests;
 > +
 > +    ret = ret || kvmppc_get_tsr_wrc(v);

Why do you need to declare the cpu as non-runnable when a watchdog event 
occured?

 > +
 > +    return ret;
 >  }
 >
 >  int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 > @@ -237,6 +241,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 >      case KVM_CAP_PPC_GET_PVINFO:
 >  #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 >      case KVM_CAP_SW_TLB:
 > +    case KVM_CAP_PPC_WDT:
 >  #endif
 >          r = 1;
 >          break;
 > @@ -386,13 +391,21 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 >  #ifdef CONFIG_KVM_EXIT_TIMING
 >      mutex_init(&vcpu->arch.exit_timing_lock);
 >  #endif
 > -
 > +#ifdef CONFIG_BOOKE
 > +    spin_lock_init(&vcpu->arch.wdt_lock);
 > +#endif
 >      return 0;
 >  }
 >
 >  void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 >  {
 >      kvmppc_mmu_destroy(vcpu);
 > +#ifdef CONFIG_BOOKE
 > +    spin_lock(&vcpu->arch.wdt_lock);
 > +    if (vcpu->arch.watchdog_enable)
 > +        del_timer(&vcpu->arch.wdt_timer);
 > +    spin_unlock(&vcpu->arch.wdt_lock);
 > +#endif
 >  }
 >
 >  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 > @@ -637,6 +650,16 @@ static int kvm_vcpu_ioctl_enable_cap(struct 
kvm_vcpu *vcpu,
 >          r = 0;
 >          vcpu->arch.papr_enabled = true;
 >          break;
 > +#ifdef CONFIG_BOOKE
 > +    case KVM_CAP_PPC_WDT:
 > +        r = 0;
 > +        /* setup watchdog timer once */
 > +        if (!vcpu->arch.watchdog_enable)
 > +            setup_timer(&vcpu->arch.wdt_timer,
 > +                    kvmppc_watchdog_func, (unsigned long)vcpu);
 > +        vcpu->arch.watchdog_enable = true;
 > +        break;
 > +#endif
 >  #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 >      case KVM_CAP_SW_TLB: {
 >          struct kvm_config_tlb cfg;
 > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
 > index 2ce09aa..b9fdb52 100644
 > --- a/include/linux/kvm.h
 > +++ b/include/linux/kvm.h
 > @@ -163,6 +163,7 @@ struct kvm_pit_config {
 >  #define KVM_EXIT_OSI              18
 >  #define KVM_EXIT_PAPR_HCALL      19
 >  #define KVM_EXIT_S390_UCONTROL      20
 > +#define KVM_EXIT_WDT              21

Please make this a more generic KVM_EXIT_WATCHDOG so that other archs 
may have the chance to reuse it.


Alex

 >
 >  /* For KVM_EXIT_INTERNAL_ERROR */
 >  #define KVM_INTERNAL_ERROR_EMULATION 1
 > @@ -618,6 +619,7 @@ struct kvm_ppc_smmu_info {
 >  #define KVM_CAP_PPC_GET_SMMU_INFO 78
 >  #define KVM_CAP_S390_COW 79
 >  #define KVM_CAP_PPC_ALLOC_HTAB 80
 > +#define KVM_CAP_PPC_WDT    81
 >
 >  #ifdef KVM_CAP_IRQ_ROUTING
 >

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Wood July 17, 2012, 1:02 a.m. UTC | #2
On 07/16/2012 12:18 PM, Alexander Graf wrote:
>> +/*
>> + * Return the number of jiffies until the next timeout.  If the
> timeout is
>> + * longer than the NEXT_TIMER_MAX_DELTA, that
> 
> then?
> 
>>  return NEXT_TIMER_MAX_DELTA
>> + * instead.
> 
> I can read code.

Come on, it's not exactly x++; /* add one to x */

It's faster to read code (as well as know the constraints within which
you can modify it without having to spend a lot of time digesting all
the callers' use cases) when you have a high level description of its
interface contract, and can be selective about when to zoom in to the
details.  Linux kernel code tends to be bad about this.

> The important piece of information in the comment is
> missing: The reason.

The reason for what?  Why you want to know the next timeout?  That's the
caller's business.  Or why we use NEXT_TIMER_MAX_DELTA as the limit?

>> +void kvmppc_watchdog_func(unsigned long data)
>> +{
>> +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>> +    u32 tsr, new_tsr;
>> +    int final;
>> +
>> +    do {
>> +        new_tsr = tsr = vcpu->arch.tsr;
>> +        final = 0;
>> +
>> +        /* Time out event */
>> +        if (tsr & TSR_ENW) {
>> +            if (tsr & TSR_WIS) {
>> +                new_tsr = (tsr & ~TCR_WRC_MASK) |
>> +                      (vcpu->arch.tcr & TCR_WRC_MASK);
>> +                vcpu->arch.tcr &= ~TCR_WRC_MASK;
> 
> Can't we just poke the vcpu to exit the VM and do the above on its own?

We've discussed this before.  TSR updates are done via atomics, and we
send a request for the vcpu to act on the result.  This is how the
decrementer works.

http://www.spinics.net/lists/kvm-ppc/msg03169.html

> This is the watdog expired case, right?

Final expiration, yes.

> I'd also prefer to have an
> explicit event for the expiry than a special TSR check in the main loop.

So check TSR[WRS] in update_timer_ints(), and have it queue a
pseudoexception?  That would eliminate the need to change the runnable
function.
	
> Also call me sceptic on the reset of tcr. If our user space watchdog
> event is "write a message", then we essentially want to hide the fact
> that the watchdog expired from the guest, right? In that case, the
> second time-out wouldn't do anything guest visible.

This was probably copied straight out of the hardware documentation,
which explicitly says TCR[WRC] gets set to zero on final expiration (as
part of reset).  We should leave that part up to userspace.  It
definitely shouldn't be done inside the cmpxchg loop (or from interrupt
context -- only TSR gets the atomic treatment).  I don't think the read
of TCR outside vcpu context is a problem, though.

>>  int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
>>  {
>> -    return !(v->arch.shared->msr & MSR_WE) ||
>> -           !!(v->arch.pending_exceptions) ||
>> -           v->requests;
>> +    bool ret = !(v->arch.shared->msr & MSR_WE) ||
>> +           !!(v->arch.pending_exceptions) ||
>> +           v->requests;
>> +
>> +    ret = ret || kvmppc_get_tsr_wrc(v);
> 
> Why do you need to declare the cpu as non-runnable when a watchdog event
> occured?

It's the other way around -- it's always runnable when a watchdog exit
is pending.  It's like a pending exception.

>> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
>> index 2ce09aa..b9fdb52 100644
>> --- a/include/linux/kvm.h
>> +++ b/include/linux/kvm.h
>> @@ -163,6 +163,7 @@ struct kvm_pit_config {
>>  #define KVM_EXIT_OSI              18
>>  #define KVM_EXIT_PAPR_HCALL      19
>>  #define KVM_EXIT_S390_UCONTROL      20
>> +#define KVM_EXIT_WDT              21
> 
> Please make this a more generic KVM_EXIT_WATCHDOG so that other archs
> may have the chance to reuse it.

WDT is generic (85 of 115 files in drivers/watchdog/ contain "wdt" in
their names), just overly abbreviated.  KVM_EXIT_WATCHDOG is more readable.

-Scott

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf July 17, 2012, 7:20 a.m. UTC | #3
On 17.07.2012, at 03:02, Scott Wood <scottwood@freescale.com> wrote:

> On 07/16/2012 12:18 PM, Alexander Graf wrote:
>>> +/*
>>> + * Return the number of jiffies until the next timeout.  If the
>> timeout is
>>> + * longer than the NEXT_TIMER_MAX_DELTA, that
>> 
>> then?
>> 
>>> return NEXT_TIMER_MAX_DELTA
>>> + * instead.
>> 
>> I can read code.
> 
> Come on, it's not exactly x++; /* add one to x */
> 
> It's faster to read code (as well as know the constraints within which
> you can modify it without having to spend a lot of time digesting all
> the callers' use cases) when you have a high level description of its
> interface contract, and can be selective about when to zoom in to the
> details.  Linux kernel code tends to be bad about this.

Yeah, not opposed to leave that part in :).

> 
>> The important piece of information in the comment is
>> missing: The reason.
> 
> The reason for what?  Why you want to know the next timeout?  That's the
> caller's business.  Or why we use NEXT_TIMER_MAX_DELTA as the limit?

Why we use the limit. IIRC it was explained in the last thread, just didn't make its way into the comment.

> 
>>> +void kvmppc_watchdog_func(unsigned long data)
>>> +{
>>> +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>> +    u32 tsr, new_tsr;
>>> +    int final;
>>> +
>>> +    do {
>>> +        new_tsr = tsr = vcpu->arch.tsr;
>>> +        final = 0;
>>> +
>>> +        /* Time out event */
>>> +        if (tsr & TSR_ENW) {
>>> +            if (tsr & TSR_WIS) {
>>> +                new_tsr = (tsr & ~TCR_WRC_MASK) |
>>> +                      (vcpu->arch.tcr & TCR_WRC_MASK);
>>> +                vcpu->arch.tcr &= ~TCR_WRC_MASK;
>> 
>> Can't we just poke the vcpu to exit the VM and do the above on its own?
> 
> We've discussed this before.  TSR updates are done via atomics, and we
> send a request for the vcpu to act on the result.  This is how the
> decrementer works.
> 
> http://www.spinics.net/lists/kvm-ppc/msg03169.html

Yeah, the major difference to the dec is the atomicity of the whole thing. Dec changes one bit to enable the interrupt line. The final expiration is more complex.

> 
>> This is the watdog expired case, right?
> 
> Final expiration, yes.
> 
>> I'd also prefer to have an
>> explicit event for the expiry than a special TSR check in the main loop.
> 
> So check TSR[WRS] in update_timer_ints(), and have it queue a
> pseudoexception?  

Or here.

> That would eliminate the need to change the runnable
> function.
>    
>> Also call me sceptic on the reset of tcr. If our user space watchdog
>> event is "write a message", then we essentially want to hide the fact
>> that the watchdog expired from the guest, right? In that case, the
>> second time-out wouldn't do anything guest visible.
> 
> This was probably copied straight out of the hardware documentation,
> which explicitly says TCR[WRC] gets set to zero on final expiration (as
> part of reset).  We should leave that part up to userspace.  It
> definitely shouldn't be done inside the cmpxchg loop (or from interrupt
> context -- only TSR gets the atomic treatment).  I don't think the read
> of TCR outside vcpu context is a problem, though.

Yeah, but it'd just make me less wary if only the vcpu thread itself accesses vcpu internal registers that aren't irq state and thus designed for it (TSR).

But yes, the most flexible way would probably be to do it from user space. Since it'd happen from within the vcpu context of user space, we can also guarantee that the TCR access is atomic.

> 
>>> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
>>> {
>>> -    return !(v->arch.shared->msr & MSR_WE) ||
>>> -           !!(v->arch.pending_exceptions) ||
>>> -           v->requests;
>>> +    bool ret = !(v->arch.shared->msr & MSR_WE) ||
>>> +           !!(v->arch.pending_exceptions) ||
>>> +           v->requests;
>>> +
>>> +    ret = ret || kvmppc_get_tsr_wrc(v);
>> 
>> Why do you need to declare the cpu as non-runnable when a watchdog event
>> occured?
> 
> It's the other way around -- it's always runnable when a watchdog exit
> is pending.  It's like a pending exception.

Ah, so yes, we should just shove it into pending_exceptions then.

> 
>>> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
>>> index 2ce09aa..b9fdb52 100644
>>> --- a/include/linux/kvm.h
>>> +++ b/include/linux/kvm.h
>>> @@ -163,6 +163,7 @@ struct kvm_pit_config {
>>> #define KVM_EXIT_OSI              18
>>> #define KVM_EXIT_PAPR_HCALL      19
>>> #define KVM_EXIT_S390_UCONTROL      20
>>> +#define KVM_EXIT_WDT              21
>> 
>> Please make this a more generic KVM_EXIT_WATCHDOG so that other archs
>> may have the chance to reuse it.
> 
> WDT is generic (85 of 115 files in drivers/watchdog/ contain "wdt" in
> their names), just overly abbreviated.  KVM_EXIT_WATCHDOG is more readable.

Ah, didn't know that it's a commonly used abbreviation. It's not like we're constrained in our line length for exit code handling usually though, so readable still wins for me :)

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bharat Bhushan July 17, 2012, 9:57 a.m. UTC | #4
> -----Original Message-----
> From: kvm-ppc-owner@vger.kernel.org [mailto:kvm-ppc-owner@vger.kernel.org] On
> Behalf Of Alexander Graf
> Sent: Tuesday, July 17, 2012 12:50 PM
> To: Wood Scott-B07421
> Cc: Bhushan Bharat-R65777; <kvm-ppc@vger.kernel.org>; <kvm@vger.kernel.org>;
> <bharatb.yadav@gmail.com>; Bhushan Bharat-R65777; Benjamin Herrenschmidt; Kumar
> Gala
> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
> 
> 
> 
> On 17.07.2012, at 03:02, Scott Wood <scottwood@freescale.com> wrote:
> 
> > On 07/16/2012 12:18 PM, Alexander Graf wrote:
> >>> +/*
> >>> + * Return the number of jiffies until the next timeout.  If the
> >> timeout is
> >>> + * longer than the NEXT_TIMER_MAX_DELTA, that
> >>
> >> then?
> >>
> >>> return NEXT_TIMER_MAX_DELTA
> >>> + * instead.
> >>
> >> I can read code.
> >
> > Come on, it's not exactly x++; /* add one to x */
> >
> > It's faster to read code (as well as know the constraints within which
> > you can modify it without having to spend a lot of time digesting all
> > the callers' use cases) when you have a high level description of its
> > interface contract, and can be selective about when to zoom in to the
> > details.  Linux kernel code tends to be bad about this.
> 
> Yeah, not opposed to leave that part in :).
> 
> >
> >> The important piece of information in the comment is
> >> missing: The reason.
> >
> > The reason for what?  Why you want to know the next timeout?  That's
> > the caller's business.  Or why we use NEXT_TIMER_MAX_DELTA as the limit?
> 
> Why we use the limit. IIRC it was explained in the last thread, just didn't make
> its way into the comment.

Earlier we have a comment on the #define MAX_TIMEOUT (new define added for a purpose, so the comment described the puspose).
Now we uses the generic #define NEX_TIMER_MAX_DELTA (include/linux/timer.h), so removed the comment.

> 
> >
> >>> +void kvmppc_watchdog_func(unsigned long data) {
> >>> +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
> >>> +    u32 tsr, new_tsr;
> >>> +    int final;
> >>> +
> >>> +    do {
> >>> +        new_tsr = tsr = vcpu->arch.tsr;
> >>> +        final = 0;
> >>> +
> >>> +        /* Time out event */
> >>> +        if (tsr & TSR_ENW) {
> >>> +            if (tsr & TSR_WIS) {
> >>> +                new_tsr = (tsr & ~TCR_WRC_MASK) |
> >>> +                      (vcpu->arch.tcr & TCR_WRC_MASK);
> >>> +                vcpu->arch.tcr &= ~TCR_WRC_MASK;
> >>
> >> Can't we just poke the vcpu to exit the VM and do the above on its own?
> >
> > We've discussed this before.  TSR updates are done via atomics, and we
> > send a request for the vcpu to act on the result.  This is how the
> > decrementer works.
> >
> > http://www.spinics.net/lists/kvm-ppc/msg03169.html
> 
> Yeah, the major difference to the dec is the atomicity of the whole thing. Dec
> changes one bit to enable the interrupt line. The final expiration is more
> complex.

Is not setting the TSR.WRS atomic here (cmpxchg() will handle this)?

> 
> >
> >> This is the watdog expired case, right?
> >
> > Final expiration, yes.
> >
> >> I'd also prefer to have an
> >> explicit event for the expiry than a special TSR check in the main loop.
> >
> > So check TSR[WRS] in update_timer_ints(), and have it queue a
> > pseudoexception?
> 
> Or here.

Do we mean define a sudo IROPRIO for final expiry.

> 
> > That would eliminate the need to change the runnable function.
> >
> >> Also call me sceptic on the reset of tcr. If our user space watchdog
> >> event is "write a message", then we essentially want to hide the fact
> >> that the watchdog expired from the guest, right? In that case, the
> >> second time-out wouldn't do anything guest visible.
> >
> > This was probably copied straight out of the hardware documentation,
> > which explicitly says TCR[WRC] gets set to zero on final expiration
> > (as part of reset).  We should leave that part up to userspace.  It
> > definitely shouldn't be done inside the cmpxchg loop (or from
> > interrupt context -- only TSR gets the atomic treatment).  I don't
> > think the read of TCR outside vcpu context is a problem, though.
> 
> Yeah, but it'd just make me less wary if only the vcpu thread itself accesses
> vcpu internal registers that aren't irq state and thus designed for it (TSR).
> 
> But yes, the most flexible way would probably be to do it from user space. Since
> it'd happen from within the vcpu context of user space, we can also guarantee
> that the TCR access is atomic.

Yes, will move the tcr.wrc clearing to userspace.

> 
> >
> >>> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) {
> >>> -    return !(v->arch.shared->msr & MSR_WE) ||
> >>> -           !!(v->arch.pending_exceptions) ||
> >>> -           v->requests;
> >>> +    bool ret = !(v->arch.shared->msr & MSR_WE) ||
> >>> +           !!(v->arch.pending_exceptions) ||
> >>> +           v->requests;
> >>> +
> >>> +    ret = ret || kvmppc_get_tsr_wrc(v);
> >>
> >> Why do you need to declare the cpu as non-runnable when a watchdog
> >> event occured?
> >
> > It's the other way around -- it's always runnable when a watchdog exit
> > is pending.  It's like a pending exception.
> 
> Ah, so yes, we should just shove it into pending_exceptions then.

Pending_exception? You mean sudo again here as said earlier.

Thanks
-Bharat
> 
> >
> >>> diff --git a/include/linux/kvm.h b/include/linux/kvm.h index
> >>> 2ce09aa..b9fdb52 100644
> >>> --- a/include/linux/kvm.h
> >>> +++ b/include/linux/kvm.h
> >>> @@ -163,6 +163,7 @@ struct kvm_pit_config {
> >>> #define KVM_EXIT_OSI              18
> >>> #define KVM_EXIT_PAPR_HCALL      19
> >>> #define KVM_EXIT_S390_UCONTROL      20
> >>> +#define KVM_EXIT_WDT              21
> >>
> >> Please make this a more generic KVM_EXIT_WATCHDOG so that other archs
> >> may have the chance to reuse it.
> >
> > WDT is generic (85 of 115 files in drivers/watchdog/ contain "wdt" in
> > their names), just overly abbreviated.  KVM_EXIT_WATCHDOG is more readable.
> 
> Ah, didn't know that it's a commonly used abbreviation. It's not like we're
> constrained in our line length for exit code handling usually though, so
> readable still wins for me :)
> 
> Alex
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body
> of a message to majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bharat Bhushan July 17, 2012, 11:31 a.m. UTC | #5
> -----Original Message-----

> From: Wood Scott-B07421

> Sent: Tuesday, July 17, 2012 6:32 AM

> To: Alexander Graf

> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org; kvm@vger.kernel.org;

> bharatb.yadav@gmail.com; Bhushan Bharat-R65777; Benjamin Herrenschmidt; Kumar

> Gala

> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation

> 

> On 07/16/2012 12:18 PM, Alexander Graf wrote:

> >> +/*

> >> + * Return the number of jiffies until the next timeout.  If the

> > timeout is

> >> + * longer than the NEXT_TIMER_MAX_DELTA, that

> >

> > then?

> >

> >>  return NEXT_TIMER_MAX_DELTA

> >> + * instead.

> >

> > I can read code.

> 

> Come on, it's not exactly x++; /* add one to x */

> 

> It's faster to read code (as well as know the constraints within which you can

> modify it without having to spend a lot of time digesting all the callers' use

> cases) when you have a high level description of its interface contract, and can

> be selective about when to zoom in to the details.  Linux kernel code tends to

> be bad about this.

> 

> > The important piece of information in the comment is

> > missing: The reason.

> 

> The reason for what?  Why you want to know the next timeout?  That's the

> caller's business.  Or why we use NEXT_TIMER_MAX_DELTA as the limit?

> 

> >> +void kvmppc_watchdog_func(unsigned long data) {

> >> +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;

> >> +    u32 tsr, new_tsr;

> >> +    int final;

> >> +

> >> +    do {

> >> +        new_tsr = tsr = vcpu->arch.tsr;

> >> +        final = 0;

> >> +

> >> +        /* Time out event */

> >> +        if (tsr & TSR_ENW) {

> >> +            if (tsr & TSR_WIS) {

> >> +                new_tsr = (tsr & ~TCR_WRC_MASK) |

> >> +                      (vcpu->arch.tcr & TCR_WRC_MASK);

> >> +                vcpu->arch.tcr &= ~TCR_WRC_MASK;

> >

> > Can't we just poke the vcpu to exit the VM and do the above on its own?

> 

> We've discussed this before.  TSR updates are done via atomics, and we send a

> request for the vcpu to act on the result.  This is how the decrementer works.

> 

> http://www.spinics.net/lists/kvm-ppc/msg03169.html

> 

> > This is the watdog expired case, right?

> 

> Final expiration, yes.

> 

> > I'd also prefer to have an

> > explicit event for the expiry than a special TSR check in the main loop.

> 

> So check TSR[WRS] in update_timer_ints(), and have it queue a pseudoexception?

> That would eliminate the need to change the runnable function.

> 

> > Also call me sceptic on the reset of tcr. If our user space watchdog

> > event is "write a message", then we essentially want to hide the fact

> > that the watchdog expired from the guest, right? In that case, the

> > second time-out wouldn't do anything guest visible.

> 

> This was probably copied straight out of the hardware documentation, which

> explicitly says TCR[WRC] gets set to zero on final expiration (as part of

> reset).  We should leave that part up to userspace.  It definitely shouldn't be

> done inside the cmpxchg loop (or from interrupt context -- only TSR gets the

> atomic treatment).  I don't think the read of TCR outside vcpu context is a

> problem, though.

> 

> >>  int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)  {

> >> -    return !(v->arch.shared->msr & MSR_WE) ||

> >> -           !!(v->arch.pending_exceptions) ||

> >> -           v->requests;

> >> +    bool ret = !(v->arch.shared->msr & MSR_WE) ||

> >> +           !!(v->arch.pending_exceptions) ||

> >> +           v->requests;

> >> +

> >> +    ret = ret || kvmppc_get_tsr_wrc(v);

> >

> > Why do you need to declare the cpu as non-runnable when a watchdog

> > event occured?

> 

> It's the other way around -- it's always runnable when a watchdog exit is

> pending.  It's like a pending exception.


With the above check, Are we trying to handle the case where watchdog interrupt bit in pending_exception is cleared by guest after final expiry but before the qemu exit? And we want that if TSR.WRS update wins the race with clearing of watchdog interrupt condition from guest then anyways let QEMU exit with reason KVM_EXIT_WDT? 

What if we do not allow guest clear watchdog interrupt condition if final expiry already happened? And let QEMU clear TSR.ENW | TSR.WIS | TSR.WRS in all selected policy (debug | reset etc).

Thanks
-Bharat

> 

> >> diff --git a/include/linux/kvm.h b/include/linux/kvm.h index

> >> 2ce09aa..b9fdb52 100644

> >> --- a/include/linux/kvm.h

> >> +++ b/include/linux/kvm.h

> >> @@ -163,6 +163,7 @@ struct kvm_pit_config {

> >>  #define KVM_EXIT_OSI              18

> >>  #define KVM_EXIT_PAPR_HCALL      19

> >>  #define KVM_EXIT_S390_UCONTROL      20

> >> +#define KVM_EXIT_WDT              21

> >

> > Please make this a more generic KVM_EXIT_WATCHDOG so that other archs

> > may have the chance to reuse it.

> 

> WDT is generic (85 of 115 files in drivers/watchdog/ contain "wdt" in their

> names), just overly abbreviated.  KVM_EXIT_WATCHDOG is more readable.

> 

> -Scott
Alexander Graf July 17, 2012, 12:51 p.m. UTC | #6
On 07/17/2012 11:57 AM, Bhushan Bharat-R65777 wrote:
>
>> -----Original Message-----
>> From: kvm-ppc-owner@vger.kernel.org [mailto:kvm-ppc-owner@vger.kernel.org] On
>> Behalf Of Alexander Graf
>> Sent: Tuesday, July 17, 2012 12:50 PM
>> To: Wood Scott-B07421
>> Cc: Bhushan Bharat-R65777; <kvm-ppc@vger.kernel.org>; <kvm@vger.kernel.org>;
>> <bharatb.yadav@gmail.com>; Bhushan Bharat-R65777; Benjamin Herrenschmidt; Kumar
>> Gala
>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
>>
>>
>>
>> On 17.07.2012, at 03:02, Scott Wood <scottwood@freescale.com> wrote:
>>
>>> On 07/16/2012 12:18 PM, Alexander Graf wrote:
>>>>> +/*
>>>>> + * Return the number of jiffies until the next timeout.  If the
>>>> timeout is
>>>>> + * longer than the NEXT_TIMER_MAX_DELTA, that
>>>> then?
>>>>
>>>>> return NEXT_TIMER_MAX_DELTA
>>>>> + * instead.
>>>> I can read code.
>>> Come on, it's not exactly x++; /* add one to x */
>>>
>>> It's faster to read code (as well as know the constraints within which
>>> you can modify it without having to spend a lot of time digesting all
>>> the callers' use cases) when you have a high level description of its
>>> interface contract, and can be selective about when to zoom in to the
>>> details.  Linux kernel code tends to be bad about this.
>> Yeah, not opposed to leave that part in :).
>>
>>>> The important piece of information in the comment is
>>>> missing: The reason.
>>> The reason for what?  Why you want to know the next timeout?  That's
>>> the caller's business.  Or why we use NEXT_TIMER_MAX_DELTA as the limit?
>> Why we use the limit. IIRC it was explained in the last thread, just didn't make
>> its way into the comment.
> Earlier we have a comment on the #define MAX_TIMEOUT (new define added for a purpose, so the comment described the puspose).
> Now we uses the generic #define NEX_TIMER_MAX_DELTA (include/linux/timer.h), so removed the comment.

Ah, ok. Just saying, if you comment on some mechanism, like you did 
here, please also include the reasoning behind it. For example

   Do foo if x is true.

isn't particularly helpful. However

   Do foo if x is true because the bar API will break with high values

is very helpful. It includes the action and reason of the code :). 
Alternatively, to me the same as above would be

   /* bar API will break with high values */
   if (x)
     do(foo)

because in this case the code is the action description. Either variant 
works fine for me.

>
>>>>> +void kvmppc_watchdog_func(unsigned long data) {
>>>>> +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>>>> +    u32 tsr, new_tsr;
>>>>> +    int final;
>>>>> +
>>>>> +    do {
>>>>> +        new_tsr = tsr = vcpu->arch.tsr;
>>>>> +        final = 0;
>>>>> +
>>>>> +        /* Time out event */
>>>>> +        if (tsr & TSR_ENW) {
>>>>> +            if (tsr & TSR_WIS) {
>>>>> +                new_tsr = (tsr & ~TCR_WRC_MASK) |
>>>>> +                      (vcpu->arch.tcr & TCR_WRC_MASK);
>>>>> +                vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>>> Can't we just poke the vcpu to exit the VM and do the above on its own?
>>> We've discussed this before.  TSR updates are done via atomics, and we
>>> send a request for the vcpu to act on the result.  This is how the
>>> decrementer works.
>>>
>>> http://www.spinics.net/lists/kvm-ppc/msg03169.html
>> Yeah, the major difference to the dec is the atomicity of the whole thing. Dec
>> changes one bit to enable the interrupt line. The final expiration is more
>> complex.
> Is not setting the TSR.WRS atomic here (cmpxchg() will handle this)?

Final expiration sets TCR. TSR should be ok.

>
>>>> This is the watdog expired case, right?
>>> Final expiration, yes.
>>>
>>>> I'd also prefer to have an
>>>> explicit event for the expiry than a special TSR check in the main loop.
>>> So check TSR[WRS] in update_timer_ints(), and have it queue a
>>> pseudoexception?
>> Or here.
> Do we mean define a sudo IROPRIO for final expiry.

We can also define an event that is sent through kvm_make_request. But 
yeah, IRQPRIO is probably easier. Not 100% sure which way is better 
though. Avi, any preferences?

>
>>> That would eliminate the need to change the runnable function.
>>>
>>>> Also call me sceptic on the reset of tcr. If our user space watchdog
>>>> event is "write a message", then we essentially want to hide the fact
>>>> that the watchdog expired from the guest, right? In that case, the
>>>> second time-out wouldn't do anything guest visible.
>>> This was probably copied straight out of the hardware documentation,
>>> which explicitly says TCR[WRC] gets set to zero on final expiration
>>> (as part of reset).  We should leave that part up to userspace.  It
>>> definitely shouldn't be done inside the cmpxchg loop (or from
>>> interrupt context -- only TSR gets the atomic treatment).  I don't
>>> think the read of TCR outside vcpu context is a problem, though.
>> Yeah, but it'd just make me less wary if only the vcpu thread itself accesses
>> vcpu internal registers that aren't irq state and thus designed for it (TSR).
>>
>> But yes, the most flexible way would probably be to do it from user space. Since
>> it'd happen from within the vcpu context of user space, we can also guarantee
>> that the TCR access is atomic.
> Yes, will move the tcr.wrc clearing to userspace.
>
>>>>> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) {
>>>>> -    return !(v->arch.shared->msr & MSR_WE) ||
>>>>> -           !!(v->arch.pending_exceptions) ||
>>>>> -           v->requests;
>>>>> +    bool ret = !(v->arch.shared->msr & MSR_WE) ||
>>>>> +           !!(v->arch.pending_exceptions) ||
>>>>> +           v->requests;
>>>>> +
>>>>> +    ret = ret || kvmppc_get_tsr_wrc(v);
>>>> Why do you need to declare the cpu as non-runnable when a watchdog
>>>> event occured?
>>> It's the other way around -- it's always runnable when a watchdog exit
>>> is pending.  It's like a pending exception.
>> Ah, so yes, we should just shove it into pending_exceptions then.
> Pending_exception? You mean sudo again here as said earlier.

pseudo :). Yeah, I'm referring to above. No need to check 500 different 
conditions when we already have a bitmap that says "event is pending".


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bharat Bhushan July 17, 2012, 1:15 p.m. UTC | #7
> -----Original Message-----
> From: kvm-ppc-owner@vger.kernel.org [mailto:kvm-ppc-owner@vger.kernel.org] On
> Behalf Of Alexander Graf
> Sent: Tuesday, July 17, 2012 6:22 PM
> To: Bhushan Bharat-R65777
> Cc: Wood Scott-B07421; <kvm-ppc@vger.kernel.org>; <kvm@vger.kernel.org>;
> <bharatb.yadav@gmail.com>; Benjamin Herrenschmidt; Kumar Gala; Avi Kivity
> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
> 
> On 07/17/2012 11:57 AM, Bhushan Bharat-R65777 wrote:
> >
> >> -----Original Message-----
> >> From: kvm-ppc-owner@vger.kernel.org
> >> [mailto:kvm-ppc-owner@vger.kernel.org] On Behalf Of Alexander Graf
> >> Sent: Tuesday, July 17, 2012 12:50 PM
> >> To: Wood Scott-B07421
> >> Cc: Bhushan Bharat-R65777; <kvm-ppc@vger.kernel.org>;
> >> <kvm@vger.kernel.org>; <bharatb.yadav@gmail.com>; Bhushan
> >> Bharat-R65777; Benjamin Herrenschmidt; Kumar Gala
> >> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
> >>
> >>
> >>
> >> On 17.07.2012, at 03:02, Scott Wood <scottwood@freescale.com> wrote:
> >>
> >>> On 07/16/2012 12:18 PM, Alexander Graf wrote:
> >>>>> +/*
> >>>>> + * Return the number of jiffies until the next timeout.  If the
> >>>> timeout is
> >>>>> + * longer than the NEXT_TIMER_MAX_DELTA, that
> >>>> then?
> >>>>
> >>>>> return NEXT_TIMER_MAX_DELTA
> >>>>> + * instead.
> >>>> I can read code.
> >>> Come on, it's not exactly x++; /* add one to x */
> >>>
> >>> It's faster to read code (as well as know the constraints within
> >>> which you can modify it without having to spend a lot of time
> >>> digesting all the callers' use cases) when you have a high level
> >>> description of its interface contract, and can be selective about
> >>> when to zoom in to the details.  Linux kernel code tends to be bad about
> this.
> >> Yeah, not opposed to leave that part in :).
> >>
> >>>> The important piece of information in the comment is
> >>>> missing: The reason.
> >>> The reason for what?  Why you want to know the next timeout?  That's
> >>> the caller's business.  Or why we use NEXT_TIMER_MAX_DELTA as the limit?
> >> Why we use the limit. IIRC it was explained in the last thread, just
> >> didn't make its way into the comment.
> > Earlier we have a comment on the #define MAX_TIMEOUT (new define added for a
> purpose, so the comment described the puspose).
> > Now we uses the generic #define NEX_TIMER_MAX_DELTA (include/linux/timer.h),
> so removed the comment.
> 
> Ah, ok. Just saying, if you comment on some mechanism, like you did here, please
> also include the reasoning behind it. For example
> 
>    Do foo if x is true.
> 
> isn't particularly helpful. However
> 
>    Do foo if x is true because the bar API will break with high values
> 
> is very helpful. It includes the action and reason of the code :).
> Alternatively, to me the same as above would be
> 
>    /* bar API will break with high values */
>    if (x)
>      do(foo)
> 
> because in this case the code is the action description. Either variant works
> fine for me.

Ok :)

> 
> >
> >>>>> +void kvmppc_watchdog_func(unsigned long data) {
> >>>>> +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
> >>>>> +    u32 tsr, new_tsr;
> >>>>> +    int final;
> >>>>> +
> >>>>> +    do {
> >>>>> +        new_tsr = tsr = vcpu->arch.tsr;
> >>>>> +        final = 0;
> >>>>> +
> >>>>> +        /* Time out event */
> >>>>> +        if (tsr & TSR_ENW) {
> >>>>> +            if (tsr & TSR_WIS) {
> >>>>> +                new_tsr = (tsr & ~TCR_WRC_MASK) |
> >>>>> +                      (vcpu->arch.tcr & TCR_WRC_MASK);
> >>>>> +                vcpu->arch.tcr &= ~TCR_WRC_MASK;
> >>>> Can't we just poke the vcpu to exit the VM and do the above on its own?
> >>> We've discussed this before.  TSR updates are done via atomics, and
> >>> we send a request for the vcpu to act on the result.  This is how
> >>> the decrementer works.
> >>>
> >>> http://www.spinics.net/lists/kvm-ppc/msg03169.html
> >> Yeah, the major difference to the dec is the atomicity of the whole
> >> thing. Dec changes one bit to enable the interrupt line. The final
> >> expiration is more complex.
> > Is not setting the TSR.WRS atomic here (cmpxchg() will handle this)?
> 
> Final expiration sets TCR. TSR should be ok.

It is clearing some TCR bits :)

Let us move the TCR clearing to userspace (please see below response ^^). Then it is just setting TSR. Right?

> 
> >
> >>>> This is the watdog expired case, right?
> >>> Final expiration, yes.
> >>>
> >>>> I'd also prefer to have an
> >>>> explicit event for the expiry than a special TSR check in the main loop.
> >>> So check TSR[WRS] in update_timer_ints(), and have it queue a
> >>> pseudoexception?
> >> Or here.
> > Do we mean define a sudo IROPRIO for final expiry.
> 
> We can also define an event that is sent through kvm_make_request. But yeah,
> IRQPRIO is probably easier. Not 100% sure which way is better though. Avi, any
> preferences?
> 
> >
> >>> That would eliminate the need to change the runnable function.
> >>>
> >>>> Also call me sceptic on the reset of tcr. If our user space watchdog
> >>>> event is "write a message", then we essentially want to hide the fact
> >>>> that the watchdog expired from the guest, right? In that case, the
> >>>> second time-out wouldn't do anything guest visible.
> >>> This was probably copied straight out of the hardware documentation,
> >>> which explicitly says TCR[WRC] gets set to zero on final expiration
> >>> (as part of reset).  We should leave that part up to userspace.  It
> >>> definitely shouldn't be done inside the cmpxchg loop (or from
> >>> interrupt context -- only TSR gets the atomic treatment).  I don't
> >>> think the read of TCR outside vcpu context is a problem, though.
> >> Yeah, but it'd just make me less wary if only the vcpu thread itself accesses
> >> vcpu internal registers that aren't irq state and thus designed for it (TSR).
> >>
> >> But yes, the most flexible way would probably be to do it from user space.
> Since
> >> it'd happen from within the vcpu context of user space, we can also guarantee
> >> that the TCR access is atomic.
> > Yes, will move the tcr.wrc clearing to userspace.


^^ Here .. It is good to move clearing the TCR to guest.

Thanks
-Bharat

> >
> >>>>> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) {
> >>>>> -    return !(v->arch.shared->msr & MSR_WE) ||
> >>>>> -           !!(v->arch.pending_exceptions) ||
> >>>>> -           v->requests;
> >>>>> +    bool ret = !(v->arch.shared->msr & MSR_WE) ||
> >>>>> +           !!(v->arch.pending_exceptions) ||
> >>>>> +           v->requests;
> >>>>> +
> >>>>> +    ret = ret || kvmppc_get_tsr_wrc(v);
> >>>> Why do you need to declare the cpu as non-runnable when a watchdog
> >>>> event occured?
> >>> It's the other way around -- it's always runnable when a watchdog exit
> >>> is pending.  It's like a pending exception.
> >> Ah, so yes, we should just shove it into pending_exceptions then.
> > Pending_exception? You mean sudo again here as said earlier.
> 
> pseudo :). Yeah, I'm referring to above. No need to check 500 different
> conditions when we already have a bitmap that says "event is pending".
> 
> 
> Alex
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf July 17, 2012, 2:01 p.m. UTC | #8
On 07/17/2012 03:15 PM, Bhushan Bharat-R65777 wrote:
>
>> -----Original Message-----
>> From: kvm-ppc-owner@vger.kernel.org [mailto:kvm-ppc-owner@vger.kernel.org] On
>> Behalf Of Alexander Graf
>> Sent: Tuesday, July 17, 2012 6:22 PM
>> To: Bhushan Bharat-R65777
>> Cc: Wood Scott-B07421; <kvm-ppc@vger.kernel.org>; <kvm@vger.kernel.org>;
>> <bharatb.yadav@gmail.com>; Benjamin Herrenschmidt; Kumar Gala; Avi Kivity
>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
>>
>> On 07/17/2012 11:57 AM, Bhushan Bharat-R65777 wrote:
>>>> -----Original Message-----
>>>> From: kvm-ppc-owner@vger.kernel.org
>>>> [mailto:kvm-ppc-owner@vger.kernel.org] On Behalf Of Alexander Graf
>>>> Sent: Tuesday, July 17, 2012 12:50 PM
>>>> To: Wood Scott-B07421
>>>> Cc: Bhushan Bharat-R65777; <kvm-ppc@vger.kernel.org>;
>>>> <kvm@vger.kernel.org>; <bharatb.yadav@gmail.com>; Bhushan
>>>> Bharat-R65777; Benjamin Herrenschmidt; Kumar Gala
>>>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
>>>>
>>>>
>>>>
>>>> On 17.07.2012, at 03:02, Scott Wood <scottwood@freescale.com> wrote:
>>>>
>>>>> On 07/16/2012 12:18 PM, Alexander Graf wrote:
>>>>>>> +/*
>>>>>>> + * Return the number of jiffies until the next timeout.  If the
>>>>>> timeout is
>>>>>>> + * longer than the NEXT_TIMER_MAX_DELTA, that
>>>>>> then?
>>>>>>
>>>>>>> return NEXT_TIMER_MAX_DELTA
>>>>>>> + * instead.
>>>>>> I can read code.
>>>>> Come on, it's not exactly x++; /* add one to x */
>>>>>
>>>>> It's faster to read code (as well as know the constraints within
>>>>> which you can modify it without having to spend a lot of time
>>>>> digesting all the callers' use cases) when you have a high level
>>>>> description of its interface contract, and can be selective about
>>>>> when to zoom in to the details.  Linux kernel code tends to be bad about
>> this.
>>>> Yeah, not opposed to leave that part in :).
>>>>
>>>>>> The important piece of information in the comment is
>>>>>> missing: The reason.
>>>>> The reason for what?  Why you want to know the next timeout?  That's
>>>>> the caller's business.  Or why we use NEXT_TIMER_MAX_DELTA as the limit?
>>>> Why we use the limit. IIRC it was explained in the last thread, just
>>>> didn't make its way into the comment.
>>> Earlier we have a comment on the #define MAX_TIMEOUT (new define added for a
>> purpose, so the comment described the puspose).
>>> Now we uses the generic #define NEX_TIMER_MAX_DELTA (include/linux/timer.h),
>> so removed the comment.
>>
>> Ah, ok. Just saying, if you comment on some mechanism, like you did here, please
>> also include the reasoning behind it. For example
>>
>>     Do foo if x is true.
>>
>> isn't particularly helpful. However
>>
>>     Do foo if x is true because the bar API will break with high values
>>
>> is very helpful. It includes the action and reason of the code :).
>> Alternatively, to me the same as above would be
>>
>>     /* bar API will break with high values */
>>     if (x)
>>       do(foo)
>>
>> because in this case the code is the action description. Either variant works
>> fine for me.
> Ok :)
>
>>>>>>> +void kvmppc_watchdog_func(unsigned long data) {
>>>>>>> +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>>>>>> +    u32 tsr, new_tsr;
>>>>>>> +    int final;
>>>>>>> +
>>>>>>> +    do {
>>>>>>> +        new_tsr = tsr = vcpu->arch.tsr;
>>>>>>> +        final = 0;
>>>>>>> +
>>>>>>> +        /* Time out event */
>>>>>>> +        if (tsr & TSR_ENW) {
>>>>>>> +            if (tsr & TSR_WIS) {
>>>>>>> +                new_tsr = (tsr & ~TCR_WRC_MASK) |
>>>>>>> +                      (vcpu->arch.tcr & TCR_WRC_MASK);
>>>>>>> +                vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>>>>> Can't we just poke the vcpu to exit the VM and do the above on its own?
>>>>> We've discussed this before.  TSR updates are done via atomics, and
>>>>> we send a request for the vcpu to act on the result.  This is how
>>>>> the decrementer works.
>>>>>
>>>>> http://www.spinics.net/lists/kvm-ppc/msg03169.html
>>>> Yeah, the major difference to the dec is the atomicity of the whole
>>>> thing. Dec changes one bit to enable the interrupt line. The final
>>>> expiration is more complex.
>>> Is not setting the TSR.WRS atomic here (cmpxchg() will handle this)?
>> Final expiration sets TCR. TSR should be ok.
> It is clearing some TCR bits :)
>
> Let us move the TCR clearing to userspace (please see below response ^^). Then it is just setting TSR. Right?

Then TSR is still set, right? So we don't set it, it just keeps on being 
1. We need to trigger another expired event in that if branch now.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bharat Bhushan July 17, 2012, 2:13 p.m. UTC | #9
> -----Original Message-----
> From: kvm-ppc-owner@vger.kernel.org [mailto:kvm-ppc-owner@vger.kernel.org] On
> Behalf Of Alexander Graf
> Sent: Tuesday, July 17, 2012 7:31 PM
> To: Bhushan Bharat-R65777
> Cc: Wood Scott-B07421; <kvm-ppc@vger.kernel.org>; <kvm@vger.kernel.org>;
> <bharatb.yadav@gmail.com>; Benjamin Herrenschmidt; Kumar Gala; Avi Kivity
> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
> 
> On 07/17/2012 03:15 PM, Bhushan Bharat-R65777 wrote:
> >
> >> -----Original Message-----
> >> From: kvm-ppc-owner@vger.kernel.org
> >> [mailto:kvm-ppc-owner@vger.kernel.org] On Behalf Of Alexander Graf
> >> Sent: Tuesday, July 17, 2012 6:22 PM
> >> To: Bhushan Bharat-R65777
> >> Cc: Wood Scott-B07421; <kvm-ppc@vger.kernel.org>;
> >> <kvm@vger.kernel.org>; <bharatb.yadav@gmail.com>; Benjamin
> >> Herrenschmidt; Kumar Gala; Avi Kivity
> >> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
> >>
> >> On 07/17/2012 11:57 AM, Bhushan Bharat-R65777 wrote:
> >>>> -----Original Message-----
> >>>> From: kvm-ppc-owner@vger.kernel.org
> >>>> [mailto:kvm-ppc-owner@vger.kernel.org] On Behalf Of Alexander Graf
> >>>> Sent: Tuesday, July 17, 2012 12:50 PM
> >>>> To: Wood Scott-B07421
> >>>> Cc: Bhushan Bharat-R65777; <kvm-ppc@vger.kernel.org>;
> >>>> <kvm@vger.kernel.org>; <bharatb.yadav@gmail.com>; Bhushan
> >>>> Bharat-R65777; Benjamin Herrenschmidt; Kumar Gala
> >>>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
> >>>>
> >>>>
> >>>>
> >>>> On 17.07.2012, at 03:02, Scott Wood <scottwood@freescale.com> wrote:
> >>>>
> >>>>> On 07/16/2012 12:18 PM, Alexander Graf wrote:
> >>>>>>> +/*
> >>>>>>> + * Return the number of jiffies until the next timeout.  If the
> >>>>>> timeout is
> >>>>>>> + * longer than the NEXT_TIMER_MAX_DELTA, that
> >>>>>> then?
> >>>>>>
> >>>>>>> return NEXT_TIMER_MAX_DELTA
> >>>>>>> + * instead.
> >>>>>> I can read code.
> >>>>> Come on, it's not exactly x++; /* add one to x */
> >>>>>
> >>>>> It's faster to read code (as well as know the constraints within
> >>>>> which you can modify it without having to spend a lot of time
> >>>>> digesting all the callers' use cases) when you have a high level
> >>>>> description of its interface contract, and can be selective about
> >>>>> when to zoom in to the details.  Linux kernel code tends to be bad
> >>>>> about
> >> this.
> >>>> Yeah, not opposed to leave that part in :).
> >>>>
> >>>>>> The important piece of information in the comment is
> >>>>>> missing: The reason.
> >>>>> The reason for what?  Why you want to know the next timeout?
> >>>>> That's the caller's business.  Or why we use NEXT_TIMER_MAX_DELTA as the
> limit?
> >>>> Why we use the limit. IIRC it was explained in the last thread,
> >>>> just didn't make its way into the comment.
> >>> Earlier we have a comment on the #define MAX_TIMEOUT (new define
> >>> added for a
> >> purpose, so the comment described the puspose).
> >>> Now we uses the generic #define NEX_TIMER_MAX_DELTA
> >>> (include/linux/timer.h),
> >> so removed the comment.
> >>
> >> Ah, ok. Just saying, if you comment on some mechanism, like you did
> >> here, please also include the reasoning behind it. For example
> >>
> >>     Do foo if x is true.
> >>
> >> isn't particularly helpful. However
> >>
> >>     Do foo if x is true because the bar API will break with high
> >> values
> >>
> >> is very helpful. It includes the action and reason of the code :).
> >> Alternatively, to me the same as above would be
> >>
> >>     /* bar API will break with high values */
> >>     if (x)
> >>       do(foo)
> >>
> >> because in this case the code is the action description. Either
> >> variant works fine for me.
> > Ok :)
> >
> >>>>>>> +void kvmppc_watchdog_func(unsigned long data) {
> >>>>>>> +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
> >>>>>>> +    u32 tsr, new_tsr;
> >>>>>>> +    int final;
> >>>>>>> +
> >>>>>>> +    do {
> >>>>>>> +        new_tsr = tsr = vcpu->arch.tsr;
> >>>>>>> +        final = 0;
> >>>>>>> +
> >>>>>>> +        /* Time out event */
> >>>>>>> +        if (tsr & TSR_ENW) {
> >>>>>>> +            if (tsr & TSR_WIS) {
> >>>>>>> +                new_tsr = (tsr & ~TCR_WRC_MASK) |
> >>>>>>> +                      (vcpu->arch.tcr & TCR_WRC_MASK);
> >>>>>>> +                vcpu->arch.tcr &= ~TCR_WRC_MASK;
> >>>>>> Can't we just poke the vcpu to exit the VM and do the above on its own?
> >>>>> We've discussed this before.  TSR updates are done via atomics,
> >>>>> and we send a request for the vcpu to act on the result.  This is
> >>>>> how the decrementer works.
> >>>>>
> >>>>> http://www.spinics.net/lists/kvm-ppc/msg03169.html
> >>>> Yeah, the major difference to the dec is the atomicity of the whole
> >>>> thing. Dec changes one bit to enable the interrupt line. The final
> >>>> expiration is more complex.
> >>> Is not setting the TSR.WRS atomic here (cmpxchg() will handle this)?
> >> Final expiration sets TCR. TSR should be ok.
> > It is clearing some TCR bits :)
> >
> > Let us move the TCR clearing to userspace (please see below response ^^). Then
> it is just setting TSR. Right?
> 
> Then TSR is still set, right? So we don't set it, it just keeps on being 1. We
> need to trigger another expired event in that if branch now.

I am sorry Alex but I did not get you.

What I meant was that you were having concern that dec is atomic while watchdog final expiration (complex) is not atomic, but if we move the TCR.WRC clearing to userspace on final expiration then are you ok with current code?

Thanks
-Bharat

> 
> 
> Alex
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf July 17, 2012, 2:35 p.m. UTC | #10
On 07/17/2012 04:13 PM, Bhushan Bharat-R65777 wrote:
>
>> -----Original Message-----
>> From: kvm-ppc-owner@vger.kernel.org [mailto:kvm-ppc-owner@vger.kernel.org] On
>> Behalf Of Alexander Graf
>> Sent: Tuesday, July 17, 2012 7:31 PM
>> To: Bhushan Bharat-R65777
>> Cc: Wood Scott-B07421; <kvm-ppc@vger.kernel.org>; <kvm@vger.kernel.org>;
>> <bharatb.yadav@gmail.com>; Benjamin Herrenschmidt; Kumar Gala; Avi Kivity
>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
>>
>> On 07/17/2012 03:15 PM, Bhushan Bharat-R65777 wrote:
>>>> -----Original Message-----
>>>> From: kvm-ppc-owner@vger.kernel.org
>>>> [mailto:kvm-ppc-owner@vger.kernel.org] On Behalf Of Alexander Graf
>>>> Sent: Tuesday, July 17, 2012 6:22 PM
>>>> To: Bhushan Bharat-R65777
>>>> Cc: Wood Scott-B07421; <kvm-ppc@vger.kernel.org>;
>>>> <kvm@vger.kernel.org>; <bharatb.yadav@gmail.com>; Benjamin
>>>> Herrenschmidt; Kumar Gala; Avi Kivity
>>>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
>>>>
>>>> On 07/17/2012 11:57 AM, Bhushan Bharat-R65777 wrote:
>>>>>> -----Original Message-----
>>>>>> From: kvm-ppc-owner@vger.kernel.org
>>>>>> [mailto:kvm-ppc-owner@vger.kernel.org] On Behalf Of Alexander Graf
>>>>>> Sent: Tuesday, July 17, 2012 12:50 PM
>>>>>> To: Wood Scott-B07421
>>>>>> Cc: Bhushan Bharat-R65777; <kvm-ppc@vger.kernel.org>;
>>>>>> <kvm@vger.kernel.org>; <bharatb.yadav@gmail.com>; Bhushan
>>>>>> Bharat-R65777; Benjamin Herrenschmidt; Kumar Gala
>>>>>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 17.07.2012, at 03:02, Scott Wood <scottwood@freescale.com> wrote:
>>>>>>
>>>>>>> On 07/16/2012 12:18 PM, Alexander Graf wrote:
>>>>>>>>> +/*
>>>>>>>>> + * Return the number of jiffies until the next timeout.  If the
>>>>>>>> timeout is
>>>>>>>>> + * longer than the NEXT_TIMER_MAX_DELTA, that
>>>>>>>> then?
>>>>>>>>
>>>>>>>>> return NEXT_TIMER_MAX_DELTA
>>>>>>>>> + * instead.
>>>>>>>> I can read code.
>>>>>>> Come on, it's not exactly x++; /* add one to x */
>>>>>>>
>>>>>>> It's faster to read code (as well as know the constraints within
>>>>>>> which you can modify it without having to spend a lot of time
>>>>>>> digesting all the callers' use cases) when you have a high level
>>>>>>> description of its interface contract, and can be selective about
>>>>>>> when to zoom in to the details.  Linux kernel code tends to be bad
>>>>>>> about
>>>> this.
>>>>>> Yeah, not opposed to leave that part in :).
>>>>>>
>>>>>>>> The important piece of information in the comment is
>>>>>>>> missing: The reason.
>>>>>>> The reason for what?  Why you want to know the next timeout?
>>>>>>> That's the caller's business.  Or why we use NEXT_TIMER_MAX_DELTA as the
>> limit?
>>>>>> Why we use the limit. IIRC it was explained in the last thread,
>>>>>> just didn't make its way into the comment.
>>>>> Earlier we have a comment on the #define MAX_TIMEOUT (new define
>>>>> added for a
>>>> purpose, so the comment described the puspose).
>>>>> Now we uses the generic #define NEX_TIMER_MAX_DELTA
>>>>> (include/linux/timer.h),
>>>> so removed the comment.
>>>>
>>>> Ah, ok. Just saying, if you comment on some mechanism, like you did
>>>> here, please also include the reasoning behind it. For example
>>>>
>>>>      Do foo if x is true.
>>>>
>>>> isn't particularly helpful. However
>>>>
>>>>      Do foo if x is true because the bar API will break with high
>>>> values
>>>>
>>>> is very helpful. It includes the action and reason of the code :).
>>>> Alternatively, to me the same as above would be
>>>>
>>>>      /* bar API will break with high values */
>>>>      if (x)
>>>>        do(foo)
>>>>
>>>> because in this case the code is the action description. Either
>>>> variant works fine for me.
>>> Ok :)
>>>
>>>>>>>>> +void kvmppc_watchdog_func(unsigned long data) {
>>>>>>>>> +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>>>>>>>> +    u32 tsr, new_tsr;
>>>>>>>>> +    int final;
>>>>>>>>> +
>>>>>>>>> +    do {
>>>>>>>>> +        new_tsr = tsr = vcpu->arch.tsr;
>>>>>>>>> +        final = 0;
>>>>>>>>> +
>>>>>>>>> +        /* Time out event */
>>>>>>>>> +        if (tsr & TSR_ENW) {
>>>>>>>>> +            if (tsr & TSR_WIS) {
>>>>>>>>> +                new_tsr = (tsr & ~TCR_WRC_MASK) |
>>>>>>>>> +                      (vcpu->arch.tcr & TCR_WRC_MASK);
>>>>>>>>> +                vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>>>>>>> Can't we just poke the vcpu to exit the VM and do the above on its own?
>>>>>>> We've discussed this before.  TSR updates are done via atomics,
>>>>>>> and we send a request for the vcpu to act on the result.  This is
>>>>>>> how the decrementer works.
>>>>>>>
>>>>>>> http://www.spinics.net/lists/kvm-ppc/msg03169.html
>>>>>> Yeah, the major difference to the dec is the atomicity of the whole
>>>>>> thing. Dec changes one bit to enable the interrupt line. The final
>>>>>> expiration is more complex.
>>>>> Is not setting the TSR.WRS atomic here (cmpxchg() will handle this)?
>>>> Final expiration sets TCR. TSR should be ok.
>>> It is clearing some TCR bits :)
>>>
>>> Let us move the TCR clearing to userspace (please see below response ^^). Then
>> it is just setting TSR. Right?
>>
>> Then TSR is still set, right? So we don't set it, it just keeps on being 1. We
>> need to trigger another expired event in that if branch now.
> I am sorry Alex but I did not get you.
>
> What I meant was that you were having concern that dec is atomic while watchdog final expiration (complex) is not atomic, but if we move the TCR.WRC clearing to userspace on final expiration then are you ok with current code?

If we move the final expiration TCR.WRC clearing out of that branch, you 
don't have any condition left to check if we are in the final expiration 
path. So you need to set some other bit somewhere (eventually 
pending_exceptions probably) that indicates that we need to return to 
user space.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bharat Bhushan July 17, 2012, 4:10 p.m. UTC | #11
> -----Original Message-----
> From: Alexander Graf [mailto:agraf@suse.de]
> Sent: Tuesday, July 17, 2012 8:05 PM
> To: Bhushan Bharat-R65777
> Cc: Wood Scott-B07421; <kvm-ppc@vger.kernel.org>; <kvm@vger.kernel.org>;
> <bharatb.yadav@gmail.com>; Benjamin Herrenschmidt; Kumar Gala; Avi Kivity
> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
> 
> On 07/17/2012 04:13 PM, Bhushan Bharat-R65777 wrote:
> >
> >> -----Original Message-----
> >> From: kvm-ppc-owner@vger.kernel.org
> >> [mailto:kvm-ppc-owner@vger.kernel.org] On Behalf Of Alexander Graf
> >> Sent: Tuesday, July 17, 2012 7:31 PM
> >> To: Bhushan Bharat-R65777
> >> Cc: Wood Scott-B07421; <kvm-ppc@vger.kernel.org>;
> >> <kvm@vger.kernel.org>; <bharatb.yadav@gmail.com>; Benjamin
> >> Herrenschmidt; Kumar Gala; Avi Kivity
> >> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
> >>
> >> On 07/17/2012 03:15 PM, Bhushan Bharat-R65777 wrote:
> >>>> -----Original Message-----
> >>>> From: kvm-ppc-owner@vger.kernel.org
> >>>> [mailto:kvm-ppc-owner@vger.kernel.org] On Behalf Of Alexander Graf
> >>>> Sent: Tuesday, July 17, 2012 6:22 PM
> >>>> To: Bhushan Bharat-R65777
> >>>> Cc: Wood Scott-B07421; <kvm-ppc@vger.kernel.org>;
> >>>> <kvm@vger.kernel.org>; <bharatb.yadav@gmail.com>; Benjamin
> >>>> Herrenschmidt; Kumar Gala; Avi Kivity
> >>>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
> >>>>
> >>>> On 07/17/2012 11:57 AM, Bhushan Bharat-R65777 wrote:
> >>>>>> -----Original Message-----
> >>>>>> From: kvm-ppc-owner@vger.kernel.org
> >>>>>> [mailto:kvm-ppc-owner@vger.kernel.org] On Behalf Of Alexander
> >>>>>> Graf
> >>>>>> Sent: Tuesday, July 17, 2012 12:50 PM
> >>>>>> To: Wood Scott-B07421
> >>>>>> Cc: Bhushan Bharat-R65777; <kvm-ppc@vger.kernel.org>;
> >>>>>> <kvm@vger.kernel.org>; <bharatb.yadav@gmail.com>; Bhushan
> >>>>>> Bharat-R65777; Benjamin Herrenschmidt; Kumar Gala
> >>>>>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog
> >>>>>> emulation
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>> On 17.07.2012, at 03:02, Scott Wood <scottwood@freescale.com> wrote:
> >>>>>>
> >>>>>>> On 07/16/2012 12:18 PM, Alexander Graf wrote:
> >>>>>>>>> +/*
> >>>>>>>>> + * Return the number of jiffies until the next timeout.  If
> >>>>>>>>> +the
> >>>>>>>> timeout is
> >>>>>>>>> + * longer than the NEXT_TIMER_MAX_DELTA, that
> >>>>>>>> then?
> >>>>>>>>
> >>>>>>>>> return NEXT_TIMER_MAX_DELTA
> >>>>>>>>> + * instead.
> >>>>>>>> I can read code.
> >>>>>>> Come on, it's not exactly x++; /* add one to x */
> >>>>>>>
> >>>>>>> It's faster to read code (as well as know the constraints within
> >>>>>>> which you can modify it without having to spend a lot of time
> >>>>>>> digesting all the callers' use cases) when you have a high level
> >>>>>>> description of its interface contract, and can be selective
> >>>>>>> about when to zoom in to the details.  Linux kernel code tends
> >>>>>>> to be bad about
> >>>> this.
> >>>>>> Yeah, not opposed to leave that part in :).
> >>>>>>
> >>>>>>>> The important piece of information in the comment is
> >>>>>>>> missing: The reason.
> >>>>>>> The reason for what?  Why you want to know the next timeout?
> >>>>>>> That's the caller's business.  Or why we use
> >>>>>>> NEXT_TIMER_MAX_DELTA as the
> >> limit?
> >>>>>> Why we use the limit. IIRC it was explained in the last thread,
> >>>>>> just didn't make its way into the comment.
> >>>>> Earlier we have a comment on the #define MAX_TIMEOUT (new define
> >>>>> added for a
> >>>> purpose, so the comment described the puspose).
> >>>>> Now we uses the generic #define NEX_TIMER_MAX_DELTA
> >>>>> (include/linux/timer.h),
> >>>> so removed the comment.
> >>>>
> >>>> Ah, ok. Just saying, if you comment on some mechanism, like you did
> >>>> here, please also include the reasoning behind it. For example
> >>>>
> >>>>      Do foo if x is true.
> >>>>
> >>>> isn't particularly helpful. However
> >>>>
> >>>>      Do foo if x is true because the bar API will break with high
> >>>> values
> >>>>
> >>>> is very helpful. It includes the action and reason of the code :).
> >>>> Alternatively, to me the same as above would be
> >>>>
> >>>>      /* bar API will break with high values */
> >>>>      if (x)
> >>>>        do(foo)
> >>>>
> >>>> because in this case the code is the action description. Either
> >>>> variant works fine for me.
> >>> Ok :)
> >>>
> >>>>>>>>> +void kvmppc_watchdog_func(unsigned long data) {
> >>>>>>>>> +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
> >>>>>>>>> +    u32 tsr, new_tsr;
> >>>>>>>>> +    int final;
> >>>>>>>>> +
> >>>>>>>>> +    do {
> >>>>>>>>> +        new_tsr = tsr = vcpu->arch.tsr;
> >>>>>>>>> +        final = 0;
> >>>>>>>>> +
> >>>>>>>>> +        /* Time out event */
> >>>>>>>>> +        if (tsr & TSR_ENW) {
> >>>>>>>>> +            if (tsr & TSR_WIS) {
> >>>>>>>>> +                new_tsr = (tsr & ~TCR_WRC_MASK) |
> >>>>>>>>> +                      (vcpu->arch.tcr & TCR_WRC_MASK);
> >>>>>>>>> +                vcpu->arch.tcr &= ~TCR_WRC_MASK;
> >>>>>>>> Can't we just poke the vcpu to exit the VM and do the above on its own?
> >>>>>>> We've discussed this before.  TSR updates are done via atomics,
> >>>>>>> and we send a request for the vcpu to act on the result.  This
> >>>>>>> is how the decrementer works.
> >>>>>>>
> >>>>>>> http://www.spinics.net/lists/kvm-ppc/msg03169.html
> >>>>>> Yeah, the major difference to the dec is the atomicity of the
> >>>>>> whole thing. Dec changes one bit to enable the interrupt line.
> >>>>>> The final expiration is more complex.
> >>>>> Is not setting the TSR.WRS atomic here (cmpxchg() will handle this)?
> >>>> Final expiration sets TCR. TSR should be ok.
> >>> It is clearing some TCR bits :)
> >>>
> >>> Let us move the TCR clearing to userspace (please see below response
> >>> ^^). Then
> >> it is just setting TSR. Right?
> >>
> >> Then TSR is still set, right? So we don't set it, it just keeps on
> >> being 1. We need to trigger another expired event in that if branch now.
> > I am sorry Alex but I did not get you.
> >
> > What I meant was that you were having concern that dec is atomic while
> watchdog final expiration (complex) is not atomic, but if we move the TCR.WRC
> clearing to userspace on final expiration then are you ok with current code?
> 
> If we move the final expiration TCR.WRC clearing out of that branch, you don't
> have any condition left to check if we are in the final expiration path. So you
> need to set some other bit somewhere (eventually pending_exceptions probably)
> that indicates that we need to return to user space.

Thanks Alex... 

We still have TSR.WRS (TCR.WRC is different if you are getting confused with that), we can use that as a condition for final expiration (the way we are using currently in code).

-Bharat

> 
> 
> Alex
> 


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Wood July 17, 2012, 4:27 p.m. UTC | #12
On 07/17/2012 09:35 AM, Alexander Graf wrote:
> On 07/17/2012 04:13 PM, Bhushan Bharat-R65777 wrote:
>>
>>> -----Original Message-----
>>> From: kvm-ppc-owner@vger.kernel.org
>>> [mailto:kvm-ppc-owner@vger.kernel.org] On
>>> Behalf Of Alexander Graf
>>> Sent: Tuesday, July 17, 2012 7:31 PM
>>> To: Bhushan Bharat-R65777
>>> Cc: Wood Scott-B07421; <kvm-ppc@vger.kernel.org>; <kvm@vger.kernel.org>;
>>> <bharatb.yadav@gmail.com>; Benjamin Herrenschmidt; Kumar Gala; Avi
>>> Kivity
>>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
>>>
>>> On 07/17/2012 03:15 PM, Bhushan Bharat-R65777 wrote:
>>>>> -----Original Message-----
>>>>> From: kvm-ppc-owner@vger.kernel.org
>>>>> [mailto:kvm-ppc-owner@vger.kernel.org] On Behalf Of Alexander Graf
>>>>> Sent: Tuesday, July 17, 2012 6:22 PM
>>>>> To: Bhushan Bharat-R65777
>>>>> Cc: Wood Scott-B07421; <kvm-ppc@vger.kernel.org>;
>>>>> <kvm@vger.kernel.org>; <bharatb.yadav@gmail.com>; Benjamin
>>>>> Herrenschmidt; Kumar Gala; Avi Kivity
>>>>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
>>>>>
>>>>> On 07/17/2012 11:57 AM, Bhushan Bharat-R65777 wrote:
>>>>>> Is not setting the TSR.WRS atomic here (cmpxchg() will handle this)?
>>>>> Final expiration sets TCR. TSR should be ok.
>>>> It is clearing some TCR bits :)
>>>>
>>>> Let us move the TCR clearing to userspace (please see below response
>>>> ^^). Then
>>> it is just setting TSR. Right?
>>>
>>> Then TSR is still set, right? So we don't set it, it just keeps on
>>> being 1. We
>>> need to trigger another expired event in that if branch now.
>> I am sorry Alex but I did not get you.
>>
>> What I meant was that you were having concern that dec is atomic while
>> watchdog final expiration (complex) is not atomic, but if we move the
>> TCR.WRC clearing to userspace on final expiration then are you ok with
>> current code?
> 
> If we move the final expiration TCR.WRC clearing out of that branch, you
> don't have any condition left to check if we are in the final expiration
> path.

I don't follow.

Determining whether we're on final expiration is based only on the
previous value of TSR[ENW,WIS] when the timer expires.  We look at TCR
to determine whether we need to actually exit to QEMU (as opposed to a
final expiration with no action, which still should result in the timer
being stopped), but I don't see any problematic races there as long as
we don't try to update TCR from that context.

> So you need to set some other bit somewhere (eventually
> pending_exceptions probably) that indicates that we need to return to
> user space.

We should do that to simplify other parts of the code, but I'm not sure
what it has to do with TCR[WRC].

-Scott

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Wood July 17, 2012, 4:37 p.m. UTC | #13
On 07/17/2012 06:31 AM, Bhushan Bharat-R65777 wrote:
>>>>  int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)  {
>>>> -    return !(v->arch.shared->msr & MSR_WE) ||
>>>> -           !!(v->arch.pending_exceptions) ||
>>>> -           v->requests;
>>>> +    bool ret = !(v->arch.shared->msr & MSR_WE) ||
>>>> +           !!(v->arch.pending_exceptions) ||
>>>> +           v->requests;
>>>> +
>>>> +    ret = ret || kvmppc_get_tsr_wrc(v);
>>>
>>> Why do you need to declare the cpu as non-runnable when a watchdog
>>> event occured?
>>
>> It's the other way around -- it's always runnable when a watchdog exit is
>> pending.  It's like a pending exception.
> 
> With the above check, Are we trying to handle the case where watchdog
> interrupt bit in pending_exception is cleared by guest after final
> expiry but before the qemu exit?

No, we're just trying to test the actual condition we want to exit on.
The watchdog interrupt might be masked (either with WIE or CE).

> And we want that if TSR.WRS update
> wins the race with clearing of watchdog interrupt condition from
> guest then anyways let QEMU exit with reason KVM_EXIT_WDT?

What race?  If ENW and WIS are both set when the watchdog timer fires,
it's a final expiration.  It's irrelevant what happens to WIS after that
point, before enforcement kicks in.

> What if we do not allow guest clear watchdog interrupt condition if
> final expiry already happened?

What would that solve?

-Scott

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf July 17, 2012, 4:51 p.m. UTC | #14
On 07/17/2012 06:27 PM, Scott Wood wrote:
> On 07/17/2012 09:35 AM, Alexander Graf wrote:
>> On 07/17/2012 04:13 PM, Bhushan Bharat-R65777 wrote:
>>>> -----Original Message-----
>>>> From: kvm-ppc-owner@vger.kernel.org
>>>> [mailto:kvm-ppc-owner@vger.kernel.org] On
>>>> Behalf Of Alexander Graf
>>>> Sent: Tuesday, July 17, 2012 7:31 PM
>>>> To: Bhushan Bharat-R65777
>>>> Cc: Wood Scott-B07421; <kvm-ppc@vger.kernel.org>; <kvm@vger.kernel.org>;
>>>> <bharatb.yadav@gmail.com>; Benjamin Herrenschmidt; Kumar Gala; Avi
>>>> Kivity
>>>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
>>>>
>>>> On 07/17/2012 03:15 PM, Bhushan Bharat-R65777 wrote:
>>>>>> -----Original Message-----
>>>>>> From: kvm-ppc-owner@vger.kernel.org
>>>>>> [mailto:kvm-ppc-owner@vger.kernel.org] On Behalf Of Alexander Graf
>>>>>> Sent: Tuesday, July 17, 2012 6:22 PM
>>>>>> To: Bhushan Bharat-R65777
>>>>>> Cc: Wood Scott-B07421; <kvm-ppc@vger.kernel.org>;
>>>>>> <kvm@vger.kernel.org>; <bharatb.yadav@gmail.com>; Benjamin
>>>>>> Herrenschmidt; Kumar Gala; Avi Kivity
>>>>>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
>>>>>>
>>>>>> On 07/17/2012 11:57 AM, Bhushan Bharat-R65777 wrote:
>>>>>>> Is not setting the TSR.WRS atomic here (cmpxchg() will handle this)?
>>>>>> Final expiration sets TCR. TSR should be ok.
>>>>> It is clearing some TCR bits :)
>>>>>
>>>>> Let us move the TCR clearing to userspace (please see below response
>>>>> ^^). Then
>>>> it is just setting TSR. Right?
>>>>
>>>> Then TSR is still set, right? So we don't set it, it just keeps on
>>>> being 1. We
>>>> need to trigger another expired event in that if branch now.
>>> I am sorry Alex but I did not get you.
>>>
>>> What I meant was that you were having concern that dec is atomic while
>>> watchdog final expiration (complex) is not atomic, but if we move the
>>> TCR.WRC clearing to userspace on final expiration then are you ok with
>>> current code?
>> If we move the final expiration TCR.WRC clearing out of that branch, you
>> don't have any condition left to check if we are in the final expiration
>> path.
> I don't follow.
>
> Determining whether we're on final expiration is based only on the
> previous value of TSR[ENW,WIS] when the timer expires.  We look at TCR
> to determine whether we need to actually exit to QEMU (as opposed to a
> final expiration with no action, which still should result in the timer
> being stopped), but I don't see any problematic races there as long as
> we don't try to update TCR from that context.

Yeah, what I was trying to say is that I would like to keep the TSR 
state unmodified for the final expiry case. If user space decides to not 
act upon it, it should be able to call into VCPU_RUN and have the same 
behavior as if the watchdog never expired.

So there needs to be a check whether we return to user space based on 
TCR, yes. But we shouldn't modify TSR or TCR in that branch. We only 
trigger an event saying "return to user space with a watchdog expired exit".

Then user space can decide whether to act on it. If it decides to ignore 
it, we don't have an oddball TSR.WRS state lingering around that'd need 
clearing.

That way we can put a single if (watchdog_enabled) on the qemu bubble-up 
event injection. If it's not enabled, it'd be as if TCR.WRC is 0 and the 
watchdog expired without action. If it's enabled, user space can decide 
to fire it up again if it deems it necessary.


Alex

>
>> So you need to set some other bit somewhere (eventually
>> pending_exceptions probably) that indicates that we need to return to
>> user space.
> We should do that to simplify other parts of the code, but I'm not sure
> what it has to do with TCR[WRC].
>
> -Scott
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bharat Bhushan July 17, 2012, 4:56 p.m. UTC | #15
> -----Original Message-----

> From: Wood Scott-B07421

> Sent: Tuesday, July 17, 2012 10:08 PM

> To: Bhushan Bharat-R65777

> Cc: Wood Scott-B07421; Alexander Graf; kvm-ppc@vger.kernel.org;

> kvm@vger.kernel.org; bharatb.yadav@gmail.com; Benjamin Herrenschmidt; Kumar Gala

> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation

> 

> On 07/17/2012 06:31 AM, Bhushan Bharat-R65777 wrote:

> >>>>  int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)  {

> >>>> -    return !(v->arch.shared->msr & MSR_WE) ||

> >>>> -           !!(v->arch.pending_exceptions) ||

> >>>> -           v->requests;

> >>>> +    bool ret = !(v->arch.shared->msr & MSR_WE) ||

> >>>> +           !!(v->arch.pending_exceptions) ||

> >>>> +           v->requests;

> >>>> +

> >>>> +    ret = ret || kvmppc_get_tsr_wrc(v);

> >>>

> >>> Why do you need to declare the cpu as non-runnable when a watchdog

> >>> event occured?

> >>

> >> It's the other way around -- it's always runnable when a watchdog

> >> exit is pending.  It's like a pending exception.

> >

> > With the above check, Are we trying to handle the case where watchdog

> > interrupt bit in pending_exception is cleared by guest after final

> > expiry but before the qemu exit?

> 

> No, we're just trying to test the actual condition we want to exit on.

> The watchdog interrupt might be masked (either with WIE or CE).


If the interrupt is masked then still the pending_exception will be set. But interrupt will not be delivered to guest and Final expiry can still cause reset etc !! Right?

> 

> > And we want that if TSR.WRS update

> > wins the race with clearing of watchdog interrupt condition from guest

> > then anyways let QEMU exit with reason KVM_EXIT_WDT?

> 

> What race?  If ENW and WIS are both set when the watchdog timer fires, it's a

> final expiration.  It's irrelevant what happens to WIS after that point, before

> enforcement kicks in.


What I was thinking of was if we can remove this check in vcpu_runnable() and use watchdog bit in pending_exception to qualify vcpu_runnable()

> 

> > What if we do not allow guest clear watchdog interrupt condition if

> > final expiry already happened?

> 

> What would that solve?


Removing  " ret || kvmppc_get_tsr_wrc(v);"  this from vcpu_runnable().

Thanks
-Bharat
> 

> -Scott
Scott Wood July 17, 2012, 5 p.m. UTC | #16
On 07/17/2012 11:56 AM, Bhushan Bharat-R65777 wrote:
> 
> 
>> -----Original Message-----
>> From: Wood Scott-B07421
>> Sent: Tuesday, July 17, 2012 10:08 PM
>> To: Bhushan Bharat-R65777
>> Cc: Wood Scott-B07421; Alexander Graf; kvm-ppc@vger.kernel.org;
>> kvm@vger.kernel.org; bharatb.yadav@gmail.com; Benjamin Herrenschmidt; Kumar Gala
>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
>>
>> On 07/17/2012 06:31 AM, Bhushan Bharat-R65777 wrote:
>>>>>>  int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)  {
>>>>>> -    return !(v->arch.shared->msr & MSR_WE) ||
>>>>>> -           !!(v->arch.pending_exceptions) ||
>>>>>> -           v->requests;
>>>>>> +    bool ret = !(v->arch.shared->msr & MSR_WE) ||
>>>>>> +           !!(v->arch.pending_exceptions) ||
>>>>>> +           v->requests;
>>>>>> +
>>>>>> +    ret = ret || kvmppc_get_tsr_wrc(v);
>>>>>
>>>>> Why do you need to declare the cpu as non-runnable when a watchdog
>>>>> event occured?
>>>>
>>>> It's the other way around -- it's always runnable when a watchdog
>>>> exit is pending.  It's like a pending exception.
>>>
>>> With the above check, Are we trying to handle the case where watchdog
>>> interrupt bit in pending_exception is cleared by guest after final
>>> expiry but before the qemu exit?
>>
>> No, we're just trying to test the actual condition we want to exit on.
>> The watchdog interrupt might be masked (either with WIE or CE).
> 
> If the interrupt is masked then still the pending_exception will be set. 

Not if it's masked by WIE -- and even when masked by CE, it's a bug that
we currently consider the vcpu runnable.  We shouldn't depend on that bug.

> But interrupt will not be delivered to guest and Final expiry can still cause reset etc !! Right?

Not sure what your point is -- yes, if the interrupt is masked it will
not be delivered to the guest, and the enforcement mechanism of the
watchdog is still active.

>>> And we want that if TSR.WRS update
>>> wins the race with clearing of watchdog interrupt condition from guest
>>> then anyways let QEMU exit with reason KVM_EXIT_WDT?
>>
>> What race?  If ENW and WIS are both set when the watchdog timer fires, it's a
>> final expiration.  It's irrelevant what happens to WIS after that point, before
>> enforcement kicks in.
> 
> What I was thinking of was if we can remove this check in vcpu_runnable() and use watchdog bit in pending_exception to qualify vcpu_runnable()

No, we want a new bit.

-Scott

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bharat Bhushan July 17, 2012, 5:10 p.m. UTC | #17
> -----Original Message-----

> From: Wood Scott-B07421

> Sent: Tuesday, July 17, 2012 10:31 PM

> To: Bhushan Bharat-R65777

> Cc: Wood Scott-B07421; Alexander Graf; kvm-ppc@vger.kernel.org;

> kvm@vger.kernel.org; bharatb.yadav@gmail.com; Benjamin Herrenschmidt; Kumar Gala

> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation

> 

> On 07/17/2012 11:56 AM, Bhushan Bharat-R65777 wrote:

> >

> >

> >> -----Original Message-----

> >> From: Wood Scott-B07421

> >> Sent: Tuesday, July 17, 2012 10:08 PM

> >> To: Bhushan Bharat-R65777

> >> Cc: Wood Scott-B07421; Alexander Graf; kvm-ppc@vger.kernel.org;

> >> kvm@vger.kernel.org; bharatb.yadav@gmail.com; Benjamin Herrenschmidt;

> >> Kumar Gala

> >> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation

> >>

> >> On 07/17/2012 06:31 AM, Bhushan Bharat-R65777 wrote:

> >>>>>>  int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)  {

> >>>>>> -    return !(v->arch.shared->msr & MSR_WE) ||

> >>>>>> -           !!(v->arch.pending_exceptions) ||

> >>>>>> -           v->requests;

> >>>>>> +    bool ret = !(v->arch.shared->msr & MSR_WE) ||

> >>>>>> +           !!(v->arch.pending_exceptions) ||

> >>>>>> +           v->requests;

> >>>>>> +

> >>>>>> +    ret = ret || kvmppc_get_tsr_wrc(v);

> >>>>>

> >>>>> Why do you need to declare the cpu as non-runnable when a watchdog

> >>>>> event occured?

> >>>>

> >>>> It's the other way around -- it's always runnable when a watchdog

> >>>> exit is pending.  It's like a pending exception.

> >>>

> >>> With the above check, Are we trying to handle the case where

> >>> watchdog interrupt bit in pending_exception is cleared by guest

> >>> after final expiry but before the qemu exit?

> >>

> >> No, we're just trying to test the actual condition we want to exit on.

> >> The watchdog interrupt might be masked (either with WIE or CE).

> >

> > If the interrupt is masked then still the pending_exception will be set.

> 

> Not if it's masked by WIE -- and even when masked by CE, it's a bug that we

> currently consider the vcpu runnable.  We shouldn't depend on that bug.


Scott can you please describe what is bug? What I remember is that if vcpu is not run-able then we halt vcpu and cannot cause qemu exit also.

Thanks
-Bharat
Scott Wood July 17, 2012, 5:25 p.m. UTC | #18
On 07/17/2012 12:10 PM, Bhushan Bharat-R65777 wrote:
> 
> 
>> -----Original Message-----
>> From: Wood Scott-B07421
>> Sent: Tuesday, July 17, 2012 10:31 PM
>> To: Bhushan Bharat-R65777
>> Cc: Wood Scott-B07421; Alexander Graf; kvm-ppc@vger.kernel.org;
>> kvm@vger.kernel.org; bharatb.yadav@gmail.com; Benjamin Herrenschmidt; Kumar Gala
>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
>>
>> On 07/17/2012 11:56 AM, Bhushan Bharat-R65777 wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Wood Scott-B07421
>>>> Sent: Tuesday, July 17, 2012 10:08 PM
>>>> To: Bhushan Bharat-R65777
>>>> Cc: Wood Scott-B07421; Alexander Graf; kvm-ppc@vger.kernel.org;
>>>> kvm@vger.kernel.org; bharatb.yadav@gmail.com; Benjamin Herrenschmidt;
>>>> Kumar Gala
>>>> Subject: Re: [PATCH 2/2 v2] KVM: PPC: booke: Add watchdog emulation
>>>>
>>>> On 07/17/2012 06:31 AM, Bhushan Bharat-R65777 wrote:
>>>>>>>>  int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)  {
>>>>>>>> -    return !(v->arch.shared->msr & MSR_WE) ||
>>>>>>>> -           !!(v->arch.pending_exceptions) ||
>>>>>>>> -           v->requests;
>>>>>>>> +    bool ret = !(v->arch.shared->msr & MSR_WE) ||
>>>>>>>> +           !!(v->arch.pending_exceptions) ||
>>>>>>>> +           v->requests;
>>>>>>>> +
>>>>>>>> +    ret = ret || kvmppc_get_tsr_wrc(v);
>>>>>>>
>>>>>>> Why do you need to declare the cpu as non-runnable when a watchdog
>>>>>>> event occured?
>>>>>>
>>>>>> It's the other way around -- it's always runnable when a watchdog
>>>>>> exit is pending.  It's like a pending exception.
>>>>>
>>>>> With the above check, Are we trying to handle the case where
>>>>> watchdog interrupt bit in pending_exception is cleared by guest
>>>>> after final expiry but before the qemu exit?
>>>>
>>>> No, we're just trying to test the actual condition we want to exit on.
>>>> The watchdog interrupt might be masked (either with WIE or CE).
>>>
>>> If the interrupt is masked then still the pending_exception will be set.
>>
>> Not if it's masked by WIE -- and even when masked by CE, it's a bug that we
>> currently consider the vcpu runnable.  We shouldn't depend on that bug.
> 
> Scott can you please describe what is bug?

If an interrupt is masked by EE, CE, ME, etc. it is still in
pending_exceptions, so runnable still returns true, and we can't go idle.

> What I remember is that if
> vcpu is not run-able then we halt vcpu and cannot cause qemu exit
> also.

I agree that we want to be considered runnable if we have a final
expiration with an action.  What I disagree with is using the same
pending_exceptions bit as is used for the ordinary watchdog interrupt.
They're not the same thing.

-Scott

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bharat Bhushan July 17, 2012, 5:29 p.m. UTC | #19
> >>

> >> Not if it's masked by WIE -- and even when masked by CE, it's a bug

> >> that we currently consider the vcpu runnable.  We shouldn't depend on that

> bug.

> >

> > Scott can you please describe what is bug?

> 

> If an interrupt is masked by EE, CE, ME, etc. it is still in pending_exceptions,

> so runnable still returns true, and we can't go idle.


Hmm, ok.

> 

> > What I remember is that if

> > vcpu is not run-able then we halt vcpu and cannot cause qemu exit

> > also.

> 

> I agree that we want to be considered runnable if we have a final expiration

> with an action.  What I disagree with is using the same pending_exceptions bit

> as is used for the ordinary watchdog interrupt.

> They're not the same thing.


Now I agree with your disagreement :)

Thanks
-Bharat
Scott Wood July 17, 2012, 6 p.m. UTC | #20
On 07/17/2012 11:51 AM, Alexander Graf wrote:
> On 07/17/2012 06:27 PM, Scott Wood wrote:
>> Determining whether we're on final expiration is based only on the
>> previous value of TSR[ENW,WIS] when the timer expires.  We look at TCR
>> to determine whether we need to actually exit to QEMU (as opposed to a
>> final expiration with no action, which still should result in the timer
>> being stopped), but I don't see any problematic races there as long as
>> we don't try to update TCR from that context.
> 
> Yeah, what I was trying to say is that I would like to keep the TSR
> state unmodified for the final expiry case. If user space decides to not
> act upon it, it should be able to call into VCPU_RUN and have the same
> behavior as if the watchdog never expired.

I agree, it would be racy for QEMU to have to clear TSR if it's not
doing a full reset.  And that fact is making me wonder if I should have
listened to you and kept TSR owned by the vcpu thread. :-)

-Scott

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index f0e0c6a..7bbc6cd 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -446,6 +446,11 @@  static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 }
 #endif
 
+static inline u32 kvmppc_get_tsr_wrc(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
 #define OSI_SC_MAGIC_R3			0x113724FA
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
index b7cd335..e5b86c1 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -100,4 +100,9 @@  static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.shared->msr;
 }
+
+static inline u32 kvmppc_get_tsr_wrc(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.tsr & TCR_WRC_MASK;
+}
 #endif /* __ASM_KVM_BOOKE_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 50ea12f..01047f4 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -467,6 +467,8 @@  struct kvm_vcpu_arch {
 	ulong fault_esr;
 	ulong queued_dear;
 	ulong queued_esr;
+	spinlock_t wdt_lock;
+	struct timer_list wdt_timer;
 	u32 tlbcfg[4];
 	u32 mmucfg;
 	u32 epr;
@@ -482,6 +484,7 @@  struct kvm_vcpu_arch {
 	u8 osi_needed;
 	u8 osi_enabled;
 	u8 papr_enabled;
+	u8 watchdog_enable;
 	u8 sane;
 	u8 cpu_type;
 	u8 hcall_needed;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0124937..e5cf4b9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -67,6 +67,7 @@  extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern void kvmppc_decrementer_func(unsigned long data);
+extern void kvmppc_watchdog_func(unsigned long data);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 
 /* Core-specific hooks */
@@ -104,6 +105,8 @@  extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                        struct kvm_interrupt *irq);
 extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
                                          struct kvm_interrupt *irq);
+extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                   unsigned int op, int *advance);
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 2d916c4..e07e6af 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -539,6 +539,13 @@ 
 #define TCR_FIE		0x00800000	/* FIT Interrupt Enable */
 #define TCR_ARE		0x00400000	/* Auto Reload Enable */
 
+#ifdef CONFIG_E500
+#define TCR_GET_WP(tcr)  ((((tcr) & 0xC0000000) >> 30) | \
+			      (((tcr) & 0x1E0000) >> 15))
+#else
+#define TCR_GET_WP(tcr)  (((tcr) & 0xC0000000) >> 30)
+#endif
+
 /* Bit definitions for the TSR. */
 #define TSR_ENW		0x80000000	/* Enable Next Watchdog */
 #define TSR_WIS		0x40000000	/* WDT Interrupt Status */
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 50e7dbc..a213aba 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -28,6 +28,7 @@ 
 #include <asm/kvm_44x.h>
 #include <asm/kvm_ppc.h>
 
+#include "booke.h"
 #include "44x_tlb.h"
 #include "booke.h"
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index d25a097..97f54ea 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -206,6 +206,16 @@  void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
 	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
 }
 
+void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
+}
+
+void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
+}
+
 static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
 {
 #ifdef CONFIG_KVM_BOOKE_HV
@@ -325,6 +335,7 @@  static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		msr_mask = MSR_CE | MSR_ME | MSR_DE;
 		int_class = INT_CLASS_NONCRIT;
 		break;
+	case BOOKE_IRQPRIO_WATCHDOG:
 	case BOOKE_IRQPRIO_CRITICAL:
 	case BOOKE_IRQPRIO_DBELL_CRIT:
 		allowed = vcpu->arch.shared->msr & MSR_CE;
@@ -404,12 +415,101 @@  static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	return allowed;
 }
 
+/*
+ * Return the number of jiffies until the next timeout.  If the timeout is
+ * longer than the NEXT_TIMER_MAX_DELTA, that return NEXT_TIMER_MAX_DELTA
+ * instead.
+ */
+static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
+{
+	u64 tb, wdt_tb, wdt_ticks = 0;
+	u64 nr_jiffies = 0;
+	u32 period = TCR_GET_WP(vcpu->arch.tcr);
+
+	wdt_tb = 1ULL << (63 - period);
+	tb = get_tb();
+	/*
+	 * The watchdog timeout will hapeen when TB bit corresponding
+	 * to watchdog will toggle from 0 to 1.
+	 */
+	if (tb & wdt_tb)
+		wdt_ticks = wdt_tb;
+
+	wdt_ticks += wdt_tb - (tb & (wdt_tb - 1));
+
+	/* Convert timebase ticks to jiffies */
+	nr_jiffies = wdt_ticks;
+
+	if (do_div(nr_jiffies, tb_ticks_per_jiffy))
+		nr_jiffies++;
+
+	return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA);
+}
+
+static void arm_next_watchdog(struct kvm_vcpu *vcpu)
+{
+	unsigned long nr_jiffies;
+
+	nr_jiffies = watchdog_next_timeout(vcpu);
+	spin_lock(&vcpu->arch.wdt_lock);
+	if (nr_jiffies < NEXT_TIMER_MAX_DELTA)
+		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
+	else
+		del_timer(&vcpu->arch.wdt_timer);
+	spin_unlock(&vcpu->arch.wdt_lock);
+}
+
+void kvmppc_watchdog_func(unsigned long data)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+	u32 tsr, new_tsr;
+	int final;
+
+	do {
+		new_tsr = tsr = vcpu->arch.tsr;
+		final = 0;
+
+		/* Time out event */
+		if (tsr & TSR_ENW) {
+			if (tsr & TSR_WIS) {
+				new_tsr = (tsr & ~TCR_WRC_MASK) |
+					  (vcpu->arch.tcr & TCR_WRC_MASK);
+				vcpu->arch.tcr &= ~TCR_WRC_MASK;
+				final = 1;
+			} else {
+				new_tsr = tsr | TSR_WIS;
+			}
+		} else {
+			new_tsr = tsr | TSR_ENW;
+		}
+	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
+
+	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
+		smp_wmb();
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	/*
+	 * Stop running the watchdog timer after final expiration to
+	 * prevent the host from being flooded with timers if the
+	 * guest sets a short period.
+	 * Timers will resume when TSR/TCR is updated next time.
+	 */
+	if (!final)
+		arm_next_watchdog(vcpu);
+}
 static void update_timer_ints(struct kvm_vcpu *vcpu)
 {
 	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
 		kvmppc_core_queue_dec(vcpu);
 	else
 		kvmppc_core_dequeue_dec(vcpu);
+
+	if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
+		kvmppc_core_queue_watchdog(vcpu);
+	else
+		kvmppc_core_dequeue_watchdog(vcpu);
 }
 
 static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
@@ -516,6 +616,12 @@  int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
+	if (vcpu->arch.tsr & TCR_WRC_MASK) {
+		kvm_run->exit_reason = KVM_EXIT_WDT;
+		ret = 0;
+		goto out;
+	}
+
 	kvm_guest_enter();
 
 #ifdef CONFIG_PPC_FPU
@@ -977,6 +1083,10 @@  int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
 		}
 	}
+	if (vcpu->arch.tsr & TCR_WRC_MASK) {
+		run->exit_reason = KVM_EXIT_WDT;
+		r = RESUME_HOST | (r & RESUME_FLAG_NV);
+	}
 
 	return r;
 }
@@ -1106,7 +1216,14 @@  static int set_sregs_base(struct kvm_vcpu *vcpu,
 	}
 
 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
+		u32 old_tsr = vcpu->arch.tsr;
+
 		vcpu->arch.tsr = sregs->u.e.tsr;
+
+		if (vcpu->arch.watchdog_enable && ((old_tsr ^ vcpu->arch.tsr) &
+		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK)))
+			arm_next_watchdog(vcpu);
+
 		update_timer_ints(vcpu);
 	}
 
@@ -1267,6 +1384,8 @@  void kvmppc_core_commit_memory_region(struct kvm *kvm,
 void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
 {
 	vcpu->arch.tcr = new_tcr;
+	if (vcpu->arch.watchdog_enable)
+		arm_next_watchdog(vcpu);
 	update_timer_ints(vcpu);
 }
 
@@ -1281,6 +1400,15 @@  void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 {
 	clear_bits(tsr_bits, &vcpu->arch.tsr);
+
+	/*
+	 * We may have stopped the watchdog due to
+	 * being stuck on final expiration.
+	 */
+	if (vcpu->arch.watchdog_enable && (tsr_bits & (TSR_ENW |
+			TSR_WIS | TCR_WRC_MASK)))
+		arm_next_watchdog(vcpu);
+
 	update_timer_ints(vcpu);
 }
 
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 12834bb..5a66ade 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -145,6 +145,14 @@  int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 		kvmppc_clr_tsr_bits(vcpu, spr_val);
 		break;
 	case SPRN_TCR:
+		/*
+		 * WRC is a 2-bit field that is supposed to preserve its
+		 * value once written to non-zero.
+		 */
+		if (vcpu->arch.tcr & TCR_WRC_MASK) {
+			spr_val &= ~TCR_WRC_MASK;
+			spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
+		}
 		kvmppc_set_tcr(vcpu, spr_val);
 		break;
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 87f4dc8..646c4da 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -38,9 +38,13 @@ 
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return !(v->arch.shared->msr & MSR_WE) ||
-	       !!(v->arch.pending_exceptions) ||
-	       v->requests;
+	bool ret = !(v->arch.shared->msr & MSR_WE) ||
+		   !!(v->arch.pending_exceptions) ||
+		   v->requests;
+
+	ret = ret || kvmppc_get_tsr_wrc(v);
+
+	return ret;
 }
 
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
@@ -237,6 +241,7 @@  int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_GET_PVINFO:
 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB:
+	case KVM_CAP_PPC_WDT:
 #endif
 		r = 1;
 		break;
@@ -386,13 +391,21 @@  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_KVM_EXIT_TIMING
 	mutex_init(&vcpu->arch.exit_timing_lock);
 #endif
-
+#ifdef CONFIG_BOOKE
+	spin_lock_init(&vcpu->arch.wdt_lock);
+#endif
 	return 0;
 }
 
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 	kvmppc_mmu_destroy(vcpu);
+#ifdef CONFIG_BOOKE
+	spin_lock(&vcpu->arch.wdt_lock);
+	if (vcpu->arch.watchdog_enable)
+		del_timer(&vcpu->arch.wdt_timer);
+	spin_unlock(&vcpu->arch.wdt_lock);
+#endif
 }
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -637,6 +650,16 @@  static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		r = 0;
 		vcpu->arch.papr_enabled = true;
 		break;
+#ifdef CONFIG_BOOKE
+	case KVM_CAP_PPC_WDT:
+		r = 0;
+		/* setup watchdog timer once */
+		if (!vcpu->arch.watchdog_enable)
+			setup_timer(&vcpu->arch.wdt_timer,
+				    kvmppc_watchdog_func, (unsigned long)vcpu);
+		vcpu->arch.watchdog_enable = true;
+		break;
+#endif
 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB: {
 		struct kvm_config_tlb cfg;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2ce09aa..b9fdb52 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -163,6 +163,7 @@  struct kvm_pit_config {
 #define KVM_EXIT_OSI              18
 #define KVM_EXIT_PAPR_HCALL	  19
 #define KVM_EXIT_S390_UCONTROL	  20
+#define KVM_EXIT_WDT              21
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -618,6 +619,7 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_GET_SMMU_INFO 78
 #define KVM_CAP_S390_COW 79
 #define KVM_CAP_PPC_ALLOC_HTAB 80
+#define KVM_CAP_PPC_WDT	81
 
 #ifdef KVM_CAP_IRQ_ROUTING