Patchwork [2/2] KVM: PPC: booke: Add watchdog emulation

login
register
mail settings
Submitter Bharat Bhushan
Date June 28, 2012, 6:17 a.m.
Message ID <1340864262-31651-1-git-send-email-Bharat.Bhushan@freescale.com>
Download mbox | patch
Permalink /patch/167798/
State New
Headers show

Comments

Bharat Bhushan - June 28, 2012, 6:17 a.m.
This patch adds the watchdog emulation in KVM. The watchdog
emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
The kernel timer are used for watchdog emulation and emulates
h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
it is configured.

Signed-off-by: Liu Yu <yu.liu@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
---
 arch/powerpc/include/asm/kvm_host.h  |    2 +
 arch/powerpc/include/asm/kvm_ppc.h   |    3 +
 arch/powerpc/include/asm/reg_booke.h |    2 +
 arch/powerpc/kvm/44x.c               |    1 +
 arch/powerpc/kvm/booke.c             |  125 ++++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/booke_emulate.c     |    6 ++-
 arch/powerpc/kvm/powerpc.c           |   25 ++++++-
 include/linux/kvm.h                  |    2 +
 8 files changed, 162 insertions(+), 4 deletions(-)
Alexander Graf - July 6, 2012, 1:17 p.m.
On 28.06.2012, at 08:17, Bharat Bhushan wrote:

> This patch adds the watchdog emulation in KVM. The watchdog
> emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
> The kernel timer are used for watchdog emulation and emulates
> h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
> if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
> it is configured.
> 
> Signed-off-by: Liu Yu <yu.liu@freescale.com>
> Signed-off-by: Scott Wood <scottwood@freescale.com>
> Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
> ---
> arch/powerpc/include/asm/kvm_host.h  |    2 +
> arch/powerpc/include/asm/kvm_ppc.h   |    3 +
> arch/powerpc/include/asm/reg_booke.h |    2 +
> arch/powerpc/kvm/44x.c               |    1 +
> arch/powerpc/kvm/booke.c             |  125 ++++++++++++++++++++++++++++++++++
> arch/powerpc/kvm/booke_emulate.c     |    6 ++-
> arch/powerpc/kvm/powerpc.c           |   25 ++++++-
> include/linux/kvm.h                  |    2 +
> 8 files changed, 162 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index 50ea12f..a9e5aed 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -467,6 +467,7 @@ struct kvm_vcpu_arch {
> 	ulong fault_esr;
> 	ulong queued_dear;
> 	ulong queued_esr;
> +	struct timer_list wdt_timer;
> 	u32 tlbcfg[4];
> 	u32 mmucfg;
> 	u32 epr;
> @@ -482,6 +483,7 @@ struct kvm_vcpu_arch {
> 	u8 osi_needed;
> 	u8 osi_enabled;
> 	u8 papr_enabled;
> +	u8 watchdog_enable;
> 	u8 sane;
> 	u8 cpu_type;
> 	u8 hcall_needed;
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index 0124937..e5cf4b9 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -67,6 +67,7 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
> extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
> extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
> extern void kvmppc_decrementer_func(unsigned long data);
> +extern void kvmppc_watchdog_func(unsigned long data);
> extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
> 
> /* Core-specific hooks */
> @@ -104,6 +105,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
>                                        struct kvm_interrupt *irq);
> extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
>                                          struct kvm_interrupt *irq);
> +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
> +extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
> 
> extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
>                                   unsigned int op, int *advance);
> diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
> index 2d916c4..7efb3f5 100644
> --- a/arch/powerpc/include/asm/reg_booke.h
> +++ b/arch/powerpc/include/asm/reg_booke.h
> @@ -538,6 +538,8 @@
> #define FP_2_21		3		/* 2^21 clocks */
> #define TCR_FIE		0x00800000	/* FIT Interrupt Enable */
> #define TCR_ARE		0x00400000	/* Auto Reload Enable */
> +#define TCR_GET_FSL_WP(tcr)  ((((tcr) & 0xC0000000) >> 30) | \
> +			      (((tcr) & 0x1E0000) >> 15))
> 
> /* Bit definitions for the TSR. */
> #define TSR_ENW		0x80000000	/* Enable Next Watchdog */
> diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
> index 50e7dbc..a213aba 100644
> --- a/arch/powerpc/kvm/44x.c
> +++ b/arch/powerpc/kvm/44x.c
> @@ -28,6 +28,7 @@
> #include <asm/kvm_44x.h>
> #include <asm/kvm_ppc.h>
> 
> +#include "booke.h"
> #include "44x_tlb.h"
> #include "booke.h"
> 
> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
> index d25a097..ad30f75 100644
> --- a/arch/powerpc/kvm/booke.c
> +++ b/arch/powerpc/kvm/booke.c
> @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
> 	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
> }
> 
> +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
> +{
> +	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
> +}
> +
> +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
> +{
> +	clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
> +}
> +
> static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
> {
> #ifdef CONFIG_KVM_BOOKE_HV
> @@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
> 		msr_mask = MSR_CE | MSR_ME | MSR_DE;
> 		int_class = INT_CLASS_NONCRIT;
> 		break;
> +	case BOOKE_IRQPRIO_WATCHDOG:
> 	case BOOKE_IRQPRIO_CRITICAL:
> 	case BOOKE_IRQPRIO_DBELL_CRIT:
> 		allowed = vcpu->arch.shared->msr & MSR_CE;
> @@ -404,12 +415,98 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
> 	return allowed;
> }
> 
> +/*
> + * The timer system can almost deal with LONG_MAX timeouts, except that
> + * when you get very close to LONG_MAX, the slack added can cause overflow.
> + *
> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
> + * any realistic use.
> + */
> +#define MAX_TIMEOUT (LONG_MAX/2)

Should this really be in kvm code?

> +
> +/*
> + * Return the number of jiffies until the next timeout.  If the timeout is
> + * longer than the MAX_TIMEOUT, that we return MAX_TIMEOUT instead.
> + */
> +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long long tb, mask, nr_jiffies = 0;

u64?

> +	u32 period = TCR_GET_FSL_WP(vcpu->arch.tcr);

Doesn't sound like something booke generic to me.

> +
> +	mask = 1ULL << (63 - period);
> +	tb = get_tb();
> +	if (tb & mask)
> +		nr_jiffies += mask;

To be honest, you lost me here. nr_jiffies is jiffies, right? While mask is basically in timebase granularity. I suppose you're just reusing the variable here for no good reason - the compiler will gladly optimize things for you if you write things a bit more verbose :).

Please make this function more readable. With comments if needed.

> +
> +	nr_jiffies += mask - (tb & (mask - 1));
> +
> +	if (do_div(nr_jiffies, tb_ticks_per_jiffy))
> +		nr_jiffies++;
> +
> +	return min_t(unsigned long long, nr_jiffies, MAX_TIMEOUT);
> +}
> +
> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long nr_jiffies;
> +
> +	nr_jiffies = watchdog_next_timeout(vcpu);
> +	if (nr_jiffies < MAX_TIMEOUT)
> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
> +	else
> +		del_timer(&vcpu->arch.wdt_timer);

Can you del a timer that's not armed? Could that ever happen in this case?

Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?

> +}
> +
> +void kvmppc_watchdog_func(unsigned long data)
> +{
> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
> +	u32 tsr, new_tsr;
> +	int final;
> +
> +	do {
> +		new_tsr = tsr = vcpu->arch.tsr;
> +		final = 0;
> +
> +		/* Time out event */
> +		if (tsr & TSR_ENW) {
> +			if (tsr & TSR_WIS) {
> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
> +				final = 1;
> +			} else {
> +				new_tsr = tsr | TSR_WIS;
> +			}
> +		} else {
> +			new_tsr = tsr | TSR_ENW;
> +		}
> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
> +
> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
> +		smp_wmb();
> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
> +		kvm_vcpu_kick(vcpu);
> +	}
> +
> +	/*
> +	 * Avoid getting a storm of timers if the guest sets
> +	 * the period very short.  We'll restart it if anything
> +	 * changes.
> +	 */
> +	if (!final)
> +		arm_next_watchdog(vcpu);

Mind to explain this part a bit further?

> +}
> static void update_timer_ints(struct kvm_vcpu *vcpu)
> {
> 	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
> 		kvmppc_core_queue_dec(vcpu);
> 	else
> 		kvmppc_core_dequeue_dec(vcpu);
> +
> +	if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
> +		kvmppc_core_queue_watchdog(vcpu);
> +	else
> +		kvmppc_core_dequeue_watchdog(vcpu);
> }
> 
> static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
> @@ -516,6 +613,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
> 		goto out;
> 	}
> 
> +	if (vcpu->arch.tsr & TCR_WRC_MASK) {
> +		kvm_run->exit_reason = KVM_EXIT_WDT;
> +		ret = 0;
> +		goto out;
> +	}
> +
> 	kvm_guest_enter();
> 
> #ifdef CONFIG_PPC_FPU
> @@ -977,6 +1080,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
> 			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
> 		}
> 	}
> +	if (vcpu->arch.tsr & TCR_WRC_MASK) {
> +		run->exit_reason = KVM_EXIT_WDT;
> +		r = RESUME_HOST | (r & RESUME_FLAG_NV);
> +	}
> 
> 	return r;
> }
> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
> 	}
> 
> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
> +		u32 old_tsr = vcpu->arch.tsr;
> +
> 		vcpu->arch.tsr = sregs->u.e.tsr;
> +
> +		if ((old_tsr ^ vcpu->arch.tsr) &
> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
> +			arm_next_watchdog(vcpu);

Why isn't this one guarded by vcpu->arch.watchdog_enable?

> +
> 		update_timer_ints(vcpu);
> 	}
> 
> @@ -1267,6 +1381,8 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
> void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
> {
> 	vcpu->arch.tcr = new_tcr;
> +	if (vcpu->arch.watchdog_enable)
> +		arm_next_watchdog(vcpu);
> 	update_timer_ints(vcpu);
> }
> 
> @@ -1281,6 +1397,15 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
> void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
> {
> 	clear_bits(tsr_bits, &vcpu->arch.tsr);
> +
> +	/*
> +	 * We may have stopped the watchdog due to
> +	 * being stuck on final expiration.
> +	 */
> +	if (vcpu->arch.watchdog_enable && (tsr_bits & (TSR_ENW |
> +			TSR_WIS | TCR_WRC_MASK)))
> +		arm_next_watchdog(vcpu);
> +
> 	update_timer_ints(vcpu);
> }
> 
> diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
> index 12834bb..cc94d42 100644
> --- a/arch/powerpc/kvm/booke_emulate.c
> +++ b/arch/powerpc/kvm/booke_emulate.c
> @@ -145,7 +145,11 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
> 		kvmppc_clr_tsr_bits(vcpu, spr_val);
> 		break;
> 	case SPRN_TCR:
> -		kvmppc_set_tcr(vcpu, spr_val);
> +		/* WRC can only be programmed when WRC=0 */
> +		if (TCR_WRC_MASK & vcpu->arch.tcr)

Please reverse the order.

> +			spr_val &= ~TCR_WRC_MASK;
> +		kvmppc_set_tcr(vcpu,
> +		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));

In fact, what you're trying to do here is keep TCR_WRC always on when it was enabled once. So all you need is the OR here. No need for the mask above.

> 		break;
> 
> 	case SPRN_DECAR:
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 87f4dc8..35a1ff3 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -38,9 +38,15 @@
> 
> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
> {
> -	return !(v->arch.shared->msr & MSR_WE) ||
> -	       !!(v->arch.pending_exceptions) ||
> -	       v->requests;
> +	bool ret = !(v->arch.shared->msr & MSR_WE) ||
> +		   !!(v->arch.pending_exceptions) ||
> +		   v->requests;
> +
> +#ifdef CONFIG_BOOKE
> +	ret = ret || (v->arch.tsr & TCR_WRC_MASK);

Please make this a callback. In a header if you think it's performance critical, but I don't want to see #ifdef CONFIG_BOOKE too often in powerpc.c.

> +#endif
> +
> +	return ret;
> }
> 
> int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
> @@ -237,6 +243,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> 	case KVM_CAP_PPC_GET_PVINFO:
> #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
> 	case KVM_CAP_SW_TLB:
> +	case KVM_CAP_PPC_WDT:
> #endif
> 		r = 1;
> 		break;
> @@ -393,6 +400,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
> {
> 	kvmppc_mmu_destroy(vcpu);
> +#ifdef CONFIG_BOOKE
> +	if (vcpu->arch.watchdog_enable)
> +		del_timer(&vcpu->arch.wdt_timer);
> +#endif
> }
> 
> void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> @@ -637,6 +648,14 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
> 		r = 0;
> 		vcpu->arch.papr_enabled = true;
> 		break;
> +#ifdef CONFIG_BOOKE
> +	case KVM_CAP_PPC_WDT:
> +		r = 0;
> +		vcpu->arch.watchdog_enable = true;
> +		setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
> +		            (unsigned long)vcpu);

Shouldn't we guard against user space calling this twice?


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Wood - July 6, 2012, 11:37 p.m.
On 07/06/2012 08:17 AM, Alexander Graf wrote:
> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>> +/*
>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>> + *
>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>> + * any realistic use.
>> + */
>> +#define MAX_TIMEOUT (LONG_MAX/2)
> 
> Should this really be in kvm code?

It looks like we can use NEXT_TIMER_MAX_DELTA for this.

>> +	mask = 1ULL << (63 - period);
>> +	tb = get_tb();
>> +	if (tb & mask)
>> +		nr_jiffies += mask;
> 
> To be honest, you lost me here. nr_jiffies is jiffies, right? While
> mask is basically in timebase granularity. I suppose you're just
> reusing the variable here for no good reason - the compiler will
> gladly optimize things for you if you write things a bit more verbose
> :).

Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
something generic like "ticks", "interval", "remaining", etc. would be
better, with a comment on the do_div saying it's converting timebase
ticks into jiffies.

>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>> +{
>> +	unsigned long nr_jiffies;
>> +
>> +	nr_jiffies = watchdog_next_timeout(vcpu);
>> +	if (nr_jiffies < MAX_TIMEOUT)
>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>> +	else
>> +		del_timer(&vcpu->arch.wdt_timer);
> 
> Can you del a timer that's not armed? Could that ever happen in this case?

"del_timer() deactivates a timer - this works on both active and
inactive timers."

> Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?

This can be called in the context of the timer, so del_timer_sync()
would hang.

As for what would happen if a caller from a different context were to
race with a timer, I think you could end up with the timer armed based
on an old TCR.  del_timer_sync() won't help though, unless you replace
mod_timer() with del_timer_sync() plus add_timer() (with a check to see
whether it's running in timer context).  A better solution is probably
to use a spinlock in arm_next_watchdog().

>> +void kvmppc_watchdog_func(unsigned long data)
>> +{
>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>> +	u32 tsr, new_tsr;
>> +	int final;
>> +
>> +	do {
>> +		new_tsr = tsr = vcpu->arch.tsr;
>> +		final = 0;
>> +
>> +		/* Time out event */
>> +		if (tsr & TSR_ENW) {
>> +			if (tsr & TSR_WIS) {
>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
>> +				final = 1;
>> +			} else {
>> +				new_tsr = tsr | TSR_WIS;
>> +			}
>> +		} else {
>> +			new_tsr = tsr | TSR_ENW;
>> +		}
>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>> +
>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>> +		smp_wmb();
>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>> +		kvm_vcpu_kick(vcpu);
>> +	}
>> +
>> +	/*
>> +	 * Avoid getting a storm of timers if the guest sets
>> +	 * the period very short.  We'll restart it if anything
>> +	 * changes.
>> +	 */
>> +	if (!final)
>> +		arm_next_watchdog(vcpu);
> 
> Mind to explain this part a bit further?

The whole function, or some subset near the end?

The "if (!final)" check means that we stop running the timer after final
expiration, to prevent the host from being flooded with timers if the
guest sets a short period but does not have TCR set to exit to QEMU.
Timers will resume the next time TSR/TCR is updated.

>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>> 	}
>>
>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>> +		u32 old_tsr = vcpu->arch.tsr;
>> +
>> 		vcpu->arch.tsr = sregs->u.e.tsr;
>> +
>> +		if ((old_tsr ^ vcpu->arch.tsr) &
>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>> +			arm_next_watchdog(vcpu);
> 
> Why isn't this one guarded by vcpu->arch.watchdog_enable?

I'm not sure that any of them should be -- there's no reason for the
watchdog interrupt mechanism to be dependent on QEMU, only the
heavyweight exit on final expiration.

>> +			spr_val &= ~TCR_WRC_MASK;
>> +		kvmppc_set_tcr(vcpu,
>> +		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
> 
> In fact, what you're trying to do here is keep TCR_WRC always on when it was enabled once. So all you need is the OR here. No need for the mask above.

WRC is a 2-bit field that is supposed to preserve its value once written
to be non-zero.  Not that we actually do anything different based on the
specific non-zero value, but still we should implement the architected
semantics.

-Scott

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf - July 7, 2012, 7:50 a.m.
On 07.07.2012, at 01:37, Scott Wood wrote:

> On 07/06/2012 08:17 AM, Alexander Graf wrote:
>> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>>> +/*
>>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>>> + *
>>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>>> + * any realistic use.
>>> + */
>>> +#define MAX_TIMEOUT (LONG_MAX/2)
>> 
>> Should this really be in kvm code?
> 
> It looks like we can use NEXT_TIMER_MAX_DELTA for this.
> 
>>> +	mask = 1ULL << (63 - period);
>>> +	tb = get_tb();
>>> +	if (tb & mask)
>>> +		nr_jiffies += mask;
>> 
>> To be honest, you lost me here. nr_jiffies is jiffies, right? While
>> mask is basically in timebase granularity. I suppose you're just
>> reusing the variable here for no good reason - the compiler will
>> gladly optimize things for you if you write things a bit more verbose
>> :).
> 
> Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
> something generic like "ticks", "interval", "remaining", etc. would be
> better, with a comment on the do_div saying it's converting timebase
> ticks into jiffies.

Well, you could start off with a variable "delta_tb", then do

  nr_jiffies = delta_tb;
  x = do_div(...);

and things would suddenly become readable :). Of course I don't object to comments along the code either :).

> 
>>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>>> +{
>>> +	unsigned long nr_jiffies;
>>> +
>>> +	nr_jiffies = watchdog_next_timeout(vcpu);
>>> +	if (nr_jiffies < MAX_TIMEOUT)
>>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>>> +	else
>>> +		del_timer(&vcpu->arch.wdt_timer);
>> 
>> Can you del a timer that's not armed? Could that ever happen in this case?
> 
> "del_timer() deactivates a timer - this works on both active and
> inactive timers."

Ah, good :).

> 
>> Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?
> 
> This can be called in the context of the timer, so del_timer_sync()
> would hang.
> 
> As for what would happen if a caller from a different context were to
> race with a timer, I think you could end up with the timer armed based
> on an old TCR.  del_timer_sync() won't help though, unless you replace
> mod_timer() with del_timer_sync() plus add_timer() (with a check to see
> whether it's running in timer context).  A better solution is probably
> to use a spinlock in arm_next_watchdog().

Yup. Either way, we have a race that the guest might not expect.

> 
>>> +void kvmppc_watchdog_func(unsigned long data)
>>> +{
>>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>> +	u32 tsr, new_tsr;
>>> +	int final;
>>> +
>>> +	do {
>>> +		new_tsr = tsr = vcpu->arch.tsr;
>>> +		final = 0;
>>> +
>>> +		/* Time out event */
>>> +		if (tsr & TSR_ENW) {
>>> +			if (tsr & TSR_WIS) {
>>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
>>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
>>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>> +				final = 1;
>>> +			} else {
>>> +				new_tsr = tsr | TSR_WIS;
>>> +			}
>>> +		} else {
>>> +			new_tsr = tsr | TSR_ENW;
>>> +		}
>>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>>> +
>>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>>> +		smp_wmb();
>>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>>> +		kvm_vcpu_kick(vcpu);
>>> +	}
>>> +
>>> +	/*
>>> +	 * Avoid getting a storm of timers if the guest sets
>>> +	 * the period very short.  We'll restart it if anything
>>> +	 * changes.
>>> +	 */
>>> +	if (!final)
>>> +		arm_next_watchdog(vcpu);
>> 
>> Mind to explain this part a bit further?
> 
> The whole function, or some subset near the end?
> 
> The "if (!final)" check means that we stop running the timer after final
> expiration, to prevent the host from being flooded with timers if the
> guest sets a short period but does not have TCR set to exit to QEMU.
> Timers will resume the next time TSR/TCR is updated.

Ah. The semantics make sense. The comment however is slightly too short. Please explain this in a more verbose way, so someone who didn't write the code knows what's going on :).

> 
>>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>>> 	}
>>> 
>>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>>> +		u32 old_tsr = vcpu->arch.tsr;
>>> +
>>> 		vcpu->arch.tsr = sregs->u.e.tsr;
>>> +
>>> +		if ((old_tsr ^ vcpu->arch.tsr) &
>>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>>> +			arm_next_watchdog(vcpu);
>> 
>> Why isn't this one guarded by vcpu->arch.watchdog_enable?
> 
> I'm not sure that any of them should be -- there's no reason for the
> watchdog interrupt mechanism to be dependent on QEMU, only the
> heavyweight exit on final expiration.

Well - I like the concept of having new features switchable. Overlapping the "watchdog is implemented" feature with "user space wants watchdog exits" makes sense. But I definitely want to have a switch for the former, because we otherwise differ quite substantially from the emulation we had before.

> 
>>> +			spr_val &= ~TCR_WRC_MASK;
>>> +		kvmppc_set_tcr(vcpu,
>>> +		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
>> 
>> In fact, what you're trying to do here is keep TCR_WRC always on when it was enabled once. So all you need is the OR here. No need for the mask above.
> 
> WRC is a 2-bit field that is supposed to preserve its value once written
> to be non-zero.  Not that we actually do anything different based on the
> specific non-zero value, but still we should implement the architected
> semantics.

Ah, being 2 bits wide, the above code suddenly makes more sense :). How about

/* WRC is a 2-bit field that is supposed to preserve its value once written to be non-zero */
spr_val &= ~TCR_WRC_MASK;
spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
kvmppc_set_tcr(vcpu, spr_val);


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bharat Bhushan - July 9, 2012, 5:13 a.m.
> -----Original Message-----
> From: Alexander Graf [mailto:agraf@suse.de]
> Sent: Saturday, July 07, 2012 1:21 PM
> To: Wood Scott-B07421
> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org; kvm@vger.kernel.org; Bhushan
> Bharat-R65777
> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
> 
> 
> On 07.07.2012, at 01:37, Scott Wood wrote:
> 
> > On 07/06/2012 08:17 AM, Alexander Graf wrote:
> >> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
> >>> +/*
> >>> + * The timer system can almost deal with LONG_MAX timeouts, except that
> >>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
> >>> + *
> >>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
> >>> + * any realistic use.
> >>> + */
> >>> +#define MAX_TIMEOUT (LONG_MAX/2)
> >>
> >> Should this really be in kvm code?
> >
> > It looks like we can use NEXT_TIMER_MAX_DELTA for this.
> >
> >>> +	mask = 1ULL << (63 - period);
> >>> +	tb = get_tb();
> >>> +	if (tb & mask)
> >>> +		nr_jiffies += mask;
> >>
> >> To be honest, you lost me here. nr_jiffies is jiffies, right? While
> >> mask is basically in timebase granularity. I suppose you're just
> >> reusing the variable here for no good reason - the compiler will
> >> gladly optimize things for you if you write things a bit more verbose
> >> :).
> >
> > Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
> > something generic like "ticks", "interval", "remaining", etc. would be
> > better, with a comment on the do_div saying it's converting timebase
> > ticks into jiffies.
> 
> Well, you could start off with a variable "delta_tb", then do
> 
>   nr_jiffies = delta_tb;
>   x = do_div(...);
> 
> and things would suddenly become readable :). Of course I don't object to
> comments along the code either :).

Ok

> 
> >
> >>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
> >>> +{
> >>> +	unsigned long nr_jiffies;
> >>> +
> >>> +	nr_jiffies = watchdog_next_timeout(vcpu);
> >>> +	if (nr_jiffies < MAX_TIMEOUT)
> >>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
> >>> +	else
> >>> +		del_timer(&vcpu->arch.wdt_timer);
> >>
> >> Can you del a timer that's not armed? Could that ever happen in this case?
> >
> > "del_timer() deactivates a timer - this works on both active and
> > inactive timers."
> 
> Ah, good :).
> 
> >
> >> Also, could the timer possibly be running somewhere, so do we need
> del_timer_sync? Or don't we need to care?
> >
> > This can be called in the context of the timer, so del_timer_sync()
> > would hang.
> >
> > As for what would happen if a caller from a different context were to
> > race with a timer, I think you could end up with the timer armed based
> > on an old TCR.  del_timer_sync() won't help though, unless you replace
> > mod_timer() with del_timer_sync() plus add_timer() (with a check to see
> > whether it's running in timer context).  A better solution is probably
> > to use a spinlock in arm_next_watchdog().
> 
> Yup. Either way, we have a race that the guest might not expect.

Ok, will use spinlock in arm_next_watchdog().

> 
> >
> >>> +void kvmppc_watchdog_func(unsigned long data)
> >>> +{
> >>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
> >>> +	u32 tsr, new_tsr;
> >>> +	int final;
> >>> +
> >>> +	do {
> >>> +		new_tsr = tsr = vcpu->arch.tsr;
> >>> +		final = 0;
> >>> +
> >>> +		/* Time out event */
> >>> +		if (tsr & TSR_ENW) {
> >>> +			if (tsr & TSR_WIS) {
> >>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
> >>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
> >>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
> >>> +				final = 1;
> >>> +			} else {
> >>> +				new_tsr = tsr | TSR_WIS;
> >>> +			}
> >>> +		} else {
> >>> +			new_tsr = tsr | TSR_ENW;
> >>> +		}
> >>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
> >>> +
> >>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
> >>> +		smp_wmb();
> >>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
> >>> +		kvm_vcpu_kick(vcpu);
> >>> +	}
> >>> +
> >>> +	/*
> >>> +	 * Avoid getting a storm of timers if the guest sets
> >>> +	 * the period very short.  We'll restart it if anything
> >>> +	 * changes.
> >>> +	 */
> >>> +	if (!final)
> >>> +		arm_next_watchdog(vcpu);
> >>
> >> Mind to explain this part a bit further?
> >
> > The whole function, or some subset near the end?
> >
> > The "if (!final)" check means that we stop running the timer after final
> > expiration, to prevent the host from being flooded with timers if the
> > guest sets a short period but does not have TCR set to exit to QEMU.
> > Timers will resume the next time TSR/TCR is updated.
> 
> Ah. The semantics make sense. The comment however is slightly too short. Please
> explain this in a more verbose way, so someone who didn't write the code knows
> what's going on :).

Ok.

> 
> >
> >>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
> >>> 	}
> >>>
> >>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
> >>> +		u32 old_tsr = vcpu->arch.tsr;
> >>> +
> >>> 		vcpu->arch.tsr = sregs->u.e.tsr;
> >>> +
> >>> +		if ((old_tsr ^ vcpu->arch.tsr) &
> >>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
> >>> +			arm_next_watchdog(vcpu);
> >>
> >> Why isn't this one guarded by vcpu->arch.watchdog_enable?
> >
> > I'm not sure that any of them should be -- there's no reason for the
> > watchdog interrupt mechanism to be dependent on QEMU, only the
> > heavyweight exit on final expiration.
> 
> Well - I like the concept of having new features switchable. Overlapping the
> "watchdog is implemented" feature with "user space wants watchdog exits" makes
> sense. But I definitely want to have a switch for the former, because we
> otherwise differ quite substantially from the emulation we had before.

Ok, will guard with watchdog_enable.

> 
> >
> >>> +			spr_val &= ~TCR_WRC_MASK;
> >>> +		kvmppc_set_tcr(vcpu,
> >>> +		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
> >>
> >> In fact, what you're trying to do here is keep TCR_WRC always on when it was
> enabled once. So all you need is the OR here. No need for the mask above.
> >
> > WRC is a 2-bit field that is supposed to preserve its value once written
> > to be non-zero.  Not that we actually do anything different based on the
> > specific non-zero value, but still we should implement the architected
> > semantics.
> 
> Ah, being 2 bits wide, the above code suddenly makes more sense :). How about
> 
> /* WRC is a 2-bit field that is supposed to preserve its value once written to
> be non-zero */
> spr_val &= ~TCR_WRC_MASK;
> spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
> kvmppc_set_tcr(vcpu, spr_val);

I think you mean:

if (TCR_WRC_MASK & vcpu->arch.tcr) {
    spr_val &= ~TCR_WRC_MASK;
    spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
}
kvmppc_set_tcr(vcpu, spr_val);

Thanks
-Bharat

> 
> 
> Alex
> 


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf - July 9, 2012, 8:49 a.m.
On 09.07.2012, at 07:13, Bhushan Bharat-R65777 <R65777@freescale.com> wrote:

> 
> 
>> -----Original Message-----
>> From: Alexander Graf [mailto:agraf@suse.de]
>> Sent: Saturday, July 07, 2012 1:21 PM
>> To: Wood Scott-B07421
>> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org; kvm@vger.kernel.org; Bhushan
>> Bharat-R65777
>> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
>> 
>> 
>> On 07.07.2012, at 01:37, Scott Wood wrote:
>> 
>>> On 07/06/2012 08:17 AM, Alexander Graf wrote:
>>>> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>>>>> +/*
>>>>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>>>>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>>>>> + *
>>>>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>>>>> + * any realistic use.
>>>>> + */
>>>>> +#define MAX_TIMEOUT (LONG_MAX/2)
>>>> 
>>>> Should this really be in kvm code?
>>> 
>>> It looks like we can use NEXT_TIMER_MAX_DELTA for this.
>>> 
>>>>> +    mask = 1ULL << (63 - period);
>>>>> +    tb = get_tb();
>>>>> +    if (tb & mask)
>>>>> +        nr_jiffies += mask;
>>>> 
>>>> To be honest, you lost me here. nr_jiffies is jiffies, right? While
>>>> mask is basically in timebase granularity. I suppose you're just
>>>> reusing the variable here for no good reason - the compiler will
>>>> gladly optimize things for you if you write things a bit more verbose
>>>> :).
>>> 
>>> Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
>>> something generic like "ticks", "interval", "remaining", etc. would be
>>> better, with a comment on the do_div saying it's converting timebase
>>> ticks into jiffies.
>> 
>> Well, you could start off with a variable "delta_tb", then do
>> 
>>  nr_jiffies = delta_tb;
>>  x = do_div(...);
>> 
>> and things would suddenly become readable :). Of course I don't object to
>> comments along the code either :).
> 
> Ok
> 
>> 
>>> 
>>>>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +    unsigned long nr_jiffies;
>>>>> +
>>>>> +    nr_jiffies = watchdog_next_timeout(vcpu);
>>>>> +    if (nr_jiffies < MAX_TIMEOUT)
>>>>> +        mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>>>>> +    else
>>>>> +        del_timer(&vcpu->arch.wdt_timer);
>>>> 
>>>> Can you del a timer that's not armed? Could that ever happen in this case?
>>> 
>>> "del_timer() deactivates a timer - this works on both active and
>>> inactive timers."
>> 
>> Ah, good :).
>> 
>>> 
>>>> Also, could the timer possibly be running somewhere, so do we need
>> del_timer_sync? Or don't we need to care?
>>> 
>>> This can be called in the context of the timer, so del_timer_sync()
>>> would hang.
>>> 
>>> As for what would happen if a caller from a different context were to
>>> race with a timer, I think you could end up with the timer armed based
>>> on an old TCR.  del_timer_sync() won't help though, unless you replace
>>> mod_timer() with del_timer_sync() plus add_timer() (with a check to see
>>> whether it's running in timer context).  A better solution is probably
>>> to use a spinlock in arm_next_watchdog().
>> 
>> Yup. Either way, we have a race that the guest might not expect.
> 
> Ok, will use spinlock in arm_next_watchdog().
> 
>> 
>>> 
>>>>> +void kvmppc_watchdog_func(unsigned long data)
>>>>> +{
>>>>> +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>>>> +    u32 tsr, new_tsr;
>>>>> +    int final;
>>>>> +
>>>>> +    do {
>>>>> +        new_tsr = tsr = vcpu->arch.tsr;
>>>>> +        final = 0;
>>>>> +
>>>>> +        /* Time out event */
>>>>> +        if (tsr & TSR_ENW) {
>>>>> +            if (tsr & TSR_WIS) {
>>>>> +                new_tsr = (tsr & ~TCR_WRC_MASK) |
>>>>> +                      (vcpu->arch.tcr & TCR_WRC_MASK);
>>>>> +                vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>>>> +                final = 1;
>>>>> +            } else {
>>>>> +                new_tsr = tsr | TSR_WIS;
>>>>> +            }
>>>>> +        } else {
>>>>> +            new_tsr = tsr | TSR_ENW;
>>>>> +        }
>>>>> +    } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>>>>> +
>>>>> +    if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>>>>> +        smp_wmb();
>>>>> +        kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>>>>> +        kvm_vcpu_kick(vcpu);
>>>>> +    }
>>>>> +
>>>>> +    /*
>>>>> +     * Avoid getting a storm of timers if the guest sets
>>>>> +     * the period very short.  We'll restart it if anything
>>>>> +     * changes.
>>>>> +     */
>>>>> +    if (!final)
>>>>> +        arm_next_watchdog(vcpu);
>>>> 
>>>> Mind to explain this part a bit further?
>>> 
>>> The whole function, or some subset near the end?
>>> 
>>> The "if (!final)" check means that we stop running the timer after final
>>> expiration, to prevent the host from being flooded with timers if the
>>> guest sets a short period but does not have TCR set to exit to QEMU.
>>> Timers will resume the next time TSR/TCR is updated.
>> 
>> Ah. The semantics make sense. The comment however is slightly too short. Please
>> explain this in a more verbose way, so someone who didn't write the code knows
>> what's going on :).
> 
> Ok.
> 
>> 
>>> 
>>>>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>>>>>    }
>>>>> 
>>>>>    if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>>>>> +        u32 old_tsr = vcpu->arch.tsr;
>>>>> +
>>>>>        vcpu->arch.tsr = sregs->u.e.tsr;
>>>>> +
>>>>> +        if ((old_tsr ^ vcpu->arch.tsr) &
>>>>> +            (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>>>>> +            arm_next_watchdog(vcpu);
>>>> 
>>>> Why isn't this one guarded by vcpu->arch.watchdog_enable?
>>> 
>>> I'm not sure that any of them should be -- there's no reason for the
>>> watchdog interrupt mechanism to be dependent on QEMU, only the
>>> heavyweight exit on final expiration.
>> 
>> Well - I like the concept of having new features switchable. Overlapping the
>> "watchdog is implemented" feature with "user space wants watchdog exits" makes
>> sense. But I definitely want to have a switch for the former, because we
>> otherwise differ quite substantially from the emulation we had before.
> 
> Ok, will guard with watchdog_enable.
> 
>> 
>>> 
>>>>> +            spr_val &= ~TCR_WRC_MASK;
>>>>> +        kvmppc_set_tcr(vcpu,
>>>>> +                       spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
>>>> 
>>>> In fact, what you're trying to do here is keep TCR_WRC always on when it was
>> enabled once. So all you need is the OR here. No need for the mask above.
>>> 
>>> WRC is a 2-bit field that is supposed to preserve its value once written
>>> to be non-zero.  Not that we actually do anything different based on the
>>> specific non-zero value, but still we should implement the architected
>>> semantics.
>> 
>> Ah, being 2 bits wide, the above code suddenly makes more sense :). How about
>> 
>> /* WRC is a 2-bit field that is supposed to preserve its value once written to
>> be non-zero */
>> spr_val &= ~TCR_WRC_MASK;
>> spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
>> kvmppc_set_tcr(vcpu, spr_val);
> 
> I think you mean:
> 
> if (TCR_WRC_MASK & vcpu->arch.tcr) {
>    spr_val &= ~TCR_WRC_MASK;
>    spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
> }
> kvmppc_set_tcr(vcpu, spr_val);

Eh, yes, of course :). Plus the comment.

Alex

> 
> Thanks
> -Bharat
> 
>> 
>> 
>> Alex
>> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Wood - July 9, 2012, 2:28 p.m.
On 07/09/2012 12:13 AM, Bhushan Bharat-R65777 wrote:
> 
> 
>> -----Original Message-----
>> From: Alexander Graf [mailto:agraf@suse.de]
>> Sent: Saturday, July 07, 2012 1:21 PM
>> To: Wood Scott-B07421
>> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org; kvm@vger.kernel.org; Bhushan
>> Bharat-R65777
>> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
>>
>>
>> Ah, being 2 bits wide, the above code suddenly makes more sense :). How about
>>
>> /* WRC is a 2-bit field that is supposed to preserve its value once written to
>> be non-zero */
>> spr_val &= ~TCR_WRC_MASK;
>> spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
>> kvmppc_set_tcr(vcpu, spr_val);
> 
> I think you mean:
> 
> if (TCR_WRC_MASK & vcpu->arch.tcr) {
>     spr_val &= ~TCR_WRC_MASK;
>     spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
> }
> kvmppc_set_tcr(vcpu, spr_val);

Actually I think he means:

if (vcpu->arch.tcr & TCR_WRC_MASK) {
	spr_val &= ~TCR_WRC_MASK;
	spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
}

kvmppc_set_tcr(vcpu, spr_val);

:-)

-Scott

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf - July 9, 2012, 2:36 p.m.
On 09.07.2012, at 16:28, Scott Wood wrote:

> On 07/09/2012 12:13 AM, Bhushan Bharat-R65777 wrote:
>> 
>> 
>>> -----Original Message-----
>>> From: Alexander Graf [mailto:agraf@suse.de]
>>> Sent: Saturday, July 07, 2012 1:21 PM
>>> To: Wood Scott-B07421
>>> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org; kvm@vger.kernel.org; Bhushan
>>> Bharat-R65777
>>> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
>>> 
>>> 
>>> Ah, being 2 bits wide, the above code suddenly makes more sense :). How about
>>> 
>>> /* WRC is a 2-bit field that is supposed to preserve its value once written to
>>> be non-zero */
>>> spr_val &= ~TCR_WRC_MASK;
>>> spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
>>> kvmppc_set_tcr(vcpu, spr_val);
>> 
>> I think you mean:
>> 
>> if (TCR_WRC_MASK & vcpu->arch.tcr) {
>>    spr_val &= ~TCR_WRC_MASK;
>>    spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
>> }
>> kvmppc_set_tcr(vcpu, spr_val);
> 
> Actually I think he means:
> 
> if (vcpu->arch.tcr & TCR_WRC_MASK) {
> 	spr_val &= ~TCR_WRC_MASK;
> 	spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
> }
> 
> kvmppc_set_tcr(vcpu, spr_val);

Plus the comment, yes :). I really don't like (mask & val) constructs. About as much as I dislike if (0 == x) ones.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bharat Bhushan - July 9, 2012, 2:44 p.m.
> -----Original Message-----
> From: kvm-ppc-owner@vger.kernel.org [mailto:kvm-ppc-owner@vger.kernel.org] On
> Behalf Of Alexander Graf
> Sent: Monday, July 09, 2012 8:07 PM
> To: Wood Scott-B07421
> Cc: Bhushan Bharat-R65777; Wood Scott-B07421; kvm-ppc@vger.kernel.org;
> kvm@vger.kernel.org
> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
> 
> 
> On 09.07.2012, at 16:28, Scott Wood wrote:
> 
> > On 07/09/2012 12:13 AM, Bhushan Bharat-R65777 wrote:
> >>
> >>
> >>> -----Original Message-----
> >>> From: Alexander Graf [mailto:agraf@suse.de]
> >>> Sent: Saturday, July 07, 2012 1:21 PM
> >>> To: Wood Scott-B07421
> >>> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org;
> >>> kvm@vger.kernel.org; Bhushan
> >>> Bharat-R65777
> >>> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
> >>>
> >>>
> >>> Ah, being 2 bits wide, the above code suddenly makes more sense :).
> >>> How about
> >>>
> >>> /* WRC is a 2-bit field that is supposed to preserve its value once
> >>> written to be non-zero */ spr_val &= ~TCR_WRC_MASK; spr_val |=
> >>> vcpu->arch.tcr & TCR_WRC_MASK; kvmppc_set_tcr(vcpu, spr_val);
> >>
> >> I think you mean:
> >>
> >> if (TCR_WRC_MASK & vcpu->arch.tcr) {
> >>    spr_val &= ~TCR_WRC_MASK;
> >>    spr_val |= vcpu->arch.tcr & TCR_WRC_MASK; } kvmppc_set_tcr(vcpu,
> >> spr_val);
> >
> > Actually I think he means:
> >
> > if (vcpu->arch.tcr & TCR_WRC_MASK) {
> > 	spr_val &= ~TCR_WRC_MASK;
> > 	spr_val |= vcpu->arch.tcr & TCR_WRC_MASK; }
> >
> > kvmppc_set_tcr(vcpu, spr_val);
> 
> Plus the comment, yes :). I really don't like (mask & val) constructs. About as
> much as I dislike if (0 == x) ones.

I think I did in v2 :)

> 
> 
> Alex
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body
> of a message to majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Wood - July 9, 2012, 5:09 p.m.
On 07/07/2012 02:50 AM, Alexander Graf wrote:
> 
> On 07.07.2012, at 01:37, Scott Wood wrote:
> 
>> On 07/06/2012 08:17 AM, Alexander Graf wrote:
>>> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>>>> +/*
>>>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>>>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>>>> + *
>>>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>>>> + * any realistic use.
>>>> + */
>>>> +#define MAX_TIMEOUT (LONG_MAX/2)
>>>
>>> Should this really be in kvm code?
>>
>> It looks like we can use NEXT_TIMER_MAX_DELTA for this.
>>
>>>> +	mask = 1ULL << (63 - period);
>>>> +	tb = get_tb();
>>>> +	if (tb & mask)
>>>> +		nr_jiffies += mask;
>>>
>>> To be honest, you lost me here. nr_jiffies is jiffies, right? While
>>> mask is basically in timebase granularity. I suppose you're just
>>> reusing the variable here for no good reason - the compiler will
>>> gladly optimize things for you if you write things a bit more verbose
>>> :).
>>
>> Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
>> something generic like "ticks", "interval", "remaining", etc. would be
>> better, with a comment on the do_div saying it's converting timebase
>> ticks into jiffies.
> 
> Well, you could start off with a variable "delta_tb", then do
> 
>   nr_jiffies = delta_tb;
>   x = do_div(...);
> 
> and things would suddenly become readable :). Of course I don't object to comments along the code either :).
> 
>>
>>>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	unsigned long nr_jiffies;
>>>> +
>>>> +	nr_jiffies = watchdog_next_timeout(vcpu);
>>>> +	if (nr_jiffies < MAX_TIMEOUT)
>>>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>>>> +	else
>>>> +		del_timer(&vcpu->arch.wdt_timer);
>>>
>>> Can you del a timer that's not armed? Could that ever happen in this case?
>>
>> "del_timer() deactivates a timer - this works on both active and
>> inactive timers."
> 
> Ah, good :).
> 
>>
>>> Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?
>>
>> This can be called in the context of the timer, so del_timer_sync()
>> would hang.
>>
>> As for what would happen if a caller from a different context were to
>> race with a timer, I think you could end up with the timer armed based
>> on an old TCR.  del_timer_sync() won't help though, unless you replace
>> mod_timer() with del_timer_sync() plus add_timer() (with a check to see
>> whether it's running in timer context).  A better solution is probably
>> to use a spinlock in arm_next_watchdog().
> 
> Yup. Either way, we have a race that the guest might not expect.
> 
>>
>>>> +void kvmppc_watchdog_func(unsigned long data)
>>>> +{
>>>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>>> +	u32 tsr, new_tsr;
>>>> +	int final;
>>>> +
>>>> +	do {
>>>> +		new_tsr = tsr = vcpu->arch.tsr;
>>>> +		final = 0;
>>>> +
>>>> +		/* Time out event */
>>>> +		if (tsr & TSR_ENW) {
>>>> +			if (tsr & TSR_WIS) {
>>>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
>>>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
>>>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>>> +				final = 1;
>>>> +			} else {
>>>> +				new_tsr = tsr | TSR_WIS;
>>>> +			}
>>>> +		} else {
>>>> +			new_tsr = tsr | TSR_ENW;
>>>> +		}
>>>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>>>> +
>>>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>>>> +		smp_wmb();
>>>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>>>> +		kvm_vcpu_kick(vcpu);
>>>> +	}
>>>> +
>>>> +	/*
>>>> +	 * Avoid getting a storm of timers if the guest sets
>>>> +	 * the period very short.  We'll restart it if anything
>>>> +	 * changes.
>>>> +	 */
>>>> +	if (!final)
>>>> +		arm_next_watchdog(vcpu);
>>>
>>> Mind to explain this part a bit further?
>>
>> The whole function, or some subset near the end?
>>
>> The "if (!final)" check means that we stop running the timer after final
>> expiration, to prevent the host from being flooded with timers if the
>> guest sets a short period but does not have TCR set to exit to QEMU.
>> Timers will resume the next time TSR/TCR is updated.
> 
> Ah. The semantics make sense. The comment however is slightly too short. Please explain this in a more verbose way, so someone who didn't write the code knows what's going on :).
> 
>>
>>>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>>>> 	}
>>>>
>>>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>>>> +		u32 old_tsr = vcpu->arch.tsr;
>>>> +
>>>> 		vcpu->arch.tsr = sregs->u.e.tsr;
>>>> +
>>>> +		if ((old_tsr ^ vcpu->arch.tsr) &
>>>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>>>> +			arm_next_watchdog(vcpu);
>>>
>>> Why isn't this one guarded by vcpu->arch.watchdog_enable?
>>
>> I'm not sure that any of them should be -- there's no reason for the
>> watchdog interrupt mechanism to be dependent on QEMU, only the
>> heavyweight exit on final expiration.
> 
> Well - I like the concept of having new features switchable.

The code quickly becomes an unmantainable mess if every new feature is
switchable (and where is the line between new feature and bugfix when it
comes to filling in missing emulation?).  The reason for switchability
in this case is limited to QEMU interface compatibility.

The internal-to-KVM part of the watchdog isn't much different from the
FIT -- we're not going to conditionalize that, are we?

> Overlapping the "watchdog is implemented" feature with "user space
> wants watchdog exits" makes sense. But I definitely want to have a
> switch for the former, because we otherwise differ quite
> substantially from the emulation we had before.

It differs in that it is now a bit closer to being correct.

-Scott

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf - July 9, 2012, 5:29 p.m.
On 09.07.2012, at 19:09, Scott Wood wrote:

> On 07/07/2012 02:50 AM, Alexander Graf wrote:
>> 
>> On 07.07.2012, at 01:37, Scott Wood wrote:
>> 
>>> On 07/06/2012 08:17 AM, Alexander Graf wrote:
>>>> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>>>>> +/*
>>>>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>>>>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>>>>> + *
>>>>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>>>>> + * any realistic use.
>>>>> + */
>>>>> +#define MAX_TIMEOUT (LONG_MAX/2)
>>>> 
>>>> Should this really be in kvm code?
>>> 
>>> It looks like we can use NEXT_TIMER_MAX_DELTA for this.
>>> 
>>>>> +	mask = 1ULL << (63 - period);
>>>>> +	tb = get_tb();
>>>>> +	if (tb & mask)
>>>>> +		nr_jiffies += mask;
>>>> 
>>>> To be honest, you lost me here. nr_jiffies is jiffies, right? While
>>>> mask is basically in timebase granularity. I suppose you're just
>>>> reusing the variable here for no good reason - the compiler will
>>>> gladly optimize things for you if you write things a bit more verbose
>>>> :).
>>> 
>>> Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
>>> something generic like "ticks", "interval", "remaining", etc. would be
>>> better, with a comment on the do_div saying it's converting timebase
>>> ticks into jiffies.
>> 
>> Well, you could start off with a variable "delta_tb", then do
>> 
>>  nr_jiffies = delta_tb;
>>  x = do_div(...);
>> 
>> and things would suddenly become readable :). Of course I don't object to comments along the code either :).
>> 
>>> 
>>>>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +	unsigned long nr_jiffies;
>>>>> +
>>>>> +	nr_jiffies = watchdog_next_timeout(vcpu);
>>>>> +	if (nr_jiffies < MAX_TIMEOUT)
>>>>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>>>>> +	else
>>>>> +		del_timer(&vcpu->arch.wdt_timer);
>>>> 
>>>> Can you del a timer that's not armed? Could that ever happen in this case?
>>> 
>>> "del_timer() deactivates a timer - this works on both active and
>>> inactive timers."
>> 
>> Ah, good :).
>> 
>>> 
>>>> Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?
>>> 
>>> This can be called in the context of the timer, so del_timer_sync()
>>> would hang.
>>> 
>>> As for what would happen if a caller from a different context were to
>>> race with a timer, I think you could end up with the timer armed based
>>> on an old TCR.  del_timer_sync() won't help though, unless you replace
>>> mod_timer() with del_timer_sync() plus add_timer() (with a check to see
>>> whether it's running in timer context).  A better solution is probably
>>> to use a spinlock in arm_next_watchdog().
>> 
>> Yup. Either way, we have a race that the guest might not expect.
>> 
>>> 
>>>>> +void kvmppc_watchdog_func(unsigned long data)
>>>>> +{
>>>>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>>>> +	u32 tsr, new_tsr;
>>>>> +	int final;
>>>>> +
>>>>> +	do {
>>>>> +		new_tsr = tsr = vcpu->arch.tsr;
>>>>> +		final = 0;
>>>>> +
>>>>> +		/* Time out event */
>>>>> +		if (tsr & TSR_ENW) {
>>>>> +			if (tsr & TSR_WIS) {
>>>>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
>>>>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
>>>>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>>>> +				final = 1;
>>>>> +			} else {
>>>>> +				new_tsr = tsr | TSR_WIS;
>>>>> +			}
>>>>> +		} else {
>>>>> +			new_tsr = tsr | TSR_ENW;
>>>>> +		}
>>>>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>>>>> +
>>>>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>>>>> +		smp_wmb();
>>>>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>>>>> +		kvm_vcpu_kick(vcpu);
>>>>> +	}
>>>>> +
>>>>> +	/*
>>>>> +	 * Avoid getting a storm of timers if the guest sets
>>>>> +	 * the period very short.  We'll restart it if anything
>>>>> +	 * changes.
>>>>> +	 */
>>>>> +	if (!final)
>>>>> +		arm_next_watchdog(vcpu);
>>>> 
>>>> Mind to explain this part a bit further?
>>> 
>>> The whole function, or some subset near the end?
>>> 
>>> The "if (!final)" check means that we stop running the timer after final
>>> expiration, to prevent the host from being flooded with timers if the
>>> guest sets a short period but does not have TCR set to exit to QEMU.
>>> Timers will resume the next time TSR/TCR is updated.
>> 
>> Ah. The semantics make sense. The comment however is slightly too short. Please explain this in a more verbose way, so someone who didn't write the code knows what's going on :).
>> 
>>> 
>>>>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>>>>> 	}
>>>>> 
>>>>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>>>>> +		u32 old_tsr = vcpu->arch.tsr;
>>>>> +
>>>>> 		vcpu->arch.tsr = sregs->u.e.tsr;
>>>>> +
>>>>> +		if ((old_tsr ^ vcpu->arch.tsr) &
>>>>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>>>>> +			arm_next_watchdog(vcpu);
>>>> 
>>>> Why isn't this one guarded by vcpu->arch.watchdog_enable?
>>> 
>>> I'm not sure that any of them should be -- there's no reason for the
>>> watchdog interrupt mechanism to be dependent on QEMU, only the
>>> heavyweight exit on final expiration.
>> 
>> Well - I like the concept of having new features switchable.
> 
> The code quickly becomes an unmantainable mess if every new feature is
> switchable (and where is the line between new feature and bugfix when it
> comes to filling in missing emulation?).  The reason for switchability
> in this case is limited to QEMU interface compatibility.
> 
> The internal-to-KVM part of the watchdog isn't much different from the
> FIT -- we're not going to conditionalize that, are we?
> 
>> Overlapping the "watchdog is implemented" feature with "user space
>> wants watchdog exits" makes sense. But I definitely want to have a
>> switch for the former, because we otherwise differ quite
>> substantially from the emulation we had before.
> 
> It differs in that it is now a bit closer to being correct.

If we can maintain user space backwards compatibility, I'm fine with not switching it, since we're effectively fixing a bug wrt the implementation of the announced features, yes.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 50ea12f..a9e5aed 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -467,6 +467,7 @@  struct kvm_vcpu_arch {
 	ulong fault_esr;
 	ulong queued_dear;
 	ulong queued_esr;
+	struct timer_list wdt_timer;
 	u32 tlbcfg[4];
 	u32 mmucfg;
 	u32 epr;
@@ -482,6 +483,7 @@  struct kvm_vcpu_arch {
 	u8 osi_needed;
 	u8 osi_enabled;
 	u8 papr_enabled;
+	u8 watchdog_enable;
 	u8 sane;
 	u8 cpu_type;
 	u8 hcall_needed;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0124937..e5cf4b9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -67,6 +67,7 @@  extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern void kvmppc_decrementer_func(unsigned long data);
+extern void kvmppc_watchdog_func(unsigned long data);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 
 /* Core-specific hooks */
@@ -104,6 +105,8 @@  extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                        struct kvm_interrupt *irq);
 extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
                                          struct kvm_interrupt *irq);
+extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                   unsigned int op, int *advance);
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 2d916c4..7efb3f5 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -538,6 +538,8 @@ 
 #define FP_2_21		3		/* 2^21 clocks */
 #define TCR_FIE		0x00800000	/* FIT Interrupt Enable */
 #define TCR_ARE		0x00400000	/* Auto Reload Enable */
+#define TCR_GET_FSL_WP(tcr)  ((((tcr) & 0xC0000000) >> 30) | \
+			      (((tcr) & 0x1E0000) >> 15))
 
 /* Bit definitions for the TSR. */
 #define TSR_ENW		0x80000000	/* Enable Next Watchdog */
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 50e7dbc..a213aba 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -28,6 +28,7 @@ 
 #include <asm/kvm_44x.h>
 #include <asm/kvm_ppc.h>
 
+#include "booke.h"
 #include "44x_tlb.h"
 #include "booke.h"
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index d25a097..ad30f75 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -206,6 +206,16 @@  void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
 	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
 }
 
+void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
+}
+
+void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
+}
+
 static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
 {
 #ifdef CONFIG_KVM_BOOKE_HV
@@ -325,6 +335,7 @@  static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		msr_mask = MSR_CE | MSR_ME | MSR_DE;
 		int_class = INT_CLASS_NONCRIT;
 		break;
+	case BOOKE_IRQPRIO_WATCHDOG:
 	case BOOKE_IRQPRIO_CRITICAL:
 	case BOOKE_IRQPRIO_DBELL_CRIT:
 		allowed = vcpu->arch.shared->msr & MSR_CE;
@@ -404,12 +415,98 @@  static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	return allowed;
 }
 
+/*
+ * The timer system can almost deal with LONG_MAX timeouts, except that
+ * when you get very close to LONG_MAX, the slack added can cause overflow.
+ *
+ * LONG_MAX/2 is a conservative threshold, but it should be adequate for
+ * any realistic use.
+ */
+#define MAX_TIMEOUT (LONG_MAX/2)
+
+/*
+ * Return the number of jiffies until the next timeout.  If the timeout is
+ * longer than the MAX_TIMEOUT, that we return MAX_TIMEOUT instead.
+ */
+static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
+{
+	unsigned long long tb, mask, nr_jiffies = 0;
+	u32 period = TCR_GET_FSL_WP(vcpu->arch.tcr);
+
+	mask = 1ULL << (63 - period);
+	tb = get_tb();
+	if (tb & mask)
+		nr_jiffies += mask;
+
+	nr_jiffies += mask - (tb & (mask - 1));
+
+	if (do_div(nr_jiffies, tb_ticks_per_jiffy))
+		nr_jiffies++;
+
+	return min_t(unsigned long long, nr_jiffies, MAX_TIMEOUT);
+}
+
+static void arm_next_watchdog(struct kvm_vcpu *vcpu)
+{
+	unsigned long nr_jiffies;
+
+	nr_jiffies = watchdog_next_timeout(vcpu);
+	if (nr_jiffies < MAX_TIMEOUT)
+		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
+	else
+		del_timer(&vcpu->arch.wdt_timer);
+}
+
+void kvmppc_watchdog_func(unsigned long data)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+	u32 tsr, new_tsr;
+	int final;
+
+	do {
+		new_tsr = tsr = vcpu->arch.tsr;
+		final = 0;
+
+		/* Time out event */
+		if (tsr & TSR_ENW) {
+			if (tsr & TSR_WIS) {
+				new_tsr = (tsr & ~TCR_WRC_MASK) |
+					  (vcpu->arch.tcr & TCR_WRC_MASK);
+				vcpu->arch.tcr &= ~TCR_WRC_MASK;
+				final = 1;
+			} else {
+				new_tsr = tsr | TSR_WIS;
+			}
+		} else {
+			new_tsr = tsr | TSR_ENW;
+		}
+	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
+
+	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
+		smp_wmb();
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	/*
+	 * Avoid getting a storm of timers if the guest sets
+	 * the period very short.  We'll restart it if anything
+	 * changes.
+	 */
+	if (!final)
+		arm_next_watchdog(vcpu);
+}
 static void update_timer_ints(struct kvm_vcpu *vcpu)
 {
 	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
 		kvmppc_core_queue_dec(vcpu);
 	else
 		kvmppc_core_dequeue_dec(vcpu);
+
+	if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
+		kvmppc_core_queue_watchdog(vcpu);
+	else
+		kvmppc_core_dequeue_watchdog(vcpu);
 }
 
 static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
@@ -516,6 +613,12 @@  int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
+	if (vcpu->arch.tsr & TCR_WRC_MASK) {
+		kvm_run->exit_reason = KVM_EXIT_WDT;
+		ret = 0;
+		goto out;
+	}
+
 	kvm_guest_enter();
 
 #ifdef CONFIG_PPC_FPU
@@ -977,6 +1080,10 @@  int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
 		}
 	}
+	if (vcpu->arch.tsr & TCR_WRC_MASK) {
+		run->exit_reason = KVM_EXIT_WDT;
+		r = RESUME_HOST | (r & RESUME_FLAG_NV);
+	}
 
 	return r;
 }
@@ -1106,7 +1213,14 @@  static int set_sregs_base(struct kvm_vcpu *vcpu,
 	}
 
 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
+		u32 old_tsr = vcpu->arch.tsr;
+
 		vcpu->arch.tsr = sregs->u.e.tsr;
+
+		if ((old_tsr ^ vcpu->arch.tsr) &
+		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
+			arm_next_watchdog(vcpu);
+
 		update_timer_ints(vcpu);
 	}
 
@@ -1267,6 +1381,8 @@  void kvmppc_core_commit_memory_region(struct kvm *kvm,
 void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
 {
 	vcpu->arch.tcr = new_tcr;
+	if (vcpu->arch.watchdog_enable)
+		arm_next_watchdog(vcpu);
 	update_timer_ints(vcpu);
 }
 
@@ -1281,6 +1397,15 @@  void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 {
 	clear_bits(tsr_bits, &vcpu->arch.tsr);
+
+	/*
+	 * We may have stopped the watchdog due to
+	 * being stuck on final expiration.
+	 */
+	if (vcpu->arch.watchdog_enable && (tsr_bits & (TSR_ENW |
+			TSR_WIS | TCR_WRC_MASK)))
+		arm_next_watchdog(vcpu);
+
 	update_timer_ints(vcpu);
 }
 
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 12834bb..cc94d42 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -145,7 +145,11 @@  int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 		kvmppc_clr_tsr_bits(vcpu, spr_val);
 		break;
 	case SPRN_TCR:
-		kvmppc_set_tcr(vcpu, spr_val);
+		/* WRC can only be programmed when WRC=0 */
+		if (TCR_WRC_MASK & vcpu->arch.tcr)
+			spr_val &= ~TCR_WRC_MASK;
+		kvmppc_set_tcr(vcpu,
+		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
 		break;
 
 	case SPRN_DECAR:
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 87f4dc8..35a1ff3 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -38,9 +38,15 @@ 
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return !(v->arch.shared->msr & MSR_WE) ||
-	       !!(v->arch.pending_exceptions) ||
-	       v->requests;
+	bool ret = !(v->arch.shared->msr & MSR_WE) ||
+		   !!(v->arch.pending_exceptions) ||
+		   v->requests;
+
+#ifdef CONFIG_BOOKE
+	ret = ret || (v->arch.tsr & TCR_WRC_MASK);
+#endif
+
+	return ret;
 }
 
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
@@ -237,6 +243,7 @@  int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_GET_PVINFO:
 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB:
+	case KVM_CAP_PPC_WDT:
 #endif
 		r = 1;
 		break;
@@ -393,6 +400,10 @@  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 	kvmppc_mmu_destroy(vcpu);
+#ifdef CONFIG_BOOKE
+	if (vcpu->arch.watchdog_enable)
+		del_timer(&vcpu->arch.wdt_timer);
+#endif
 }
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -637,6 +648,14 @@  static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		r = 0;
 		vcpu->arch.papr_enabled = true;
 		break;
+#ifdef CONFIG_BOOKE
+	case KVM_CAP_PPC_WDT:
+		r = 0;
+		vcpu->arch.watchdog_enable = true;
+		setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
+		            (unsigned long)vcpu);
+		break;
+#endif
 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB: {
 		struct kvm_config_tlb cfg;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2ce09aa..b9fdb52 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -163,6 +163,7 @@  struct kvm_pit_config {
 #define KVM_EXIT_OSI              18
 #define KVM_EXIT_PAPR_HCALL	  19
 #define KVM_EXIT_S390_UCONTROL	  20
+#define KVM_EXIT_WDT              21
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -618,6 +619,7 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_GET_SMMU_INFO 78
 #define KVM_CAP_S390_COW 79
 #define KVM_CAP_PPC_ALLOC_HTAB 80
+#define KVM_CAP_PPC_WDT	81
 
 #ifdef KVM_CAP_IRQ_ROUTING