diff mbox

[13/23] KVM: PPC: Book3S HV: Accumulate timing information for real-mode code

Message ID 1426844400-12017-14-git-send-email-paulus@samba.org
State New, archived
Headers show

Commit Message

Paul Mackerras March 20, 2015, 9:39 a.m. UTC
This reads the timebase at various points in the real-mode guest
entry/exit code and uses that to accumulate total, minimum and
maximum time spent in those parts of the code.  Currently these
times are accumulated per vcpu in 5 parts of the code:

* rm_entry - time taken from the start of kvmppc_hv_entry() until
  just before entering the guest.
* rm_intr - time from when we take a hypervisor interrupt in the
  guest until we either re-enter the guest or decide to exit to the
  host.  This includes time spent handling hcalls in real mode.
* rm_exit - time from when we decide to exit the guest until the
  return from kvmppc_hv_entry().
* guest - time spend in the guest
* cede - time spent napping in real mode due to an H_CEDE hcall
  while other threads in the same vcore are active.

These times are exposed in debugfs in a directory per vcpu that
contains a file called "timings".  This file contains one line for
each of the 5 timings above, with the name followed by a colon and
4 numbers, which are the count (number of times the code has been
executed), the total time, the minimum time, and the maximum time,
all in nanoseconds.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_host.h     |  19 +++++
 arch/powerpc/include/asm/time.h         |   3 +
 arch/powerpc/kernel/asm-offsets.c       |  11 +++
 arch/powerpc/kernel/time.c              |   6 ++
 arch/powerpc/kvm/book3s_hv.c            | 135 ++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 105 ++++++++++++++++++++++++-
 6 files changed, 276 insertions(+), 3 deletions(-)

Comments

Alexander Graf March 20, 2015, 11:15 a.m. UTC | #1
On 20.03.15 10:39, Paul Mackerras wrote:
> This reads the timebase at various points in the real-mode guest
> entry/exit code and uses that to accumulate total, minimum and
> maximum time spent in those parts of the code.  Currently these
> times are accumulated per vcpu in 5 parts of the code:
> 
> * rm_entry - time taken from the start of kvmppc_hv_entry() until
>   just before entering the guest.
> * rm_intr - time from when we take a hypervisor interrupt in the
>   guest until we either re-enter the guest or decide to exit to the
>   host.  This includes time spent handling hcalls in real mode.
> * rm_exit - time from when we decide to exit the guest until the
>   return from kvmppc_hv_entry().
> * guest - time spend in the guest
> * cede - time spent napping in real mode due to an H_CEDE hcall
>   while other threads in the same vcore are active.
> 
> These times are exposed in debugfs in a directory per vcpu that
> contains a file called "timings".  This file contains one line for
> each of the 5 timings above, with the name followed by a colon and
> 4 numbers, which are the count (number of times the code has been
> executed), the total time, the minimum time, and the maximum time,
> all in nanoseconds.
> 
> Signed-off-by: Paul Mackerras <paulus@samba.org>

Have you measure the additional overhead this brings?

> ---
>  arch/powerpc/include/asm/kvm_host.h     |  19 +++++
>  arch/powerpc/include/asm/time.h         |   3 +
>  arch/powerpc/kernel/asm-offsets.c       |  11 +++
>  arch/powerpc/kernel/time.c              |   6 ++
>  arch/powerpc/kvm/book3s_hv.c            | 135 ++++++++++++++++++++++++++++++++
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 105 ++++++++++++++++++++++++-
>  6 files changed, 276 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index f1d0bbc..286c0ce 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -369,6 +369,14 @@ struct kvmppc_slb {
>  	u8 base_page_size;	/* MMU_PAGE_xxx */
>  };
>  
> +/* Struct used to accumulate timing information in HV real mode code */
> +struct kvmhv_tb_accumulator {
> +	u64	seqcount;	/* used to synchronize access, also count * 2 */
> +	u64	tb_total;	/* total time in timebase ticks */
> +	u64	tb_min;		/* min time */
> +	u64	tb_max;		/* max time */
> +};
> +
>  # ifdef CONFIG_PPC_FSL_BOOK3E
>  #define KVMPPC_BOOKE_IAC_NUM	2
>  #define KVMPPC_BOOKE_DAC_NUM	2
> @@ -656,6 +664,17 @@ struct kvm_vcpu_arch {
>  	u64 busy_preempt;
>  
>  	u32 emul_inst;
> +
> +	struct kvmhv_tb_accumulator *cur_activity;	/* What we're timing */
> +	u64	cur_tb_start;			/* when it started */
> +	struct kvmhv_tb_accumulator rm_entry;	/* real-mode entry code */
> +	struct kvmhv_tb_accumulator rm_intr;	/* real-mode intr handling */
> +	struct kvmhv_tb_accumulator rm_exit;	/* real-mode exit code */
> +	struct kvmhv_tb_accumulator guest_time;	/* guest execution */
> +	struct kvmhv_tb_accumulator cede_time;	/* time napping inside guest */
> +
> +	struct dentry *debugfs_dir;
> +	struct dentry *debugfs_timings;
>  #endif
>  };
>  
> diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
> index 03cbada..10fc784 100644
> --- a/arch/powerpc/include/asm/time.h
> +++ b/arch/powerpc/include/asm/time.h
> @@ -211,5 +211,8 @@ extern void secondary_cpu_time_init(void);
>  
>  DECLARE_PER_CPU(u64, decrementers_next_tb);
>  
> +/* Convert timebase ticks to nanoseconds */
> +unsigned long long tb_to_ns(unsigned long long tb_ticks);
> +
>  #endif /* __KERNEL__ */
>  #endif /* __POWERPC_TIME_H */
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index 4717859..ec9f59c 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -458,6 +458,17 @@ int main(void)
>  	DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.shregs.sprg1));
>  	DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
>  	DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
> +	DEFINE(VCPU_TB_RMENTRY, offsetof(struct kvm_vcpu, arch.rm_entry));
> +	DEFINE(VCPU_TB_RMINTR, offsetof(struct kvm_vcpu, arch.rm_intr));
> +	DEFINE(VCPU_TB_RMEXIT, offsetof(struct kvm_vcpu, arch.rm_exit));
> +	DEFINE(VCPU_TB_GUEST, offsetof(struct kvm_vcpu, arch.guest_time));
> +	DEFINE(VCPU_TB_CEDE, offsetof(struct kvm_vcpu, arch.cede_time));
> +	DEFINE(VCPU_CUR_ACTIVITY, offsetof(struct kvm_vcpu, arch.cur_activity));
> +	DEFINE(VCPU_ACTIVITY_START, offsetof(struct kvm_vcpu, arch.cur_tb_start));
> +	DEFINE(TAS_SEQCOUNT, offsetof(struct kvmhv_tb_accumulator, seqcount));
> +	DEFINE(TAS_TOTAL, offsetof(struct kvmhv_tb_accumulator, tb_total));
> +	DEFINE(TAS_MIN, offsetof(struct kvmhv_tb_accumulator, tb_min));
> +	DEFINE(TAS_MAX, offsetof(struct kvmhv_tb_accumulator, tb_max));
>  #endif
>  	DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
>  	DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index 2d7b33f..56f4484 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -608,6 +608,12 @@ void arch_suspend_enable_irqs(void)
>  }
>  #endif
>  
> +unsigned long long tb_to_ns(unsigned long long ticks)
> +{
> +	return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift;
> +}
> +EXPORT_SYMBOL_GPL(tb_to_ns);
> +
>  /*
>   * Scheduler clock - returns current time in nanosec units.
>   *
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 6631b53..8517c33 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1411,6 +1411,127 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
>  	return vcore;
>  }
>  
> +static struct debugfs_timings_element {
> +	const char *name;
> +	size_t offset;
> +} timings[] = {
> +	{"rm_entry",	offsetof(struct kvm_vcpu, arch.rm_entry)},
> +	{"rm_intr",	offsetof(struct kvm_vcpu, arch.rm_intr)},
> +	{"rm_exit",	offsetof(struct kvm_vcpu, arch.rm_exit)},
> +	{"guest",	offsetof(struct kvm_vcpu, arch.guest_time)},
> +	{"cede",	offsetof(struct kvm_vcpu, arch.cede_time)},
> +};
> +
> +#define N_TIMINGS	(sizeof(timings) / sizeof(timings[0]))
> +
> +struct debugfs_timings_state {
> +	struct kvm_vcpu	*vcpu;
> +	unsigned int	buflen;
> +	char		buf[N_TIMINGS * 100];
> +};
> +
> +static int debugfs_timings_open(struct inode *inode, struct file *file)
> +{
> +	struct kvm_vcpu *vcpu = inode->i_private;
> +	struct debugfs_timings_state *p;
> +
> +	p = kzalloc(sizeof(*p), GFP_KERNEL);
> +	if (!p)
> +		return -ENOMEM;
> +
> +	kvm_get_kvm(vcpu->kvm);
> +	p->vcpu = vcpu;
> +	file->private_data = p;
> +
> +	return nonseekable_open(inode, file);
> +}
> +
> +static int debugfs_timings_release(struct inode *inode, struct file *file)
> +{
> +	struct debugfs_timings_state *p = file->private_data;
> +
> +	kvm_put_kvm(p->vcpu->kvm);
> +	kfree(p);
> +	return 0;
> +}
> +
> +static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
> +				    size_t len, loff_t *ppos)
> +{
> +	struct debugfs_timings_state *p = file->private_data;
> +	struct kvm_vcpu *vcpu = p->vcpu;
> +	char *s;
> +	struct kvmhv_tb_accumulator tb;
> +	u64 count;
> +	loff_t pos;
> +	ssize_t n;
> +	int i, loops;
> +	bool ok;
> +
> +	if (!p->buflen) {
> +		s = p->buf;
> +		for (i = 0; i < N_TIMINGS; ++i) {
> +			struct kvmhv_tb_accumulator *acc;
> +
> +			acc = (struct kvmhv_tb_accumulator *)
> +				((unsigned long)vcpu + timings[i].offset);
> +			ok = false;
> +			for (loops = 0; loops < 1000; ++loops) {
> +				count = acc->seqcount;
> +				if (!(count & 1)) {
> +					smp_rmb();
> +					tb = *acc;
> +					smp_rmb();
> +					if (count == acc->seqcount) {
> +						ok = true;
> +						break;
> +					}
> +				}
> +				udelay(1);
> +			}
> +			if (!ok)
> +				sprintf(s, "%s: stuck\n", timings[i].name);
> +			else
> +				sprintf(s, "%s: %llu %llu %llu %llu\n",
> +					timings[i].name, count / 2,
> +					tb_to_ns(tb.tb_total),
> +					tb_to_ns(tb.tb_min),
> +					tb_to_ns(tb.tb_max));
> +			s += strlen(s);
> +		}
> +		p->buflen = s - p->buf;
> +	}
> +
> +	pos = *ppos;
> +	if (pos >= p->buflen)
> +		return 0;
> +	if (len > p->buflen - pos)
> +		len = p->buflen - pos;
> +	n = copy_to_user(buf, p->buf + pos, len);
> +	if (n) {
> +		if (n == len)
> +			return -EFAULT;
> +		len -= n;
> +	}
> +	*ppos = pos + len;
> +	return len;
> +}
> +
> +static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
> +				     size_t len, loff_t *ppos)
> +{
> +	return -EACCES;
> +}
> +
> +static const struct file_operations debugfs_timings_ops = {
> +	.owner	 = THIS_MODULE,
> +	.open	 = debugfs_timings_open,
> +	.release = debugfs_timings_release,
> +	.read	 = debugfs_timings_read,
> +	.write	 = debugfs_timings_write,
> +	.llseek	 = generic_file_llseek,
> +};
> +
>  static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
>  						   unsigned int id)
>  {
> @@ -1418,6 +1539,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
>  	int err = -EINVAL;
>  	int core;
>  	struct kvmppc_vcore *vcore;
> +	char buf[16];
>  
>  	core = id / threads_per_subcore;
>  	if (core >= KVM_MAX_VCORES)
> @@ -1480,6 +1602,19 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
>  	vcpu->arch.cpu_type = KVM_CPU_3S_64;
>  	kvmppc_sanity_check(vcpu);
>  
> +	/* Create a debugfs directory for the vcpu */
> +	sprintf(buf, "vcpu%u", id);

Please make sure to use snprintf and friends rather than assume that you
happened to choose the buffer correctly sized.


Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras March 20, 2015, 11:25 a.m. UTC | #2
On Fri, Mar 20, 2015 at 12:15:15PM +0100, Alexander Graf wrote:
> 
> 
> On 20.03.15 10:39, Paul Mackerras wrote:
> > This reads the timebase at various points in the real-mode guest
> > entry/exit code and uses that to accumulate total, minimum and
> > maximum time spent in those parts of the code.  Currently these
> > times are accumulated per vcpu in 5 parts of the code:
> > 
> > * rm_entry - time taken from the start of kvmppc_hv_entry() until
> >   just before entering the guest.
> > * rm_intr - time from when we take a hypervisor interrupt in the
> >   guest until we either re-enter the guest or decide to exit to the
> >   host.  This includes time spent handling hcalls in real mode.
> > * rm_exit - time from when we decide to exit the guest until the
> >   return from kvmppc_hv_entry().
> > * guest - time spend in the guest
> > * cede - time spent napping in real mode due to an H_CEDE hcall
> >   while other threads in the same vcore are active.
> > 
> > These times are exposed in debugfs in a directory per vcpu that
> > contains a file called "timings".  This file contains one line for
> > each of the 5 timings above, with the name followed by a colon and
> > 4 numbers, which are the count (number of times the code has been
> > executed), the total time, the minimum time, and the maximum time,
> > all in nanoseconds.
> > 
> > Signed-off-by: Paul Mackerras <paulus@samba.org>
> 
> Have you measure the additional overhead this brings?

I haven't - in fact I did this patch so I could measure the overhead
or improvement from other changes I did, but it doesn't measure its
own overhead, of course.  I guess I need a workload that does a
defined number of guest entries and exits and measure how fast it runs
with and without the patch (maybe something like H_SET_MODE in a
loop).  I'll figure something out and post the results.  

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf March 20, 2015, 11:35 a.m. UTC | #3
On 20.03.15 12:25, Paul Mackerras wrote:
> On Fri, Mar 20, 2015 at 12:15:15PM +0100, Alexander Graf wrote:
>>
>>
>> On 20.03.15 10:39, Paul Mackerras wrote:
>>> This reads the timebase at various points in the real-mode guest
>>> entry/exit code and uses that to accumulate total, minimum and
>>> maximum time spent in those parts of the code.  Currently these
>>> times are accumulated per vcpu in 5 parts of the code:
>>>
>>> * rm_entry - time taken from the start of kvmppc_hv_entry() until
>>>   just before entering the guest.
>>> * rm_intr - time from when we take a hypervisor interrupt in the
>>>   guest until we either re-enter the guest or decide to exit to the
>>>   host.  This includes time spent handling hcalls in real mode.
>>> * rm_exit - time from when we decide to exit the guest until the
>>>   return from kvmppc_hv_entry().
>>> * guest - time spend in the guest
>>> * cede - time spent napping in real mode due to an H_CEDE hcall
>>>   while other threads in the same vcore are active.
>>>
>>> These times are exposed in debugfs in a directory per vcpu that
>>> contains a file called "timings".  This file contains one line for
>>> each of the 5 timings above, with the name followed by a colon and
>>> 4 numbers, which are the count (number of times the code has been
>>> executed), the total time, the minimum time, and the maximum time,
>>> all in nanoseconds.
>>>
>>> Signed-off-by: Paul Mackerras <paulus@samba.org>
>>
>> Have you measure the additional overhead this brings?
> 
> I haven't - in fact I did this patch so I could measure the overhead
> or improvement from other changes I did, but it doesn't measure its
> own overhead, of course.  I guess I need a workload that does a
> defined number of guest entries and exits and measure how fast it runs
> with and without the patch (maybe something like H_SET_MODE in a
> loop).  I'll figure something out and post the results.  

Yeah, just measure the number of exits you can handle for a simple
hcall. If there is measurable overhead, it's probably a good idea to
move the statistics gathering into #ifdef paths for DEBUGFS or maybe
even a separate EXIT_TIMING config option as we have it for booke.


Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras March 22, 2015, 10:57 p.m. UTC | #4
On Fri, Mar 20, 2015 at 12:35:43PM +0100, Alexander Graf wrote:
> 
> 
> On 20.03.15 12:25, Paul Mackerras wrote:
> > On Fri, Mar 20, 2015 at 12:15:15PM +0100, Alexander Graf wrote:
> >> Have you measure the additional overhead this brings?
> > 
> > I haven't - in fact I did this patch so I could measure the overhead
> > or improvement from other changes I did, but it doesn't measure its
> > own overhead, of course.  I guess I need a workload that does a
> > defined number of guest entries and exits and measure how fast it runs
> > with and without the patch (maybe something like H_SET_MODE in a
> > loop).  I'll figure something out and post the results.  
> 
> Yeah, just measure the number of exits you can handle for a simple
> hcall. If there is measurable overhead, it's probably a good idea to
> move the statistics gathering into #ifdef paths for DEBUGFS or maybe
> even a separate EXIT_TIMING config option as we have it for booke.

For 1-vcpu guest on POWER8, it adds 29ns to the time for an hcall that
is handled in real mode (H_SET_DABR), which is 25%.  It adds 43ns to
the time for an hcall that is handled in the host kernel in virtual
mode (H_PROD), which is 1.2%.

I'll add a config option.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index f1d0bbc..286c0ce 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -369,6 +369,14 @@  struct kvmppc_slb {
 	u8 base_page_size;	/* MMU_PAGE_xxx */
 };
 
+/* Struct used to accumulate timing information in HV real mode code */
+struct kvmhv_tb_accumulator {
+	u64	seqcount;	/* used to synchronize access, also count * 2 */
+	u64	tb_total;	/* total time in timebase ticks */
+	u64	tb_min;		/* min time */
+	u64	tb_max;		/* max time */
+};
+
 # ifdef CONFIG_PPC_FSL_BOOK3E
 #define KVMPPC_BOOKE_IAC_NUM	2
 #define KVMPPC_BOOKE_DAC_NUM	2
@@ -656,6 +664,17 @@  struct kvm_vcpu_arch {
 	u64 busy_preempt;
 
 	u32 emul_inst;
+
+	struct kvmhv_tb_accumulator *cur_activity;	/* What we're timing */
+	u64	cur_tb_start;			/* when it started */
+	struct kvmhv_tb_accumulator rm_entry;	/* real-mode entry code */
+	struct kvmhv_tb_accumulator rm_intr;	/* real-mode intr handling */
+	struct kvmhv_tb_accumulator rm_exit;	/* real-mode exit code */
+	struct kvmhv_tb_accumulator guest_time;	/* guest execution */
+	struct kvmhv_tb_accumulator cede_time;	/* time napping inside guest */
+
+	struct dentry *debugfs_dir;
+	struct dentry *debugfs_timings;
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 03cbada..10fc784 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -211,5 +211,8 @@  extern void secondary_cpu_time_init(void);
 
 DECLARE_PER_CPU(u64, decrementers_next_tb);
 
+/* Convert timebase ticks to nanoseconds */
+unsigned long long tb_to_ns(unsigned long long tb_ticks);
+
 #endif /* __KERNEL__ */
 #endif /* __POWERPC_TIME_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 4717859..ec9f59c 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -458,6 +458,17 @@  int main(void)
 	DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.shregs.sprg1));
 	DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
 	DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
+	DEFINE(VCPU_TB_RMENTRY, offsetof(struct kvm_vcpu, arch.rm_entry));
+	DEFINE(VCPU_TB_RMINTR, offsetof(struct kvm_vcpu, arch.rm_intr));
+	DEFINE(VCPU_TB_RMEXIT, offsetof(struct kvm_vcpu, arch.rm_exit));
+	DEFINE(VCPU_TB_GUEST, offsetof(struct kvm_vcpu, arch.guest_time));
+	DEFINE(VCPU_TB_CEDE, offsetof(struct kvm_vcpu, arch.cede_time));
+	DEFINE(VCPU_CUR_ACTIVITY, offsetof(struct kvm_vcpu, arch.cur_activity));
+	DEFINE(VCPU_ACTIVITY_START, offsetof(struct kvm_vcpu, arch.cur_tb_start));
+	DEFINE(TAS_SEQCOUNT, offsetof(struct kvmhv_tb_accumulator, seqcount));
+	DEFINE(TAS_TOTAL, offsetof(struct kvmhv_tb_accumulator, tb_total));
+	DEFINE(TAS_MIN, offsetof(struct kvmhv_tb_accumulator, tb_min));
+	DEFINE(TAS_MAX, offsetof(struct kvmhv_tb_accumulator, tb_max));
 #endif
 	DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
 	DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 2d7b33f..56f4484 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -608,6 +608,12 @@  void arch_suspend_enable_irqs(void)
 }
 #endif
 
+unsigned long long tb_to_ns(unsigned long long ticks)
+{
+	return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift;
+}
+EXPORT_SYMBOL_GPL(tb_to_ns);
+
 /*
  * Scheduler clock - returns current time in nanosec units.
  *
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6631b53..8517c33 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1411,6 +1411,127 @@  static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
 	return vcore;
 }
 
+static struct debugfs_timings_element {
+	const char *name;
+	size_t offset;
+} timings[] = {
+	{"rm_entry",	offsetof(struct kvm_vcpu, arch.rm_entry)},
+	{"rm_intr",	offsetof(struct kvm_vcpu, arch.rm_intr)},
+	{"rm_exit",	offsetof(struct kvm_vcpu, arch.rm_exit)},
+	{"guest",	offsetof(struct kvm_vcpu, arch.guest_time)},
+	{"cede",	offsetof(struct kvm_vcpu, arch.cede_time)},
+};
+
+#define N_TIMINGS	(sizeof(timings) / sizeof(timings[0]))
+
+struct debugfs_timings_state {
+	struct kvm_vcpu	*vcpu;
+	unsigned int	buflen;
+	char		buf[N_TIMINGS * 100];
+};
+
+static int debugfs_timings_open(struct inode *inode, struct file *file)
+{
+	struct kvm_vcpu *vcpu = inode->i_private;
+	struct debugfs_timings_state *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	kvm_get_kvm(vcpu->kvm);
+	p->vcpu = vcpu;
+	file->private_data = p;
+
+	return nonseekable_open(inode, file);
+}
+
+static int debugfs_timings_release(struct inode *inode, struct file *file)
+{
+	struct debugfs_timings_state *p = file->private_data;
+
+	kvm_put_kvm(p->vcpu->kvm);
+	kfree(p);
+	return 0;
+}
+
+static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
+				    size_t len, loff_t *ppos)
+{
+	struct debugfs_timings_state *p = file->private_data;
+	struct kvm_vcpu *vcpu = p->vcpu;
+	char *s;
+	struct kvmhv_tb_accumulator tb;
+	u64 count;
+	loff_t pos;
+	ssize_t n;
+	int i, loops;
+	bool ok;
+
+	if (!p->buflen) {
+		s = p->buf;
+		for (i = 0; i < N_TIMINGS; ++i) {
+			struct kvmhv_tb_accumulator *acc;
+
+			acc = (struct kvmhv_tb_accumulator *)
+				((unsigned long)vcpu + timings[i].offset);
+			ok = false;
+			for (loops = 0; loops < 1000; ++loops) {
+				count = acc->seqcount;
+				if (!(count & 1)) {
+					smp_rmb();
+					tb = *acc;
+					smp_rmb();
+					if (count == acc->seqcount) {
+						ok = true;
+						break;
+					}
+				}
+				udelay(1);
+			}
+			if (!ok)
+				sprintf(s, "%s: stuck\n", timings[i].name);
+			else
+				sprintf(s, "%s: %llu %llu %llu %llu\n",
+					timings[i].name, count / 2,
+					tb_to_ns(tb.tb_total),
+					tb_to_ns(tb.tb_min),
+					tb_to_ns(tb.tb_max));
+			s += strlen(s);
+		}
+		p->buflen = s - p->buf;
+	}
+
+	pos = *ppos;
+	if (pos >= p->buflen)
+		return 0;
+	if (len > p->buflen - pos)
+		len = p->buflen - pos;
+	n = copy_to_user(buf, p->buf + pos, len);
+	if (n) {
+		if (n == len)
+			return -EFAULT;
+		len -= n;
+	}
+	*ppos = pos + len;
+	return len;
+}
+
+static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
+				     size_t len, loff_t *ppos)
+{
+	return -EACCES;
+}
+
+static const struct file_operations debugfs_timings_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = debugfs_timings_open,
+	.release = debugfs_timings_release,
+	.read	 = debugfs_timings_read,
+	.write	 = debugfs_timings_write,
+	.llseek	 = generic_file_llseek,
+};
+
 static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 						   unsigned int id)
 {
@@ -1418,6 +1539,7 @@  static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 	int err = -EINVAL;
 	int core;
 	struct kvmppc_vcore *vcore;
+	char buf[16];
 
 	core = id / threads_per_subcore;
 	if (core >= KVM_MAX_VCORES)
@@ -1480,6 +1602,19 @@  static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	kvmppc_sanity_check(vcpu);
 
+	/* Create a debugfs directory for the vcpu */
+	sprintf(buf, "vcpu%u", id);
+	if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) {
+		vcpu->arch.debugfs_dir = debugfs_create_dir(buf,
+						kvm->arch.debugfs_dir);
+		if (!IS_ERR_OR_NULL(vcpu->arch.debugfs_dir)) {
+			vcpu->arch.debugfs_timings =
+				debugfs_create_file("timings", 0444,
+						    vcpu->arch.debugfs_dir,
+						    vcpu, &debugfs_timings_ops);
+		}
+	}
+
 	return vcpu;
 
 free_vcpu:
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 0814ca1..d71ae2f 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -225,9 +225,14 @@  kvm_novcpu_wakeup:
 	/* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */
 	ld	r4, HSTATE_KVM_VCPU(r13)
 	cmpdi	r4, 0
-	bne	kvmppc_got_guest
+	beq	kvmppc_primary_no_guest
+
+	addi	r3, r4, VCPU_TB_RMENTRY
+	bl	kvmhv_start_timing
+	b	kvmppc_got_guest
 
 kvm_novcpu_exit:
+	ld	r4, HSTATE_KVM_VCPU(r13)
 	b	hdec_soon
 
 /*
@@ -376,6 +381,12 @@  kvmppc_hv_entry:
 	li	r6, KVM_GUEST_MODE_HOST_HV
 	stb	r6, HSTATE_IN_GUEST(r13)
 
+	/* Store initial timestamp */
+	cmpdi	r4, 0
+	beq	1f
+	addi	r3, r4, VCPU_TB_RMENTRY
+	bl	kvmhv_start_timing
+1:
 	/* Clear out SLB */
 	li	r6,0
 	slbmte	r6,r6
@@ -880,6 +891,10 @@  fast_guest_return:
 	li	r9, KVM_GUEST_MODE_GUEST_HV
 	stb	r9, HSTATE_IN_GUEST(r13)
 
+	/* Accumulate timing */
+	addi	r3, r4, VCPU_TB_GUEST
+	bl	kvmhv_accumulate_time
+
 	/* Enter guest */
 
 BEGIN_FTR_SECTION
@@ -917,6 +932,20 @@  END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	hrfid
 	b	.
 
+secondary_too_late:
+	cmpdi	r4, 0
+	beq	11f
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+11:	b	kvmhv_switch_to_host
+
+hdec_soon:
+	cmpdi	r4, 0
+	beq	12f
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+12:	b	kvmhv_do_exit
+
 /******************************************************************************
  *                                                                            *
  *                               Exit code                                    *
@@ -1002,6 +1031,14 @@  END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
 	stw	r12,VCPU_TRAP(r9)
 
+	addi	r3, r9, VCPU_TB_RMINTR
+	mr	r4, r9
+	bl	kvmhv_accumulate_time
+	ld	r5, VCPU_GPR(R5)(r9)
+	ld	r6, VCPU_GPR(R6)(r9)
+	ld	r7, VCPU_GPR(R7)(r9)
+	ld	r8, VCPU_GPR(R8)(r9)
+
 	/* Save HEIR (HV emulation assist reg) in emul_inst
 	   if this is an HEI (HV emulation interrupt, e40) */
 	li	r3,KVM_INST_FETCH_FAILED
@@ -1073,6 +1110,9 @@  guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 	beq	machine_check_realmode
 mc_cont:
+	addi	r3, r9, VCPU_TB_RMEXIT
+	mr	r4, r9
+	bl	kvmhv_accumulate_time
 
 	/* Save guest CTRL register, set runlatch to 1 */
 6:	mfspr	r6,SPRN_CTRLF
@@ -1417,7 +1457,7 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	slbia
 	ptesync
 
-hdec_soon:			/* r12 = trap, r13 = paca */
+kvmhv_do_exit:			/* r12 = trap, r13 = paca */
 	/*
 	 * POWER7/POWER8 guest -> host partition switch code.
 	 * We don't have to lock against tlbies but we do
@@ -1476,7 +1516,7 @@  hdec_soon:			/* r12 = trap, r13 = paca */
 	addi	r6,r6,PACA_SIZE
 	bne	42b
 
-secondary_too_late:
+kvmhv_switch_to_host:
 	/* Secondary threads wait for primary to do partition switch */
 43:	ld	r5,HSTATE_KVM_VCORE(r13)
 	ld	r4,VCORE_KVM(r5)	/* pointer to struct kvm */
@@ -1562,6 +1602,13 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 1:	addi	r8,r8,16
 	.endr
 
+	/* Finish timing, if we have a vcpu */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	li	r3, 0
+	beq	2f
+	bl	kvmhv_accumulate_time
+2:
 	/* Unset guest mode */
 	li	r0, KVM_GUEST_MODE_NONE
 	stb	r0, HSTATE_IN_GUEST(r13)
@@ -2069,6 +2116,10 @@  _GLOBAL(kvmppc_h_cede)
 	/* save FP state */
 	bl	kvmppc_save_fp
 
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	addi	r3, r4, VCPU_TB_CEDE
+	bl	kvmhv_accumulate_time
+
 	/*
 	 * Take a nap until a decrementer or external or doobell interrupt
 	 * occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the
@@ -2109,6 +2160,9 @@  kvm_end_cede:
 	/* Woken by external or decrementer interrupt */
 	ld	r1, HSTATE_HOST_R1(r13)
 
+	addi	r3, r4, VCPU_TB_RMINTR
+	bl	kvmhv_accumulate_time
+
 	/* load up FP state */
 	bl	kvmppc_load_fp
 
@@ -2429,3 +2483,48 @@  kvmppc_fix_pmao:
 	mtspr	SPRN_PMC6, r3
 	isync
 	blr
+
+/*
+ * Start timing an activity
+ * r3 = pointer to time accumulation struct, r4 = vcpu
+ */
+kvmhv_start_timing:
+	mftb	r5
+	std	r3, VCPU_CUR_ACTIVITY(r4)
+	std	r5, VCPU_ACTIVITY_START(r4)
+	blr
+
+/*
+ * Accumulate time to one activity and start another.
+ * r3 = pointer to new time accumulation struct, r4 = vcpu
+ */
+kvmhv_accumulate_time:
+	ld	r5, VCPU_CUR_ACTIVITY(r4)
+	ld	r6, VCPU_ACTIVITY_START(r4)
+	std	r3, VCPU_CUR_ACTIVITY(r4)
+	mftb	r7
+	std	r7, VCPU_ACTIVITY_START(r4)
+	cmpdi	r5, 0
+	beqlr
+	subf	r3, r6, r7
+	ld	r8, TAS_SEQCOUNT(r5)
+	cmpdi	r8, 0
+	addi	r8, r8, 1
+	std	r8, TAS_SEQCOUNT(r5)
+	lwsync
+	ld	r7, TAS_TOTAL(r5)
+	add	r7, r7, r3
+	std	r7, TAS_TOTAL(r5)
+	ld	r6, TAS_MIN(r5)
+	ld	r7, TAS_MAX(r5)
+	beq	3f
+	cmpd	r3, r6
+	bge	1f
+3:	std	r3, TAS_MIN(r5)
+1:	cmpd	r3, r7
+	ble	2f
+	std	r3, TAS_MAX(r5)
+2:	lwsync
+	addi	r8, r8, 1
+	std	r8, TAS_SEQCOUNT(r5)
+	blr