diff mbox

[1/9] KVM: PPC: Book3S: Add infrastructure to implement kernel-side RTAS calls

Message ID 20130214235954.GB17099@iris.ozlabs.ibm.com
State New, archived
Headers show

Commit Message

Paul Mackerras Feb. 14, 2013, 11:59 p.m. UTC
From: Michael Ellerman <michael@ellerman.id.au>

For pseries machine emulation, in order to move the interrupt
controller code to the kernel, we need to intercept some RTAS
calls in the kernel itself.  This adds an infrastructure to allow
in-kernel handlers to be registered for RTAS services by name.
A new ioctl, KVM_PPC_RTAS_DEFINE_TOKEN, then allows userspace to
associate token values with those service names.  Then, when the
guest requests an RTAS service with one of those token values, it
will be handled by the relevant in-kernel handler rather than being
passed up to userspace as at present.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 Documentation/virtual/kvm/api.txt   |   19 ++++
 arch/powerpc/include/asm/hvcall.h   |    3 +
 arch/powerpc/include/asm/kvm_host.h |    1 +
 arch/powerpc/include/asm/kvm_ppc.h  |    4 +
 arch/powerpc/include/uapi/asm/kvm.h |    6 ++
 arch/powerpc/kvm/Makefile           |    1 +
 arch/powerpc/kvm/book3s_hv.c        |   18 +++-
 arch/powerpc/kvm/book3s_pr_papr.c   |    7 ++
 arch/powerpc/kvm/book3s_rtas.c      |  182 +++++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/powerpc.c          |    9 +-
 include/uapi/linux/kvm.h            |    3 +
 11 files changed, 251 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_rtas.c

Comments

Alexander Graf March 21, 2013, 8:52 a.m. UTC | #1
On 15.02.2013, at 00:59, Paul Mackerras wrote:

> From: Michael Ellerman <michael@ellerman.id.au>
> 
> For pseries machine emulation, in order to move the interrupt
> controller code to the kernel, we need to intercept some RTAS
> calls in the kernel itself.  This adds an infrastructure to allow
> in-kernel handlers to be registered for RTAS services by name.
> A new ioctl, KVM_PPC_RTAS_DEFINE_TOKEN, then allows userspace to
> associate token values with those service names.  Then, when the
> guest requests an RTAS service with one of those token values, it
> will be handled by the relevant in-kernel handler rather than being
> passed up to userspace as at present.
> 
> Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
> Documentation/virtual/kvm/api.txt   |   19 ++++
> arch/powerpc/include/asm/hvcall.h   |    3 +
> arch/powerpc/include/asm/kvm_host.h |    1 +
> arch/powerpc/include/asm/kvm_ppc.h  |    4 +
> arch/powerpc/include/uapi/asm/kvm.h |    6 ++
> arch/powerpc/kvm/Makefile           |    1 +
> arch/powerpc/kvm/book3s_hv.c        |   18 +++-
> arch/powerpc/kvm/book3s_pr_papr.c   |    7 ++
> arch/powerpc/kvm/book3s_rtas.c      |  182 +++++++++++++++++++++++++++++++++++
> arch/powerpc/kvm/powerpc.c          |    9 +-
> include/uapi/linux/kvm.h            |    3 +
> 11 files changed, 251 insertions(+), 2 deletions(-)
> create mode 100644 arch/powerpc/kvm/book3s_rtas.c
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index c2534c3..d3e2d60 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -2122,6 +2122,25 @@ header; first `n_valid' valid entries with contents from the data
> written, then `n_invalid' invalid entries, invalidating any previously
> valid entries found.
> 
> +4.79 KVM_PPC_RTAS_DEFINE_TOKEN
> +
> +Capability: KVM_CAP_PPC_RTAS
> +Architectures: ppc
> +Type: vm ioctl
> +Parameters: struct kvm_rtas_token_args
> +Returns: 0 on success, -1 on error
> +
> +Defines a token value for a RTAS (Run Time Abstraction Services)
> +service in order to allow it to be handled in the kernel.  The
> +argument struct gives the name of the service, which must be the name
> +of a service that has a kernel-side implementation.  If the token
> +value is non-zero, it will be associated with that service, and
> +subsequent RTAS calls by the guest specifying that token will be
> +handled by the kernel.  If the token value is 0, then any token
> +associated with the service will be forgotten, and subsequent RTAS
> +calls by the guest for that service will be passed to userspace to be
> +handled.
> +
> 
> 5. The kvm_run structure
> ------------------------
> diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
> index 7a86706..9ea22b2 100644
> --- a/arch/powerpc/include/asm/hvcall.h
> +++ b/arch/powerpc/include/asm/hvcall.h
> @@ -269,6 +269,9 @@
> #define H_GET_MPP_X		0x314
> #define MAX_HCALL_OPCODE	H_GET_MPP_X
> 
> +/* Platform specific hcalls, used by KVM */
> +#define H_RTAS			0xf000

How about you define a different hcall ID for this? Then QEMU would create its "rtas entry blob" such that KVM-routed RTAS handling goes to KVM directly.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras April 4, 2013, 5:37 a.m. UTC | #2
On Thu, Mar 21, 2013 at 09:52:16AM +0100, Alexander Graf wrote:
> > +/* Platform specific hcalls, used by KVM */
> > +#define H_RTAS			0xf000
> 
> How about you define a different hcall ID for this? Then QEMU would
> create its "rtas entry blob" such that KVM-routed RTAS handling goes
> to KVM directly.

QEMU can still do that, and I don't see that it would change the
kernel side if it did.  We would still have to have agreement between
the kernel and userspace as to what the hcall number for invoking the
in-kernel RTAS calls was, and the kernel would still have to keep a
list of token numbers and how they correspond to the functions it
provides.  The only thing different would be that the in-kernel RTAS
hcall could return to the guest if it didn't recognize the token
number, rather than pushing the problem up to userspace.  However,
that wouldn't make the code any simpler, and it isn't a situation
where performance is an issue.

Do you see some kernel-side improvements or simplifications from your
suggestion that I'm missing?  Remember, the guest gets the token
numbers from the device tree (properties under the /rtas node), so
they are under the control of userspace/QEMU.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf April 4, 2013, 9:49 a.m. UTC | #3
On 04.04.2013, at 07:37, Paul Mackerras wrote:

> On Thu, Mar 21, 2013 at 09:52:16AM +0100, Alexander Graf wrote:
>>> +/* Platform specific hcalls, used by KVM */
>>> +#define H_RTAS			0xf000
>> 
>> How about you define a different hcall ID for this? Then QEMU would
>> create its "rtas entry blob" such that KVM-routed RTAS handling goes
>> to KVM directly.
> 
> QEMU can still do that, and I don't see that it would change the
> kernel side if it did.  We would still have to have agreement between
> the kernel and userspace as to what the hcall number for invoking the
> in-kernel RTAS calls was, and the kernel would still have to keep a
> list of token numbers and how they correspond to the functions it
> provides.  The only thing different would be that the in-kernel RTAS
> hcall could return to the guest if it didn't recognize the token
> number, rather than pushing the problem up to userspace.  However,
> that wouldn't make the code any simpler, and it isn't a situation
> where performance is an issue.
> 
> Do you see some kernel-side improvements or simplifications from your
> suggestion that I'm missing?  Remember, the guest gets the token
> numbers from the device tree (properties under the /rtas node), so
> they are under the control of userspace/QEMU.

The code flow with this patch:

  <setup time>

  foreach (override in overrides)
    ioctl(OVERRIDE_RTAS, ...);

  <runtime>

  switch (hcall_id) {
  case QEMU_RTAS_ID:
    foreach (override in kvm_overrides) {
      int rtas_id = ...;
      if (override.rtas_id == rtas_id) {
        handle_rtas();
        handled = true;
      }
    }
    if (!handled)
      pass_to_qemu();
    break;
  default:
    pass_to_qemu();
    break
  }

What I'm suggesting:

  <setup time>

  nothing from KVM's point of view

  <runtime>

  switch (hcall_id) {
  case KVM_RTAS_ID:
    handle_rtas();
    break;
  default:
    pass_to_qemu();
    break;
  }


Which one looks easier and less error prone to you? :)

Speaking of which, how does user space know that the kernel actually supports a specific RTAS token?


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras April 4, 2013, 10:38 p.m. UTC | #4
On Thu, Apr 04, 2013 at 11:49:55AM +0200, Alexander Graf wrote:
> 
> On 04.04.2013, at 07:37, Paul Mackerras wrote:
> 
> > On Thu, Mar 21, 2013 at 09:52:16AM +0100, Alexander Graf wrote:
> >>> +/* Platform specific hcalls, used by KVM */
> >>> +#define H_RTAS			0xf000
> >> 
> >> How about you define a different hcall ID for this? Then QEMU would
> >> create its "rtas entry blob" such that KVM-routed RTAS handling goes
> >> to KVM directly.
> > 
> > QEMU can still do that, and I don't see that it would change the
> > kernel side if it did.  We would still have to have agreement between
> > the kernel and userspace as to what the hcall number for invoking the
> > in-kernel RTAS calls was, and the kernel would still have to keep a
> > list of token numbers and how they correspond to the functions it
> > provides.  The only thing different would be that the in-kernel RTAS
> > hcall could return to the guest if it didn't recognize the token
> > number, rather than pushing the problem up to userspace.  However,
> > that wouldn't make the code any simpler, and it isn't a situation
> > where performance is an issue.
> > 
> > Do you see some kernel-side improvements or simplifications from your
> > suggestion that I'm missing?  Remember, the guest gets the token
> > numbers from the device tree (properties under the /rtas node), so
> > they are under the control of userspace/QEMU.
> 
> The code flow with this patch:
> 
>   <setup time>
> 
>   foreach (override in overrides)
>     ioctl(OVERRIDE_RTAS, ...);
> 
>   <runtime>
> 
>   switch (hcall_id) {
>   case QEMU_RTAS_ID:
>     foreach (override in kvm_overrides) {
>       int rtas_id = ...;
>       if (override.rtas_id == rtas_id) {
>         handle_rtas();

Actually this is more like: override.handler();

>         handled = true;
>       }
>     }
>     if (!handled)
>       pass_to_qemu();
>     break;
>   default:
>     pass_to_qemu();
>     break
>   }
> 
> What I'm suggesting:
> 
>   <setup time>
> 
>   nothing from KVM's point of view

Actually, this can't be "nothing".

The way the RTAS calls work is that there is a name and a "token"
(32-bit integer value) for each RTAS call.  The tokens have to be
unique for each different name.  Userspace puts the names and tokens
in the device tree under the /rtas node (a set of properties where the
property name is the RTAS function name and the property value is the
token).  The guest looks up the token for each RTAS function it wants
to use, and passes the token in the argument buffer for the RTAS call.

This means that userspace has to know the names and tokens for all
supported RTAS functions, both the ones implemented in the kernel and
the ones implemented in userspace.

Also, the token numbers are pretty arbitrary, and the token numbers
for the kernel-implemented RTAS functions could be chosen by userspace
or by the kernel.  If they're chosen by the kernel, then userspace
needs a way to discover them (so it can put them in the device tree),
and also has to avoid choosing any token numbers for its functions
that collide with a kernel-chosen token.  If userspace chooses the
token numbers, it has to tell the kernel what token numbers it has
chosen for the kernel-implemented RTAS functions.  We chose the latter
since it gives userspace more control.

So this <setup time> code has to be either (your suggestion):

    foreach RTAS function possibly implemented in kernel {
        query kernel token for function, by name
	if that gives an error, mark function as needing to be
	        implemented in userspace
    }
    (userspace) allocate tokens for remaining functions,
                avoiding collisions with kernel-chosen tokens

or else it is (my suggestion):

    (userspace) allocate tokens for all RTAS functions
    foreach RTAS function possibly implemented in kernel {
        tell kernel the (name, token) correspondence
    }

>   <runtime>
> 
>   switch (hcall_id) {
>   case KVM_RTAS_ID:
>     handle_rtas();

Here, you've compressed details that you expanded in your pseudo-code
above, making this a less than fair comparison.  This handle_rtas()
function has to fetch the token and branch out to the appropriate
handler routine.  Whether that's a switch statement or a loop over
registered handlers doesn't make all that much difference.

>     break;
>   default:
>     pass_to_qemu();
>     break;
>   }
> 
> 
> Which one looks easier and less error prone to you? :)
> 
> Speaking of which, how does user space know that the kernel actually
> supports a specific RTAS token? 

It's really the names that are more important, the tokens are pretty
arbitrary.  In my scheme, userspace does a KVM_PPC_RTAS_DEFINE_TOKEN
ioctl giving the name and the (userspace-chosen) token, which gets an
error if the kernel doesn't recognize the name.  In your scheme, there
would have to be an equivalent ioctl to query the (kernel-chosen)
token for a given name, which once again would return an error if the
kernel doesn't recognize the name.  Either way the kernel has to have
a list of names that it knows about.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf April 19, 2013, 3:16 p.m. UTC | #5
On 05.04.2013, at 00:38, Paul Mackerras wrote:

> On Thu, Apr 04, 2013 at 11:49:55AM +0200, Alexander Graf wrote:
>> 
>> On 04.04.2013, at 07:37, Paul Mackerras wrote:
>> 
>>> On Thu, Mar 21, 2013 at 09:52:16AM +0100, Alexander Graf wrote:
>>>>> +/* Platform specific hcalls, used by KVM */
>>>>> +#define H_RTAS			0xf000
>>>> 
>>>> How about you define a different hcall ID for this? Then QEMU would
>>>> create its "rtas entry blob" such that KVM-routed RTAS handling goes
>>>> to KVM directly.
>>> 
>>> QEMU can still do that, and I don't see that it would change the
>>> kernel side if it did.  We would still have to have agreement between
>>> the kernel and userspace as to what the hcall number for invoking the
>>> in-kernel RTAS calls was, and the kernel would still have to keep a
>>> list of token numbers and how they correspond to the functions it
>>> provides.  The only thing different would be that the in-kernel RTAS
>>> hcall could return to the guest if it didn't recognize the token
>>> number, rather than pushing the problem up to userspace.  However,
>>> that wouldn't make the code any simpler, and it isn't a situation
>>> where performance is an issue.
>>> 
>>> Do you see some kernel-side improvements or simplifications from your
>>> suggestion that I'm missing?  Remember, the guest gets the token
>>> numbers from the device tree (properties under the /rtas node), so
>>> they are under the control of userspace/QEMU.
>> 
>> The code flow with this patch:
>> 
>>  <setup time>
>> 
>>  foreach (override in overrides)
>>    ioctl(OVERRIDE_RTAS, ...);
>> 
>>  <runtime>
>> 
>>  switch (hcall_id) {
>>  case QEMU_RTAS_ID:
>>    foreach (override in kvm_overrides) {
>>      int rtas_id = ...;
>>      if (override.rtas_id == rtas_id) {
>>        handle_rtas();
> 
> Actually this is more like: override.handler();
> 
>>        handled = true;
>>      }
>>    }
>>    if (!handled)
>>      pass_to_qemu();
>>    break;
>>  default:
>>    pass_to_qemu();
>>    break
>>  }
>> 
>> What I'm suggesting:
>> 
>>  <setup time>
>> 
>>  nothing from KVM's point of view
> 
> Actually, this can't be "nothing".
> 
> The way the RTAS calls work is that there is a name and a "token"
> (32-bit integer value) for each RTAS call.  The tokens have to be
> unique for each different name.  Userspace puts the names and tokens
> in the device tree under the /rtas node (a set of properties where the
> property name is the RTAS function name and the property value is the
> token).  The guest looks up the token for each RTAS function it wants
> to use, and passes the token in the argument buffer for the RTAS call.
> 
> This means that userspace has to know the names and tokens for all
> supported RTAS functions, both the ones implemented in the kernel and
> the ones implemented in userspace.
> 
> Also, the token numbers are pretty arbitrary, and the token numbers
> for the kernel-implemented RTAS functions could be chosen by userspace
> or by the kernel.  If they're chosen by the kernel, then userspace
> needs a way to discover them (so it can put them in the device tree),
> and also has to avoid choosing any token numbers for its functions
> that collide with a kernel-chosen token.  If userspace chooses the
> token numbers, it has to tell the kernel what token numbers it has
> chosen for the kernel-implemented RTAS functions.  We chose the latter
> since it gives userspace more control.
> 
> So this <setup time> code has to be either (your suggestion):
> 
>    foreach RTAS function possibly implemented in kernel {
>        query kernel token for function, by name
> 	if that gives an error, mark function as needing to be
> 	        implemented in userspace
>    }
>    (userspace) allocate tokens for remaining functions,
>                avoiding collisions with kernel-chosen tokens
> 
> or else it is (my suggestion):
> 
>    (userspace) allocate tokens for all RTAS functions
>    foreach RTAS function possibly implemented in kernel {
>        tell kernel the (name, token) correspondence
>    }
> 
>>  <runtime>
>> 
>>  switch (hcall_id) {
>>  case KVM_RTAS_ID:
>>    handle_rtas();
> 
> Here, you've compressed details that you expanded in your pseudo-code
> above, making this a less than fair comparison.  This handle_rtas()
> function has to fetch the token and branch out to the appropriate
> handler routine.  Whether that's a switch statement or a loop over
> registered handlers doesn't make all that much difference.
> 
>>    break;
>>  default:
>>    pass_to_qemu();
>>    break;
>>  }
>> 
>> 
>> Which one looks easier and less error prone to you? :)
>> 
>> Speaking of which, how does user space know that the kernel actually
>> supports a specific RTAS token? 
> 
> It's really the names that are more important, the tokens are pretty
> arbitrary.  In my scheme, userspace does a KVM_PPC_RTAS_DEFINE_TOKEN
> ioctl giving the name and the (userspace-chosen) token, which gets an
> error if the kernel doesn't recognize the name.  In your scheme, there
> would have to be an equivalent ioctl to query the (kernel-chosen)
> token for a given name, which once again would return an error if the
> kernel doesn't recognize the name.  Either way the kernel has to have
> a list of names that it knows about.

Hrm. I think I'm slowly grasping what the real issue is.

Fair enough, your approach works for me then.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index c2534c3..d3e2d60 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2122,6 +2122,25 @@  header; first `n_valid' valid entries with contents from the data
 written, then `n_invalid' invalid entries, invalidating any previously
 valid entries found.
 
+4.79 KVM_PPC_RTAS_DEFINE_TOKEN
+
+Capability: KVM_CAP_PPC_RTAS
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_rtas_token_args
+Returns: 0 on success, -1 on error
+
+Defines a token value for a RTAS (Run Time Abstraction Services)
+service in order to allow it to be handled in the kernel.  The
+argument struct gives the name of the service, which must be the name
+of a service that has a kernel-side implementation.  If the token
+value is non-zero, it will be associated with that service, and
+subsequent RTAS calls by the guest specifying that token will be
+handled by the kernel.  If the token value is 0, then any token
+associated with the service will be forgotten, and subsequent RTAS
+calls by the guest for that service will be passed to userspace to be
+handled.
+
 
 5. The kvm_run structure
 ------------------------
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 7a86706..9ea22b2 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -269,6 +269,9 @@ 
 #define H_GET_MPP_X		0x314
 #define MAX_HCALL_OPCODE	H_GET_MPP_X
 
+/* Platform specific hcalls, used by KVM */
+#define H_RTAS			0xf000
+
 #ifndef __ASSEMBLY__
 
 /**
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 8a72d59..8295dc7 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -255,6 +255,7 @@  struct kvm_arch {
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 #ifdef CONFIG_PPC_BOOK3S_64
 	struct list_head spapr_tce_tables;
+	struct list_head rtas_tokens;
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 44a657a..dd08cfa 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -165,6 +165,10 @@  extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
 
 extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
 
+extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
+extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
+extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 16064d0..d90743c 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -299,6 +299,12 @@  struct kvm_allocate_rma {
 	__u64 rma_size;
 };
 
+/* for KVM_CAP_PPC_RTAS */
+struct kvm_rtas_token_args {
+	char name[120];
+	__u64 token;	/* Use a token of 0 to undefine a mapping */
+};
+
 struct kvm_book3e_206_tlb_entry {
 	__u32 mas8;
 	__u32 mas1;
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index b772ede..432132c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -86,6 +86,7 @@  kvm-book3s_64-module-objs := \
 	emulate.o \
 	book3s.o \
 	book3s_64_vio.o \
+	book3s_rtas.o \
 	$(kvm-book3s_64-objs-y)
 
 kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 80dcc53..567c264 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -479,7 +479,7 @@  int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 	unsigned long req = kvmppc_get_gpr(vcpu, 3);
 	unsigned long target, ret = H_SUCCESS;
 	struct kvm_vcpu *tvcpu;
-	int idx;
+	int idx, rc;
 
 	switch (req) {
 	case H_ENTER:
@@ -515,6 +515,19 @@  int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 					kvmppc_get_gpr(vcpu, 5),
 					kvmppc_get_gpr(vcpu, 6));
 		break;
+	case H_RTAS:
+		if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+			return RESUME_HOST;
+
+		rc = kvmppc_rtas_hcall(vcpu);
+
+		if (rc == -ENOENT)
+			return RESUME_HOST;
+		else if (rc == 0)
+			break;
+
+		/* Send the error out to userspace via KVM_RUN */
+		return rc;
 	default:
 		return RESUME_HOST;
 	}
@@ -1821,6 +1834,7 @@  int kvmppc_core_init_vm(struct kvm *kvm)
 	cpumask_setall(&kvm->arch.need_tlb_flush);
 
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+	INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
 
 	kvm->arch.rma = NULL;
 
@@ -1866,6 +1880,8 @@  void kvmppc_core_destroy_vm(struct kvm *kvm)
 		kvm->arch.rma = NULL;
 	}
 
+	kvmppc_rtas_tokens_free(kvm);
+
 	kvmppc_free_hpt(kvm);
 	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 }
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index ee02b30..4efa4a4 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -246,6 +246,13 @@  int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 		vcpu->stat.halt_wakeup++;
 		return EMULATE_DONE;
+	case H_RTAS:
+		if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+			return RESUME_HOST;
+		if (kvmppc_rtas_hcall(vcpu))
+			break;
+		kvmppc_set_gpr(vcpu, 3, 0);
+		return EMULATE_DONE;
 	}
 
 	return EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
new file mode 100644
index 0000000..8a324e8
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -0,0 +1,182 @@ 
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/err.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/rtas.h>
+
+
+struct rtas_handler {
+	void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args);
+	char *name;
+};
+
+static struct rtas_handler rtas_handlers[] = { };
+
+struct rtas_token_definition {
+	struct list_head list;
+	struct rtas_handler *handler;
+	u64 token;
+};
+
+static int rtas_name_matches(char *s1, char *s2)
+{
+	struct kvm_rtas_token_args args;
+	return !strncmp(s1, s2, sizeof(args.name));
+}
+
+static int rtas_token_undefine(struct kvm *kvm, char *name)
+{
+	struct rtas_token_definition *d, *tmp;
+
+	lockdep_assert_held(&kvm->lock);
+
+	list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
+		if (rtas_name_matches(d->handler->name, name)) {
+			list_del(&d->list);
+			kfree(d);
+			return 0;
+		}
+	}
+
+	/* It's not an error to undefine an undefined token */
+	return 0;
+}
+
+static int rtas_token_define(struct kvm *kvm, char *name, u64 token)
+{
+	struct rtas_token_definition *d;
+	struct rtas_handler *h;
+	bool found;
+	int i;
+
+	lockdep_assert_held(&kvm->lock);
+
+	list_for_each_entry(d, &kvm->arch.rtas_tokens, list) {
+		if (d->token == token)
+			return -EEXIST;
+	}
+
+	found = false;
+	for (i = 0; i < ARRAY_SIZE(rtas_handlers); i++) {
+		h = &rtas_handlers[i];
+		if (rtas_name_matches(h->name, name)) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		return -ENOENT;
+
+	d = kzalloc(sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	d->handler = h;
+	d->token = token;
+
+	list_add_tail(&d->list, &kvm->arch.rtas_tokens);
+
+	return 0;
+}
+
+int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp)
+{
+	struct kvm_rtas_token_args args;
+	int rc;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	mutex_lock(&kvm->lock);
+
+	if (args.token)
+		rc = rtas_token_define(kvm, args.name, args.token);
+	else
+		rc = rtas_token_undefine(kvm, args.name);
+
+	mutex_unlock(&kvm->lock);
+
+	return rc;
+}
+
+int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
+{
+	struct rtas_token_definition *d;
+	struct rtas_args args;
+	rtas_arg_t *orig_rets;
+	gpa_t args_phys;
+	int rc;
+
+	/* r4 contains the guest physical address of the RTAS args */
+	args_phys = kvmppc_get_gpr(vcpu, 4);
+
+	rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args));
+	if (rc)
+		goto fail;
+
+	/*
+	 * args->rets is a pointer into args->args. Now that we've
+	 * copied args we need to fix it up to point into our copy,
+	 * not the guest args. We also need to save the original
+	 * value so we can restore it on the way out.
+	 */
+	orig_rets = args.rets;
+	args.rets = &args.args[args.nargs];
+
+	mutex_lock(&vcpu->kvm->lock);
+
+	rc = -ENOENT;
+	list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) {
+		if (d->token == args.token) {
+			d->handler->handler(vcpu, &args);
+			rc = 0;
+			break;
+		}
+	}
+
+	mutex_unlock(&vcpu->kvm->lock);
+
+	if (rc == 0) {
+		args.rets = orig_rets;
+		rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args));
+		if (rc)
+			goto fail;
+	}
+
+	return rc;
+
+fail:
+	/*
+	 * We only get here if the guest has called RTAS with a bogus
+	 * args pointer. That means we can't get to the args, and so we
+	 * can't fail the RTAS call. So fail right out to userspace,
+	 * which should kill the guest.
+	 */
+	return rc;
+}
+
+void kvmppc_rtas_tokens_free(struct kvm *kvm)
+{
+	struct rtas_token_definition *d, *tmp;
+
+	lockdep_assert_held(&kvm->lock);
+
+	list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
+		list_del(&d->list);
+		kfree(d);
+	}
+}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 934413c..26d8003 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -335,6 +335,7 @@  int kvm_dev_ioctl_check_extension(long ext)
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_SPAPR_TCE:
 	case KVM_CAP_PPC_ALLOC_HTAB:
+	case KVM_CAP_PPC_RTAS:
 		r = 1;
 		break;
 #endif /* CONFIG_PPC_BOOK3S_64 */
@@ -946,8 +947,8 @@  long kvm_arch_vm_ioctl(struct file *filp,
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	case KVM_ALLOCATE_RMA: {
-		struct kvm *kvm = filp->private_data;
 		struct kvm_allocate_rma rma;
+		struct kvm *kvm = filp->private_data;
 
 		r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
 		if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma)))
@@ -995,6 +996,12 @@  long kvm_arch_vm_ioctl(struct file *filp,
 			r = -EFAULT;
 		break;
 	}
+	case KVM_PPC_RTAS_DEFINE_TOKEN: {
+		struct kvm *kvm = filp->private_data;
+
+		r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
+		break;
+	}
 #endif /* CONFIG_PPC_BOOK3S_64 */
 	default:
 		r = -ENOTTY;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 9a2db57..1e2fda0 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -662,6 +662,7 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_HTAB_FD 84
 #define KVM_CAP_S390_CSS_SUPPORT 85
 #define KVM_CAP_PPC_EPR 86
+#define KVM_CAP_PPC_RTAS 87
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -889,6 +890,8 @@  struct kvm_s390_ucas_mapping {
 #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 /* Available with KVM_CAP_PPC_HTAB_FD */
 #define KVM_PPC_GET_HTAB_FD	  _IOW(KVMIO,  0xaa, struct kvm_get_htab_fd)
+/* Available with KVM_CAP_PPC_RTAS */
+#define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xab, struct kvm_rtas_token_args)
 
 /*
  * ioctls for vcpu fds