diff mbox series

[v2,10/16] KVM: PPC: Book3S HV: XIVE: add get/set accessors for the VP XIVE state

Message ID 20190222112840.25000-11-clg@kaod.org
State Superseded
Headers show
Series KVM: PPC: Book3S HV: add XIVE native exploitation mode | expand

Commit Message

Cédric Le Goater Feb. 22, 2019, 11:28 a.m. UTC
At a VCPU level, the state of the thread interrupt management
registers needs to be collected. These registers are cached under the
'xive_saved_state.w01' field of the VCPU when the VPCU context is
pulled from the HW thread. An OPAL call retrieves the backup of the
IPB register in the underlying XIVE NVT structure and merges it in the
KVM state.

The structures of the interface between QEMU and KVM provisions some
extra room (two u64) for further extensions if more state needs to be
transferred back to QEMU.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 arch/powerpc/include/asm/kvm_ppc.h         | 11 +++
 arch/powerpc/include/uapi/asm/kvm.h        |  2 +
 arch/powerpc/kvm/book3s.c                  | 24 +++++++
 arch/powerpc/kvm/book3s_xive_native.c      | 82 ++++++++++++++++++++++
 Documentation/virtual/kvm/devices/xive.txt | 19 +++++
 5 files changed, 138 insertions(+)

Comments

David Gibson Feb. 25, 2019, 3:31 a.m. UTC | #1
On Fri, Feb 22, 2019 at 12:28:34PM +0100, Cédric Le Goater wrote:
> At a VCPU level, the state of the thread interrupt management
> registers needs to be collected. These registers are cached under the
> 'xive_saved_state.w01' field of the VCPU when the VPCU context is
> pulled from the HW thread. An OPAL call retrieves the backup of the
> IPB register in the underlying XIVE NVT structure and merges it in the
> KVM state.
> 
> The structures of the interface between QEMU and KVM provisions some
> extra room (two u64) for further extensions if more state needs to be
> transferred back to QEMU.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  arch/powerpc/include/asm/kvm_ppc.h         | 11 +++
>  arch/powerpc/include/uapi/asm/kvm.h        |  2 +
>  arch/powerpc/kvm/book3s.c                  | 24 +++++++
>  arch/powerpc/kvm/book3s_xive_native.c      | 82 ++++++++++++++++++++++
>  Documentation/virtual/kvm/devices/xive.txt | 19 +++++
>  5 files changed, 138 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index 1e61877fe147..664c65051612 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -272,6 +272,7 @@ union kvmppc_one_reg {
>  		u64	addr;
>  		u64	length;
>  	}	vpaval;
> +	u64	xive_timaval[4];

This is doubling the size of the userspace visible one_reg union.  Is
that safe?

>  };
>  
>  struct kvmppc_ops {
> @@ -604,6 +605,10 @@ extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
>  extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu);
>  extern void kvmppc_xive_native_init_module(void);
>  extern void kvmppc_xive_native_exit_module(void);
> +extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
> +				     union kvmppc_one_reg *val);
> +extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
> +				     union kvmppc_one_reg *val);
>  
>  #else
>  static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
> @@ -636,6 +641,12 @@ static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
>  static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { }
>  static inline void kvmppc_xive_native_init_module(void) { }
>  static inline void kvmppc_xive_native_exit_module(void) { }
> +static inline int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
> +					    union kvmppc_one_reg *val)
> +{ return 0; }
> +static inline int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
> +					    union kvmppc_one_reg *val)
> +{ return -ENOENT; }
>  
>  #endif /* CONFIG_KVM_XIVE */
>  
> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> index cd78ad1020fe..42d4ef93ec2d 100644
> --- a/arch/powerpc/include/uapi/asm/kvm.h
> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> @@ -480,6 +480,8 @@ struct kvm_ppc_cpu_char {
>  #define  KVM_REG_PPC_ICP_PPRI_SHIFT	16	/* pending irq priority */
>  #define  KVM_REG_PPC_ICP_PPRI_MASK	0xff
>  
> +#define KVM_REG_PPC_VP_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U256 | 0x8d)
> +
>  /* Device control API: PPC-specific devices */
>  #define KVM_DEV_MPIC_GRP_MISC		1
>  #define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> index 96d43f091255..f85a9211f30c 100644
> --- a/arch/powerpc/kvm/book3s.c
> +++ b/arch/powerpc/kvm/book3s.c
> @@ -641,6 +641,18 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
>  				*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
>  			break;
>  #endif /* CONFIG_KVM_XICS */
> +#ifdef CONFIG_KVM_XIVE
> +		case KVM_REG_PPC_VP_STATE:
> +			if (!vcpu->arch.xive_vcpu) {
> +				r = -ENXIO;
> +				break;
> +			}
> +			if (xive_enabled())
> +				r = kvmppc_xive_native_get_vp(vcpu, val);
> +			else
> +				r = -ENXIO;
> +			break;
> +#endif /* CONFIG_KVM_XIVE */
>  		case KVM_REG_PPC_FSCR:
>  			*val = get_reg_val(id, vcpu->arch.fscr);
>  			break;
> @@ -714,6 +726,18 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
>  				r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
>  			break;
>  #endif /* CONFIG_KVM_XICS */
> +#ifdef CONFIG_KVM_XIVE
> +		case KVM_REG_PPC_VP_STATE:
> +			if (!vcpu->arch.xive_vcpu) {
> +				r = -ENXIO;
> +				break;
> +			}
> +			if (xive_enabled())
> +				r = kvmppc_xive_native_set_vp(vcpu, val);
> +			else
> +				r = -ENXIO;
> +			break;
> +#endif /* CONFIG_KVM_XIVE */
>  		case KVM_REG_PPC_FSCR:
>  			vcpu->arch.fscr = set_reg_val(id, *val);
>  			break;
> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
> index 3debc876d5a0..132bff52d70a 100644
> --- a/arch/powerpc/kvm/book3s_xive_native.c
> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> @@ -845,6 +845,88 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
>  	return ret;
>  }
>  
> +/*
> + * Interrupt Pending Buffer (IPB) offset
> + */
> +#define TM_IPB_SHIFT 40
> +#define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
> +
> +int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
> +{
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +	u64 opal_state;
> +	int rc;
> +
> +	if (!kvmppc_xive_enabled(vcpu))
> +		return -EPERM;
> +
> +	if (!xc)
> +		return -ENOENT;
> +
> +	/* Thread context registers. We only care about IPB and CPPR */
> +	val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
> +
> +	/*
> +	 * Return the OS CAM line to print out the VP identifier in
> +	 * the QEMU monitor. This is not restored.
> +	 */
> +	val->xive_timaval[1] = vcpu->arch.xive_cam_word;

I'm pretty dubious about this mixing of vital state information with
what's basically debug information.  Doubly so since it requires
changing the ABI to increase the one_reg union's size.

Might be better to have this control only return the 0th and 2nd u64s
from the TIMA, with the CAM debug information returned via some other
mechanism.

> +
> +	/* Get the VP state from OPAL */
> +	rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
> +	if (rc)
> +		return rc;
> +
> +	/*
> +	 * Capture the backup of IPB register in the NVT structure and
> +	 * merge it in our KVM VP state.
> +	 */
> +	val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
> +
> +	pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
> +		 __func__,
> +		 vcpu->arch.xive_saved_state.nsr,
> +		 vcpu->arch.xive_saved_state.cppr,
> +		 vcpu->arch.xive_saved_state.ipb,
> +		 vcpu->arch.xive_saved_state.pipr,
> +		 vcpu->arch.xive_saved_state.w01,
> +		 (u32) vcpu->arch.xive_cam_word, opal_state);

Hrm.. except you don't seem to be using the last half of the timaval
field anyway.


> +
> +	return 0;
> +}
> +
> +int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
> +{
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
> +
> +	pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
> +		 val->xive_timaval[0], val->xive_timaval[1]);
> +
> +	if (!kvmppc_xive_enabled(vcpu))
> +		return -EPERM;
> +
> +	if (!xc || !xive)
> +		return -ENOENT;
> +
> +	/* We can't update the state of a "pushed" VCPU	 */
> +	if (WARN_ON(vcpu->arch.xive_pushed))

What prevents userspace from tripping this WARN_ON()?

> +		return -EIO;

EBUSY might be more appropriate here.

> +
> +	/*
> +	 * Restore the thread context registers. IPB and CPPR should
> +	 * be the only ones that matter.
> +	 */
> +	vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
> +
> +	/*
> +	 * There is no need to restore the XIVE internal state (IPB
> +	 * stored in the NVT) as the IPB register was merged in KVM VP
> +	 * state when captured.
> +	 */
> +	return 0;
> +}
> +
>  static int xive_native_debug_show(struct seq_file *m, void *private)
>  {
>  	struct kvmppc_xive *xive = m->private;
> diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
> index a26be635cff9..1b8957c50c53 100644
> --- a/Documentation/virtual/kvm/devices/xive.txt
> +++ b/Documentation/virtual/kvm/devices/xive.txt
> @@ -102,6 +102,25 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>      -EINVAL: Not initialized source number, invalid priority or
>               invalid CPU number.
>  
> +* VCPU state
> +
> +  The XIVE IC maintains VP interrupt state in an internal structure
> +  called the NVT. When a VP is not dispatched on a HW processor
> +  thread, this structure can be updated by HW if the VP is the target
> +  of an event notification.
> +
> +  It is important for migration to capture the cached IPB from the NVT
> +  as it synthesizes the priorities of the pending interrupts. We
> +  capture a bit more to report debug information.
> +
> +  KVM_REG_PPC_VP_STATE (4 * 64bits)
> +  bits:     |  63  ....  32  |  31  ....  0  |
> +  values:   |   TIMA word0   |   TIMA word1  |
> +  bits:     | 127       ..........       64  |
> +  values:   |         VP CAM Line            |
> +  bits:     | 255       ..........      128  |
> +  values:   |            unused              |
> +
>  * Migration:
>  
>    Saving the state of a VM using the XIVE native exploitation mode
Cédric Le Goater March 13, 2019, 1:19 p.m. UTC | #2
On 2/25/19 4:31 AM, David Gibson wrote:
> On Fri, Feb 22, 2019 at 12:28:34PM +0100, Cédric Le Goater wrote:
>> At a VCPU level, the state of the thread interrupt management
>> registers needs to be collected. These registers are cached under the
>> 'xive_saved_state.w01' field of the VCPU when the VPCU context is
>> pulled from the HW thread. An OPAL call retrieves the backup of the
>> IPB register in the underlying XIVE NVT structure and merges it in the
>> KVM state.
>>
>> The structures of the interface between QEMU and KVM provisions some
>> extra room (two u64) for further extensions if more state needs to be
>> transferred back to QEMU.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  arch/powerpc/include/asm/kvm_ppc.h         | 11 +++
>>  arch/powerpc/include/uapi/asm/kvm.h        |  2 +
>>  arch/powerpc/kvm/book3s.c                  | 24 +++++++
>>  arch/powerpc/kvm/book3s_xive_native.c      | 82 ++++++++++++++++++++++
>>  Documentation/virtual/kvm/devices/xive.txt | 19 +++++
>>  5 files changed, 138 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
>> index 1e61877fe147..664c65051612 100644
>> --- a/arch/powerpc/include/asm/kvm_ppc.h
>> +++ b/arch/powerpc/include/asm/kvm_ppc.h
>> @@ -272,6 +272,7 @@ union kvmppc_one_reg {
>>  		u64	addr;
>>  		u64	length;
>>  	}	vpaval;
>> +	u64	xive_timaval[4];
> 
> This is doubling the size of the userspace visible one_reg union.  Is
> that safe?

'safe' as in compatibility on an older KVM which would still use the old 
kvmppc_one_reg definition ? 

It should be fine as KVM_REG_PPC_VP_STATE would not be handled. Am I wrong ?

>>  };
>>  
>>  struct kvmppc_ops {
>> @@ -604,6 +605,10 @@ extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
>>  extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu);
>>  extern void kvmppc_xive_native_init_module(void);
>>  extern void kvmppc_xive_native_exit_module(void);
>> +extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
>> +				     union kvmppc_one_reg *val);
>> +extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
>> +				     union kvmppc_one_reg *val);
>>  
>>  #else
>>  static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
>> @@ -636,6 +641,12 @@ static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
>>  static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { }
>>  static inline void kvmppc_xive_native_init_module(void) { }
>>  static inline void kvmppc_xive_native_exit_module(void) { }
>> +static inline int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
>> +					    union kvmppc_one_reg *val)
>> +{ return 0; }
>> +static inline int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
>> +					    union kvmppc_one_reg *val)
>> +{ return -ENOENT; }
>>  
>>  #endif /* CONFIG_KVM_XIVE */
>>  
>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
>> index cd78ad1020fe..42d4ef93ec2d 100644
>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>> @@ -480,6 +480,8 @@ struct kvm_ppc_cpu_char {
>>  #define  KVM_REG_PPC_ICP_PPRI_SHIFT	16	/* pending irq priority */
>>  #define  KVM_REG_PPC_ICP_PPRI_MASK	0xff
>>  
>> +#define KVM_REG_PPC_VP_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U256 | 0x8d)
>> +
>>  /* Device control API: PPC-specific devices */
>>  #define KVM_DEV_MPIC_GRP_MISC		1
>>  #define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
>> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
>> index 96d43f091255..f85a9211f30c 100644
>> --- a/arch/powerpc/kvm/book3s.c
>> +++ b/arch/powerpc/kvm/book3s.c
>> @@ -641,6 +641,18 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
>>  				*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
>>  			break;
>>  #endif /* CONFIG_KVM_XICS */
>> +#ifdef CONFIG_KVM_XIVE
>> +		case KVM_REG_PPC_VP_STATE:
>> +			if (!vcpu->arch.xive_vcpu) {
>> +				r = -ENXIO;
>> +				break;
>> +			}
>> +			if (xive_enabled())
>> +				r = kvmppc_xive_native_get_vp(vcpu, val);
>> +			else
>> +				r = -ENXIO;
>> +			break;
>> +#endif /* CONFIG_KVM_XIVE */
>>  		case KVM_REG_PPC_FSCR:
>>  			*val = get_reg_val(id, vcpu->arch.fscr);
>>  			break;
>> @@ -714,6 +726,18 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
>>  				r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
>>  			break;
>>  #endif /* CONFIG_KVM_XICS */
>> +#ifdef CONFIG_KVM_XIVE
>> +		case KVM_REG_PPC_VP_STATE:
>> +			if (!vcpu->arch.xive_vcpu) {
>> +				r = -ENXIO;
>> +				break;
>> +			}
>> +			if (xive_enabled())
>> +				r = kvmppc_xive_native_set_vp(vcpu, val);
>> +			else
>> +				r = -ENXIO;
>> +			break;
>> +#endif /* CONFIG_KVM_XIVE */
>>  		case KVM_REG_PPC_FSCR:
>>  			vcpu->arch.fscr = set_reg_val(id, *val);
>>  			break;
>> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
>> index 3debc876d5a0..132bff52d70a 100644
>> --- a/arch/powerpc/kvm/book3s_xive_native.c
>> +++ b/arch/powerpc/kvm/book3s_xive_native.c
>> @@ -845,6 +845,88 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
>>  	return ret;
>>  }
>>  
>> +/*
>> + * Interrupt Pending Buffer (IPB) offset
>> + */
>> +#define TM_IPB_SHIFT 40
>> +#define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
>> +
>> +int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
>> +{
>> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>> +	u64 opal_state;
>> +	int rc;
>> +
>> +	if (!kvmppc_xive_enabled(vcpu))
>> +		return -EPERM;
>> +
>> +	if (!xc)
>> +		return -ENOENT;
>> +
>> +	/* Thread context registers. We only care about IPB and CPPR */
>> +	val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
>> +
>> +	/*
>> +	 * Return the OS CAM line to print out the VP identifier in
>> +	 * the QEMU monitor. This is not restored.
>> +	 */
>> +	val->xive_timaval[1] = vcpu->arch.xive_cam_word;
> 
> I'm pretty dubious about this mixing of vital state information with
> what's basically debug information. 

I think QEMU deserves to know about the OS CAM line value. I was even 
thinking about adding the POOL CAM line value for future use (nested) 

> Doubly so since it requires changing the ABI to increase 
> the one_reg union's size.

OK. That's one argument.
 
> Might be better to have this control only return the 0th and 2nd u64s
> from the TIMA, with the CAM debug information returned via some other
> mechanism.

Like an extra reg : KVM_REG_PPC_VP_CAM ? 

>> +
>> +	/* Get the VP state from OPAL */
>> +	rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
>> +	if (rc)
>> +		return rc;
>> +
>> +	/*
>> +	 * Capture the backup of IPB register in the NVT structure and
>> +	 * merge it in our KVM VP state.
>> +	 */
>> +	val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
>> +
>> +	pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
>> +		 __func__,
>> +		 vcpu->arch.xive_saved_state.nsr,
>> +		 vcpu->arch.xive_saved_state.cppr,
>> +		 vcpu->arch.xive_saved_state.ipb,
>> +		 vcpu->arch.xive_saved_state.pipr,
>> +		 vcpu->arch.xive_saved_state.w01,
>> +		 (u32) vcpu->arch.xive_cam_word, opal_state);
> 
> Hrm.. except you don't seem to be using the last half of the timaval
> field anyway.

Yes. The two u64 are extras. We can do without. 

Would that be ok if I stored the w01 regs in the first u64, the CAM line(s) 
in the second and remove the extra two u64 ?  
 
>> +
>> +	return 0;
>> +}
>> +
>> +int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
>> +{
>> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>> +	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
>> +
>> +	pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
>> +		 val->xive_timaval[0], val->xive_timaval[1]);
>> +
>> +	if (!kvmppc_xive_enabled(vcpu))
>> +		return -EPERM;
>> +
>> +	if (!xc || !xive)
>> +		return -ENOENT;
>> +
>> +	/* We can't update the state of a "pushed" VCPU	 */
>> +	if (WARN_ON(vcpu->arch.xive_pushed))
> 
> What prevents userspace from tripping this WARN_ON()?

if the vCPU is executing a vCPU ioctl, it means that it exited the guest 
and that its interrupt context has been pulled out of XIVE.
 
>> +		return -EIO;
> 
> EBUSY might be more appropriate here.

OK.

Thanks,

C. 

> 
>> +
>> +	/*
>> +	 * Restore the thread context registers. IPB and CPPR should
>> +	 * be the only ones that matter.
>> +	 */
>> +	vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
>> +
>> +	/*
>> +	 * There is no need to restore the XIVE internal state (IPB
>> +	 * stored in the NVT) as the IPB register was merged in KVM VP
>> +	 * state when captured.
>> +	 */
>> +	return 0;
>> +}
>> +
>>  static int xive_native_debug_show(struct seq_file *m, void *private)
>>  {
>>  	struct kvmppc_xive *xive = m->private;
>> diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
>> index a26be635cff9..1b8957c50c53 100644
>> --- a/Documentation/virtual/kvm/devices/xive.txt
>> +++ b/Documentation/virtual/kvm/devices/xive.txt
>> @@ -102,6 +102,25 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>>      -EINVAL: Not initialized source number, invalid priority or
>>               invalid CPU number.
>>  
>> +* VCPU state
>> +
>> +  The XIVE IC maintains VP interrupt state in an internal structure
>> +  called the NVT. When a VP is not dispatched on a HW processor
>> +  thread, this structure can be updated by HW if the VP is the target
>> +  of an event notification.
>> +
>> +  It is important for migration to capture the cached IPB from the NVT
>> +  as it synthesizes the priorities of the pending interrupts. We
>> +  capture a bit more to report debug information.
>> +
>> +  KVM_REG_PPC_VP_STATE (4 * 64bits)
>> +  bits:     |  63  ....  32  |  31  ....  0  |
>> +  values:   |   TIMA word0   |   TIMA word1  |
>> +  bits:     | 127       ..........       64  |
>> +  values:   |         VP CAM Line            |
>> +  bits:     | 255       ..........      128  |
>> +  values:   |            unused              |
>> +
>>  * Migration:
>>  
>>    Saving the state of a VM using the XIVE native exploitation mode
>
David Gibson March 14, 2019, 3:09 a.m. UTC | #3
On Wed, Mar 13, 2019 at 02:19:13PM +0100, Cédric Le Goater wrote:
> On 2/25/19 4:31 AM, David Gibson wrote:
> > On Fri, Feb 22, 2019 at 12:28:34PM +0100, Cédric Le Goater wrote:
> >> At a VCPU level, the state of the thread interrupt management
> >> registers needs to be collected. These registers are cached under the
> >> 'xive_saved_state.w01' field of the VCPU when the VPCU context is
> >> pulled from the HW thread. An OPAL call retrieves the backup of the
> >> IPB register in the underlying XIVE NVT structure and merges it in the
> >> KVM state.
> >>
> >> The structures of the interface between QEMU and KVM provisions some
> >> extra room (two u64) for further extensions if more state needs to be
> >> transferred back to QEMU.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>  arch/powerpc/include/asm/kvm_ppc.h         | 11 +++
> >>  arch/powerpc/include/uapi/asm/kvm.h        |  2 +
> >>  arch/powerpc/kvm/book3s.c                  | 24 +++++++
> >>  arch/powerpc/kvm/book3s_xive_native.c      | 82 ++++++++++++++++++++++
> >>  Documentation/virtual/kvm/devices/xive.txt | 19 +++++
> >>  5 files changed, 138 insertions(+)
> >>
> >> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> >> index 1e61877fe147..664c65051612 100644
> >> --- a/arch/powerpc/include/asm/kvm_ppc.h
> >> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> >> @@ -272,6 +272,7 @@ union kvmppc_one_reg {
> >>  		u64	addr;
> >>  		u64	length;
> >>  	}	vpaval;
> >> +	u64	xive_timaval[4];
> > 
> > This is doubling the size of the userspace visible one_reg union.  Is
> > that safe?
> 
> 'safe' as in compatibility on an older KVM which would still use the old 
> kvmppc_one_reg definition ?

I was more thinking of old qemu with a new kernel.

> It should be fine as KVM_REG_PPC_VP_STATE would not be handled. Am I
> wrong ?

Looks like it should be ok, because we only partially copy the
structure to/from userspace due to the one_reg_size() logic.  If the
whole union was always copied, it would be hilariously unsafe.

> 
> >>  };
> >>  
> >>  struct kvmppc_ops {
> >> @@ -604,6 +605,10 @@ extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
> >>  extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu);
> >>  extern void kvmppc_xive_native_init_module(void);
> >>  extern void kvmppc_xive_native_exit_module(void);
> >> +extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
> >> +				     union kvmppc_one_reg *val);
> >> +extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
> >> +				     union kvmppc_one_reg *val);
> >>  
> >>  #else
> >>  static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
> >> @@ -636,6 +641,12 @@ static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
> >>  static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { }
> >>  static inline void kvmppc_xive_native_init_module(void) { }
> >>  static inline void kvmppc_xive_native_exit_module(void) { }
> >> +static inline int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
> >> +					    union kvmppc_one_reg *val)
> >> +{ return 0; }
> >> +static inline int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
> >> +					    union kvmppc_one_reg *val)
> >> +{ return -ENOENT; }
> >>  
> >>  #endif /* CONFIG_KVM_XIVE */
> >>  
> >> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> >> index cd78ad1020fe..42d4ef93ec2d 100644
> >> --- a/arch/powerpc/include/uapi/asm/kvm.h
> >> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> >> @@ -480,6 +480,8 @@ struct kvm_ppc_cpu_char {
> >>  #define  KVM_REG_PPC_ICP_PPRI_SHIFT	16	/* pending irq priority */
> >>  #define  KVM_REG_PPC_ICP_PPRI_MASK	0xff
> >>  
> >> +#define KVM_REG_PPC_VP_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U256 | 0x8d)
> >> +
> >>  /* Device control API: PPC-specific devices */
> >>  #define KVM_DEV_MPIC_GRP_MISC		1
> >>  #define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
> >> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> >> index 96d43f091255..f85a9211f30c 100644
> >> --- a/arch/powerpc/kvm/book3s.c
> >> +++ b/arch/powerpc/kvm/book3s.c
> >> @@ -641,6 +641,18 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
> >>  				*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
> >>  			break;
> >>  #endif /* CONFIG_KVM_XICS */
> >> +#ifdef CONFIG_KVM_XIVE
> >> +		case KVM_REG_PPC_VP_STATE:
> >> +			if (!vcpu->arch.xive_vcpu) {
> >> +				r = -ENXIO;
> >> +				break;
> >> +			}
> >> +			if (xive_enabled())
> >> +				r = kvmppc_xive_native_get_vp(vcpu, val);
> >> +			else
> >> +				r = -ENXIO;
> >> +			break;
> >> +#endif /* CONFIG_KVM_XIVE */
> >>  		case KVM_REG_PPC_FSCR:
> >>  			*val = get_reg_val(id, vcpu->arch.fscr);
> >>  			break;
> >> @@ -714,6 +726,18 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
> >>  				r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
> >>  			break;
> >>  #endif /* CONFIG_KVM_XICS */
> >> +#ifdef CONFIG_KVM_XIVE
> >> +		case KVM_REG_PPC_VP_STATE:
> >> +			if (!vcpu->arch.xive_vcpu) {
> >> +				r = -ENXIO;
> >> +				break;
> >> +			}
> >> +			if (xive_enabled())
> >> +				r = kvmppc_xive_native_set_vp(vcpu, val);
> >> +			else
> >> +				r = -ENXIO;
> >> +			break;
> >> +#endif /* CONFIG_KVM_XIVE */
> >>  		case KVM_REG_PPC_FSCR:
> >>  			vcpu->arch.fscr = set_reg_val(id, *val);
> >>  			break;
> >> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
> >> index 3debc876d5a0..132bff52d70a 100644
> >> --- a/arch/powerpc/kvm/book3s_xive_native.c
> >> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> >> @@ -845,6 +845,88 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
> >>  	return ret;
> >>  }
> >>  
> >> +/*
> >> + * Interrupt Pending Buffer (IPB) offset
> >> + */
> >> +#define TM_IPB_SHIFT 40
> >> +#define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
> >> +
> >> +int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
> >> +{
> >> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> >> +	u64 opal_state;
> >> +	int rc;
> >> +
> >> +	if (!kvmppc_xive_enabled(vcpu))
> >> +		return -EPERM;
> >> +
> >> +	if (!xc)
> >> +		return -ENOENT;
> >> +
> >> +	/* Thread context registers. We only care about IPB and CPPR */
> >> +	val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
> >> +
> >> +	/*
> >> +	 * Return the OS CAM line to print out the VP identifier in
> >> +	 * the QEMU monitor. This is not restored.
> >> +	 */
> >> +	val->xive_timaval[1] = vcpu->arch.xive_cam_word;
> > 
> > I'm pretty dubious about this mixing of vital state information with
> > what's basically debug information. 
> 
> I think QEMU deserves to know about the OS CAM line value. I was even 
> thinking about adding the POOL CAM line value for future use (nested) 
> 
> > Doubly so since it requires changing the ABI to increase 
> > the one_reg union's size.
> 
> OK. That's one argument.
>  
> > Might be better to have this control only return the 0th and 2nd u64s
> > from the TIMA, with the CAM debug information returned via some other
> > mechanism.
> 
> Like an extra reg : KVM_REG_PPC_VP_CAM ? 

That would be the obvious choice, yes.

> >> +
> >> +	/* Get the VP state from OPAL */
> >> +	rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
> >> +	if (rc)
> >> +		return rc;
> >> +
> >> +	/*
> >> +	 * Capture the backup of IPB register in the NVT structure and
> >> +	 * merge it in our KVM VP state.
> >> +	 */
> >> +	val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
> >> +
> >> +	pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
> >> +		 __func__,
> >> +		 vcpu->arch.xive_saved_state.nsr,
> >> +		 vcpu->arch.xive_saved_state.cppr,
> >> +		 vcpu->arch.xive_saved_state.ipb,
> >> +		 vcpu->arch.xive_saved_state.pipr,
> >> +		 vcpu->arch.xive_saved_state.w01,
> >> +		 (u32) vcpu->arch.xive_cam_word, opal_state);
> > 
> > Hrm.. except you don't seem to be using the last half of the timaval
> > field anyway.
> 
> Yes. The two u64 are extras. We can do without. 
> 
> Would that be ok if I stored the w01 regs in the first u64, the CAM line(s) 
> in the second and remove the extra two u64 ?

I'd still prefer them in separate regs.  They kind of belong to
different categories of information, and I can't think of any
particular reason you'd have to update or fetch them as a unit.

>  
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
> >> +{
> >> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> >> +	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
> >> +
> >> +	pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
> >> +		 val->xive_timaval[0], val->xive_timaval[1]);
> >> +
> >> +	if (!kvmppc_xive_enabled(vcpu))
> >> +		return -EPERM;
> >> +
> >> +	if (!xc || !xive)
> >> +		return -ENOENT;
> >> +
> >> +	/* We can't update the state of a "pushed" VCPU	 */
> >> +	if (WARN_ON(vcpu->arch.xive_pushed))
> > 
> > What prevents userspace from tripping this WARN_ON()?
> 
> if the vCPU is executing a vCPU ioctl, it means that it exited the guest 
> and that its interrupt context has been pulled out of XIVE.

But couldn't one user thread call the vcpu ioctl() while another is
inside the guest?

> >> +		return -EIO;
> > 
> > EBUSY might be more appropriate here.
> 
> OK.
> 
> Thanks,
> 
> C. 
> 
> > 
> >> +
> >> +	/*
> >> +	 * Restore the thread context registers. IPB and CPPR should
> >> +	 * be the only ones that matter.
> >> +	 */
> >> +	vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
> >> +
> >> +	/*
> >> +	 * There is no need to restore the XIVE internal state (IPB
> >> +	 * stored in the NVT) as the IPB register was merged in KVM VP
> >> +	 * state when captured.
> >> +	 */
> >> +	return 0;
> >> +}
> >> +
> >>  static int xive_native_debug_show(struct seq_file *m, void *private)
> >>  {
> >>  	struct kvmppc_xive *xive = m->private;
> >> diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
> >> index a26be635cff9..1b8957c50c53 100644
> >> --- a/Documentation/virtual/kvm/devices/xive.txt
> >> +++ b/Documentation/virtual/kvm/devices/xive.txt
> >> @@ -102,6 +102,25 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
> >>      -EINVAL: Not initialized source number, invalid priority or
> >>               invalid CPU number.
> >>  
> >> +* VCPU state
> >> +
> >> +  The XIVE IC maintains VP interrupt state in an internal structure
> >> +  called the NVT. When a VP is not dispatched on a HW processor
> >> +  thread, this structure can be updated by HW if the VP is the target
> >> +  of an event notification.
> >> +
> >> +  It is important for migration to capture the cached IPB from the NVT
> >> +  as it synthesizes the priorities of the pending interrupts. We
> >> +  capture a bit more to report debug information.
> >> +
> >> +  KVM_REG_PPC_VP_STATE (4 * 64bits)
> >> +  bits:     |  63  ....  32  |  31  ....  0  |
> >> +  values:   |   TIMA word0   |   TIMA word1  |
> >> +  bits:     | 127       ..........       64  |
> >> +  values:   |         VP CAM Line            |
> >> +  bits:     | 255       ..........      128  |
> >> +  values:   |            unused              |
> >> +
> >>  * Migration:
> >>  
> >>    Saving the state of a VM using the XIVE native exploitation mode
> > 
>
Cédric Le Goater March 14, 2019, 7:08 a.m. UTC | #4
On 3/14/19 4:09 AM, David Gibson wrote:
> On Wed, Mar 13, 2019 at 02:19:13PM +0100, Cédric Le Goater wrote:
>> On 2/25/19 4:31 AM, David Gibson wrote:
>>> On Fri, Feb 22, 2019 at 12:28:34PM +0100, Cédric Le Goater wrote:
>>>> At a VCPU level, the state of the thread interrupt management
>>>> registers needs to be collected. These registers are cached under the
>>>> 'xive_saved_state.w01' field of the VCPU when the VPCU context is
>>>> pulled from the HW thread. An OPAL call retrieves the backup of the
>>>> IPB register in the underlying XIVE NVT structure and merges it in the
>>>> KVM state.
>>>>
>>>> The structures of the interface between QEMU and KVM provisions some
>>>> extra room (two u64) for further extensions if more state needs to be
>>>> transferred back to QEMU.
>>>>
>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>> ---
>>>>  arch/powerpc/include/asm/kvm_ppc.h         | 11 +++
>>>>  arch/powerpc/include/uapi/asm/kvm.h        |  2 +
>>>>  arch/powerpc/kvm/book3s.c                  | 24 +++++++
>>>>  arch/powerpc/kvm/book3s_xive_native.c      | 82 ++++++++++++++++++++++
>>>>  Documentation/virtual/kvm/devices/xive.txt | 19 +++++
>>>>  5 files changed, 138 insertions(+)
>>>>
>>>> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
>>>> index 1e61877fe147..664c65051612 100644
>>>> --- a/arch/powerpc/include/asm/kvm_ppc.h
>>>> +++ b/arch/powerpc/include/asm/kvm_ppc.h
>>>> @@ -272,6 +272,7 @@ union kvmppc_one_reg {
>>>>  		u64	addr;
>>>>  		u64	length;
>>>>  	}	vpaval;
>>>> +	u64	xive_timaval[4];
>>>
>>> This is doubling the size of the userspace visible one_reg union.  Is
>>> that safe?
>>
>> 'safe' as in compatibility on an older KVM which would still use the old 
>> kvmppc_one_reg definition ?
> 
> I was more thinking of old qemu with a new kernel.
> 
>> It should be fine as KVM_REG_PPC_VP_STATE would not be handled. Am I
>> wrong ?
> 
> Looks like it should be ok, because we only partially copy the
> structure to/from userspace due to the one_reg_size() logic.  If the
> whole union was always copied, it would be hilariously unsafe.
> 
>>
>>>>  };
>>>>  
>>>>  struct kvmppc_ops {
>>>> @@ -604,6 +605,10 @@ extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
>>>>  extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu);
>>>>  extern void kvmppc_xive_native_init_module(void);
>>>>  extern void kvmppc_xive_native_exit_module(void);
>>>> +extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
>>>> +				     union kvmppc_one_reg *val);
>>>> +extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
>>>> +				     union kvmppc_one_reg *val);
>>>>  
>>>>  #else
>>>>  static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
>>>> @@ -636,6 +641,12 @@ static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
>>>>  static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { }
>>>>  static inline void kvmppc_xive_native_init_module(void) { }
>>>>  static inline void kvmppc_xive_native_exit_module(void) { }
>>>> +static inline int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
>>>> +					    union kvmppc_one_reg *val)
>>>> +{ return 0; }
>>>> +static inline int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
>>>> +					    union kvmppc_one_reg *val)
>>>> +{ return -ENOENT; }
>>>>  
>>>>  #endif /* CONFIG_KVM_XIVE */
>>>>  
>>>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
>>>> index cd78ad1020fe..42d4ef93ec2d 100644
>>>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>>>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>>>> @@ -480,6 +480,8 @@ struct kvm_ppc_cpu_char {
>>>>  #define  KVM_REG_PPC_ICP_PPRI_SHIFT	16	/* pending irq priority */
>>>>  #define  KVM_REG_PPC_ICP_PPRI_MASK	0xff
>>>>  
>>>> +#define KVM_REG_PPC_VP_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U256 | 0x8d)
>>>> +
>>>>  /* Device control API: PPC-specific devices */
>>>>  #define KVM_DEV_MPIC_GRP_MISC		1
>>>>  #define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
>>>> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
>>>> index 96d43f091255..f85a9211f30c 100644
>>>> --- a/arch/powerpc/kvm/book3s.c
>>>> +++ b/arch/powerpc/kvm/book3s.c
>>>> @@ -641,6 +641,18 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
>>>>  				*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
>>>>  			break;
>>>>  #endif /* CONFIG_KVM_XICS */
>>>> +#ifdef CONFIG_KVM_XIVE
>>>> +		case KVM_REG_PPC_VP_STATE:
>>>> +			if (!vcpu->arch.xive_vcpu) {
>>>> +				r = -ENXIO;
>>>> +				break;
>>>> +			}
>>>> +			if (xive_enabled())
>>>> +				r = kvmppc_xive_native_get_vp(vcpu, val);
>>>> +			else
>>>> +				r = -ENXIO;
>>>> +			break;
>>>> +#endif /* CONFIG_KVM_XIVE */
>>>>  		case KVM_REG_PPC_FSCR:
>>>>  			*val = get_reg_val(id, vcpu->arch.fscr);
>>>>  			break;
>>>> @@ -714,6 +726,18 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
>>>>  				r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
>>>>  			break;
>>>>  #endif /* CONFIG_KVM_XICS */
>>>> +#ifdef CONFIG_KVM_XIVE
>>>> +		case KVM_REG_PPC_VP_STATE:
>>>> +			if (!vcpu->arch.xive_vcpu) {
>>>> +				r = -ENXIO;
>>>> +				break;
>>>> +			}
>>>> +			if (xive_enabled())
>>>> +				r = kvmppc_xive_native_set_vp(vcpu, val);
>>>> +			else
>>>> +				r = -ENXIO;
>>>> +			break;
>>>> +#endif /* CONFIG_KVM_XIVE */
>>>>  		case KVM_REG_PPC_FSCR:
>>>>  			vcpu->arch.fscr = set_reg_val(id, *val);
>>>>  			break;
>>>> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
>>>> index 3debc876d5a0..132bff52d70a 100644
>>>> --- a/arch/powerpc/kvm/book3s_xive_native.c
>>>> +++ b/arch/powerpc/kvm/book3s_xive_native.c
>>>> @@ -845,6 +845,88 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
>>>>  	return ret;
>>>>  }
>>>>  
>>>> +/*
>>>> + * Interrupt Pending Buffer (IPB) offset
>>>> + */
>>>> +#define TM_IPB_SHIFT 40
>>>> +#define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
>>>> +
>>>> +int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
>>>> +{
>>>> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>>>> +	u64 opal_state;
>>>> +	int rc;
>>>> +
>>>> +	if (!kvmppc_xive_enabled(vcpu))
>>>> +		return -EPERM;
>>>> +
>>>> +	if (!xc)
>>>> +		return -ENOENT;
>>>> +
>>>> +	/* Thread context registers. We only care about IPB and CPPR */
>>>> +	val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
>>>> +
>>>> +	/*
>>>> +	 * Return the OS CAM line to print out the VP identifier in
>>>> +	 * the QEMU monitor. This is not restored.
>>>> +	 */
>>>> +	val->xive_timaval[1] = vcpu->arch.xive_cam_word;
>>>
>>> I'm pretty dubious about this mixing of vital state information with
>>> what's basically debug information. 
>>
>> I think QEMU deserves to know about the OS CAM line value. I was even 
>> thinking about adding the POOL CAM line value for future use (nested) 
>>
>>> Doubly so since it requires changing the ABI to increase 
>>> the one_reg union's size.
>>
>> OK. That's one argument.
>>  
>>> Might be better to have this control only return the 0th and 2nd u64s
>>> from the TIMA, with the CAM debug information returned via some other
>>> mechanism.
>>
>> Like an extra reg : KVM_REG_PPC_VP_CAM ? 
> 
> That would be the obvious choice, yes.

OK. Let's keep that in mind but I think it is overkill. I would rather
have one reg per ring instead.

>>>> +
>>>> +	/* Get the VP state from OPAL */
>>>> +	rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
>>>> +	if (rc)
>>>> +		return rc;
>>>> +
>>>> +	/*
>>>> +	 * Capture the backup of IPB register in the NVT structure and
>>>> +	 * merge it in our KVM VP state.
>>>> +	 */
>>>> +	val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
>>>> +
>>>> +	pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
>>>> +		 __func__,
>>>> +		 vcpu->arch.xive_saved_state.nsr,
>>>> +		 vcpu->arch.xive_saved_state.cppr,
>>>> +		 vcpu->arch.xive_saved_state.ipb,
>>>> +		 vcpu->arch.xive_saved_state.pipr,
>>>> +		 vcpu->arch.xive_saved_state.w01,
>>>> +		 (u32) vcpu->arch.xive_cam_word, opal_state);
>>>
>>> Hrm.. except you don't seem to be using the last half of the timaval
>>> field anyway.
>>
>> Yes. The two u64 are extras. We can do without. 
>>
>> Would that be ok if I stored the w01 regs in the first u64, the CAM line(s) 
>> in the second and remove the extra two u64 ?
> 
> I'd still prefer them in separate regs.  They kind of belong to
> different categories of information, and I can't think of any
> particular reason you'd have to update or fetch them as a unit.

Because they belong to the same thread interrupt context and the same 
ring (OS) even if only the hypervisor can set the OS CAM line. The OS 
can only set the CPPR. QEMU operates at the hypervisor level so it is 
not violating any privilege level.  

>>  
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
>>>> +{
>>>> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>>>> +	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
>>>> +
>>>> +	pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
>>>> +		 val->xive_timaval[0], val->xive_timaval[1]);
>>>> +
>>>> +	if (!kvmppc_xive_enabled(vcpu))
>>>> +		return -EPERM;
>>>> +
>>>> +	if (!xc || !xive)
>>>> +		return -ENOENT;
>>>> +
>>>> +	/* We can't update the state of a "pushed" VCPU	 */
>>>> +	if (WARN_ON(vcpu->arch.xive_pushed))
>>>
>>> What prevents userspace from tripping this WARN_ON()?
>>
>> if the vCPU is executing a vCPU ioctl, it means that it exited the guest 
>> and that its interrupt context has been pulled out of XIVE.
> 
> But couldn't one user thread call the vcpu ioctl() while another is
> inside the guest? 

Not while setting the VP state. The guest is not resumed.

Thanks,

C.
 
> 
>>>> +		return -EIO;
>>>
>>> EBUSY might be more appropriate here.
>>
>> OK.
>>
>> Thanks,
>>
>> C. 
>>
>>>
>>>> +
>>>> +	/*
>>>> +	 * Restore the thread context registers. IPB and CPPR should
>>>> +	 * be the only ones that matter.
>>>> +	 */
>>>> +	vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
>>>> +
>>>> +	/*
>>>> +	 * There is no need to restore the XIVE internal state (IPB
>>>> +	 * stored in the NVT) as the IPB register was merged in KVM VP
>>>> +	 * state when captured.
>>>> +	 */
>>>> +	return 0;
>>>> +}
>>>> +
>>>>  static int xive_native_debug_show(struct seq_file *m, void *private)
>>>>  {
>>>>  	struct kvmppc_xive *xive = m->private;
>>>> diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
>>>> index a26be635cff9..1b8957c50c53 100644
>>>> --- a/Documentation/virtual/kvm/devices/xive.txt
>>>> +++ b/Documentation/virtual/kvm/devices/xive.txt
>>>> @@ -102,6 +102,25 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>>>>      -EINVAL: Not initialized source number, invalid priority or
>>>>               invalid CPU number.
>>>>  
>>>> +* VCPU state
>>>> +
>>>> +  The XIVE IC maintains VP interrupt state in an internal structure
>>>> +  called the NVT. When a VP is not dispatched on a HW processor
>>>> +  thread, this structure can be updated by HW if the VP is the target
>>>> +  of an event notification.
>>>> +
>>>> +  It is important for migration to capture the cached IPB from the NVT
>>>> +  as it synthesizes the priorities of the pending interrupts. We
>>>> +  capture a bit more to report debug information.
>>>> +
>>>> +  KVM_REG_PPC_VP_STATE (4 * 64bits)
>>>> +  bits:     |  63  ....  32  |  31  ....  0  |
>>>> +  values:   |   TIMA word0   |   TIMA word1  |
>>>> +  bits:     | 127       ..........       64  |
>>>> +  values:   |         VP CAM Line            |
>>>> +  bits:     | 255       ..........      128  |
>>>> +  values:   |            unused              |
>>>> +
>>>>  * Migration:
>>>>  
>>>>    Saving the state of a VM using the XIVE native exploitation mode
>>>
>>
>
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 1e61877fe147..664c65051612 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -272,6 +272,7 @@  union kvmppc_one_reg {
 		u64	addr;
 		u64	length;
 	}	vpaval;
+	u64	xive_timaval[4];
 };
 
 struct kvmppc_ops {
@@ -604,6 +605,10 @@  extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
 extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu);
 extern void kvmppc_xive_native_init_module(void);
 extern void kvmppc_xive_native_exit_module(void);
+extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
+				     union kvmppc_one_reg *val);
+extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
+				     union kvmppc_one_reg *val);
 
 #else
 static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
@@ -636,6 +641,12 @@  static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
 static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { }
 static inline void kvmppc_xive_native_init_module(void) { }
 static inline void kvmppc_xive_native_exit_module(void) { }
+static inline int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
+					    union kvmppc_one_reg *val)
+{ return 0; }
+static inline int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
+					    union kvmppc_one_reg *val)
+{ return -ENOENT; }
 
 #endif /* CONFIG_KVM_XIVE */
 
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index cd78ad1020fe..42d4ef93ec2d 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -480,6 +480,8 @@  struct kvm_ppc_cpu_char {
 #define  KVM_REG_PPC_ICP_PPRI_SHIFT	16	/* pending irq priority */
 #define  KVM_REG_PPC_ICP_PPRI_MASK	0xff
 
+#define KVM_REG_PPC_VP_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U256 | 0x8d)
+
 /* Device control API: PPC-specific devices */
 #define KVM_DEV_MPIC_GRP_MISC		1
 #define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 96d43f091255..f85a9211f30c 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -641,6 +641,18 @@  int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
 				*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
 			break;
 #endif /* CONFIG_KVM_XICS */
+#ifdef CONFIG_KVM_XIVE
+		case KVM_REG_PPC_VP_STATE:
+			if (!vcpu->arch.xive_vcpu) {
+				r = -ENXIO;
+				break;
+			}
+			if (xive_enabled())
+				r = kvmppc_xive_native_get_vp(vcpu, val);
+			else
+				r = -ENXIO;
+			break;
+#endif /* CONFIG_KVM_XIVE */
 		case KVM_REG_PPC_FSCR:
 			*val = get_reg_val(id, vcpu->arch.fscr);
 			break;
@@ -714,6 +726,18 @@  int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
 				r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
 			break;
 #endif /* CONFIG_KVM_XICS */
+#ifdef CONFIG_KVM_XIVE
+		case KVM_REG_PPC_VP_STATE:
+			if (!vcpu->arch.xive_vcpu) {
+				r = -ENXIO;
+				break;
+			}
+			if (xive_enabled())
+				r = kvmppc_xive_native_set_vp(vcpu, val);
+			else
+				r = -ENXIO;
+			break;
+#endif /* CONFIG_KVM_XIVE */
 		case KVM_REG_PPC_FSCR:
 			vcpu->arch.fscr = set_reg_val(id, *val);
 			break;
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index 3debc876d5a0..132bff52d70a 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -845,6 +845,88 @@  static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
 	return ret;
 }
 
+/*
+ * Interrupt Pending Buffer (IPB) offset
+ */
+#define TM_IPB_SHIFT 40
+#define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
+
+int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	u64 opal_state;
+	int rc;
+
+	if (!kvmppc_xive_enabled(vcpu))
+		return -EPERM;
+
+	if (!xc)
+		return -ENOENT;
+
+	/* Thread context registers. We only care about IPB and CPPR */
+	val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
+
+	/*
+	 * Return the OS CAM line to print out the VP identifier in
+	 * the QEMU monitor. This is not restored.
+	 */
+	val->xive_timaval[1] = vcpu->arch.xive_cam_word;
+
+	/* Get the VP state from OPAL */
+	rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
+	if (rc)
+		return rc;
+
+	/*
+	 * Capture the backup of IPB register in the NVT structure and
+	 * merge it in our KVM VP state.
+	 */
+	val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
+
+	pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
+		 __func__,
+		 vcpu->arch.xive_saved_state.nsr,
+		 vcpu->arch.xive_saved_state.cppr,
+		 vcpu->arch.xive_saved_state.ipb,
+		 vcpu->arch.xive_saved_state.pipr,
+		 vcpu->arch.xive_saved_state.w01,
+		 (u32) vcpu->arch.xive_cam_word, opal_state);
+
+	return 0;
+}
+
+int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+
+	pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
+		 val->xive_timaval[0], val->xive_timaval[1]);
+
+	if (!kvmppc_xive_enabled(vcpu))
+		return -EPERM;
+
+	if (!xc || !xive)
+		return -ENOENT;
+
+	/* We can't update the state of a "pushed" VCPU	 */
+	if (WARN_ON(vcpu->arch.xive_pushed))
+		return -EIO;
+
+	/*
+	 * Restore the thread context registers. IPB and CPPR should
+	 * be the only ones that matter.
+	 */
+	vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
+
+	/*
+	 * There is no need to restore the XIVE internal state (IPB
+	 * stored in the NVT) as the IPB register was merged in KVM VP
+	 * state when captured.
+	 */
+	return 0;
+}
+
 static int xive_native_debug_show(struct seq_file *m, void *private)
 {
 	struct kvmppc_xive *xive = m->private;
diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
index a26be635cff9..1b8957c50c53 100644
--- a/Documentation/virtual/kvm/devices/xive.txt
+++ b/Documentation/virtual/kvm/devices/xive.txt
@@ -102,6 +102,25 @@  the legacy interrupt mode, referred as XICS (POWER7/8).
     -EINVAL: Not initialized source number, invalid priority or
              invalid CPU number.
 
+* VCPU state
+
+  The XIVE IC maintains VP interrupt state in an internal structure
+  called the NVT. When a VP is not dispatched on a HW processor
+  thread, this structure can be updated by HW if the VP is the target
+  of an event notification.
+
+  It is important for migration to capture the cached IPB from the NVT
+  as it synthesizes the priorities of the pending interrupts. We
+  capture a bit more to report debug information.
+
+  KVM_REG_PPC_VP_STATE (4 * 64bits)
+  bits:     |  63  ....  32  |  31  ....  0  |
+  values:   |   TIMA word0   |   TIMA word1  |
+  bits:     | 127       ..........       64  |
+  values:   |         VP CAM Line            |
+  bits:     | 255       ..........      128  |
+  values:   |            unused              |
+
 * Migration:
 
   Saving the state of a VM using the XIVE native exploitation mode