diff mbox series

[v12,4/8] arm64: kvm: Introduce MTE VM feature

Message ID 20210517123239.8025-5-steven.price@arm.com
State New
Headers show
Series MTE support for KVM guest | expand

Commit Message

Steven Price May 17, 2021, 12:32 p.m. UTC
Add a new VM feature 'KVM_ARM_CAP_MTE' which enables memory tagging
for a VM. This will expose the feature to the guest and automatically
tag memory pages touched by the VM as PG_mte_tagged (and clear the tag
storage) to ensure that the guest cannot see stale tags, and so that
the tags are correctly saved/restored across swap.

Actually exposing the new capability to user space happens in a later
patch.

Signed-off-by: Steven Price <steven.price@arm.com>
---
 arch/arm64/include/asm/kvm_emulate.h |  3 +++
 arch/arm64/include/asm/kvm_host.h    |  3 +++
 arch/arm64/kvm/hyp/exception.c       |  3 ++-
 arch/arm64/kvm/mmu.c                 | 37 +++++++++++++++++++++++++++-
 arch/arm64/kvm/sys_regs.c            |  3 +++
 include/uapi/linux/kvm.h             |  1 +
 6 files changed, 48 insertions(+), 2 deletions(-)

Comments

Marc Zyngier May 17, 2021, 4:45 p.m. UTC | #1
On Mon, 17 May 2021 13:32:35 +0100,
Steven Price <steven.price@arm.com> wrote:
> 
> Add a new VM feature 'KVM_ARM_CAP_MTE' which enables memory tagging
> for a VM. This will expose the feature to the guest and automatically
> tag memory pages touched by the VM as PG_mte_tagged (and clear the tag
> storage) to ensure that the guest cannot see stale tags, and so that
> the tags are correctly saved/restored across swap.
> 
> Actually exposing the new capability to user space happens in a later
> patch.

uber nit in $SUBJECT: "KVM: arm64:" is the preferred prefix (just like
patches 7 and 8).

> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
>  arch/arm64/include/asm/kvm_emulate.h |  3 +++
>  arch/arm64/include/asm/kvm_host.h    |  3 +++
>  arch/arm64/kvm/hyp/exception.c       |  3 ++-
>  arch/arm64/kvm/mmu.c                 | 37 +++++++++++++++++++++++++++-
>  arch/arm64/kvm/sys_regs.c            |  3 +++
>  include/uapi/linux/kvm.h             |  1 +
>  6 files changed, 48 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index f612c090f2e4..6bf776c2399c 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -84,6 +84,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
>  	if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
>  	    vcpu_el1_is_32bit(vcpu))
>  		vcpu->arch.hcr_el2 |= HCR_TID2;
> +
> +	if (kvm_has_mte(vcpu->kvm))
> +		vcpu->arch.hcr_el2 |= HCR_ATA;
>  }
>  
>  static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 7cd7d5c8c4bc..afaa5333f0e4 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -132,6 +132,8 @@ struct kvm_arch {
>  
>  	u8 pfr0_csv2;
>  	u8 pfr0_csv3;
> +	/* Memory Tagging Extension enabled for the guest */
> +	bool mte_enabled;
>  };
>  
>  struct kvm_vcpu_fault_info {
> @@ -769,6 +771,7 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
>  #define kvm_arm_vcpu_sve_finalized(vcpu) \
>  	((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
>  
> +#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
>  #define kvm_vcpu_has_pmu(vcpu)					\
>  	(test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
>  
> diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
> index 73629094f903..56426565600c 100644
> --- a/arch/arm64/kvm/hyp/exception.c
> +++ b/arch/arm64/kvm/hyp/exception.c
> @@ -112,7 +112,8 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
>  	new |= (old & PSR_C_BIT);
>  	new |= (old & PSR_V_BIT);
>  
> -	// TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests)
> +	if (kvm_has_mte(vcpu->kvm))
> +		new |= PSR_TCO_BIT;
>  
>  	new |= (old & PSR_DIT_BIT);
>  
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index c5d1f3c87dbd..8660f6a03f51 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -822,6 +822,31 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>  	return PAGE_SIZE;
>  }
>  
> +static int sanitise_mte_tags(struct kvm *kvm, unsigned long size,
> +			     kvm_pfn_t pfn)

Nit: please order the parameters as address, then size.

> +{
> +	if (kvm_has_mte(kvm)) {
> +		/*
> +		 * The page will be mapped in stage 2 as Normal Cacheable, so
> +		 * the VM will be able to see the page's tags and therefore
> +		 * they must be initialised first. If PG_mte_tagged is set,
> +		 * tags have already been initialised.
> +		 */
> +		unsigned long i, nr_pages = size >> PAGE_SHIFT;
> +		struct page *page = pfn_to_online_page(pfn);
> +
> +		if (!page)
> +			return -EFAULT;

Under which circumstances can this happen? We already have done a GUP
on the page, so I really can't see how the page can vanish from under
our feet.

> +
> +		for (i = 0; i < nr_pages; i++, page++) {
> +			if (!test_and_set_bit(PG_mte_tagged, &page->flags))
> +				mte_clear_page_tags(page_address(page));

You seem to be doing this irrespective of the VMA being created with
PROT_MTE. This is fine form a guest perspective (all its memory should
be MTE capable). However, I can't see any guarantee that the VMM will
actually allocate memslots with PROT_MTE.

Aren't we missing some sanity checks at memslot registration time?

> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  			  struct kvm_memory_slot *memslot, unsigned long hva,
>  			  unsigned long fault_status)
> @@ -971,8 +996,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (writable)
>  		prot |= KVM_PGTABLE_PROT_W;
>  
> -	if (fault_status != FSC_PERM && !device)
> +	if (fault_status != FSC_PERM && !device) {
> +		ret = sanitise_mte_tags(kvm, vma_pagesize, pfn);
> +		if (ret)
> +			goto out_unlock;
> +
>  		clean_dcache_guest_page(pfn, vma_pagesize);
> +	}
>  
>  	if (exec_fault) {
>  		prot |= KVM_PGTABLE_PROT_X;
> @@ -1168,12 +1198,17 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>  {
>  	kvm_pfn_t pfn = pte_pfn(range->pte);
> +	int ret;
>  
>  	if (!kvm->arch.mmu.pgt)
>  		return 0;
>  
>  	WARN_ON(range->end - range->start != 1);
>  
> +	ret = sanitise_mte_tags(kvm, PAGE_SIZE, pfn);
> +	if (ret)
> +		return ret;

Notice the change in return type?

> +
>  	/*
>  	 * We've moved a page around, probably through CoW, so let's treat it
>  	 * just like a translation fault and clean the cache to the PoC.
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 76ea2800c33e..24a844cb79ca 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -1047,6 +1047,9 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
>  		break;
>  	case SYS_ID_AA64PFR1_EL1:
>  		val &= ~FEATURE(ID_AA64PFR1_MTE);
> +		if (kvm_has_mte(vcpu->kvm))
> +			val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE),
> +					  ID_AA64PFR1_MTE);

Shouldn't this be consistent with what the HW is capable of
(i.e. FEAT_MTE3 if available), and extracted from the sanitised view
of the feature set?

>  		break;
>  	case SYS_ID_AA64ISAR1_EL1:
>  		if (!vcpu_has_ptrauth(vcpu))
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 3fd9a7e9d90c..8c95ba0fadda 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1082,6 +1082,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_SGX_ATTRIBUTE 196
>  #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
>  #define KVM_CAP_PTP_KVM 198
> +#define KVM_CAP_ARM_MTE 199
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  

Thanks,

	M.
Steven Price May 19, 2021, 10:48 a.m. UTC | #2
On 17/05/2021 17:45, Marc Zyngier wrote:
> On Mon, 17 May 2021 13:32:35 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> Add a new VM feature 'KVM_ARM_CAP_MTE' which enables memory tagging
>> for a VM. This will expose the feature to the guest and automatically
>> tag memory pages touched by the VM as PG_mte_tagged (and clear the tag
>> storage) to ensure that the guest cannot see stale tags, and so that
>> the tags are correctly saved/restored across swap.
>>
>> Actually exposing the new capability to user space happens in a later
>> patch.
> 
> uber nit in $SUBJECT: "KVM: arm64:" is the preferred prefix (just like
> patches 7 and 8).

Good spot - I obviously got carried away with the "arm64:" prefix ;)

>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>>  arch/arm64/include/asm/kvm_emulate.h |  3 +++
>>  arch/arm64/include/asm/kvm_host.h    |  3 +++
>>  arch/arm64/kvm/hyp/exception.c       |  3 ++-
>>  arch/arm64/kvm/mmu.c                 | 37 +++++++++++++++++++++++++++-
>>  arch/arm64/kvm/sys_regs.c            |  3 +++
>>  include/uapi/linux/kvm.h             |  1 +
>>  6 files changed, 48 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
>> index f612c090f2e4..6bf776c2399c 100644
>> --- a/arch/arm64/include/asm/kvm_emulate.h
>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>> @@ -84,6 +84,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
>>  	if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
>>  	    vcpu_el1_is_32bit(vcpu))
>>  		vcpu->arch.hcr_el2 |= HCR_TID2;
>> +
>> +	if (kvm_has_mte(vcpu->kvm))
>> +		vcpu->arch.hcr_el2 |= HCR_ATA;
>>  }
>>  
>>  static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> index 7cd7d5c8c4bc..afaa5333f0e4 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -132,6 +132,8 @@ struct kvm_arch {
>>  
>>  	u8 pfr0_csv2;
>>  	u8 pfr0_csv3;
>> +	/* Memory Tagging Extension enabled for the guest */
>> +	bool mte_enabled;
>>  };
>>  
>>  struct kvm_vcpu_fault_info {
>> @@ -769,6 +771,7 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
>>  #define kvm_arm_vcpu_sve_finalized(vcpu) \
>>  	((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
>>  
>> +#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
>>  #define kvm_vcpu_has_pmu(vcpu)					\
>>  	(test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
>>  
>> diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
>> index 73629094f903..56426565600c 100644
>> --- a/arch/arm64/kvm/hyp/exception.c
>> +++ b/arch/arm64/kvm/hyp/exception.c
>> @@ -112,7 +112,8 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
>>  	new |= (old & PSR_C_BIT);
>>  	new |= (old & PSR_V_BIT);
>>  
>> -	// TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests)
>> +	if (kvm_has_mte(vcpu->kvm))
>> +		new |= PSR_TCO_BIT;
>>  
>>  	new |= (old & PSR_DIT_BIT);
>>  
>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>> index c5d1f3c87dbd..8660f6a03f51 100644
>> --- a/arch/arm64/kvm/mmu.c
>> +++ b/arch/arm64/kvm/mmu.c
>> @@ -822,6 +822,31 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>>  	return PAGE_SIZE;
>>  }
>>  
>> +static int sanitise_mte_tags(struct kvm *kvm, unsigned long size,
>> +			     kvm_pfn_t pfn)
> 
> Nit: please order the parameters as address, then size.

Sure

>> +{
>> +	if (kvm_has_mte(kvm)) {
>> +		/*
>> +		 * The page will be mapped in stage 2 as Normal Cacheable, so
>> +		 * the VM will be able to see the page's tags and therefore
>> +		 * they must be initialised first. If PG_mte_tagged is set,
>> +		 * tags have already been initialised.
>> +		 */
>> +		unsigned long i, nr_pages = size >> PAGE_SHIFT;
>> +		struct page *page = pfn_to_online_page(pfn);
>> +
>> +		if (!page)
>> +			return -EFAULT;
> 
> Under which circumstances can this happen? We already have done a GUP
> on the page, so I really can't see how the page can vanish from under
> our feet.

It's less about the page vanishing and more that pfn_to_online_page()
will reject some pages. Specifically in this case we want to reject any
sort of device memory (e.g. graphics card memory or other memory on the
end of a bus like PCIe) as it is unlikely to support MTE.

>> +
>> +		for (i = 0; i < nr_pages; i++, page++) {
>> +			if (!test_and_set_bit(PG_mte_tagged, &page->flags))
>> +				mte_clear_page_tags(page_address(page));
> 
> You seem to be doing this irrespective of the VMA being created with
> PROT_MTE. This is fine form a guest perspective (all its memory should
> be MTE capable). However, I can't see any guarantee that the VMM will
> actually allocate memslots with PROT_MTE.
> 
> Aren't we missing some sanity checks at memslot registration time?

I've taken the decision not to require that the VMM allocates with
PROT_MTE, there are two main reasons for this:

 1. The VMM generally doesn't actually want a PROT_MTE mapping as the
tags from the guest are almost certainly wrong for most usages (e.g.
device emulation). So a PROT_MTE mapping actively gets in the way of the
VMM using MTE for it's own purposes. However this then leads to the
requirement for the new ioctl in patch 7.

 2. Because the VMM can change the pages in a memslot at any time and
KVM relies on mmu notifiers to spot the change it's hard and ugly to
enforce that the memslot VMAs keep having the PROT_MTE flag. When I
tried this it meant we've discover that a page doesn't have the MTE flag
at fault time and have no other option that to kill the VM at that time.

So the model is that non-PROT_MTE memory can be supplied to the memslots
and KVM will automatically upgrade it to PG_mte_tagged if you supply it
to a VM with MTE enabled. This makes the VMM implementation easier for
most cases, and the new ioctl helps for migration. I think the kernel
code is tidier too.

Of course even better would be a stage 2 flag to control MTE
availability on a page-by-page basis, but that doesn't exist in the
architecture at the moment.

>> +		}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  			  struct kvm_memory_slot *memslot, unsigned long hva,
>>  			  unsigned long fault_status)
>> @@ -971,8 +996,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  	if (writable)
>>  		prot |= KVM_PGTABLE_PROT_W;
>>  
>> -	if (fault_status != FSC_PERM && !device)
>> +	if (fault_status != FSC_PERM && !device) {
>> +		ret = sanitise_mte_tags(kvm, vma_pagesize, pfn);
>> +		if (ret)
>> +			goto out_unlock;
>> +
>>  		clean_dcache_guest_page(pfn, vma_pagesize);
>> +	}
>>  
>>  	if (exec_fault) {
>>  		prot |= KVM_PGTABLE_PROT_X;
>> @@ -1168,12 +1198,17 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>>  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>>  {
>>  	kvm_pfn_t pfn = pte_pfn(range->pte);
>> +	int ret;
>>  
>>  	if (!kvm->arch.mmu.pgt)
>>  		return 0;
>>  
>>  	WARN_ON(range->end - range->start != 1);
>>  
>> +	ret = sanitise_mte_tags(kvm, PAGE_SIZE, pfn);
>> +	if (ret)
>> +		return ret;
> 
> Notice the change in return type?

I do now - I was tricked by the use of '0' as false. Looks like false
('0') is actually the correct return here to avoid an unnecessary
kvm_flush_remote_tlbs().

>> +
>>  	/*
>>  	 * We've moved a page around, probably through CoW, so let's treat it
>>  	 * just like a translation fault and clean the cache to the PoC.
>> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>> index 76ea2800c33e..24a844cb79ca 100644
>> --- a/arch/arm64/kvm/sys_regs.c
>> +++ b/arch/arm64/kvm/sys_regs.c
>> @@ -1047,6 +1047,9 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
>>  		break;
>>  	case SYS_ID_AA64PFR1_EL1:
>>  		val &= ~FEATURE(ID_AA64PFR1_MTE);
>> +		if (kvm_has_mte(vcpu->kvm))
>> +			val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE),
>> +					  ID_AA64PFR1_MTE);
> 
> Shouldn't this be consistent with what the HW is capable of
> (i.e. FEAT_MTE3 if available), and extracted from the sanitised view
> of the feature set?

Yes - however at the moment our sanitised view is either FEAT_MTE2 or
nothing:

	{
		.desc = "Memory Tagging Extension",
		.capability = ARM64_MTE,
		.type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
		.matches = has_cpuid_feature,
		.sys_reg = SYS_ID_AA64PFR1_EL1,
		.field_pos = ID_AA64PFR1_MTE_SHIFT,
		.min_field_value = ID_AA64PFR1_MTE,
		.sign = FTR_UNSIGNED,
		.cpu_enable = cpu_enable_mte,
	},

When host support for FEAT_MTE3 is added then the KVM code will need
revisiting to expose that down to the guest safely (AFAICS there's
nothing extra to do here, but I haven't tested any of the MTE3
features). I don't think we want to expose newer versions to the guest
than the host is aware of. (Or indeed expose FEAT_MTE if the host has
MTE disabled because Linux requires at least FEAT_MTE2).

Thanks,

Steve

>>  		break;
>>  	case SYS_ID_AA64ISAR1_EL1:
>>  		if (!vcpu_has_ptrauth(vcpu))
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 3fd9a7e9d90c..8c95ba0fadda 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -1082,6 +1082,7 @@ struct kvm_ppc_resize_hpt {
>>  #define KVM_CAP_SGX_ATTRIBUTE 196
>>  #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
>>  #define KVM_CAP_PTP_KVM 198
>> +#define KVM_CAP_ARM_MTE 199
>>  
>>  #ifdef KVM_CAP_IRQ_ROUTING
>>  
> 
> Thanks,
> 
> 	M.
>
Marc Zyngier May 20, 2021, 8:51 a.m. UTC | #3
On Wed, 19 May 2021 11:48:21 +0100,
Steven Price <steven.price@arm.com> wrote:
> 
> On 17/05/2021 17:45, Marc Zyngier wrote:
> > On Mon, 17 May 2021 13:32:35 +0100,
> > Steven Price <steven.price@arm.com> wrote:
> >>
> >> Add a new VM feature 'KVM_ARM_CAP_MTE' which enables memory tagging
> >> for a VM. This will expose the feature to the guest and automatically
> >> tag memory pages touched by the VM as PG_mte_tagged (and clear the tag
> >> storage) to ensure that the guest cannot see stale tags, and so that
> >> the tags are correctly saved/restored across swap.
> >>
> >> Actually exposing the new capability to user space happens in a later
> >> patch.
> > 
> > uber nit in $SUBJECT: "KVM: arm64:" is the preferred prefix (just like
> > patches 7 and 8).
> 
> Good spot - I obviously got carried away with the "arm64:" prefix ;)
> 
> >>
> >> Signed-off-by: Steven Price <steven.price@arm.com>
> >> ---
> >>  arch/arm64/include/asm/kvm_emulate.h |  3 +++
> >>  arch/arm64/include/asm/kvm_host.h    |  3 +++
> >>  arch/arm64/kvm/hyp/exception.c       |  3 ++-
> >>  arch/arm64/kvm/mmu.c                 | 37 +++++++++++++++++++++++++++-
> >>  arch/arm64/kvm/sys_regs.c            |  3 +++
> >>  include/uapi/linux/kvm.h             |  1 +
> >>  6 files changed, 48 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> >> index f612c090f2e4..6bf776c2399c 100644
> >> --- a/arch/arm64/include/asm/kvm_emulate.h
> >> +++ b/arch/arm64/include/asm/kvm_emulate.h
> >> @@ -84,6 +84,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
> >>  	if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
> >>  	    vcpu_el1_is_32bit(vcpu))
> >>  		vcpu->arch.hcr_el2 |= HCR_TID2;
> >> +
> >> +	if (kvm_has_mte(vcpu->kvm))
> >> +		vcpu->arch.hcr_el2 |= HCR_ATA;
> >>  }
> >>  
> >>  static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
> >> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> >> index 7cd7d5c8c4bc..afaa5333f0e4 100644
> >> --- a/arch/arm64/include/asm/kvm_host.h
> >> +++ b/arch/arm64/include/asm/kvm_host.h
> >> @@ -132,6 +132,8 @@ struct kvm_arch {
> >>  
> >>  	u8 pfr0_csv2;
> >>  	u8 pfr0_csv3;
> >> +	/* Memory Tagging Extension enabled for the guest */
> >> +	bool mte_enabled;
> >>  };
> >>  
> >>  struct kvm_vcpu_fault_info {
> >> @@ -769,6 +771,7 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
> >>  #define kvm_arm_vcpu_sve_finalized(vcpu) \
> >>  	((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
> >>  
> >> +#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
> >>  #define kvm_vcpu_has_pmu(vcpu)					\
> >>  	(test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
> >>  
> >> diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
> >> index 73629094f903..56426565600c 100644
> >> --- a/arch/arm64/kvm/hyp/exception.c
> >> +++ b/arch/arm64/kvm/hyp/exception.c
> >> @@ -112,7 +112,8 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
> >>  	new |= (old & PSR_C_BIT);
> >>  	new |= (old & PSR_V_BIT);
> >>  
> >> -	// TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests)
> >> +	if (kvm_has_mte(vcpu->kvm))
> >> +		new |= PSR_TCO_BIT;
> >>  
> >>  	new |= (old & PSR_DIT_BIT);
> >>  
> >> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> >> index c5d1f3c87dbd..8660f6a03f51 100644
> >> --- a/arch/arm64/kvm/mmu.c
> >> +++ b/arch/arm64/kvm/mmu.c
> >> @@ -822,6 +822,31 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
> >>  	return PAGE_SIZE;
> >>  }
> >>  
> >> +static int sanitise_mte_tags(struct kvm *kvm, unsigned long size,
> >> +			     kvm_pfn_t pfn)
> > 
> > Nit: please order the parameters as address, then size.
> 
> Sure
> 
> >> +{
> >> +	if (kvm_has_mte(kvm)) {
> >> +		/*
> >> +		 * The page will be mapped in stage 2 as Normal Cacheable, so
> >> +		 * the VM will be able to see the page's tags and therefore
> >> +		 * they must be initialised first. If PG_mte_tagged is set,
> >> +		 * tags have already been initialised.
> >> +		 */
> >> +		unsigned long i, nr_pages = size >> PAGE_SHIFT;
> >> +		struct page *page = pfn_to_online_page(pfn);
> >> +
> >> +		if (!page)
> >> +			return -EFAULT;
> > 
> > Under which circumstances can this happen? We already have done a GUP
> > on the page, so I really can't see how the page can vanish from under
> > our feet.
> 
> It's less about the page vanishing and more that pfn_to_online_page()
> will reject some pages. Specifically in this case we want to reject any
> sort of device memory (e.g. graphics card memory or other memory on the
> end of a bus like PCIe) as it is unlikely to support MTE.

OK. We really never should see this error as we check for device
mappings right before calling this, but I guess it doesn't hurt.

> 
> >> +
> >> +		for (i = 0; i < nr_pages; i++, page++) {
> >> +			if (!test_and_set_bit(PG_mte_tagged, &page->flags))
> >> +				mte_clear_page_tags(page_address(page));
> > 
> > You seem to be doing this irrespective of the VMA being created with
> > PROT_MTE. This is fine form a guest perspective (all its memory should
> > be MTE capable). However, I can't see any guarantee that the VMM will
> > actually allocate memslots with PROT_MTE.
> > 
> > Aren't we missing some sanity checks at memslot registration time?
> 
> I've taken the decision not to require that the VMM allocates with
> PROT_MTE, there are two main reasons for this:
> 
>  1. The VMM generally doesn't actually want a PROT_MTE mapping as the
> tags from the guest are almost certainly wrong for most usages (e.g.
> device emulation). So a PROT_MTE mapping actively gets in the way of the
> VMM using MTE for it's own purposes. However this then leads to the
> requirement for the new ioctl in patch 7.
> 
>  2. Because the VMM can change the pages in a memslot at any time and
> KVM relies on mmu notifiers to spot the change it's hard and ugly to
> enforce that the memslot VMAs keep having the PROT_MTE flag. When I
> tried this it meant we've discover that a page doesn't have the MTE flag
> at fault time and have no other option that to kill the VM at that time.
> 
> So the model is that non-PROT_MTE memory can be supplied to the memslots
> and KVM will automatically upgrade it to PG_mte_tagged if you supply it
> to a VM with MTE enabled. This makes the VMM implementation easier for
> most cases, and the new ioctl helps for migration. I think the kernel
> code is tidier too.

OK, I see your point. I guess we rely on the implicit requirement that
all the available memory is MTE-capable, although I'm willing to bet
that someone will eventually break this requirement.

> Of course even better would be a stage 2 flag to control MTE
> availability on a page-by-page basis, but that doesn't exist in the
> architecture at the moment.

Nah, that'd be way too good. Let's not do that.

> 
> >> +		}
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >> +
> >>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  			  struct kvm_memory_slot *memslot, unsigned long hva,
> >>  			  unsigned long fault_status)
> >> @@ -971,8 +996,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  	if (writable)
> >>  		prot |= KVM_PGTABLE_PROT_W;
> >>  
> >> -	if (fault_status != FSC_PERM && !device)
> >> +	if (fault_status != FSC_PERM && !device) {
> >> +		ret = sanitise_mte_tags(kvm, vma_pagesize, pfn);
> >> +		if (ret)
> >> +			goto out_unlock;
> >> +
> >>  		clean_dcache_guest_page(pfn, vma_pagesize);
> >> +	}
> >>  
> >>  	if (exec_fault) {
> >>  		prot |= KVM_PGTABLE_PROT_X;
> >> @@ -1168,12 +1198,17 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> >>  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
> >>  {
> >>  	kvm_pfn_t pfn = pte_pfn(range->pte);
> >> +	int ret;
> >>  
> >>  	if (!kvm->arch.mmu.pgt)
> >>  		return 0;
> >>  
> >>  	WARN_ON(range->end - range->start != 1);
> >>  
> >> +	ret = sanitise_mte_tags(kvm, PAGE_SIZE, pfn);
> >> +	if (ret)
> >> +		return ret;
> > 
> > Notice the change in return type?
> 
> I do now - I was tricked by the use of '0' as false. Looks like false
> ('0') is actually the correct return here to avoid an unnecessary
> kvm_flush_remote_tlbs().

Yup. BTW, the return values have been fixed to proper boolean types in
the latest set of fixes.

> 
> >> +
> >>  	/*
> >>  	 * We've moved a page around, probably through CoW, so let's treat it
> >>  	 * just like a translation fault and clean the cache to the PoC.
> >> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> >> index 76ea2800c33e..24a844cb79ca 100644
> >> --- a/arch/arm64/kvm/sys_regs.c
> >> +++ b/arch/arm64/kvm/sys_regs.c
> >> @@ -1047,6 +1047,9 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
> >>  		break;
> >>  	case SYS_ID_AA64PFR1_EL1:
> >>  		val &= ~FEATURE(ID_AA64PFR1_MTE);
> >> +		if (kvm_has_mte(vcpu->kvm))
> >> +			val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE),
> >> +					  ID_AA64PFR1_MTE);
> > 
> > Shouldn't this be consistent with what the HW is capable of
> > (i.e. FEAT_MTE3 if available), and extracted from the sanitised view
> > of the feature set?
> 
> Yes - however at the moment our sanitised view is either FEAT_MTE2 or
> nothing:
> 
> 	{
> 		.desc = "Memory Tagging Extension",
> 		.capability = ARM64_MTE,
> 		.type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
> 		.matches = has_cpuid_feature,
> 		.sys_reg = SYS_ID_AA64PFR1_EL1,
> 		.field_pos = ID_AA64PFR1_MTE_SHIFT,
> 		.min_field_value = ID_AA64PFR1_MTE,
> 		.sign = FTR_UNSIGNED,
> 		.cpu_enable = cpu_enable_mte,
> 	},
> 
> When host support for FEAT_MTE3 is added then the KVM code will need
> revisiting to expose that down to the guest safely (AFAICS there's
> nothing extra to do here, but I haven't tested any of the MTE3
> features). I don't think we want to expose newer versions to the guest
> than the host is aware of. (Or indeed expose FEAT_MTE if the host has
> MTE disabled because Linux requires at least FEAT_MTE2).

What I was suggesting is to have something like this:

     pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
     mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT);
     val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE), mte);

which does the trick nicely, and doesn't expose more than the host
supports.

Thanks,

	M.
Catalin Marinas May 20, 2021, 11:54 a.m. UTC | #4
On Mon, May 17, 2021 at 01:32:35PM +0100, Steven Price wrote:
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index c5d1f3c87dbd..8660f6a03f51 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -822,6 +822,31 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>  	return PAGE_SIZE;
>  }
>  
> +static int sanitise_mte_tags(struct kvm *kvm, unsigned long size,
> +			     kvm_pfn_t pfn)
> +{
> +	if (kvm_has_mte(kvm)) {
> +		/*
> +		 * The page will be mapped in stage 2 as Normal Cacheable, so
> +		 * the VM will be able to see the page's tags and therefore
> +		 * they must be initialised first. If PG_mte_tagged is set,
> +		 * tags have already been initialised.
> +		 */
> +		unsigned long i, nr_pages = size >> PAGE_SHIFT;
> +		struct page *page = pfn_to_online_page(pfn);
> +
> +		if (!page)
> +			return -EFAULT;

IIRC we ended up with pfn_to_online_page() to reject ZONE_DEVICE pages
that may be mapped into a guest and we have no idea whether they support
MTE. It may be worth adding a comment, otherwise, as Marc said, the page
wouldn't disappear.

> +
> +		for (i = 0; i < nr_pages; i++, page++) {
> +			if (!test_and_set_bit(PG_mte_tagged, &page->flags))
> +				mte_clear_page_tags(page_address(page));

We started the page->flags thread and ended up fixing it for the host
set_pte_at() as per the first patch:

https://lore.kernel.org/r/c3293d47-a5f2-ea4a-6730-f5cae26d8a7e@arm.com

Now, can we have a race between the stage 2 kvm_set_spte_gfn() and a
stage 1 set_pte_at()? Only the latter takes a lock. Or between two
kvm_set_spte_gfn() in different VMs? I think in the above thread we
concluded that there's only a problem if the page is shared between
multiple VMMs (MAP_SHARED). How realistic is this and what's the
workaround?

Either way, I think it's worth adding a comment here on the race on
page->flags as it looks strange that here it's just a test_and_set_bit()
while set_pte_at() uses a spinlock.
Steven Price May 20, 2021, 2:46 p.m. UTC | #5
On 20/05/2021 09:51, Marc Zyngier wrote:
> On Wed, 19 May 2021 11:48:21 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> On 17/05/2021 17:45, Marc Zyngier wrote:
>>> On Mon, 17 May 2021 13:32:35 +0100,
>>> Steven Price <steven.price@arm.com> wrote:
[...]
>>>> +		}
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  			  struct kvm_memory_slot *memslot, unsigned long hva,
>>>>  			  unsigned long fault_status)
>>>> @@ -971,8 +996,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  	if (writable)
>>>>  		prot |= KVM_PGTABLE_PROT_W;
>>>>  
>>>> -	if (fault_status != FSC_PERM && !device)
>>>> +	if (fault_status != FSC_PERM && !device) {
>>>> +		ret = sanitise_mte_tags(kvm, vma_pagesize, pfn);
>>>> +		if (ret)
>>>> +			goto out_unlock;
>>>> +
>>>>  		clean_dcache_guest_page(pfn, vma_pagesize);
>>>> +	}
>>>>  
>>>>  	if (exec_fault) {
>>>>  		prot |= KVM_PGTABLE_PROT_X;
>>>> @@ -1168,12 +1198,17 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>>>>  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>>>>  {
>>>>  	kvm_pfn_t pfn = pte_pfn(range->pte);
>>>> +	int ret;
>>>>  
>>>>  	if (!kvm->arch.mmu.pgt)
>>>>  		return 0;
>>>>  
>>>>  	WARN_ON(range->end - range->start != 1);
>>>>  
>>>> +	ret = sanitise_mte_tags(kvm, PAGE_SIZE, pfn);
>>>> +	if (ret)
>>>> +		return ret;
>>>
>>> Notice the change in return type?
>>
>> I do now - I was tricked by the use of '0' as false. Looks like false
>> ('0') is actually the correct return here to avoid an unnecessary
>> kvm_flush_remote_tlbs().
> 
> Yup. BTW, the return values have been fixed to proper boolean types in
> the latest set of fixes.

Thanks for the heads up - I'll return 'false' to avoid regressing that.

>>
>>>> +
>>>>  	/*
>>>>  	 * We've moved a page around, probably through CoW, so let's treat it
>>>>  	 * just like a translation fault and clean the cache to the PoC.
>>>> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>>>> index 76ea2800c33e..24a844cb79ca 100644
>>>> --- a/arch/arm64/kvm/sys_regs.c
>>>> +++ b/arch/arm64/kvm/sys_regs.c
>>>> @@ -1047,6 +1047,9 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
>>>>  		break;
>>>>  	case SYS_ID_AA64PFR1_EL1:
>>>>  		val &= ~FEATURE(ID_AA64PFR1_MTE);
>>>> +		if (kvm_has_mte(vcpu->kvm))
>>>> +			val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE),
>>>> +					  ID_AA64PFR1_MTE);
>>>
>>> Shouldn't this be consistent with what the HW is capable of
>>> (i.e. FEAT_MTE3 if available), and extracted from the sanitised view
>>> of the feature set?
>>
>> Yes - however at the moment our sanitised view is either FEAT_MTE2 or
>> nothing:
>>
>> 	{
>> 		.desc = "Memory Tagging Extension",
>> 		.capability = ARM64_MTE,
>> 		.type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
>> 		.matches = has_cpuid_feature,
>> 		.sys_reg = SYS_ID_AA64PFR1_EL1,
>> 		.field_pos = ID_AA64PFR1_MTE_SHIFT,
>> 		.min_field_value = ID_AA64PFR1_MTE,
>> 		.sign = FTR_UNSIGNED,
>> 		.cpu_enable = cpu_enable_mte,
>> 	},
>>
>> When host support for FEAT_MTE3 is added then the KVM code will need
>> revisiting to expose that down to the guest safely (AFAICS there's
>> nothing extra to do here, but I haven't tested any of the MTE3
>> features). I don't think we want to expose newer versions to the guest
>> than the host is aware of. (Or indeed expose FEAT_MTE if the host has
>> MTE disabled because Linux requires at least FEAT_MTE2).
> 
> What I was suggesting is to have something like this:
> 
>      pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
>      mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT);
>      val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE), mte);
> 
> which does the trick nicely, and doesn't expose more than the host
> supports.

Ok, I have to admit to not fully understanding the sanitised register
code - but wouldn't this expose higher MTE values if all CPUs support
it, even though the host doesn't know what a hypothetical 'MTE4' adds?
Or is there some magic in the sanitising that caps the value to what the
host knows about?

Thanks,

Steve
Steven Price May 20, 2021, 3:05 p.m. UTC | #6
On 20/05/2021 12:54, Catalin Marinas wrote:
> On Mon, May 17, 2021 at 01:32:35PM +0100, Steven Price wrote:
>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>> index c5d1f3c87dbd..8660f6a03f51 100644
>> --- a/arch/arm64/kvm/mmu.c
>> +++ b/arch/arm64/kvm/mmu.c
>> @@ -822,6 +822,31 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>>  	return PAGE_SIZE;
>>  }
>>  
>> +static int sanitise_mte_tags(struct kvm *kvm, unsigned long size,
>> +			     kvm_pfn_t pfn)
>> +{
>> +	if (kvm_has_mte(kvm)) {
>> +		/*
>> +		 * The page will be mapped in stage 2 as Normal Cacheable, so
>> +		 * the VM will be able to see the page's tags and therefore
>> +		 * they must be initialised first. If PG_mte_tagged is set,
>> +		 * tags have already been initialised.
>> +		 */
>> +		unsigned long i, nr_pages = size >> PAGE_SHIFT;
>> +		struct page *page = pfn_to_online_page(pfn);
>> +
>> +		if (!page)
>> +			return -EFAULT;
> 
> IIRC we ended up with pfn_to_online_page() to reject ZONE_DEVICE pages
> that may be mapped into a guest and we have no idea whether they support
> MTE. It may be worth adding a comment, otherwise, as Marc said, the page
> wouldn't disappear.

I'll add a comment.

>> +
>> +		for (i = 0; i < nr_pages; i++, page++) {
>> +			if (!test_and_set_bit(PG_mte_tagged, &page->flags))
>> +				mte_clear_page_tags(page_address(page));
> 
> We started the page->flags thread and ended up fixing it for the host
> set_pte_at() as per the first patch:
> 
> https://lore.kernel.org/r/c3293d47-a5f2-ea4a-6730-f5cae26d8a7e@arm.com
> 
> Now, can we have a race between the stage 2 kvm_set_spte_gfn() and a
> stage 1 set_pte_at()? Only the latter takes a lock. Or between two
> kvm_set_spte_gfn() in different VMs? I think in the above thread we
> concluded that there's only a problem if the page is shared between
> multiple VMMs (MAP_SHARED). How realistic is this and what's the
> workaround?
> 
> Either way, I think it's worth adding a comment here on the race on
> page->flags as it looks strange that here it's just a test_and_set_bit()
> while set_pte_at() uses a spinlock.
> 

Very good point! I should have thought about that. I think splitting the
test_and_set_bit() in two (as with the cache flush) is sufficient. While
there technically still is a race which could lead to user space tags
being clobbered:

a) It's very odd for a VMM to be doing an mprotect() after the fact to
add PROT_MTE, or to be sharing the memory with another process which
sets PROT_MTE.

b) The window for the race is incredibly small and the VMM (generally)
needs to be robust against the guest changing tags anyway.

But I'll add a comment here as well:

	/*
	 * There is a potential race between sanitising the
	 * flags here and user space using mprotect() to add
	 * PROT_MTE to access the tags, however by splitting
	 * the test/set the only risk is user space tags
	 * being overwritten by the mte_clear_page_tags() call.
	 */

Thanks,

Steve
Catalin Marinas May 20, 2021, 5:50 p.m. UTC | #7
On Thu, May 20, 2021 at 04:05:46PM +0100, Steven Price wrote:
> On 20/05/2021 12:54, Catalin Marinas wrote:
> > On Mon, May 17, 2021 at 01:32:35PM +0100, Steven Price wrote:
> >> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> >> index c5d1f3c87dbd..8660f6a03f51 100644
> >> --- a/arch/arm64/kvm/mmu.c
> >> +++ b/arch/arm64/kvm/mmu.c
> >> @@ -822,6 +822,31 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
> >>  	return PAGE_SIZE;
> >>  }
> >>  
> >> +static int sanitise_mte_tags(struct kvm *kvm, unsigned long size,
> >> +			     kvm_pfn_t pfn)
> >> +{
> >> +	if (kvm_has_mte(kvm)) {
> >> +		/*
> >> +		 * The page will be mapped in stage 2 as Normal Cacheable, so
> >> +		 * the VM will be able to see the page's tags and therefore
> >> +		 * they must be initialised first. If PG_mte_tagged is set,
> >> +		 * tags have already been initialised.
> >> +		 */
> >> +		unsigned long i, nr_pages = size >> PAGE_SHIFT;
> >> +		struct page *page = pfn_to_online_page(pfn);
> >> +
> >> +		if (!page)
> >> +			return -EFAULT;
> > 
> > IIRC we ended up with pfn_to_online_page() to reject ZONE_DEVICE pages
> > that may be mapped into a guest and we have no idea whether they support
> > MTE. It may be worth adding a comment, otherwise, as Marc said, the page
> > wouldn't disappear.
> 
> I'll add a comment.
> 
> >> +
> >> +		for (i = 0; i < nr_pages; i++, page++) {
> >> +			if (!test_and_set_bit(PG_mte_tagged, &page->flags))
> >> +				mte_clear_page_tags(page_address(page));
> > 
> > We started the page->flags thread and ended up fixing it for the host
> > set_pte_at() as per the first patch:
> > 
> > https://lore.kernel.org/r/c3293d47-a5f2-ea4a-6730-f5cae26d8a7e@arm.com
> > 
> > Now, can we have a race between the stage 2 kvm_set_spte_gfn() and a
> > stage 1 set_pte_at()? Only the latter takes a lock. Or between two
> > kvm_set_spte_gfn() in different VMs? I think in the above thread we
> > concluded that there's only a problem if the page is shared between
> > multiple VMMs (MAP_SHARED). How realistic is this and what's the
> > workaround?
> > 
> > Either way, I think it's worth adding a comment here on the race on
> > page->flags as it looks strange that here it's just a test_and_set_bit()
> > while set_pte_at() uses a spinlock.
> > 
> 
> Very good point! I should have thought about that. I think splitting the
> test_and_set_bit() in two (as with the cache flush) is sufficient. While
> there technically still is a race which could lead to user space tags
> being clobbered:
> 
> a) It's very odd for a VMM to be doing an mprotect() after the fact to
> add PROT_MTE, or to be sharing the memory with another process which
> sets PROT_MTE.
> 
> b) The window for the race is incredibly small and the VMM (generally)
> needs to be robust against the guest changing tags anyway.
> 
> But I'll add a comment here as well:
> 
> 	/*
> 	 * There is a potential race between sanitising the
> 	 * flags here and user space using mprotect() to add
> 	 * PROT_MTE to access the tags, however by splitting
> 	 * the test/set the only risk is user space tags
> 	 * being overwritten by the mte_clear_page_tags() call.
> 	 */

I think (well, I haven't re-checked), an mprotect() in the VMM ends up
calling set_pte_at_notify() which would call kvm_set_spte_gfn() and that
will map the page in the guest. So the problem only appears between
different VMMs sharing the same page. In principle they can be
MAP_PRIVATE but they'd be CoW so the race wouldn't matter. So it's left
with MAP_SHARED between multiple VMMs.

I think we should just state that this is unsafe and they can delete
each-others tags. If we are really worried, we can export that lock you
added in mte.c.
Steven Price May 21, 2021, 9:28 a.m. UTC | #8
On 20/05/2021 18:50, Catalin Marinas wrote:
> On Thu, May 20, 2021 at 04:05:46PM +0100, Steven Price wrote:
>> On 20/05/2021 12:54, Catalin Marinas wrote:
>>> On Mon, May 17, 2021 at 01:32:35PM +0100, Steven Price wrote:
>>>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>>>> index c5d1f3c87dbd..8660f6a03f51 100644
>>>> --- a/arch/arm64/kvm/mmu.c
>>>> +++ b/arch/arm64/kvm/mmu.c
>>>> @@ -822,6 +822,31 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>>>>  	return PAGE_SIZE;
>>>>  }
>>>>  
>>>> +static int sanitise_mte_tags(struct kvm *kvm, unsigned long size,
>>>> +			     kvm_pfn_t pfn)
>>>> +{
>>>> +	if (kvm_has_mte(kvm)) {
>>>> +		/*
>>>> +		 * The page will be mapped in stage 2 as Normal Cacheable, so
>>>> +		 * the VM will be able to see the page's tags and therefore
>>>> +		 * they must be initialised first. If PG_mte_tagged is set,
>>>> +		 * tags have already been initialised.
>>>> +		 */
>>>> +		unsigned long i, nr_pages = size >> PAGE_SHIFT;
>>>> +		struct page *page = pfn_to_online_page(pfn);
>>>> +
>>>> +		if (!page)
>>>> +			return -EFAULT;
>>>
>>> IIRC we ended up with pfn_to_online_page() to reject ZONE_DEVICE pages
>>> that may be mapped into a guest and we have no idea whether they support
>>> MTE. It may be worth adding a comment, otherwise, as Marc said, the page
>>> wouldn't disappear.
>>
>> I'll add a comment.
>>
>>>> +
>>>> +		for (i = 0; i < nr_pages; i++, page++) {
>>>> +			if (!test_and_set_bit(PG_mte_tagged, &page->flags))
>>>> +				mte_clear_page_tags(page_address(page));
>>>
>>> We started the page->flags thread and ended up fixing it for the host
>>> set_pte_at() as per the first patch:
>>>
>>> https://lore.kernel.org/r/c3293d47-a5f2-ea4a-6730-f5cae26d8a7e@arm.com
>>>
>>> Now, can we have a race between the stage 2 kvm_set_spte_gfn() and a
>>> stage 1 set_pte_at()? Only the latter takes a lock. Or between two
>>> kvm_set_spte_gfn() in different VMs? I think in the above thread we
>>> concluded that there's only a problem if the page is shared between
>>> multiple VMMs (MAP_SHARED). How realistic is this and what's the
>>> workaround?
>>>
>>> Either way, I think it's worth adding a comment here on the race on
>>> page->flags as it looks strange that here it's just a test_and_set_bit()
>>> while set_pte_at() uses a spinlock.
>>>
>>
>> Very good point! I should have thought about that. I think splitting the
>> test_and_set_bit() in two (as with the cache flush) is sufficient. While
>> there technically still is a race which could lead to user space tags
>> being clobbered:
>>
>> a) It's very odd for a VMM to be doing an mprotect() after the fact to
>> add PROT_MTE, or to be sharing the memory with another process which
>> sets PROT_MTE.
>>
>> b) The window for the race is incredibly small and the VMM (generally)
>> needs to be robust against the guest changing tags anyway.
>>
>> But I'll add a comment here as well:
>>
>> 	/*
>> 	 * There is a potential race between sanitising the
>> 	 * flags here and user space using mprotect() to add
>> 	 * PROT_MTE to access the tags, however by splitting
>> 	 * the test/set the only risk is user space tags
>> 	 * being overwritten by the mte_clear_page_tags() call.
>> 	 */
> 
> I think (well, I haven't re-checked), an mprotect() in the VMM ends up
> calling set_pte_at_notify() which would call kvm_set_spte_gfn() and that
> will map the page in the guest. So the problem only appears between
> different VMMs sharing the same page. In principle they can be
> MAP_PRIVATE but they'd be CoW so the race wouldn't matter. So it's left
> with MAP_SHARED between multiple VMMs.

mprotect.c only has a call to set_pte_at() not set_pte_at_notify(). And
AFAICT the MMU notifiers are called to invalidate only in
change_pmd_range(). So the stage 2 mappings would be invalidated rather
than populated. However I believe this should cause synchronisation
because of the KVM mmu_lock. So from my reading you are right an
mprotect() can't race.

MAP_SHARED between multiple VMs is then the only potential problem.

> I think we should just state that this is unsafe and they can delete
> each-others tags. If we are really worried, we can export that lock you
> added in mte.c.
> 

I'll just update the comment for now.

Thanks,

Steve
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index f612c090f2e4..6bf776c2399c 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -84,6 +84,9 @@  static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
 	if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
 	    vcpu_el1_is_32bit(vcpu))
 		vcpu->arch.hcr_el2 |= HCR_TID2;
+
+	if (kvm_has_mte(vcpu->kvm))
+		vcpu->arch.hcr_el2 |= HCR_ATA;
 }
 
 static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 7cd7d5c8c4bc..afaa5333f0e4 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -132,6 +132,8 @@  struct kvm_arch {
 
 	u8 pfr0_csv2;
 	u8 pfr0_csv3;
+	/* Memory Tagging Extension enabled for the guest */
+	bool mte_enabled;
 };
 
 struct kvm_vcpu_fault_info {
@@ -769,6 +771,7 @@  bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
 #define kvm_arm_vcpu_sve_finalized(vcpu) \
 	((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
 
+#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
 #define kvm_vcpu_has_pmu(vcpu)					\
 	(test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
 
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 73629094f903..56426565600c 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -112,7 +112,8 @@  static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
 	new |= (old & PSR_C_BIT);
 	new |= (old & PSR_V_BIT);
 
-	// TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests)
+	if (kvm_has_mte(vcpu->kvm))
+		new |= PSR_TCO_BIT;
 
 	new |= (old & PSR_DIT_BIT);
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index c5d1f3c87dbd..8660f6a03f51 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -822,6 +822,31 @@  transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
 	return PAGE_SIZE;
 }
 
+static int sanitise_mte_tags(struct kvm *kvm, unsigned long size,
+			     kvm_pfn_t pfn)
+{
+	if (kvm_has_mte(kvm)) {
+		/*
+		 * The page will be mapped in stage 2 as Normal Cacheable, so
+		 * the VM will be able to see the page's tags and therefore
+		 * they must be initialised first. If PG_mte_tagged is set,
+		 * tags have already been initialised.
+		 */
+		unsigned long i, nr_pages = size >> PAGE_SHIFT;
+		struct page *page = pfn_to_online_page(pfn);
+
+		if (!page)
+			return -EFAULT;
+
+		for (i = 0; i < nr_pages; i++, page++) {
+			if (!test_and_set_bit(PG_mte_tagged, &page->flags))
+				mte_clear_page_tags(page_address(page));
+		}
+	}
+
+	return 0;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
@@ -971,8 +996,13 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (writable)
 		prot |= KVM_PGTABLE_PROT_W;
 
-	if (fault_status != FSC_PERM && !device)
+	if (fault_status != FSC_PERM && !device) {
+		ret = sanitise_mte_tags(kvm, vma_pagesize, pfn);
+		if (ret)
+			goto out_unlock;
+
 		clean_dcache_guest_page(pfn, vma_pagesize);
+	}
 
 	if (exec_fault) {
 		prot |= KVM_PGTABLE_PROT_X;
@@ -1168,12 +1198,17 @@  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	kvm_pfn_t pfn = pte_pfn(range->pte);
+	int ret;
 
 	if (!kvm->arch.mmu.pgt)
 		return 0;
 
 	WARN_ON(range->end - range->start != 1);
 
+	ret = sanitise_mte_tags(kvm, PAGE_SIZE, pfn);
+	if (ret)
+		return ret;
+
 	/*
 	 * We've moved a page around, probably through CoW, so let's treat it
 	 * just like a translation fault and clean the cache to the PoC.
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 76ea2800c33e..24a844cb79ca 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1047,6 +1047,9 @@  static u64 read_id_reg(const struct kvm_vcpu *vcpu,
 		break;
 	case SYS_ID_AA64PFR1_EL1:
 		val &= ~FEATURE(ID_AA64PFR1_MTE);
+		if (kvm_has_mte(vcpu->kvm))
+			val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE),
+					  ID_AA64PFR1_MTE);
 		break;
 	case SYS_ID_AA64ISAR1_EL1:
 		if (!vcpu_has_ptrauth(vcpu))
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3fd9a7e9d90c..8c95ba0fadda 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1082,6 +1082,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SGX_ATTRIBUTE 196
 #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
 #define KVM_CAP_PTP_KVM 198
+#define KVM_CAP_ARM_MTE 199
 
 #ifdef KVM_CAP_IRQ_ROUTING