diff mbox

[kernel,3/9] KVM: PPC: Use preregistered memory API to access TCE list

Message ID 1457322077-26640-4-git-send-email-aik@ozlabs.ru
State Changes Requested
Headers show

Commit Message

Alexey Kardashevskiy March 7, 2016, 3:41 a.m. UTC
VFIO on sPAPR already implements guest memory pre-registration
when the entire guest RAM gets pinned. This can be used to translate
the physical address of a guest page containing the TCE list
from H_PUT_TCE_INDIRECT.

This makes use of the pre-registrered memory API to access TCE list
pages in order to avoid unnecessary locking on the KVM memory
reverse map.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/kvm/book3s_64_vio_hv.c | 86 ++++++++++++++++++++++++++++++-------
 1 file changed, 70 insertions(+), 16 deletions(-)

Comments

David Gibson March 7, 2016, 6 a.m. UTC | #1
On Mon, Mar 07, 2016 at 02:41:11PM +1100, Alexey Kardashevskiy wrote:
> VFIO on sPAPR already implements guest memory pre-registration
> when the entire guest RAM gets pinned. This can be used to translate
> the physical address of a guest page containing the TCE list
> from H_PUT_TCE_INDIRECT.
> 
> This makes use of the pre-registrered memory API to access TCE list
> pages in order to avoid unnecessary locking on the KVM memory
> reverse map.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Ok.. so, what's the benefit of not having to lock the rmap?

> ---
>  arch/powerpc/kvm/book3s_64_vio_hv.c | 86 ++++++++++++++++++++++++++++++-------
>  1 file changed, 70 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> index 44be73e..af155f6 100644
> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -180,6 +180,38 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
>  EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
>  
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +static mm_context_t *kvmppc_mm_context(struct kvm_vcpu *vcpu)
> +{
> +	struct task_struct *task;
> +
> +	task = vcpu->arch.run_task;
> +	if (unlikely(!task || !task->mm))
> +		return NULL;
> +
> +	return &task->mm->context;
> +}
> +
> +static inline bool kvmppc_preregistered(struct kvm_vcpu *vcpu)
> +{
> +	mm_context_t *mm = kvmppc_mm_context(vcpu);
> +
> +	if (unlikely(!mm))
> +		return false;
> +
> +	return mm_iommu_preregistered(mm);
> +}
> +
> +static struct mm_iommu_table_group_mem_t *kvmppc_rm_iommu_lookup(
> +		struct kvm_vcpu *vcpu, unsigned long ua, unsigned long size)
> +{
> +	mm_context_t *mm = kvmppc_mm_context(vcpu);
> +
> +	if (unlikely(!mm))
> +		return NULL;
> +
> +	return mm_iommu_lookup_rm(mm, ua, size);
> +}
> +
>  long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>  		      unsigned long ioba, unsigned long tce)
>  {
> @@ -261,23 +293,44 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>  	if (ret != H_SUCCESS)
>  		return ret;
>  
> -	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
> -		return H_TOO_HARD;
> +	if (kvmppc_preregistered(vcpu)) {
> +		/*
> +		 * We get here if guest memory was pre-registered which
> +		 * is normally VFIO case and gpa->hpa translation does not
> +		 * depend on hpt.
> +		 */
> +		struct mm_iommu_table_group_mem_t *mem;
>  
> -	rmap = (void *) vmalloc_to_phys(rmap);
> +		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
> +			return H_TOO_HARD;
>  
> -	/*
> -	 * Synchronize with the MMU notifier callbacks in
> -	 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
> -	 * While we have the rmap lock, code running on other CPUs
> -	 * cannot finish unmapping the host real page that backs
> -	 * this guest real page, so we are OK to access the host
> -	 * real page.
> -	 */
> -	lock_rmap(rmap);
> -	if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
> -		ret = H_TOO_HARD;
> -		goto unlock_exit;
> +		mem = kvmppc_rm_iommu_lookup(vcpu, ua, IOMMU_PAGE_SIZE_4K);
> +		if (!mem || mm_iommu_rm_ua_to_hpa(mem, ua, &tces))
> +			return H_TOO_HARD;
> +	} else {
> +		/*
> +		 * This is emulated devices case.
> +		 * We do not require memory to be preregistered in this case
> +		 * so lock rmap and do __find_linux_pte_or_hugepte().
> +		 */
> +		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
> +			return H_TOO_HARD;
> +
> +		rmap = (void *) vmalloc_to_phys(rmap);
> +
> +		/*
> +		 * Synchronize with the MMU notifier callbacks in
> +		 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
> +		 * While we have the rmap lock, code running on other CPUs
> +		 * cannot finish unmapping the host real page that backs
> +		 * this guest real page, so we are OK to access the host
> +		 * real page.
> +		 */
> +		lock_rmap(rmap);
> +		if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
> +			ret = H_TOO_HARD;
> +			goto unlock_exit;
> +		}
>  	}
>  
>  	for (i = 0; i < npages; ++i) {
> @@ -291,7 +344,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>  	}
>  
>  unlock_exit:
> -	unlock_rmap(rmap);
> +	if (rmap)

I don't see where rmap is initialized to NULL in the case where it's
not being used.

> +		unlock_rmap(rmap);
>  
>  	return ret;
>  }
Alexey Kardashevskiy March 8, 2016, 5:47 a.m. UTC | #2
On 03/07/2016 05:00 PM, David Gibson wrote:
> On Mon, Mar 07, 2016 at 02:41:11PM +1100, Alexey Kardashevskiy wrote:
>> VFIO on sPAPR already implements guest memory pre-registration
>> when the entire guest RAM gets pinned. This can be used to translate
>> the physical address of a guest page containing the TCE list
>> from H_PUT_TCE_INDIRECT.
>>
>> This makes use of the pre-registrered memory API to access TCE list
>> pages in order to avoid unnecessary locking on the KVM memory
>> reverse map.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
> Ok.. so, what's the benefit of not having to lock the rmap?

Less locking -> less racing == good, no?


>
>> ---
>>   arch/powerpc/kvm/book3s_64_vio_hv.c | 86 ++++++++++++++++++++++++++++++-------
>>   1 file changed, 70 insertions(+), 16 deletions(-)
>>
>> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
>> index 44be73e..af155f6 100644
>> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
>> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
>> @@ -180,6 +180,38 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
>>   EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
>>
>>   #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>> +static mm_context_t *kvmppc_mm_context(struct kvm_vcpu *vcpu)
>> +{
>> +	struct task_struct *task;
>> +
>> +	task = vcpu->arch.run_task;
>> +	if (unlikely(!task || !task->mm))
>> +		return NULL;
>> +
>> +	return &task->mm->context;
>> +}
>> +
>> +static inline bool kvmppc_preregistered(struct kvm_vcpu *vcpu)
>> +{
>> +	mm_context_t *mm = kvmppc_mm_context(vcpu);
>> +
>> +	if (unlikely(!mm))
>> +		return false;
>> +
>> +	return mm_iommu_preregistered(mm);
>> +}
>> +
>> +static struct mm_iommu_table_group_mem_t *kvmppc_rm_iommu_lookup(
>> +		struct kvm_vcpu *vcpu, unsigned long ua, unsigned long size)
>> +{
>> +	mm_context_t *mm = kvmppc_mm_context(vcpu);
>> +
>> +	if (unlikely(!mm))
>> +		return NULL;
>> +
>> +	return mm_iommu_lookup_rm(mm, ua, size);
>> +}
>> +
>>   long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>>   		      unsigned long ioba, unsigned long tce)
>>   {
>> @@ -261,23 +293,44 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>>   	if (ret != H_SUCCESS)
>>   		return ret;
>>
>> -	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
>> -		return H_TOO_HARD;
>> +	if (kvmppc_preregistered(vcpu)) {
>> +		/*
>> +		 * We get here if guest memory was pre-registered which
>> +		 * is normally VFIO case and gpa->hpa translation does not
>> +		 * depend on hpt.
>> +		 */
>> +		struct mm_iommu_table_group_mem_t *mem;
>>
>> -	rmap = (void *) vmalloc_to_phys(rmap);
>> +		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
>> +			return H_TOO_HARD;
>>
>> -	/*
>> -	 * Synchronize with the MMU notifier callbacks in
>> -	 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
>> -	 * While we have the rmap lock, code running on other CPUs
>> -	 * cannot finish unmapping the host real page that backs
>> -	 * this guest real page, so we are OK to access the host
>> -	 * real page.
>> -	 */
>> -	lock_rmap(rmap);
>> -	if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
>> -		ret = H_TOO_HARD;
>> -		goto unlock_exit;
>> +		mem = kvmppc_rm_iommu_lookup(vcpu, ua, IOMMU_PAGE_SIZE_4K);
>> +		if (!mem || mm_iommu_rm_ua_to_hpa(mem, ua, &tces))
>> +			return H_TOO_HARD;
>> +	} else {
>> +		/*
>> +		 * This is emulated devices case.
>> +		 * We do not require memory to be preregistered in this case
>> +		 * so lock rmap and do __find_linux_pte_or_hugepte().
>> +		 */
>> +		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
>> +			return H_TOO_HARD;
>> +
>> +		rmap = (void *) vmalloc_to_phys(rmap);
>> +
>> +		/*
>> +		 * Synchronize with the MMU notifier callbacks in
>> +		 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
>> +		 * While we have the rmap lock, code running on other CPUs
>> +		 * cannot finish unmapping the host real page that backs
>> +		 * this guest real page, so we are OK to access the host
>> +		 * real page.
>> +		 */
>> +		lock_rmap(rmap);
>> +		if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
>> +			ret = H_TOO_HARD;
>> +			goto unlock_exit;
>> +		}
>>   	}
>>
>>   	for (i = 0; i < npages; ++i) {
>> @@ -291,7 +344,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>>   	}
>>
>>   unlock_exit:
>> -	unlock_rmap(rmap);
>> +	if (rmap)
>
> I don't see where rmap is initialized to NULL in the case where it's
> not being used.

@rmap is not new to this function, and it has always been initialized to 
NULL as it was returned via a pointer from kvmppc_gpa_to_ua().


>
>> +		unlock_rmap(rmap);
>>
>>   	return ret;
>>   }
>
David Gibson March 8, 2016, 6:30 a.m. UTC | #3
On Tue, Mar 08, 2016 at 04:47:20PM +1100, Alexey Kardashevskiy wrote:
> On 03/07/2016 05:00 PM, David Gibson wrote:
> >On Mon, Mar 07, 2016 at 02:41:11PM +1100, Alexey Kardashevskiy wrote:
> >>VFIO on sPAPR already implements guest memory pre-registration
> >>when the entire guest RAM gets pinned. This can be used to translate
> >>the physical address of a guest page containing the TCE list
> >>from H_PUT_TCE_INDIRECT.
> >>
> >>This makes use of the pre-registrered memory API to access TCE list
> >>pages in order to avoid unnecessary locking on the KVM memory
> >>reverse map.
> >>
> >>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >
> >Ok.. so, what's the benefit of not having to lock the rmap?
> 
> Less locking -> less racing == good, no?

Well.. maybe.  The increased difficulty in verifying that the code is
correct isn't always a good price to pay.

> >>---
> >>  arch/powerpc/kvm/book3s_64_vio_hv.c | 86 ++++++++++++++++++++++++++++++-------
> >>  1 file changed, 70 insertions(+), 16 deletions(-)
> >>
> >>diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> >>index 44be73e..af155f6 100644
> >>--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> >>+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> >>@@ -180,6 +180,38 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
> >>  EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
> >>
> >>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> >>+static mm_context_t *kvmppc_mm_context(struct kvm_vcpu *vcpu)
> >>+{
> >>+	struct task_struct *task;
> >>+
> >>+	task = vcpu->arch.run_task;
> >>+	if (unlikely(!task || !task->mm))
> >>+		return NULL;
> >>+
> >>+	return &task->mm->context;
> >>+}
> >>+
> >>+static inline bool kvmppc_preregistered(struct kvm_vcpu *vcpu)
> >>+{
> >>+	mm_context_t *mm = kvmppc_mm_context(vcpu);
> >>+
> >>+	if (unlikely(!mm))
> >>+		return false;
> >>+
> >>+	return mm_iommu_preregistered(mm);
> >>+}
> >>+
> >>+static struct mm_iommu_table_group_mem_t *kvmppc_rm_iommu_lookup(
> >>+		struct kvm_vcpu *vcpu, unsigned long ua, unsigned long size)
> >>+{
> >>+	mm_context_t *mm = kvmppc_mm_context(vcpu);
> >>+
> >>+	if (unlikely(!mm))
> >>+		return NULL;
> >>+
> >>+	return mm_iommu_lookup_rm(mm, ua, size);
> >>+}
> >>+
> >>  long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
> >>  		      unsigned long ioba, unsigned long tce)
> >>  {
> >>@@ -261,23 +293,44 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
> >>  	if (ret != H_SUCCESS)
> >>  		return ret;
> >>
> >>-	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
> >>-		return H_TOO_HARD;
> >>+	if (kvmppc_preregistered(vcpu)) {
> >>+		/*
> >>+		 * We get here if guest memory was pre-registered which
> >>+		 * is normally VFIO case and gpa->hpa translation does not
> >>+		 * depend on hpt.
> >>+		 */
> >>+		struct mm_iommu_table_group_mem_t *mem;
> >>
> >>-	rmap = (void *) vmalloc_to_phys(rmap);
> >>+		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
> >>+			return H_TOO_HARD;
> >>
> >>-	/*
> >>-	 * Synchronize with the MMU notifier callbacks in
> >>-	 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
> >>-	 * While we have the rmap lock, code running on other CPUs
> >>-	 * cannot finish unmapping the host real page that backs
> >>-	 * this guest real page, so we are OK to access the host
> >>-	 * real page.
> >>-	 */
> >>-	lock_rmap(rmap);
> >>-	if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
> >>-		ret = H_TOO_HARD;
> >>-		goto unlock_exit;
> >>+		mem = kvmppc_rm_iommu_lookup(vcpu, ua, IOMMU_PAGE_SIZE_4K);
> >>+		if (!mem || mm_iommu_rm_ua_to_hpa(mem, ua, &tces))
> >>+			return H_TOO_HARD;
> >>+	} else {
> >>+		/*
> >>+		 * This is emulated devices case.
> >>+		 * We do not require memory to be preregistered in this case
> >>+		 * so lock rmap and do __find_linux_pte_or_hugepte().
> >>+		 */
> >>+		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
> >>+			return H_TOO_HARD;
> >>+
> >>+		rmap = (void *) vmalloc_to_phys(rmap);
> >>+
> >>+		/*
> >>+		 * Synchronize with the MMU notifier callbacks in
> >>+		 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
> >>+		 * While we have the rmap lock, code running on other CPUs
> >>+		 * cannot finish unmapping the host real page that backs
> >>+		 * this guest real page, so we are OK to access the host
> >>+		 * real page.
> >>+		 */
> >>+		lock_rmap(rmap);
> >>+		if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
> >>+			ret = H_TOO_HARD;
> >>+			goto unlock_exit;
> >>+		}
> >>  	}
> >>
> >>  	for (i = 0; i < npages; ++i) {
> >>@@ -291,7 +344,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
> >>  	}
> >>
> >>  unlock_exit:
> >>-	unlock_rmap(rmap);
> >>+	if (rmap)
> >
> >I don't see where rmap is initialized to NULL in the case where it's
> >not being used.
> 
> @rmap is not new to this function, and it has always been initialized to
> NULL as it was returned via a pointer from kvmppc_gpa_to_ua().

This comment confuses me.  Looking closer at the code I see you're
right, and it's initialized to NULL where defined, which I missed.

But that has nothing to do with being returned by pointer from
kvmppc_gpa_to_ua(), since one of your branches in the new code no
longer passes &rmap to that function.
Alexey Kardashevskiy March 9, 2016, 8:55 a.m. UTC | #4
On 03/08/2016 05:30 PM, David Gibson wrote:
> On Tue, Mar 08, 2016 at 04:47:20PM +1100, Alexey Kardashevskiy wrote:
>> On 03/07/2016 05:00 PM, David Gibson wrote:
>>> On Mon, Mar 07, 2016 at 02:41:11PM +1100, Alexey Kardashevskiy wrote:
>>>> VFIO on sPAPR already implements guest memory pre-registration
>>>> when the entire guest RAM gets pinned. This can be used to translate
>>>> the physical address of a guest page containing the TCE list
>>> >from H_PUT_TCE_INDIRECT.
>>>>
>>>> This makes use of the pre-registrered memory API to access TCE list
>>>> pages in order to avoid unnecessary locking on the KVM memory
>>>> reverse map.
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>
>>> Ok.. so, what's the benefit of not having to lock the rmap?
>>
>> Less locking -> less racing == good, no?
>
> Well.. maybe.  The increased difficulty in verifying that the code is
> correct isn't always a good price to pay.
 >
>>>> ---
>>>>   arch/powerpc/kvm/book3s_64_vio_hv.c | 86 ++++++++++++++++++++++++++++++-------
>>>>   1 file changed, 70 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
>>>> index 44be73e..af155f6 100644
>>>> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
>>>> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
>>>> @@ -180,6 +180,38 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
>>>>   EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
>>>>
>>>>   #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>>>> +static mm_context_t *kvmppc_mm_context(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	struct task_struct *task;
>>>> +
>>>> +	task = vcpu->arch.run_task;
>>>> +	if (unlikely(!task || !task->mm))
>>>> +		return NULL;
>>>> +
>>>> +	return &task->mm->context;
>>>> +}
>>>> +
>>>> +static inline bool kvmppc_preregistered(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	mm_context_t *mm = kvmppc_mm_context(vcpu);
>>>> +
>>>> +	if (unlikely(!mm))
>>>> +		return false;
>>>> +
>>>> +	return mm_iommu_preregistered(mm);
>>>> +}
>>>> +
>>>> +static struct mm_iommu_table_group_mem_t *kvmppc_rm_iommu_lookup(
>>>> +		struct kvm_vcpu *vcpu, unsigned long ua, unsigned long size)
>>>> +{
>>>> +	mm_context_t *mm = kvmppc_mm_context(vcpu);
>>>> +
>>>> +	if (unlikely(!mm))
>>>> +		return NULL;
>>>> +
>>>> +	return mm_iommu_lookup_rm(mm, ua, size);
>>>> +}
>>>> +
>>>>   long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>>>>   		      unsigned long ioba, unsigned long tce)
>>>>   {
>>>> @@ -261,23 +293,44 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>>>>   	if (ret != H_SUCCESS)
>>>>   		return ret;
>>>>
>>>> -	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
>>>> -		return H_TOO_HARD;
>>>> +	if (kvmppc_preregistered(vcpu)) {
>>>> +		/*
>>>> +		 * We get here if guest memory was pre-registered which
>>>> +		 * is normally VFIO case and gpa->hpa translation does not
>>>> +		 * depend on hpt.
>>>> +		 */
>>>> +		struct mm_iommu_table_group_mem_t *mem;
>>>>
>>>> -	rmap = (void *) vmalloc_to_phys(rmap);
>>>> +		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
>>>> +			return H_TOO_HARD;
>>>>
>>>> -	/*
>>>> -	 * Synchronize with the MMU notifier callbacks in
>>>> -	 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
>>>> -	 * While we have the rmap lock, code running on other CPUs
>>>> -	 * cannot finish unmapping the host real page that backs
>>>> -	 * this guest real page, so we are OK to access the host
>>>> -	 * real page.
>>>> -	 */
>>>> -	lock_rmap(rmap);
>>>> -	if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
>>>> -		ret = H_TOO_HARD;
>>>> -		goto unlock_exit;
>>>> +		mem = kvmppc_rm_iommu_lookup(vcpu, ua, IOMMU_PAGE_SIZE_4K);
>>>> +		if (!mem || mm_iommu_rm_ua_to_hpa(mem, ua, &tces))
>>>> +			return H_TOO_HARD;
>>>> +	} else {
>>>> +		/*
>>>> +		 * This is emulated devices case.
>>>> +		 * We do not require memory to be preregistered in this case
>>>> +		 * so lock rmap and do __find_linux_pte_or_hugepte().
>>>> +		 */
>>>> +		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
>>>> +			return H_TOO_HARD;
>>>> +
>>>> +		rmap = (void *) vmalloc_to_phys(rmap);
>>>> +
>>>> +		/*
>>>> +		 * Synchronize with the MMU notifier callbacks in
>>>> +		 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
>>>> +		 * While we have the rmap lock, code running on other CPUs
>>>> +		 * cannot finish unmapping the host real page that backs
>>>> +		 * this guest real page, so we are OK to access the host
>>>> +		 * real page.
>>>> +		 */
>>>> +		lock_rmap(rmap);
>>>> +		if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
>>>> +			ret = H_TOO_HARD;
>>>> +			goto unlock_exit;
>>>> +		}
>>>>   	}
>>>>
>>>>   	for (i = 0; i < npages; ++i) {
>>>> @@ -291,7 +344,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>>>>   	}
>>>>
>>>>   unlock_exit:
>>>> -	unlock_rmap(rmap);
>>>> +	if (rmap)
>>>
>>> I don't see where rmap is initialized to NULL in the case where it's
>>> not being used.
>>
>> @rmap is not new to this function, and it has always been initialized to
>> NULL as it was returned via a pointer from kvmppc_gpa_to_ua().
>
> This comment confuses me.  Looking closer at the code I see you're
> right, and it's initialized to NULL where defined, which I missed.
>
> But that has nothing to do with being returned by pointer from
> kvmppc_gpa_to_ua(), since one of your branches in the new code no
> longer passes &rmap to that function.


So? The code is still correct - the "preregistered branch" does not touch 
NULL pointer, it remains NULL and unlock_rmap() is not called. I agree the 
patch is not the easiest to read but how can I improve it to get your "rb"? 
Replace "if(rmap)" with "if (kvmppc_preregistered(vcpu))"? Move that loop 
between lock_rmap/unlock_rmap to a helper? kvmppc_rm_h_put_tce_indirect() 
is not big enough to justify splitting, the comments inside it are though...
David Gibson March 9, 2016, 11:46 p.m. UTC | #5
On Wed, Mar 09, 2016 at 07:55:53PM +1100, Alexey Kardashevskiy wrote:
> On 03/08/2016 05:30 PM, David Gibson wrote:
> >On Tue, Mar 08, 2016 at 04:47:20PM +1100, Alexey Kardashevskiy wrote:
> >>On 03/07/2016 05:00 PM, David Gibson wrote:
> >>>On Mon, Mar 07, 2016 at 02:41:11PM +1100, Alexey Kardashevskiy wrote:
> >>>>VFIO on sPAPR already implements guest memory pre-registration
> >>>>when the entire guest RAM gets pinned. This can be used to translate
> >>>>the physical address of a guest page containing the TCE list
> >>>>from H_PUT_TCE_INDIRECT.
> >>>>
> >>>>This makes use of the pre-registrered memory API to access TCE list
> >>>>pages in order to avoid unnecessary locking on the KVM memory
> >>>>reverse map.
> >>>>
> >>>>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>
> >>>Ok.. so, what's the benefit of not having to lock the rmap?
> >>
> >>Less locking -> less racing == good, no?
> >
> >Well.. maybe.  The increased difficulty in verifying that the code is
> >correct isn't always a good price to pay.
> >
> >>>>---
> >>>>  arch/powerpc/kvm/book3s_64_vio_hv.c | 86 ++++++++++++++++++++++++++++++-------
> >>>>  1 file changed, 70 insertions(+), 16 deletions(-)
> >>>>
> >>>>diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> >>>>index 44be73e..af155f6 100644
> >>>>--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> >>>>+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> >>>>@@ -180,6 +180,38 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
> >>>>  EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
> >>>>
> >>>>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> >>>>+static mm_context_t *kvmppc_mm_context(struct kvm_vcpu *vcpu)
> >>>>+{
> >>>>+	struct task_struct *task;
> >>>>+
> >>>>+	task = vcpu->arch.run_task;
> >>>>+	if (unlikely(!task || !task->mm))
> >>>>+		return NULL;
> >>>>+
> >>>>+	return &task->mm->context;
> >>>>+}
> >>>>+
> >>>>+static inline bool kvmppc_preregistered(struct kvm_vcpu *vcpu)
> >>>>+{
> >>>>+	mm_context_t *mm = kvmppc_mm_context(vcpu);
> >>>>+
> >>>>+	if (unlikely(!mm))
> >>>>+		return false;
> >>>>+
> >>>>+	return mm_iommu_preregistered(mm);
> >>>>+}
> >>>>+
> >>>>+static struct mm_iommu_table_group_mem_t *kvmppc_rm_iommu_lookup(
> >>>>+		struct kvm_vcpu *vcpu, unsigned long ua, unsigned long size)
> >>>>+{
> >>>>+	mm_context_t *mm = kvmppc_mm_context(vcpu);
> >>>>+
> >>>>+	if (unlikely(!mm))
> >>>>+		return NULL;
> >>>>+
> >>>>+	return mm_iommu_lookup_rm(mm, ua, size);
> >>>>+}
> >>>>+
> >>>>  long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
> >>>>  		      unsigned long ioba, unsigned long tce)
> >>>>  {
> >>>>@@ -261,23 +293,44 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
> >>>>  	if (ret != H_SUCCESS)
> >>>>  		return ret;
> >>>>
> >>>>-	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
> >>>>-		return H_TOO_HARD;
> >>>>+	if (kvmppc_preregistered(vcpu)) {
> >>>>+		/*
> >>>>+		 * We get here if guest memory was pre-registered which
> >>>>+		 * is normally VFIO case and gpa->hpa translation does not
> >>>>+		 * depend on hpt.
> >>>>+		 */
> >>>>+		struct mm_iommu_table_group_mem_t *mem;
> >>>>
> >>>>-	rmap = (void *) vmalloc_to_phys(rmap);
> >>>>+		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
> >>>>+			return H_TOO_HARD;
> >>>>
> >>>>-	/*
> >>>>-	 * Synchronize with the MMU notifier callbacks in
> >>>>-	 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
> >>>>-	 * While we have the rmap lock, code running on other CPUs
> >>>>-	 * cannot finish unmapping the host real page that backs
> >>>>-	 * this guest real page, so we are OK to access the host
> >>>>-	 * real page.
> >>>>-	 */
> >>>>-	lock_rmap(rmap);
> >>>>-	if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
> >>>>-		ret = H_TOO_HARD;
> >>>>-		goto unlock_exit;
> >>>>+		mem = kvmppc_rm_iommu_lookup(vcpu, ua, IOMMU_PAGE_SIZE_4K);
> >>>>+		if (!mem || mm_iommu_rm_ua_to_hpa(mem, ua, &tces))
> >>>>+			return H_TOO_HARD;
> >>>>+	} else {
> >>>>+		/*
> >>>>+		 * This is emulated devices case.
> >>>>+		 * We do not require memory to be preregistered in this case
> >>>>+		 * so lock rmap and do __find_linux_pte_or_hugepte().
> >>>>+		 */
> >>>>+		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
> >>>>+			return H_TOO_HARD;
> >>>>+
> >>>>+		rmap = (void *) vmalloc_to_phys(rmap);
> >>>>+
> >>>>+		/*
> >>>>+		 * Synchronize with the MMU notifier callbacks in
> >>>>+		 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
> >>>>+		 * While we have the rmap lock, code running on other CPUs
> >>>>+		 * cannot finish unmapping the host real page that backs
> >>>>+		 * this guest real page, so we are OK to access the host
> >>>>+		 * real page.
> >>>>+		 */
> >>>>+		lock_rmap(rmap);
> >>>>+		if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
> >>>>+			ret = H_TOO_HARD;
> >>>>+			goto unlock_exit;
> >>>>+		}
> >>>>  	}
> >>>>
> >>>>  	for (i = 0; i < npages; ++i) {
> >>>>@@ -291,7 +344,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
> >>>>  	}
> >>>>
> >>>>  unlock_exit:
> >>>>-	unlock_rmap(rmap);
> >>>>+	if (rmap)
> >>>
> >>>I don't see where rmap is initialized to NULL in the case where it's
> >>>not being used.
> >>
> >>@rmap is not new to this function, and it has always been initialized to
> >>NULL as it was returned via a pointer from kvmppc_gpa_to_ua().
> >
> >This comment confuses me.  Looking closer at the code I see you're
> >right, and it's initialized to NULL where defined, which I missed.
> >
> >But that has nothing to do with being returned by pointer from
> >kvmppc_gpa_to_ua(), since one of your branches in the new code no
> >longer passes &rmap to that function.
> 
> 
> So? The code is still correct - the "preregistered branch" does not touch
> NULL pointer, it remains NULL and unlock_rmap() is not called. I agree the
> patch is not the easiest to read but how can I improve it to get your "rb"?
> Replace "if(rmap)" with "if (kvmppc_preregistered(vcpu))"? Move that loop
> between lock_rmap/unlock_rmap to a helper? kvmppc_rm_h_put_tce_indirect() is
> not big enough to justify splitting, the comments inside it are though...

Sorry, I wasn't clear.  I no longer have a specific objection.  I left
out the R-b, since there have been enough other comments on the series
that I was expecting a respin, so I was planning to re-review in the
context of an updated series.
Paul Mackerras March 10, 2016, 8:33 a.m. UTC | #6
On Mon, Mar 07, 2016 at 05:00:14PM +1100, David Gibson wrote:
> On Mon, Mar 07, 2016 at 02:41:11PM +1100, Alexey Kardashevskiy wrote:
> > VFIO on sPAPR already implements guest memory pre-registration
> > when the entire guest RAM gets pinned. This can be used to translate
> > the physical address of a guest page containing the TCE list
> > from H_PUT_TCE_INDIRECT.
> > 
> > This makes use of the pre-registrered memory API to access TCE list
> > pages in order to avoid unnecessary locking on the KVM memory
> > reverse map.
> > 
> > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> 
> Ok.. so, what's the benefit of not having to lock the rmap?

It's not primarily about locking or not locking the rmap.  The point
is that when memory is pre-registered, we know that all of guest
memory is pinned and we have a flat array mapping GPA to HPA.  It's
simpler and quicker to index into that array (even with looking up the
kernel page tables in vmalloc_to_phys) than it is to find the memslot,
lock the rmap entry, look up the user page tables, and unlock the rmap
entry.  We were only locking the rmap entry to stop the page being
unmapped and reallocated to something else, but if memory is
pre-registered, it's all pinned, so it can't be reallocated.

Paul.




--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Gibson March 10, 2016, 11:42 p.m. UTC | #7
On Thu, Mar 10, 2016 at 07:33:05PM +1100, Paul Mackerras wrote:
> On Mon, Mar 07, 2016 at 05:00:14PM +1100, David Gibson wrote:
> > On Mon, Mar 07, 2016 at 02:41:11PM +1100, Alexey Kardashevskiy wrote:
> > > VFIO on sPAPR already implements guest memory pre-registration
> > > when the entire guest RAM gets pinned. This can be used to translate
> > > the physical address of a guest page containing the TCE list
> > > from H_PUT_TCE_INDIRECT.
> > > 
> > > This makes use of the pre-registrered memory API to access TCE list
> > > pages in order to avoid unnecessary locking on the KVM memory
> > > reverse map.
> > > 
> > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > 
> > Ok.. so, what's the benefit of not having to lock the rmap?
> 
> It's not primarily about locking or not locking the rmap.  The point
> is that when memory is pre-registered, we know that all of guest
> memory is pinned and we have a flat array mapping GPA to HPA.  It's
> simpler and quicker to index into that array (even with looking up the
> kernel page tables in vmalloc_to_phys) than it is to find the memslot,
> lock the rmap entry, look up the user page tables, and unlock the rmap
> entry.  We were only locking the rmap entry to stop the page being
> unmapped and reallocated to something else, but if memory is
> pre-registered, it's all pinned, so it can't be reallocated.

Ok, that makes sense.

Alexey, can you fold some of that rationale into the commit message?
diff mbox

Patch

diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 44be73e..af155f6 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -180,6 +180,38 @@  long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
 EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+static mm_context_t *kvmppc_mm_context(struct kvm_vcpu *vcpu)
+{
+	struct task_struct *task;
+
+	task = vcpu->arch.run_task;
+	if (unlikely(!task || !task->mm))
+		return NULL;
+
+	return &task->mm->context;
+}
+
+static inline bool kvmppc_preregistered(struct kvm_vcpu *vcpu)
+{
+	mm_context_t *mm = kvmppc_mm_context(vcpu);
+
+	if (unlikely(!mm))
+		return false;
+
+	return mm_iommu_preregistered(mm);
+}
+
+static struct mm_iommu_table_group_mem_t *kvmppc_rm_iommu_lookup(
+		struct kvm_vcpu *vcpu, unsigned long ua, unsigned long size)
+{
+	mm_context_t *mm = kvmppc_mm_context(vcpu);
+
+	if (unlikely(!mm))
+		return NULL;
+
+	return mm_iommu_lookup_rm(mm, ua, size);
+}
+
 long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		      unsigned long ioba, unsigned long tce)
 {
@@ -261,23 +293,44 @@  long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	if (ret != H_SUCCESS)
 		return ret;
 
-	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
-		return H_TOO_HARD;
+	if (kvmppc_preregistered(vcpu)) {
+		/*
+		 * We get here if guest memory was pre-registered which
+		 * is normally VFIO case and gpa->hpa translation does not
+		 * depend on hpt.
+		 */
+		struct mm_iommu_table_group_mem_t *mem;
 
-	rmap = (void *) vmalloc_to_phys(rmap);
+		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
+			return H_TOO_HARD;
 
-	/*
-	 * Synchronize with the MMU notifier callbacks in
-	 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
-	 * While we have the rmap lock, code running on other CPUs
-	 * cannot finish unmapping the host real page that backs
-	 * this guest real page, so we are OK to access the host
-	 * real page.
-	 */
-	lock_rmap(rmap);
-	if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
-		ret = H_TOO_HARD;
-		goto unlock_exit;
+		mem = kvmppc_rm_iommu_lookup(vcpu, ua, IOMMU_PAGE_SIZE_4K);
+		if (!mem || mm_iommu_rm_ua_to_hpa(mem, ua, &tces))
+			return H_TOO_HARD;
+	} else {
+		/*
+		 * This is emulated devices case.
+		 * We do not require memory to be preregistered in this case
+		 * so lock rmap and do __find_linux_pte_or_hugepte().
+		 */
+		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
+			return H_TOO_HARD;
+
+		rmap = (void *) vmalloc_to_phys(rmap);
+
+		/*
+		 * Synchronize with the MMU notifier callbacks in
+		 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
+		 * While we have the rmap lock, code running on other CPUs
+		 * cannot finish unmapping the host real page that backs
+		 * this guest real page, so we are OK to access the host
+		 * real page.
+		 */
+		lock_rmap(rmap);
+		if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
+			ret = H_TOO_HARD;
+			goto unlock_exit;
+		}
 	}
 
 	for (i = 0; i < npages; ++i) {
@@ -291,7 +344,8 @@  long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	}
 
 unlock_exit:
-	unlock_rmap(rmap);
+	if (rmap)
+		unlock_rmap(rmap);
 
 	return ret;
 }