Message ID | 1457322077-26640-4-git-send-email-aik@ozlabs.ru |
---|---|
State | Changes Requested |
Headers | show |
On Mon, Mar 07, 2016 at 02:41:11PM +1100, Alexey Kardashevskiy wrote: > VFIO on sPAPR already implements guest memory pre-registration > when the entire guest RAM gets pinned. This can be used to translate > the physical address of a guest page containing the TCE list > from H_PUT_TCE_INDIRECT. > > This makes use of the pre-registrered memory API to access TCE list > pages in order to avoid unnecessary locking on the KVM memory > reverse map. > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Ok.. so, what's the benefit of not having to lock the rmap? > --- > arch/powerpc/kvm/book3s_64_vio_hv.c | 86 ++++++++++++++++++++++++++++++------- > 1 file changed, 70 insertions(+), 16 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c > index 44be73e..af155f6 100644 > --- a/arch/powerpc/kvm/book3s_64_vio_hv.c > +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c > @@ -180,6 +180,38 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, > EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); > > #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE > +static mm_context_t *kvmppc_mm_context(struct kvm_vcpu *vcpu) > +{ > + struct task_struct *task; > + > + task = vcpu->arch.run_task; > + if (unlikely(!task || !task->mm)) > + return NULL; > + > + return &task->mm->context; > +} > + > +static inline bool kvmppc_preregistered(struct kvm_vcpu *vcpu) > +{ > + mm_context_t *mm = kvmppc_mm_context(vcpu); > + > + if (unlikely(!mm)) > + return false; > + > + return mm_iommu_preregistered(mm); > +} > + > +static struct mm_iommu_table_group_mem_t *kvmppc_rm_iommu_lookup( > + struct kvm_vcpu *vcpu, unsigned long ua, unsigned long size) > +{ > + mm_context_t *mm = kvmppc_mm_context(vcpu); > + > + if (unlikely(!mm)) > + return NULL; > + > + return mm_iommu_lookup_rm(mm, ua, size); > +} > + > long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, > unsigned long ioba, unsigned long tce) > { > @@ -261,23 +293,44 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, > if (ret != H_SUCCESS) > return ret; > > - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) > - return H_TOO_HARD; > + if (kvmppc_preregistered(vcpu)) { > + /* > + * We get here if guest memory was pre-registered which > + * is normally VFIO case and gpa->hpa translation does not > + * depend on hpt. > + */ > + struct mm_iommu_table_group_mem_t *mem; > > - rmap = (void *) vmalloc_to_phys(rmap); > + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) > + return H_TOO_HARD; > > - /* > - * Synchronize with the MMU notifier callbacks in > - * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). > - * While we have the rmap lock, code running on other CPUs > - * cannot finish unmapping the host real page that backs > - * this guest real page, so we are OK to access the host > - * real page. > - */ > - lock_rmap(rmap); > - if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { > - ret = H_TOO_HARD; > - goto unlock_exit; > + mem = kvmppc_rm_iommu_lookup(vcpu, ua, IOMMU_PAGE_SIZE_4K); > + if (!mem || mm_iommu_rm_ua_to_hpa(mem, ua, &tces)) > + return H_TOO_HARD; > + } else { > + /* > + * This is emulated devices case. > + * We do not require memory to be preregistered in this case > + * so lock rmap and do __find_linux_pte_or_hugepte(). > + */ > + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) > + return H_TOO_HARD; > + > + rmap = (void *) vmalloc_to_phys(rmap); > + > + /* > + * Synchronize with the MMU notifier callbacks in > + * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). > + * While we have the rmap lock, code running on other CPUs > + * cannot finish unmapping the host real page that backs > + * this guest real page, so we are OK to access the host > + * real page. > + */ > + lock_rmap(rmap); > + if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { > + ret = H_TOO_HARD; > + goto unlock_exit; > + } > } > > for (i = 0; i < npages; ++i) { > @@ -291,7 +344,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, > } > > unlock_exit: > - unlock_rmap(rmap); > + if (rmap) I don't see where rmap is initialized to NULL in the case where it's not being used. > + unlock_rmap(rmap); > > return ret; > }
On 03/07/2016 05:00 PM, David Gibson wrote: > On Mon, Mar 07, 2016 at 02:41:11PM +1100, Alexey Kardashevskiy wrote: >> VFIO on sPAPR already implements guest memory pre-registration >> when the entire guest RAM gets pinned. This can be used to translate >> the physical address of a guest page containing the TCE list >> from H_PUT_TCE_INDIRECT. >> >> This makes use of the pre-registrered memory API to access TCE list >> pages in order to avoid unnecessary locking on the KVM memory >> reverse map. >> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > > Ok.. so, what's the benefit of not having to lock the rmap? Less locking -> less racing == good, no? > >> --- >> arch/powerpc/kvm/book3s_64_vio_hv.c | 86 ++++++++++++++++++++++++++++++------- >> 1 file changed, 70 insertions(+), 16 deletions(-) >> >> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c >> index 44be73e..af155f6 100644 >> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c >> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c >> @@ -180,6 +180,38 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, >> EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); >> >> #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE >> +static mm_context_t *kvmppc_mm_context(struct kvm_vcpu *vcpu) >> +{ >> + struct task_struct *task; >> + >> + task = vcpu->arch.run_task; >> + if (unlikely(!task || !task->mm)) >> + return NULL; >> + >> + return &task->mm->context; >> +} >> + >> +static inline bool kvmppc_preregistered(struct kvm_vcpu *vcpu) >> +{ >> + mm_context_t *mm = kvmppc_mm_context(vcpu); >> + >> + if (unlikely(!mm)) >> + return false; >> + >> + return mm_iommu_preregistered(mm); >> +} >> + >> +static struct mm_iommu_table_group_mem_t *kvmppc_rm_iommu_lookup( >> + struct kvm_vcpu *vcpu, unsigned long ua, unsigned long size) >> +{ >> + mm_context_t *mm = kvmppc_mm_context(vcpu); >> + >> + if (unlikely(!mm)) >> + return NULL; >> + >> + return mm_iommu_lookup_rm(mm, ua, size); >> +} >> + >> long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, >> unsigned long ioba, unsigned long tce) >> { >> @@ -261,23 +293,44 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, >> if (ret != H_SUCCESS) >> return ret; >> >> - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) >> - return H_TOO_HARD; >> + if (kvmppc_preregistered(vcpu)) { >> + /* >> + * We get here if guest memory was pre-registered which >> + * is normally VFIO case and gpa->hpa translation does not >> + * depend on hpt. >> + */ >> + struct mm_iommu_table_group_mem_t *mem; >> >> - rmap = (void *) vmalloc_to_phys(rmap); >> + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) >> + return H_TOO_HARD; >> >> - /* >> - * Synchronize with the MMU notifier callbacks in >> - * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). >> - * While we have the rmap lock, code running on other CPUs >> - * cannot finish unmapping the host real page that backs >> - * this guest real page, so we are OK to access the host >> - * real page. >> - */ >> - lock_rmap(rmap); >> - if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { >> - ret = H_TOO_HARD; >> - goto unlock_exit; >> + mem = kvmppc_rm_iommu_lookup(vcpu, ua, IOMMU_PAGE_SIZE_4K); >> + if (!mem || mm_iommu_rm_ua_to_hpa(mem, ua, &tces)) >> + return H_TOO_HARD; >> + } else { >> + /* >> + * This is emulated devices case. >> + * We do not require memory to be preregistered in this case >> + * so lock rmap and do __find_linux_pte_or_hugepte(). >> + */ >> + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) >> + return H_TOO_HARD; >> + >> + rmap = (void *) vmalloc_to_phys(rmap); >> + >> + /* >> + * Synchronize with the MMU notifier callbacks in >> + * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). >> + * While we have the rmap lock, code running on other CPUs >> + * cannot finish unmapping the host real page that backs >> + * this guest real page, so we are OK to access the host >> + * real page. >> + */ >> + lock_rmap(rmap); >> + if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { >> + ret = H_TOO_HARD; >> + goto unlock_exit; >> + } >> } >> >> for (i = 0; i < npages; ++i) { >> @@ -291,7 +344,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, >> } >> >> unlock_exit: >> - unlock_rmap(rmap); >> + if (rmap) > > I don't see where rmap is initialized to NULL in the case where it's > not being used. @rmap is not new to this function, and it has always been initialized to NULL as it was returned via a pointer from kvmppc_gpa_to_ua(). > >> + unlock_rmap(rmap); >> >> return ret; >> } >
On Tue, Mar 08, 2016 at 04:47:20PM +1100, Alexey Kardashevskiy wrote: > On 03/07/2016 05:00 PM, David Gibson wrote: > >On Mon, Mar 07, 2016 at 02:41:11PM +1100, Alexey Kardashevskiy wrote: > >>VFIO on sPAPR already implements guest memory pre-registration > >>when the entire guest RAM gets pinned. This can be used to translate > >>the physical address of a guest page containing the TCE list > >>from H_PUT_TCE_INDIRECT. > >> > >>This makes use of the pre-registrered memory API to access TCE list > >>pages in order to avoid unnecessary locking on the KVM memory > >>reverse map. > >> > >>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > > > >Ok.. so, what's the benefit of not having to lock the rmap? > > Less locking -> less racing == good, no? Well.. maybe. The increased difficulty in verifying that the code is correct isn't always a good price to pay. > >>--- > >> arch/powerpc/kvm/book3s_64_vio_hv.c | 86 ++++++++++++++++++++++++++++++------- > >> 1 file changed, 70 insertions(+), 16 deletions(-) > >> > >>diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c > >>index 44be73e..af155f6 100644 > >>--- a/arch/powerpc/kvm/book3s_64_vio_hv.c > >>+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c > >>@@ -180,6 +180,38 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, > >> EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); > >> > >> #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE > >>+static mm_context_t *kvmppc_mm_context(struct kvm_vcpu *vcpu) > >>+{ > >>+ struct task_struct *task; > >>+ > >>+ task = vcpu->arch.run_task; > >>+ if (unlikely(!task || !task->mm)) > >>+ return NULL; > >>+ > >>+ return &task->mm->context; > >>+} > >>+ > >>+static inline bool kvmppc_preregistered(struct kvm_vcpu *vcpu) > >>+{ > >>+ mm_context_t *mm = kvmppc_mm_context(vcpu); > >>+ > >>+ if (unlikely(!mm)) > >>+ return false; > >>+ > >>+ return mm_iommu_preregistered(mm); > >>+} > >>+ > >>+static struct mm_iommu_table_group_mem_t *kvmppc_rm_iommu_lookup( > >>+ struct kvm_vcpu *vcpu, unsigned long ua, unsigned long size) > >>+{ > >>+ mm_context_t *mm = kvmppc_mm_context(vcpu); > >>+ > >>+ if (unlikely(!mm)) > >>+ return NULL; > >>+ > >>+ return mm_iommu_lookup_rm(mm, ua, size); > >>+} > >>+ > >> long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, > >> unsigned long ioba, unsigned long tce) > >> { > >>@@ -261,23 +293,44 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, > >> if (ret != H_SUCCESS) > >> return ret; > >> > >>- if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) > >>- return H_TOO_HARD; > >>+ if (kvmppc_preregistered(vcpu)) { > >>+ /* > >>+ * We get here if guest memory was pre-registered which > >>+ * is normally VFIO case and gpa->hpa translation does not > >>+ * depend on hpt. > >>+ */ > >>+ struct mm_iommu_table_group_mem_t *mem; > >> > >>- rmap = (void *) vmalloc_to_phys(rmap); > >>+ if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) > >>+ return H_TOO_HARD; > >> > >>- /* > >>- * Synchronize with the MMU notifier callbacks in > >>- * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). > >>- * While we have the rmap lock, code running on other CPUs > >>- * cannot finish unmapping the host real page that backs > >>- * this guest real page, so we are OK to access the host > >>- * real page. > >>- */ > >>- lock_rmap(rmap); > >>- if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { > >>- ret = H_TOO_HARD; > >>- goto unlock_exit; > >>+ mem = kvmppc_rm_iommu_lookup(vcpu, ua, IOMMU_PAGE_SIZE_4K); > >>+ if (!mem || mm_iommu_rm_ua_to_hpa(mem, ua, &tces)) > >>+ return H_TOO_HARD; > >>+ } else { > >>+ /* > >>+ * This is emulated devices case. > >>+ * We do not require memory to be preregistered in this case > >>+ * so lock rmap and do __find_linux_pte_or_hugepte(). > >>+ */ > >>+ if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) > >>+ return H_TOO_HARD; > >>+ > >>+ rmap = (void *) vmalloc_to_phys(rmap); > >>+ > >>+ /* > >>+ * Synchronize with the MMU notifier callbacks in > >>+ * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). > >>+ * While we have the rmap lock, code running on other CPUs > >>+ * cannot finish unmapping the host real page that backs > >>+ * this guest real page, so we are OK to access the host > >>+ * real page. > >>+ */ > >>+ lock_rmap(rmap); > >>+ if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { > >>+ ret = H_TOO_HARD; > >>+ goto unlock_exit; > >>+ } > >> } > >> > >> for (i = 0; i < npages; ++i) { > >>@@ -291,7 +344,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, > >> } > >> > >> unlock_exit: > >>- unlock_rmap(rmap); > >>+ if (rmap) > > > >I don't see where rmap is initialized to NULL in the case where it's > >not being used. > > @rmap is not new to this function, and it has always been initialized to > NULL as it was returned via a pointer from kvmppc_gpa_to_ua(). This comment confuses me. Looking closer at the code I see you're right, and it's initialized to NULL where defined, which I missed. But that has nothing to do with being returned by pointer from kvmppc_gpa_to_ua(), since one of your branches in the new code no longer passes &rmap to that function.
On 03/08/2016 05:30 PM, David Gibson wrote: > On Tue, Mar 08, 2016 at 04:47:20PM +1100, Alexey Kardashevskiy wrote: >> On 03/07/2016 05:00 PM, David Gibson wrote: >>> On Mon, Mar 07, 2016 at 02:41:11PM +1100, Alexey Kardashevskiy wrote: >>>> VFIO on sPAPR already implements guest memory pre-registration >>>> when the entire guest RAM gets pinned. This can be used to translate >>>> the physical address of a guest page containing the TCE list >>> >from H_PUT_TCE_INDIRECT. >>>> >>>> This makes use of the pre-registrered memory API to access TCE list >>>> pages in order to avoid unnecessary locking on the KVM memory >>>> reverse map. >>>> >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> >>> >>> Ok.. so, what's the benefit of not having to lock the rmap? >> >> Less locking -> less racing == good, no? > > Well.. maybe. The increased difficulty in verifying that the code is > correct isn't always a good price to pay. > >>>> --- >>>> arch/powerpc/kvm/book3s_64_vio_hv.c | 86 ++++++++++++++++++++++++++++++------- >>>> 1 file changed, 70 insertions(+), 16 deletions(-) >>>> >>>> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c >>>> index 44be73e..af155f6 100644 >>>> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c >>>> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c >>>> @@ -180,6 +180,38 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, >>>> EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); >>>> >>>> #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE >>>> +static mm_context_t *kvmppc_mm_context(struct kvm_vcpu *vcpu) >>>> +{ >>>> + struct task_struct *task; >>>> + >>>> + task = vcpu->arch.run_task; >>>> + if (unlikely(!task || !task->mm)) >>>> + return NULL; >>>> + >>>> + return &task->mm->context; >>>> +} >>>> + >>>> +static inline bool kvmppc_preregistered(struct kvm_vcpu *vcpu) >>>> +{ >>>> + mm_context_t *mm = kvmppc_mm_context(vcpu); >>>> + >>>> + if (unlikely(!mm)) >>>> + return false; >>>> + >>>> + return mm_iommu_preregistered(mm); >>>> +} >>>> + >>>> +static struct mm_iommu_table_group_mem_t *kvmppc_rm_iommu_lookup( >>>> + struct kvm_vcpu *vcpu, unsigned long ua, unsigned long size) >>>> +{ >>>> + mm_context_t *mm = kvmppc_mm_context(vcpu); >>>> + >>>> + if (unlikely(!mm)) >>>> + return NULL; >>>> + >>>> + return mm_iommu_lookup_rm(mm, ua, size); >>>> +} >>>> + >>>> long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, >>>> unsigned long ioba, unsigned long tce) >>>> { >>>> @@ -261,23 +293,44 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, >>>> if (ret != H_SUCCESS) >>>> return ret; >>>> >>>> - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) >>>> - return H_TOO_HARD; >>>> + if (kvmppc_preregistered(vcpu)) { >>>> + /* >>>> + * We get here if guest memory was pre-registered which >>>> + * is normally VFIO case and gpa->hpa translation does not >>>> + * depend on hpt. >>>> + */ >>>> + struct mm_iommu_table_group_mem_t *mem; >>>> >>>> - rmap = (void *) vmalloc_to_phys(rmap); >>>> + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) >>>> + return H_TOO_HARD; >>>> >>>> - /* >>>> - * Synchronize with the MMU notifier callbacks in >>>> - * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). >>>> - * While we have the rmap lock, code running on other CPUs >>>> - * cannot finish unmapping the host real page that backs >>>> - * this guest real page, so we are OK to access the host >>>> - * real page. >>>> - */ >>>> - lock_rmap(rmap); >>>> - if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { >>>> - ret = H_TOO_HARD; >>>> - goto unlock_exit; >>>> + mem = kvmppc_rm_iommu_lookup(vcpu, ua, IOMMU_PAGE_SIZE_4K); >>>> + if (!mem || mm_iommu_rm_ua_to_hpa(mem, ua, &tces)) >>>> + return H_TOO_HARD; >>>> + } else { >>>> + /* >>>> + * This is emulated devices case. >>>> + * We do not require memory to be preregistered in this case >>>> + * so lock rmap and do __find_linux_pte_or_hugepte(). >>>> + */ >>>> + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) >>>> + return H_TOO_HARD; >>>> + >>>> + rmap = (void *) vmalloc_to_phys(rmap); >>>> + >>>> + /* >>>> + * Synchronize with the MMU notifier callbacks in >>>> + * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). >>>> + * While we have the rmap lock, code running on other CPUs >>>> + * cannot finish unmapping the host real page that backs >>>> + * this guest real page, so we are OK to access the host >>>> + * real page. >>>> + */ >>>> + lock_rmap(rmap); >>>> + if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { >>>> + ret = H_TOO_HARD; >>>> + goto unlock_exit; >>>> + } >>>> } >>>> >>>> for (i = 0; i < npages; ++i) { >>>> @@ -291,7 +344,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, >>>> } >>>> >>>> unlock_exit: >>>> - unlock_rmap(rmap); >>>> + if (rmap) >>> >>> I don't see where rmap is initialized to NULL in the case where it's >>> not being used. >> >> @rmap is not new to this function, and it has always been initialized to >> NULL as it was returned via a pointer from kvmppc_gpa_to_ua(). > > This comment confuses me. Looking closer at the code I see you're > right, and it's initialized to NULL where defined, which I missed. > > But that has nothing to do with being returned by pointer from > kvmppc_gpa_to_ua(), since one of your branches in the new code no > longer passes &rmap to that function. So? The code is still correct - the "preregistered branch" does not touch NULL pointer, it remains NULL and unlock_rmap() is not called. I agree the patch is not the easiest to read but how can I improve it to get your "rb"? Replace "if(rmap)" with "if (kvmppc_preregistered(vcpu))"? Move that loop between lock_rmap/unlock_rmap to a helper? kvmppc_rm_h_put_tce_indirect() is not big enough to justify splitting, the comments inside it are though...
On Wed, Mar 09, 2016 at 07:55:53PM +1100, Alexey Kardashevskiy wrote: > On 03/08/2016 05:30 PM, David Gibson wrote: > >On Tue, Mar 08, 2016 at 04:47:20PM +1100, Alexey Kardashevskiy wrote: > >>On 03/07/2016 05:00 PM, David Gibson wrote: > >>>On Mon, Mar 07, 2016 at 02:41:11PM +1100, Alexey Kardashevskiy wrote: > >>>>VFIO on sPAPR already implements guest memory pre-registration > >>>>when the entire guest RAM gets pinned. This can be used to translate > >>>>the physical address of a guest page containing the TCE list > >>>>from H_PUT_TCE_INDIRECT. > >>>> > >>>>This makes use of the pre-registrered memory API to access TCE list > >>>>pages in order to avoid unnecessary locking on the KVM memory > >>>>reverse map. > >>>> > >>>>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > >>> > >>>Ok.. so, what's the benefit of not having to lock the rmap? > >> > >>Less locking -> less racing == good, no? > > > >Well.. maybe. The increased difficulty in verifying that the code is > >correct isn't always a good price to pay. > > > >>>>--- > >>>> arch/powerpc/kvm/book3s_64_vio_hv.c | 86 ++++++++++++++++++++++++++++++------- > >>>> 1 file changed, 70 insertions(+), 16 deletions(-) > >>>> > >>>>diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c > >>>>index 44be73e..af155f6 100644 > >>>>--- a/arch/powerpc/kvm/book3s_64_vio_hv.c > >>>>+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c > >>>>@@ -180,6 +180,38 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, > >>>> EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); > >>>> > >>>> #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE > >>>>+static mm_context_t *kvmppc_mm_context(struct kvm_vcpu *vcpu) > >>>>+{ > >>>>+ struct task_struct *task; > >>>>+ > >>>>+ task = vcpu->arch.run_task; > >>>>+ if (unlikely(!task || !task->mm)) > >>>>+ return NULL; > >>>>+ > >>>>+ return &task->mm->context; > >>>>+} > >>>>+ > >>>>+static inline bool kvmppc_preregistered(struct kvm_vcpu *vcpu) > >>>>+{ > >>>>+ mm_context_t *mm = kvmppc_mm_context(vcpu); > >>>>+ > >>>>+ if (unlikely(!mm)) > >>>>+ return false; > >>>>+ > >>>>+ return mm_iommu_preregistered(mm); > >>>>+} > >>>>+ > >>>>+static struct mm_iommu_table_group_mem_t *kvmppc_rm_iommu_lookup( > >>>>+ struct kvm_vcpu *vcpu, unsigned long ua, unsigned long size) > >>>>+{ > >>>>+ mm_context_t *mm = kvmppc_mm_context(vcpu); > >>>>+ > >>>>+ if (unlikely(!mm)) > >>>>+ return NULL; > >>>>+ > >>>>+ return mm_iommu_lookup_rm(mm, ua, size); > >>>>+} > >>>>+ > >>>> long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, > >>>> unsigned long ioba, unsigned long tce) > >>>> { > >>>>@@ -261,23 +293,44 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, > >>>> if (ret != H_SUCCESS) > >>>> return ret; > >>>> > >>>>- if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) > >>>>- return H_TOO_HARD; > >>>>+ if (kvmppc_preregistered(vcpu)) { > >>>>+ /* > >>>>+ * We get here if guest memory was pre-registered which > >>>>+ * is normally VFIO case and gpa->hpa translation does not > >>>>+ * depend on hpt. > >>>>+ */ > >>>>+ struct mm_iommu_table_group_mem_t *mem; > >>>> > >>>>- rmap = (void *) vmalloc_to_phys(rmap); > >>>>+ if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) > >>>>+ return H_TOO_HARD; > >>>> > >>>>- /* > >>>>- * Synchronize with the MMU notifier callbacks in > >>>>- * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). > >>>>- * While we have the rmap lock, code running on other CPUs > >>>>- * cannot finish unmapping the host real page that backs > >>>>- * this guest real page, so we are OK to access the host > >>>>- * real page. > >>>>- */ > >>>>- lock_rmap(rmap); > >>>>- if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { > >>>>- ret = H_TOO_HARD; > >>>>- goto unlock_exit; > >>>>+ mem = kvmppc_rm_iommu_lookup(vcpu, ua, IOMMU_PAGE_SIZE_4K); > >>>>+ if (!mem || mm_iommu_rm_ua_to_hpa(mem, ua, &tces)) > >>>>+ return H_TOO_HARD; > >>>>+ } else { > >>>>+ /* > >>>>+ * This is emulated devices case. > >>>>+ * We do not require memory to be preregistered in this case > >>>>+ * so lock rmap and do __find_linux_pte_or_hugepte(). > >>>>+ */ > >>>>+ if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) > >>>>+ return H_TOO_HARD; > >>>>+ > >>>>+ rmap = (void *) vmalloc_to_phys(rmap); > >>>>+ > >>>>+ /* > >>>>+ * Synchronize with the MMU notifier callbacks in > >>>>+ * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). > >>>>+ * While we have the rmap lock, code running on other CPUs > >>>>+ * cannot finish unmapping the host real page that backs > >>>>+ * this guest real page, so we are OK to access the host > >>>>+ * real page. > >>>>+ */ > >>>>+ lock_rmap(rmap); > >>>>+ if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { > >>>>+ ret = H_TOO_HARD; > >>>>+ goto unlock_exit; > >>>>+ } > >>>> } > >>>> > >>>> for (i = 0; i < npages; ++i) { > >>>>@@ -291,7 +344,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, > >>>> } > >>>> > >>>> unlock_exit: > >>>>- unlock_rmap(rmap); > >>>>+ if (rmap) > >>> > >>>I don't see where rmap is initialized to NULL in the case where it's > >>>not being used. > >> > >>@rmap is not new to this function, and it has always been initialized to > >>NULL as it was returned via a pointer from kvmppc_gpa_to_ua(). > > > >This comment confuses me. Looking closer at the code I see you're > >right, and it's initialized to NULL where defined, which I missed. > > > >But that has nothing to do with being returned by pointer from > >kvmppc_gpa_to_ua(), since one of your branches in the new code no > >longer passes &rmap to that function. > > > So? The code is still correct - the "preregistered branch" does not touch > NULL pointer, it remains NULL and unlock_rmap() is not called. I agree the > patch is not the easiest to read but how can I improve it to get your "rb"? > Replace "if(rmap)" with "if (kvmppc_preregistered(vcpu))"? Move that loop > between lock_rmap/unlock_rmap to a helper? kvmppc_rm_h_put_tce_indirect() is > not big enough to justify splitting, the comments inside it are though... Sorry, I wasn't clear. I no longer have a specific objection. I left out the R-b, since there have been enough other comments on the series that I was expecting a respin, so I was planning to re-review in the context of an updated series.
On Mon, Mar 07, 2016 at 05:00:14PM +1100, David Gibson wrote: > On Mon, Mar 07, 2016 at 02:41:11PM +1100, Alexey Kardashevskiy wrote: > > VFIO on sPAPR already implements guest memory pre-registration > > when the entire guest RAM gets pinned. This can be used to translate > > the physical address of a guest page containing the TCE list > > from H_PUT_TCE_INDIRECT. > > > > This makes use of the pre-registrered memory API to access TCE list > > pages in order to avoid unnecessary locking on the KVM memory > > reverse map. > > > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > > Ok.. so, what's the benefit of not having to lock the rmap? It's not primarily about locking or not locking the rmap. The point is that when memory is pre-registered, we know that all of guest memory is pinned and we have a flat array mapping GPA to HPA. It's simpler and quicker to index into that array (even with looking up the kernel page tables in vmalloc_to_phys) than it is to find the memslot, lock the rmap entry, look up the user page tables, and unlock the rmap entry. We were only locking the rmap entry to stop the page being unmapped and reallocated to something else, but if memory is pre-registered, it's all pinned, so it can't be reallocated. Paul. -- To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Mar 10, 2016 at 07:33:05PM +1100, Paul Mackerras wrote: > On Mon, Mar 07, 2016 at 05:00:14PM +1100, David Gibson wrote: > > On Mon, Mar 07, 2016 at 02:41:11PM +1100, Alexey Kardashevskiy wrote: > > > VFIO on sPAPR already implements guest memory pre-registration > > > when the entire guest RAM gets pinned. This can be used to translate > > > the physical address of a guest page containing the TCE list > > > from H_PUT_TCE_INDIRECT. > > > > > > This makes use of the pre-registrered memory API to access TCE list > > > pages in order to avoid unnecessary locking on the KVM memory > > > reverse map. > > > > > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > > > > Ok.. so, what's the benefit of not having to lock the rmap? > > It's not primarily about locking or not locking the rmap. The point > is that when memory is pre-registered, we know that all of guest > memory is pinned and we have a flat array mapping GPA to HPA. It's > simpler and quicker to index into that array (even with looking up the > kernel page tables in vmalloc_to_phys) than it is to find the memslot, > lock the rmap entry, look up the user page tables, and unlock the rmap > entry. We were only locking the rmap entry to stop the page being > unmapped and reallocated to something else, but if memory is > pre-registered, it's all pinned, so it can't be reallocated. Ok, that makes sense. Alexey, can you fold some of that rationale into the commit message?
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 44be73e..af155f6 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -180,6 +180,38 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +static mm_context_t *kvmppc_mm_context(struct kvm_vcpu *vcpu) +{ + struct task_struct *task; + + task = vcpu->arch.run_task; + if (unlikely(!task || !task->mm)) + return NULL; + + return &task->mm->context; +} + +static inline bool kvmppc_preregistered(struct kvm_vcpu *vcpu) +{ + mm_context_t *mm = kvmppc_mm_context(vcpu); + + if (unlikely(!mm)) + return false; + + return mm_iommu_preregistered(mm); +} + +static struct mm_iommu_table_group_mem_t *kvmppc_rm_iommu_lookup( + struct kvm_vcpu *vcpu, unsigned long ua, unsigned long size) +{ + mm_context_t *mm = kvmppc_mm_context(vcpu); + + if (unlikely(!mm)) + return NULL; + + return mm_iommu_lookup_rm(mm, ua, size); +} + long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce) { @@ -261,23 +293,44 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (ret != H_SUCCESS) return ret; - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) - return H_TOO_HARD; + if (kvmppc_preregistered(vcpu)) { + /* + * We get here if guest memory was pre-registered which + * is normally VFIO case and gpa->hpa translation does not + * depend on hpt. + */ + struct mm_iommu_table_group_mem_t *mem; - rmap = (void *) vmalloc_to_phys(rmap); + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) + return H_TOO_HARD; - /* - * Synchronize with the MMU notifier callbacks in - * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). - * While we have the rmap lock, code running on other CPUs - * cannot finish unmapping the host real page that backs - * this guest real page, so we are OK to access the host - * real page. - */ - lock_rmap(rmap); - if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { - ret = H_TOO_HARD; - goto unlock_exit; + mem = kvmppc_rm_iommu_lookup(vcpu, ua, IOMMU_PAGE_SIZE_4K); + if (!mem || mm_iommu_rm_ua_to_hpa(mem, ua, &tces)) + return H_TOO_HARD; + } else { + /* + * This is emulated devices case. + * We do not require memory to be preregistered in this case + * so lock rmap and do __find_linux_pte_or_hugepte(). + */ + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) + return H_TOO_HARD; + + rmap = (void *) vmalloc_to_phys(rmap); + + /* + * Synchronize with the MMU notifier callbacks in + * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). + * While we have the rmap lock, code running on other CPUs + * cannot finish unmapping the host real page that backs + * this guest real page, so we are OK to access the host + * real page. + */ + lock_rmap(rmap); + if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { + ret = H_TOO_HARD; + goto unlock_exit; + } } for (i = 0; i < npages; ++i) { @@ -291,7 +344,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, } unlock_exit: - unlock_rmap(rmap); + if (rmap) + unlock_rmap(rmap); return ret; }
VFIO on sPAPR already implements guest memory pre-registration when the entire guest RAM gets pinned. This can be used to translate the physical address of a guest page containing the TCE list from H_PUT_TCE_INDIRECT. This makes use of the pre-registrered memory API to access TCE list pages in order to avoid unnecessary locking on the KVM memory reverse map. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> --- arch/powerpc/kvm/book3s_64_vio_hv.c | 86 ++++++++++++++++++++++++++++++------- 1 file changed, 70 insertions(+), 16 deletions(-)