diff mbox

[4/7] KVM: PPC: Book3S HV: Fix dirty map for hugepages

Message ID 1400919721-14264-5-git-send-email-paulus@samba.org
State New, archived
Headers show

Commit Message

Paul Mackerras May 24, 2014, 8:21 a.m. UTC
From: Alexey Kardashevskiy <aik@ozlabs.ru>

The dirty map that we construct for the KVM_GET_DIRTY_LOG ioctl has
one bit per system page (4K/64K).  Currently, we only set one bit in
the map for each HPT entry with the Change bit set, even if the HPT is
for a large page (e.g., 16MB).  Userspace then considers only the
first system page dirty, though in fact the guest may have modified
anywhere in the large page.

To fix this, we make kvm_test_clear_dirty() return the actual number
of pages that are dirty.  In kvmppc_hv_get_dirty_log() we then set
that many bits in the dirty map.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

Comments

Alexander Graf May 25, 2014, 10 a.m. UTC | #1
On 24.05.14 10:21, Paul Mackerras wrote:
> From: Alexey Kardashevskiy <aik@ozlabs.ru>
>
> The dirty map that we construct for the KVM_GET_DIRTY_LOG ioctl has
> one bit per system page (4K/64K).  Currently, we only set one bit in
> the map for each HPT entry with the Change bit set, even if the HPT is
> for a large page (e.g., 16MB).  Userspace then considers only the
> first system page dirty, though in fact the guest may have modified
> anywhere in the large page.
>
> To fix this, we make kvm_test_clear_dirty() return the actual number
> of pages that are dirty.  In kvmppc_hv_get_dirty_log() we then set
> that many bits in the dirty map.
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
>   arch/powerpc/kvm/book3s_64_mmu_hv.c | 14 ++++++++++----
>   1 file changed, 10 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index f32896f..d605dc24 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -1053,6 +1053,7 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
>   {
>   	struct revmap_entry *rev = kvm->arch.revmap;
>   	unsigned long head, i, j;
> +	unsigned long n;
>   	unsigned long *hptep;
>   	int ret = 0;
>   
> @@ -1095,7 +1096,10 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
>   				rev[i].guest_rpte |= HPTE_R_C;
>   				note_hpte_modification(kvm, &rev[i]);
>   			}
> -			ret = 1;
> +			n = hpte_page_size(hptep[0], hptep[1]);
> +			n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +			if (n > ret)
> +				ret = n;

Please document in the function header that the return value is the 
number of pages that are dirty. Alternatively rename the function.

>   		}
>   		hptep[0] &= ~HPTE_V_HVLOCK;
>   	} while ((i = j) != head);
> @@ -1125,15 +1129,17 @@ static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
>   long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
>   			     unsigned long *map)
>   {
> -	unsigned long i;
> +	unsigned long i, j;
>   	unsigned long *rmapp;
>   	struct kvm_vcpu *vcpu;
>   
>   	preempt_disable();
>   	rmapp = memslot->arch.rmap;
>   	for (i = 0; i < memslot->npages; ++i) {
> -		if (kvm_test_clear_dirty(kvm, rmapp) && map)
> -			__set_bit_le(i, map);
> +		int ret = kvm_test_clear_dirty(kvm, rmapp);

Please give this one a better name. npages maybe?

> +		if (ret && map)
> +			for (j = i - (i % ret); ret; ++j, --ret)

Why do we go down from the index?


Alex

> +				__set_bit_le(j, map);
>   		++rmapp;
>   	}
>   

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras May 26, 2014, 3:30 a.m. UTC | #2
On Sun, May 25, 2014 at 12:00:48PM +0200, Alexander Graf wrote:
> 
> Please document in the function header that the return value is the number
> of pages that are dirty. Alternatively rename the function.

OK.

> >  	for (i = 0; i < memslot->npages; ++i) {
> >-		if (kvm_test_clear_dirty(kvm, rmapp) && map)
> >-			__set_bit_le(i, map);
> >+		int ret = kvm_test_clear_dirty(kvm, rmapp);
> 
> Please give this one a better name. npages maybe?

OK.

> >+		if (ret && map)
> >+			for (j = i - (i % ret); ret; ++j, --ret)
> 
> Why do we go down from the index?

When a large page is faulted in by kvmppc_book3s_hv_page_fault(), we
put the guest HPTE in the rmap chain corresponding to the guest
physical address that the guest was trying to access, not the GPA of
the start of the large page.  So we need i - (i % ret) to get the
index of the first system page in the large page.

However, on reflection I think that we should always put the guest
HPTE in the rmap chain corresponding to the base address of the large
page.  In that case i % ret would always be zero.  I'll repost.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index f32896f..d605dc24 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1053,6 +1053,7 @@  static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
 {
 	struct revmap_entry *rev = kvm->arch.revmap;
 	unsigned long head, i, j;
+	unsigned long n;
 	unsigned long *hptep;
 	int ret = 0;
 
@@ -1095,7 +1096,10 @@  static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
 				rev[i].guest_rpte |= HPTE_R_C;
 				note_hpte_modification(kvm, &rev[i]);
 			}
-			ret = 1;
+			n = hpte_page_size(hptep[0], hptep[1]);
+			n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
+			if (n > ret)
+				ret = n;
 		}
 		hptep[0] &= ~HPTE_V_HVLOCK;
 	} while ((i = j) != head);
@@ -1125,15 +1129,17 @@  static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
 long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			     unsigned long *map)
 {
-	unsigned long i;
+	unsigned long i, j;
 	unsigned long *rmapp;
 	struct kvm_vcpu *vcpu;
 
 	preempt_disable();
 	rmapp = memslot->arch.rmap;
 	for (i = 0; i < memslot->npages; ++i) {
-		if (kvm_test_clear_dirty(kvm, rmapp) && map)
-			__set_bit_le(i, map);
+		int ret = kvm_test_clear_dirty(kvm, rmapp);
+		if (ret && map)
+			for (j = i - (i % ret); ret; ++j, --ret)
+				__set_bit_le(j, map);
 		++rmapp;
 	}