Patchwork [4/4] kvm: powerpc: set cache coherency only for RAM pages

login
register
mail settings
Submitter Bharat Bhushan
Date July 30, 2013, 4:22 p.m.
Message ID <6A3DF150A5B70D4F9B66A25E3F7C888D070EBC3F@039-SN2MPN1-013.039d.mgd.msft.net>
Download mbox | patch
Permalink /patch/263442/
State New
Headers show

Comments

Bharat Bhushan - July 30, 2013, 4:22 p.m.
> -----Original Message-----

> From: Benjamin Herrenschmidt [mailto:benh@kernel.crashing.org]

> Sent: Saturday, July 27, 2013 3:57 AM

> To: Bhushan Bharat-R65777

> Cc: Alexander Graf; kvm-ppc@vger.kernel.org; kvm@vger.kernel.org; linuxppc-

> dev@lists.ozlabs.org; Wood Scott-B07421

> Subject: Re: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages

> 

> On Fri, 2013-07-26 at 15:03 +0000, Bhushan Bharat-R65777 wrote:

> > Will not searching the Linux PTE is a overkill?

> 

> That's the best approach. Also we are searching it already to resolve the page

> fault. That does mean we search twice but on the other hand that also means it's

> hot in the cache.



Below is early git diff (not a proper cleanup patch), to be sure that this is what we want on PowerPC and take early feedback. Also I run some benchmark to understand the overhead if any. 

Using kvm_is_mmio_pfn(); what the current patch does:											
Real: 0m46.616s + 0m49.517s + 0m49.510s + 0m46.936s + 0m46.889s + 0m46.684s = Avg; 47.692s
User: 0m31.636s + 0m31.816s + 0m31.456s + 0m31.752s + 0m32.028s + 0m31.848s = Avg; 31.756s
Sys:  0m11.596s + 0m11.868s + 0m12.244s + 0m11.672s + 0m11.356s + 0m11.432s = Avg; 11.695s


Using kernel page table search (below changes):
Real: 0m46.431s + 0m50.269s + 0m46.724s + 0m46.645s + 0m46.670s + 0m50.259s = Avg; 47.833s
User: 0m31.568s + 0m31.816s + 0m31.444s + 0m31.808s + 0m31.312s + 0m31.740s = Avg; 31.614s
Sys:  0m11.516s + 0m12.060s + 0m11.872s + 0m11.476s + 0m12.000s + 0m12.152s = Avg; 11.846s

------------------

---------------------------------------


> 

> Cheers,

> Ben

> 

>
Bharat Bhushan - July 31, 2013, 5:23 a.m.
> -----Original Message-----
> From: Wood Scott-B07421
> Sent: Wednesday, July 31, 2013 12:19 AM
> To: Bhushan Bharat-R65777
> Cc: Benjamin Herrenschmidt; Alexander Graf; kvm-ppc@vger.kernel.org;
> kvm@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; Wood Scott-B07421
> Subject: Re: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages
> 
> On 07/30/2013 11:22:54 AM, Bhushan Bharat-R65777 wrote:
> > diff --git a/arch/powerpc/kvm/e500_mmu_host.c
> > b/arch/powerpc/kvm/e500_mmu_host.c
> > index 5cbdc8f..a48c13f 100644
> > --- a/arch/powerpc/kvm/e500_mmu_host.c
> > +++ b/arch/powerpc/kvm/e500_mmu_host.c
> > @@ -40,6 +40,84 @@
> >
> >  static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
> >
> > +/*
> > + * find_linux_pte returns the address of a linux pte for a given
> > + * effective address and directory.  If not found, it returns zero.
> > + */
> > +static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea) {
> > +        pgd_t *pg;
> > +        pud_t *pu;
> > +        pmd_t *pm;
> > +        pte_t *pt = NULL;
> > +
> > +        pg = pgdir + pgd_index(ea);
> > +        if (!pgd_none(*pg)) {
> > +                pu = pud_offset(pg, ea);
> > +                if (!pud_none(*pu)) {
> > +                        pm = pmd_offset(pu, ea);
> > +                        if (pmd_present(*pm))
> > +                                pt = pte_offset_kernel(pm, ea);
> > +                }
> > +        }
> > +        return pt;
> > +}
> 
> How is this specific to KVM or e500?
> 
> > +#ifdef CONFIG_HUGETLB_PAGE
> > +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
> > +                                 unsigned *shift); #else static
> > +inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir,
> > unsigned long ea,
> > +                                               unsigned *shift) {
> > +        if (shift)
> > +                *shift = 0;
> > +        return find_linux_pte(pgdir, ea); } #endif /*
> > +!CONFIG_HUGETLB_PAGE */
> 
> This is already declared in asm/pgtable.h.  If we need a non-hugepage
> alternative, that should also go in asm/pgtable.h.
> 
> > +/*
> > + * Lock and read a linux PTE.  If it's present and writable,
> > atomically
> > + * set dirty and referenced bits and return the PTE, otherwise
> > return 0.
> > + */
> > +static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int
> > writing)
> > +{
> > +       pte_t pte = pte_val(*p);
> > +
> > +       if (pte_present(pte)) {
> > +               pte = pte_mkyoung(pte);
> > +               if (writing && pte_write(pte))
> > +                       pte = pte_mkdirty(pte);
> > +       }
> > +
> > +       *p = pte;
> > +
> > +       return pte;
> > +}
> > +
> > +static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
> > +                             int writing, unsigned long *pte_sizep) {
> > +       pte_t *ptep;
> > +       unsigned long ps = *pte_sizep;
> > +       unsigned int shift;
> > +
> > +       ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
> > +       if (!ptep)
> > +               return __pte(0);
> > +       if (shift)
> > +               *pte_sizep = 1ul << shift;
> > +       else
> > +               *pte_sizep = PAGE_SIZE;
> > +
> > +       if (ps > *pte_sizep)
> > +               return __pte(0);
> > +       if (!pte_present(*ptep))
> > +               return __pte(0);
> > +
> > +       return kvmppc_read_update_linux_pte(ptep, writing); }
> > +
> 
> None of this belongs in this file either.
> 
> > @@ -326,8 +405,8 @@ static void kvmppc_e500_setup_stlbe(
> >
> >         /* Force IPROT=0 for all guest mappings. */
> >         stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) |
> > MAS1_VALID;
> > -       stlbe->mas2 = (gvaddr & MAS2_EPN) |
> > -                     e500_shadow_mas2_attrib(gtlbe->mas2, pfn);
> > +       stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags &
> > E500_TLB_WIMGE_MASK);
> > +//                   e500_shadow_mas2_attrib(gtlbe->mas2, pfn);
> 
> MAS2_E and MAS2_G should be safe to come from the guest.

This is handled when setting WIMGE in ref->flags.

> 
> How does this work for TLB1?  One ref corresponds to one guest entry, which may
> correspond to multiple host entries, potentially each with different WIM
> settings.

Yes, one ref corresponds to one guest entry. To understand how this will work when a one guest tlb1 entry may maps to many host tlb0/1 entry; 
on guest tlbwe, KVM setup one guest tlb entry and then pre-map one host tlb entry (out of many) and ref (ref->pfn etc) points to this pre-map entry for that guest entry.
Now a guest TLB miss happens which falls on same guest tlb entry and but demands another host tlb entry. In that flow we change/overwrite ref (ref->pfn etc) to point to new host mapping for same guest mapping.

> 
> >         stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
> >                         e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
> >
> > @@ -346,6 +425,8 @@ static inline int kvmppc_e500_shadow_map(struct
> > kvmppc_vcpu_e500 *vcpu_e500,
> >         unsigned long hva;
> >         int pfnmap = 0;
> >         int tsize = BOOK3E_PAGESZ_4K;
> > +       pte_t pte;
> > +       int wimg = 0;
> >
> >         /*
> >          * Translate guest physical to true physical, acquiring @@
> > -451,6 +532,8 @@ static inline int kvmppc_e500_shadow_map(struct
> > kvmppc_vcpu_e500 *vcpu_e500,
> >
> >         if (likely(!pfnmap)) {
> >                 unsigned long tsize_pages = 1 << (tsize + 10 -
> > PAGE_SHIFT);
> > +               pgd_t *pgdir;
> > +
> >                 pfn = gfn_to_pfn_memslot(slot, gfn);
> >                 if (is_error_noslot_pfn(pfn)) {
> >                         printk(KERN_ERR "Couldn't get real page for
> > gfn %lx!\n", @@ -461,9 +544,15 @@ static inline int
> > kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
> >                 /* Align guest and physical address to page map
> > boundaries */
> >                 pfn &= ~(tsize_pages - 1);
> >                 gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
> > +               pgdir = vcpu_e500->vcpu.arch.pgdir;
> > +               pte = lookup_linux_pte(pgdir, hva, 1, &tsize_pages);
> > +               if (pte_present(pte))
> > +                       wimg = (pte >> PTE_WIMGE_SHIFT) &
> > MAS2_WIMGE_MASK;
> > +               else
> > +                       wimg = MAS2_I | MAS2_G;
> 
> If the PTE is not present, then we can't map it, right?

Right, we should return error :)

-Bharat

>  So why I+G?
> 
> -Scott

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h

index 3328353..d6d0dac 100644

--- a/arch/powerpc/include/asm/kvm_host.h

+++ b/arch/powerpc/include/asm/kvm_host.h

@@ -532,6 +532,7 @@  struct kvm_vcpu_arch {

        u32 epr;
        u32 crit_save;
        struct kvmppc_booke_debug_reg dbg_reg;
+       pgd_t *pgdir;

 #endif
        gpa_t paddr_accessed;
        gva_t vaddr_accessed;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c

index 17722d8..ebcccc2 100644

--- a/arch/powerpc/kvm/booke.c

+++ b/arch/powerpc/kvm/booke.c

@@ -697,7 +697,7 @@  int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)

 #endif
 
        kvmppc_fix_ee_before_entry();
-

+       vcpu->arch.pgdir = current->mm->pgd;

        ret = __kvmppc_vcpu_run(kvm_run, vcpu);
 
        /* No need for kvm_guest_exit. It's done in handle_exit.
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h

index 4fd9650..fc4b2f6 100644

--- a/arch/powerpc/kvm/e500.h

+++ b/arch/powerpc/kvm/e500.h

@@ -31,11 +31,13 @@  enum vcpu_ftr {

 #define E500_TLB_NUM   2
 
 /* entry is mapped somewhere in host TLB */
-#define E500_TLB_VALID         (1 << 0)

+#define E500_TLB_VALID         (1 << 31)

 /* TLB1 entry is mapped by host TLB1, tracked by bitmaps */
-#define E500_TLB_BITMAP                (1 << 1)

+#define E500_TLB_BITMAP                (1 << 30)

 /* TLB1 entry is mapped by host TLB0 */
-#define E500_TLB_TLB0          (1 << 2)

+#define E500_TLB_TLB0          (1 << 29)

+/* Lower 5 bits have WIMGE value */

+#define E500_TLB_WIMGE_MASK    (0x1f)

 
 struct tlbe_ref {
        pfn_t pfn;              /* valid only for TLB0, except briefly */
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c

index 5cbdc8f..a48c13f 100644

--- a/arch/powerpc/kvm/e500_mmu_host.c

+++ b/arch/powerpc/kvm/e500_mmu_host.c

@@ -40,6 +40,84 @@ 

 
 static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
 
+/*

+ * find_linux_pte returns the address of a linux pte for a given

+ * effective address and directory.  If not found, it returns zero.

+ */

+static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea)

+{

+        pgd_t *pg;

+        pud_t *pu;

+        pmd_t *pm;

+        pte_t *pt = NULL;

+

+        pg = pgdir + pgd_index(ea);

+        if (!pgd_none(*pg)) {

+                pu = pud_offset(pg, ea);

+                if (!pud_none(*pu)) {

+                        pm = pmd_offset(pu, ea);

+                        if (pmd_present(*pm))

+                                pt = pte_offset_kernel(pm, ea);

+                }

+        }

+        return pt;

+}

+

+#ifdef CONFIG_HUGETLB_PAGE

+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,

+                                 unsigned *shift);

+#else

+static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,

+                                               unsigned *shift)

+{

+        if (shift)

+                *shift = 0;

+        return find_linux_pte(pgdir, ea);

+}

+#endif /* !CONFIG_HUGETLB_PAGE */

+

+/*

+ * Lock and read a linux PTE.  If it's present and writable, atomically

+ * set dirty and referenced bits and return the PTE, otherwise return 0.

+ */

+static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing)

+{

+       pte_t pte = pte_val(*p);

+

+       if (pte_present(pte)) {

+               pte = pte_mkyoung(pte);

+               if (writing && pte_write(pte))

+                       pte = pte_mkdirty(pte);

+       }

+

+       *p = pte;

+

+       return pte;

+}

+

+static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,

+                             int writing, unsigned long *pte_sizep)

+{

+       pte_t *ptep;

+       unsigned long ps = *pte_sizep;

+       unsigned int shift;

+

+       ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);

+       if (!ptep)

+               return __pte(0);

+       if (shift)

+               *pte_sizep = 1ul << shift;

+       else

+               *pte_sizep = PAGE_SIZE;

+

+       if (ps > *pte_sizep)

+               return __pte(0);

+       if (!pte_present(*ptep))

+               return __pte(0);

+

+       return kvmppc_read_update_linux_pte(ptep, writing);

+}

+

 static inline unsigned int tlb1_max_shadow_size(void)
 {
        /* reserve one entry for magic page */
@@ -262,10 +340,11 @@  static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)

 
 static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
                                         struct kvm_book3e_206_tlb_entry *gtlbe,
-                                        pfn_t pfn)

+                                        pfn_t pfn, int wimg)

 {
        ref->pfn = pfn;
        ref->flags |= E500_TLB_VALID;
+       ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg;

 
        if (tlbe_is_writable(gtlbe))
                kvm_set_pfn_dirty(pfn);
@@ -326,8 +405,8 @@  static void kvmppc_e500_setup_stlbe(

 
        /* Force IPROT=0 for all guest mappings. */
        stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID;
-       stlbe->mas2 = (gvaddr & MAS2_EPN) |

-                     e500_shadow_mas2_attrib(gtlbe->mas2, pfn);

+       stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags & E500_TLB_WIMGE_MASK);

+//                   e500_shadow_mas2_attrib(gtlbe->mas2, pfn);

        stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
                        e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);

@@ -346,6 +425,8 @@  static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,

        unsigned long hva;
        int pfnmap = 0;
        int tsize = BOOK3E_PAGESZ_4K;
+       pte_t pte;

+       int wimg = 0;

 
        /*
         * Translate guest physical to true physical, acquiring
@@ -451,6 +532,8 @@  static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,

 
        if (likely(!pfnmap)) {
                unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
+               pgd_t *pgdir;

+

                pfn = gfn_to_pfn_memslot(slot, gfn);
                if (is_error_noslot_pfn(pfn)) {
                        printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
@@ -461,9 +544,15 @@  static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,

                /* Align guest and physical address to page map boundaries */
                pfn &= ~(tsize_pages - 1);
                gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
+               pgdir = vcpu_e500->vcpu.arch.pgdir;

+               pte = lookup_linux_pte(pgdir, hva, 1, &tsize_pages);

+               if (pte_present(pte))

+                       wimg = (pte >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;

+               else

+                       wimg = MAS2_I | MAS2_G;

        }
 
-       kvmppc_e500_ref_setup(ref, gtlbe, pfn);

+       kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);

 
        kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
                                ref, gvaddr, stlbe);