Patchwork [5/6,v2] kvm: powerpc: booke: Add linux pte lookup like booke3s

login
register
mail settings
Submitter Bharat Bhushan
Date Aug. 5, 2013, 2:27 p.m.
Message ID <6A3DF150A5B70D4F9B66A25E3F7C888D070FAD49@039-SN2MPN1-012.039d.mgd.msft.net>
Download mbox | patch
Permalink /patch/264693/
State New
Headers show

Comments

Bharat Bhushan - Aug. 5, 2013, 2:27 p.m.
> -----Original Message-----

> From: Benjamin Herrenschmidt [mailto:benh@kernel.crashing.org]

> Sent: Saturday, August 03, 2013 9:54 AM

> To: Bhushan Bharat-R65777

> Cc: Wood Scott-B07421; agraf@suse.de; kvm-ppc@vger.kernel.org;

> kvm@vger.kernel.org; linuxppc-dev@lists.ozlabs.org

> Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like

> booke3s

> 

> On Sat, 2013-08-03 at 02:58 +0000, Bhushan Bharat-R65777 wrote:

> > One of the problem I saw was that if I put this code in

> > asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and other

> > friend function (on which this code depends) are defined in pgtable.h.

> > And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h before it

> > defines pte_present() and friends functions.

> >

> > Ok I move wove this in asm/pgtable*.h, initially I fought with myself

> > to take this code in pgtable* but finally end up doing here (got

> > biased by book3s :)).

> 

> Is there a reason why these routines can not be completely generic in pgtable.h

> ?


How about the generic function:
Scott Wood - Aug. 5, 2013, 7:19 p.m.
On Mon, 2013-08-05 at 09:27 -0500, Bhushan Bharat-R65777 wrote:
> 
> > -----Original Message-----
> > From: Benjamin Herrenschmidt [mailto:benh@kernel.crashing.org]
> > Sent: Saturday, August 03, 2013 9:54 AM
> > To: Bhushan Bharat-R65777
> > Cc: Wood Scott-B07421; agraf@suse.de; kvm-ppc@vger.kernel.org;
> > kvm@vger.kernel.org; linuxppc-dev@lists.ozlabs.org
> > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like
> > booke3s
> > 
> > On Sat, 2013-08-03 at 02:58 +0000, Bhushan Bharat-R65777 wrote:
> > > One of the problem I saw was that if I put this code in
> > > asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and other
> > > friend function (on which this code depends) are defined in pgtable.h.
> > > And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h before it
> > > defines pte_present() and friends functions.
> > >
> > > Ok I move wove this in asm/pgtable*.h, initially I fought with myself
> > > to take this code in pgtable* but finally end up doing here (got
> > > biased by book3s :)).
> > 
> > Is there a reason why these routines can not be completely generic in pgtable.h
> > ?
> 
> How about the generic function:
> 
> diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
> index d257d98..21daf28 100644
> --- a/arch/powerpc/include/asm/pgtable-ppc64.h
> +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
> @@ -221,6 +221,27 @@ static inline unsigned long pte_update(struct mm_struct *mm,
>         return old;
>  }
> 
> +static inline unsigned long pte_read(pte_t *p)
> +{
> +#ifdef PTE_ATOMIC_UPDATES
> +       pte_t pte;
> +       pte_t tmp;
> +       __asm__ __volatile__ (
> +       "1:     ldarx   %0,0,%3\n"
> +       "       andi.   %1,%0,%4\n"
> +       "       bne-    1b\n"
> +       "       ori     %1,%0,%4\n"
> +       "       stdcx.  %1,0,%3\n"
> +       "       bne-    1b"
> +       : "=&r" (pte), "=&r" (tmp), "=m" (*p)
> +       : "r" (p), "i" (_PAGE_BUSY)
> +       : "cc");
> +
> +       return pte;
> +#else  
> +       return pte_val(*p);
> +#endif
> +#endif
> +}
>  static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
>                                               unsigned long addr, pte_t *ptep)

Please leave a blank line between functions.

>  {
> diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
> index 690c8c2..dad712c 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -254,6 +254,45 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
>  }
>  #endif /* !CONFIG_HUGETLB_PAGE */
> 
> +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
> +                                    int writing, unsigned long *pte_sizep)

The name implies that it just reads the PTE.  Setting accessed/dirty
shouldn't be an undocumented side-effect.  Why can't the caller do that
(or a different function that the caller calls afterward if desired)?  

Though even then you have the undocumented side effect of locking the
PTE on certain targets.

> +{
> +       pte_t *ptep;
> +       pte_t pte;
> +       unsigned long ps = *pte_sizep;
> +       unsigned int shift;
> +
> +       ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
> +       if (!ptep)
> +               return __pte(0);
> +       if (shift)
> +               *pte_sizep = 1ul << shift;
> +       else
> +               *pte_sizep = PAGE_SIZE;
> +
> +       if (ps > *pte_sizep)
> +               return __pte(0);
> +
> +       if (!pte_present(*ptep))
> +               return __pte(0);
> +
> +#ifdef CONFIG_PPC64
> +       /* Lock PTE (set _PAGE_BUSY) and read */
> +       pte = pte_read(ptep);
> +#else
> +       pte = pte_val(*ptep);
> +#endif

What about 32-bit platforms that need atomic PTEs?

-Scott



--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bharat Bhushan - Aug. 6, 2013, 1:12 a.m.
> -----Original Message-----

> From: Wood Scott-B07421

> Sent: Tuesday, August 06, 2013 12:49 AM

> To: Bhushan Bharat-R65777

> Cc: Benjamin Herrenschmidt; Wood Scott-B07421; agraf@suse.de; kvm-

> ppc@vger.kernel.org; kvm@vger.kernel.org; linuxppc-dev@lists.ozlabs.org

> Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like

> booke3s

> 

> On Mon, 2013-08-05 at 09:27 -0500, Bhushan Bharat-R65777 wrote:

> >

> > > -----Original Message-----

> > > From: Benjamin Herrenschmidt [mailto:benh@kernel.crashing.org]

> > > Sent: Saturday, August 03, 2013 9:54 AM

> > > To: Bhushan Bharat-R65777

> > > Cc: Wood Scott-B07421; agraf@suse.de; kvm-ppc@vger.kernel.org;

> > > kvm@vger.kernel.org; linuxppc-dev@lists.ozlabs.org

> > > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte

> > > lookup like booke3s

> > >

> > > On Sat, 2013-08-03 at 02:58 +0000, Bhushan Bharat-R65777 wrote:

> > > > One of the problem I saw was that if I put this code in

> > > > asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and other

> > > > friend function (on which this code depends) are defined in pgtable.h.

> > > > And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h

> > > > before it defines pte_present() and friends functions.

> > > >

> > > > Ok I move wove this in asm/pgtable*.h, initially I fought with

> > > > myself to take this code in pgtable* but finally end up doing here

> > > > (got biased by book3s :)).

> > >

> > > Is there a reason why these routines can not be completely generic

> > > in pgtable.h ?

> >

> > How about the generic function:

> >

> > diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h

> > b/arch/powerpc/include/asm/pgtable-ppc64.h

> > index d257d98..21daf28 100644

> > --- a/arch/powerpc/include/asm/pgtable-ppc64.h

> > +++ b/arch/powerpc/include/asm/pgtable-ppc64.h

> > @@ -221,6 +221,27 @@ static inline unsigned long pte_update(struct mm_struct

> *mm,

> >         return old;

> >  }

> >

> > +static inline unsigned long pte_read(pte_t *p) { #ifdef

> > +PTE_ATOMIC_UPDATES

> > +       pte_t pte;

> > +       pte_t tmp;

> > +       __asm__ __volatile__ (

> > +       "1:     ldarx   %0,0,%3\n"

> > +       "       andi.   %1,%0,%4\n"

> > +       "       bne-    1b\n"

> > +       "       ori     %1,%0,%4\n"

> > +       "       stdcx.  %1,0,%3\n"

> > +       "       bne-    1b"

> > +       : "=&r" (pte), "=&r" (tmp), "=m" (*p)

> > +       : "r" (p), "i" (_PAGE_BUSY)

> > +       : "cc");

> > +

> > +       return pte;

> > +#else

> > +       return pte_val(*p);

> > +#endif

> > +#endif

> > +}

> >  static inline int __ptep_test_and_clear_young(struct mm_struct *mm,

> >                                               unsigned long addr,

> > pte_t *ptep)

> 

> Please leave a blank line between functions.

> 

> >  {

> > diff --git a/arch/powerpc/include/asm/pgtable.h

> > b/arch/powerpc/include/asm/pgtable.h

> > index 690c8c2..dad712c 100644

> > --- a/arch/powerpc/include/asm/pgtable.h

> > +++ b/arch/powerpc/include/asm/pgtable.h

> > @@ -254,6 +254,45 @@ static inline pte_t

> > *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,  }  #endif

> > /* !CONFIG_HUGETLB_PAGE */

> >

> > +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,

> > +                                    int writing, unsigned long

> > +*pte_sizep)

> 

> The name implies that it just reads the PTE.  Setting accessed/dirty shouldn't

> be an undocumented side-effect.


Ok, will rename and document.

> Why can't the caller do that (or a different

> function that the caller calls afterward if desired)?


The current implementation in book3s is;
 1) find a pte/hugepte
 2) return null if pte not present
 3) take _PAGE_BUSY lock
 4) set accessed/dirty
 5) clear _PAGE_BUSY.

What I tried was 
1) find a pte/hugepte
2) return null if pte not present
3) return pte (not take lock by not setting _PAGE_BUSY)

4) then user calls  __ptep_set_access_flags() to atomic update the dirty/accessed flags in pte.

- but the benchmark results were not good
- Also can there be race as we do not take lock in step 3 and update in step 4 ?
  
> 

> Though even then you have the undocumented side effect of locking the PTE on

> certain targets.

> 

> > +{

> > +       pte_t *ptep;

> > +       pte_t pte;

> > +       unsigned long ps = *pte_sizep;

> > +       unsigned int shift;

> > +

> > +       ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);

> > +       if (!ptep)

> > +               return __pte(0);

> > +       if (shift)

> > +               *pte_sizep = 1ul << shift;

> > +       else

> > +               *pte_sizep = PAGE_SIZE;

> > +

> > +       if (ps > *pte_sizep)

> > +               return __pte(0);

> > +

> > +       if (!pte_present(*ptep))

> > +               return __pte(0);

> > +

> > +#ifdef CONFIG_PPC64

> > +       /* Lock PTE (set _PAGE_BUSY) and read */

> > +       pte = pte_read(ptep);

> > +#else

> > +       pte = pte_val(*ptep);

> > +#endif

> 

> What about 32-bit platforms that need atomic PTEs?


I called __ptep_set_access_flags() for both 32/64bit (for 64bit I was not calling pte_read()), which handles atomic updates. Somehow the benchmark result were not good, will try again.

Thanks
-Bharat
> 

> -Scott

>
Bharat Bhushan - Aug. 6, 2013, 7:02 a.m.
> -----Original Message-----

> From: Bhushan Bharat-R65777

> Sent: Tuesday, August 06, 2013 6:42 AM

> To: Wood Scott-B07421

> Cc: Benjamin Herrenschmidt; agraf@suse.de; kvm-ppc@vger.kernel.org;

> kvm@vger.kernel.org; linuxppc-dev@lists.ozlabs.org

> Subject: RE: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like

> booke3s

> 

> 

> 

> > -----Original Message-----

> > From: Wood Scott-B07421

> > Sent: Tuesday, August 06, 2013 12:49 AM

> > To: Bhushan Bharat-R65777

> > Cc: Benjamin Herrenschmidt; Wood Scott-B07421; agraf@suse.de; kvm-

> > ppc@vger.kernel.org; kvm@vger.kernel.org;

> > linuxppc-dev@lists.ozlabs.org

> > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup

> > like booke3s

> >

> > On Mon, 2013-08-05 at 09:27 -0500, Bhushan Bharat-R65777 wrote:

> > >

> > > > -----Original Message-----

> > > > From: Benjamin Herrenschmidt [mailto:benh@kernel.crashing.org]

> > > > Sent: Saturday, August 03, 2013 9:54 AM

> > > > To: Bhushan Bharat-R65777

> > > > Cc: Wood Scott-B07421; agraf@suse.de; kvm-ppc@vger.kernel.org;

> > > > kvm@vger.kernel.org; linuxppc-dev@lists.ozlabs.org

> > > > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte

> > > > lookup like booke3s

> > > >

> > > > On Sat, 2013-08-03 at 02:58 +0000, Bhushan Bharat-R65777 wrote:

> > > > > One of the problem I saw was that if I put this code in

> > > > > asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and

> > > > > other friend function (on which this code depends) are defined in

> pgtable.h.

> > > > > And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h

> > > > > before it defines pte_present() and friends functions.

> > > > >

> > > > > Ok I move wove this in asm/pgtable*.h, initially I fought with

> > > > > myself to take this code in pgtable* but finally end up doing

> > > > > here (got biased by book3s :)).

> > > >

> > > > Is there a reason why these routines can not be completely generic

> > > > in pgtable.h ?

> > >

> > > How about the generic function:

> > >

> > > diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h

> > > b/arch/powerpc/include/asm/pgtable-ppc64.h

> > > index d257d98..21daf28 100644

> > > --- a/arch/powerpc/include/asm/pgtable-ppc64.h

> > > +++ b/arch/powerpc/include/asm/pgtable-ppc64.h

> > > @@ -221,6 +221,27 @@ static inline unsigned long pte_update(struct

> > > mm_struct

> > *mm,

> > >         return old;

> > >  }

> > >

> > > +static inline unsigned long pte_read(pte_t *p) { #ifdef

> > > +PTE_ATOMIC_UPDATES

> > > +       pte_t pte;

> > > +       pte_t tmp;

> > > +       __asm__ __volatile__ (

> > > +       "1:     ldarx   %0,0,%3\n"

> > > +       "       andi.   %1,%0,%4\n"

> > > +       "       bne-    1b\n"

> > > +       "       ori     %1,%0,%4\n"

> > > +       "       stdcx.  %1,0,%3\n"

> > > +       "       bne-    1b"

> > > +       : "=&r" (pte), "=&r" (tmp), "=m" (*p)

> > > +       : "r" (p), "i" (_PAGE_BUSY)

> > > +       : "cc");

> > > +

> > > +       return pte;

> > > +#else

> > > +       return pte_val(*p);

> > > +#endif

> > > +#endif

> > > +}

> > >  static inline int __ptep_test_and_clear_young(struct mm_struct *mm,

> > >                                               unsigned long addr,

> > > pte_t *ptep)

> >

> > Please leave a blank line between functions.

> >

> > >  {

> > > diff --git a/arch/powerpc/include/asm/pgtable.h

> > > b/arch/powerpc/include/asm/pgtable.h

> > > index 690c8c2..dad712c 100644

> > > --- a/arch/powerpc/include/asm/pgtable.h

> > > +++ b/arch/powerpc/include/asm/pgtable.h

> > > @@ -254,6 +254,45 @@ static inline pte_t

> > > *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,  }

> > > #endif

> > > /* !CONFIG_HUGETLB_PAGE */

> > >

> > > +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,

> > > +                                    int writing, unsigned long

> > > +*pte_sizep)

> >

> > The name implies that it just reads the PTE.  Setting accessed/dirty

> > shouldn't be an undocumented side-effect.

> 

> Ok, will rename and document.

> 

> > Why can't the caller do that (or a different function that the caller

> > calls afterward if desired)?

> 

> The current implementation in book3s is;

>  1) find a pte/hugepte

>  2) return null if pte not present

>  3) take _PAGE_BUSY lock

>  4) set accessed/dirty

>  5) clear _PAGE_BUSY.

> 

> What I tried was

> 1) find a pte/hugepte

> 2) return null if pte not present

> 3) return pte (not take lock by not setting _PAGE_BUSY)

> 

> 4) then user calls  __ptep_set_access_flags() to atomic update the

> dirty/accessed flags in pte.

> 

> - but the benchmark results were not good

> - Also can there be race as we do not take lock in step 3 and update in step 4 ?

> 

> >

> > Though even then you have the undocumented side effect of locking the

> > PTE on certain targets.

> >

> > > +{

> > > +       pte_t *ptep;

> > > +       pte_t pte;

> > > +       unsigned long ps = *pte_sizep;

> > > +       unsigned int shift;

> > > +

> > > +       ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);

> > > +       if (!ptep)

> > > +               return __pte(0);

> > > +       if (shift)

> > > +               *pte_sizep = 1ul << shift;

> > > +       else

> > > +               *pte_sizep = PAGE_SIZE;

> > > +

> > > +       if (ps > *pte_sizep)

> > > +               return __pte(0);

> > > +

> > > +       if (!pte_present(*ptep))

> > > +               return __pte(0);

> > > +

> > > +#ifdef CONFIG_PPC64

> > > +       /* Lock PTE (set _PAGE_BUSY) and read */

> > > +       pte = pte_read(ptep);

> > > +#else

> > > +       pte = pte_val(*ptep);

> > > +#endif

> >

> > What about 32-bit platforms that need atomic PTEs?

> 

> I called __ptep_set_access_flags() for both 32/64bit (for 64bit I was not

> calling pte_read()), which handles atomic updates. Somehow the benchmark result

> were not good, will try again.


Hi Pauls,

I am trying to me the Linux pte search and update generic so that this can be used for powerpc as well.

I am not sure which of the below two should be ok, please help

--------------------------- (1) This --------------------------
/*
 * Lock and read a linux PTE.  If it's present and writable, atomically
 * set dirty and referenced bits and return the PTE, otherwise return 0.
 */
static inline atomic_read_update_pte_flags(pte_t *ptep, int writing)
{
        pte_t pte;
#ifdef CONFIG_PPC64
        /* Lock PTE (set _PAGE_BUSY) and read
         * XXX: NOTE: Some Architectures (book3e) does not have pte Locking,
         * so not generic in that sense for all architecture
         */
        pte = pte_read(ptep);
#else
        /* XXX: do not see any read locking on 32 bit architecture */
        pte = pte_val(*ptep);
#endif
        if (pte_present(pte)) {
                pte = pte_mkyoung(pte);
                if (writing && pte_write(pte))
                        pte = pte_mkdirty(pte);
        }

        *ptep = __pte(pte); /* 64bit: Also unlock pte (clear _PAGE_BUSY) */

        return pte;
}


--------------------- (2) OR THIS ----------------------------------

/*
 * Read a linux PTE.  If it's present and writable, atomically
 * set dirty and referenced bits and return the PTE, otherwise return 0.
 */
static inline atomic_read_update_pte_flags(pte_t *ptep, int writing)
{
        pte_t pte;
        pte = pte_val(*ptep);
        if (pte_present(pte)) {
                pte = pte_mkyoung(pte);
                if (writing && pte_write(pte))
                        pte = pte_mkdirty(pte);
        }

        __ptep_set_access_flags(ptep, pte);

        return pte_val(*ptep);
}

Thanks
-Bharat

> 

> Thanks

> -Bharat

> >

> > -Scott

> >
Paul Mackerras - Aug. 7, 2013, 12:24 a.m.
On Tue, Aug 06, 2013 at 07:02:48AM +0000, Bhushan Bharat-R65777 wrote:
> 
> I am trying to me the Linux pte search and update generic so that this can be used for powerpc as well.
> 
> I am not sure which of the below two should be ok, please help

Given that the BookE code uses gfn_to_pfn_memslot() to get the host
pfn, and then kvm_set_pfn_dirty(pfn) on pages that you're going to let
the guest write to, I don't think you need to set the dirty and/or
accessed bits in the Linux PTE explicitly.  If you care about the
WIMGE bits you can do find_linux_pte_or_hugepte() and just look at the
PTE, but you really should be using mmu_notifier_retry() to guard
against concurrent changes to the Linux PTE.  See the HV KVM code or
patch 21 of my recent series to see how it's used.  You probably
should be calling kvm_set_pfn_accessed() as well.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Wood - Aug. 7, 2013, 1:11 a.m.
On Wed, 2013-08-07 at 10:24 +1000, Paul Mackerras wrote:
> On Tue, Aug 06, 2013 at 07:02:48AM +0000, Bhushan Bharat-R65777 wrote:
> > 
> > I am trying to me the Linux pte search and update generic so that this can be used for powerpc as well.
> > 
> > I am not sure which of the below two should be ok, please help
> 
> Given that the BookE code uses gfn_to_pfn_memslot() to get the host
> pfn, and then kvm_set_pfn_dirty(pfn) on pages that you're going to let
> the guest write to, I don't think you need to set the dirty and/or
> accessed bits in the Linux PTE explicitly.  If you care about the
> WIMGE bits you can do find_linux_pte_or_hugepte() and just look at the
> PTE, but you really should be using mmu_notifier_retry() to guard
> against concurrent changes to the Linux PTE.  See the HV KVM code or
> patch 21 of my recent series to see how it's used. 

Hmm... we only get a callback on invalidate_range_start(), not
invalidate_range_end() (and even if we did get a callback for the
latter, it'd probably be racy).  So we may have a problem here
regardless of getting WIMG from the PTE, unless it's guaranteed that
hva_to_pfn() will fail after invalidate_range_start().

>  You probably should be calling kvm_set_pfn_accessed() as well.

Yeah...  I think it'll only affect the quality of page-out decisions (as
opposed to corruption and such), but still it should be fixed.

-Scott



--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras - Aug. 7, 2013, 1:47 a.m.
On Tue, Aug 06, 2013 at 08:11:34PM -0500, Scott Wood wrote:
> On Wed, 2013-08-07 at 10:24 +1000, Paul Mackerras wrote:
> > On Tue, Aug 06, 2013 at 07:02:48AM +0000, Bhushan Bharat-R65777 wrote:
> > > 
> > > I am trying to me the Linux pte search and update generic so that this can be used for powerpc as well.
> > > 
> > > I am not sure which of the below two should be ok, please help
> > 
> > Given that the BookE code uses gfn_to_pfn_memslot() to get the host
> > pfn, and then kvm_set_pfn_dirty(pfn) on pages that you're going to let
> > the guest write to, I don't think you need to set the dirty and/or
> > accessed bits in the Linux PTE explicitly.  If you care about the
> > WIMGE bits you can do find_linux_pte_or_hugepte() and just look at the
> > PTE, but you really should be using mmu_notifier_retry() to guard
> > against concurrent changes to the Linux PTE.  See the HV KVM code or
> > patch 21 of my recent series to see how it's used. 
> 
> Hmm... we only get a callback on invalidate_range_start(), not
> invalidate_range_end() (and even if we did get a callback for the
> latter, it'd probably be racy).  So we may have a problem here
> regardless of getting WIMG from the PTE, unless it's guaranteed that
> hva_to_pfn() will fail after invalidate_range_start().

No, it's not guaranteed.  You have to use mmu_notifier_retry().  It
tells you if either (a) some sort of invalidation has happened since
you snapshotted kvm->mmu_notifier_seq, or (b) an
invalidate_range_start...end sequence is currently in progress.  In
either case you should discard any PTE or pfn information you
collected and retry.

> >  You probably should be calling kvm_set_pfn_accessed() as well.
> 
> Yeah...  I think it'll only affect the quality of page-out decisions (as
> opposed to corruption and such), but still it should be fixed.

Right.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h

index d257d98..21daf28 100644

--- a/arch/powerpc/include/asm/pgtable-ppc64.h

+++ b/arch/powerpc/include/asm/pgtable-ppc64.h

@@ -221,6 +221,27 @@  static inline unsigned long pte_update(struct mm_struct *mm,

        return old;
 }

+static inline unsigned long pte_read(pte_t *p)

+{

+#ifdef PTE_ATOMIC_UPDATES

+       pte_t pte;

+       pte_t tmp;

+       __asm__ __volatile__ (

+       "1:     ldarx   %0,0,%3\n"

+       "       andi.   %1,%0,%4\n"

+       "       bne-    1b\n"

+       "       ori     %1,%0,%4\n"

+       "       stdcx.  %1,0,%3\n"

+       "       bne-    1b"

+       : "=&r" (pte), "=&r" (tmp), "=m" (*p)

+       : "r" (p), "i" (_PAGE_BUSY)

+       : "cc");

+

+       return pte;

+#else  

+       return pte_val(*p);

+#endif

+#endif

+}

 static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
                                              unsigned long addr, pte_t *ptep)
 {
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h

index 690c8c2..dad712c 100644

--- a/arch/powerpc/include/asm/pgtable.h

+++ b/arch/powerpc/include/asm/pgtable.h

@@ -254,6 +254,45 @@  static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,

 }
 #endif /* !CONFIG_HUGETLB_PAGE */

+static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,

+                                    int writing, unsigned long *pte_sizep)

+{

+       pte_t *ptep;

+       pte_t pte;

+       unsigned long ps = *pte_sizep;

+       unsigned int shift;

+

+       ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);

+       if (!ptep)

+               return __pte(0);

+       if (shift)

+               *pte_sizep = 1ul << shift;

+       else

+               *pte_sizep = PAGE_SIZE;

+

+       if (ps > *pte_sizep)

+               return __pte(0);

+

+       if (!pte_present(*ptep))

+               return __pte(0);

+

+#ifdef CONFIG_PPC64

+       /* Lock PTE (set _PAGE_BUSY) and read */

+       pte = pte_read(ptep);

+#else

+       pte = pte_val(*ptep);

+#endif

+       if (pte_present(pte)) {

+               pte = pte_mkyoung(pte);

+               if (writing && pte_write(pte))

+                       pte = pte_mkdirty(pte);

+       }

+

+       *ptep = __pte(pte); /* 64bit: Also unlock pte (clear _PAGE_BUSY) */

+

+       return pte;

+}

+

 #endif /* __ASSEMBLY__ */

 #endif /* __KERNEL__ */