Message ID | 20201126121121.164675154@infradead.org |
---|---|
State | Not Applicable |
Delegated to: | David Miller |
Headers | show |
Series | perf/mm: Fix PERF_SAMPLE_*_PAGE_SIZE | expand |
On Thu, Nov 26, 2020 at 01:01:17PM +0100, Peter Zijlstra wrote: > The (new) page-table walker in arch_perf_get_page_size() is broken in > various ways. Specifically while it is used in a lockless manner, it > doesn't depend on CONFIG_HAVE_FAST_GUP nor uses the proper _lockless > offset methods, nor is careful to only read each entry only once. > > Also the hugetlb support is broken due to calling pte_page() without > first checking pte_special(). > > Rewrite the whole thing to be a proper lockless page-table walker and > employ the new pXX_leaf_size() pgtable functions to determine the > pagetable size without looking at the page-frames. > > Fixes: 51b646b2d9f8 ("perf,mm: Handle non-page-table-aligned hugetlbfs") > Fixes: 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE") > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > Tested-by: Kan Liang <kan.liang@linux.intel.com> > --- > arch/arm64/include/asm/pgtable.h | 3 + > arch/sparc/include/asm/pgtable_64.h | 13 ++++ > arch/sparc/mm/hugetlbpage.c | 19 ++++-- > include/linux/pgtable.h | 16 +++++ > kernel/events/core.c | 102 +++++++++++++----------------------- > 5 files changed, 82 insertions(+), 71 deletions(-) This diffstat doesn't match the patch in this email ... > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -52,6 +52,7 @@ > #include <linux/mount.h> > #include <linux/min_heap.h> > #include <linux/highmem.h> > +#include <linux/pgtable.h> > > #include "internal.h" > > @@ -7001,90 +7001,62 @@ static u64 perf_virt_to_phys(u64 virt) > return phys_addr; > } > > -#ifdef CONFIG_MMU > - > /* > - * Return the MMU page size of a given virtual address. > - * > - * This generic implementation handles page-table aligned huge pages, as well > - * as non-page-table aligned hugetlbfs compound pages. > - * > - * If an architecture supports and uses non-page-table aligned pages in their > - * kernel mapping it will need to provide it's own implementation of this > - * function. > + * Return the pagetable size of a given virtual address. > */ > -__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr) > +static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) > { > - struct page *page; > - pgd_t *pgd; > - p4d_t *p4d; > - pud_t *pud; > - pmd_t *pmd; > - pte_t *pte; > + u64 size = 0; > > - pgd = pgd_offset(mm, addr); > - if (pgd_none(*pgd)) > - return 0; > +#ifdef CONFIG_HAVE_FAST_GUP > + pgd_t *pgdp, pgd; > + p4d_t *p4dp, p4d; > + pud_t *pudp, pud; > + pmd_t *pmdp, pmd; > + pte_t *ptep, pte; > > - p4d = p4d_offset(pgd, addr); > - if (!p4d_present(*p4d)) > + pgdp = pgd_offset(mm, addr); > + pgd = READ_ONCE(*pgdp); > + if (pgd_none(pgd)) > return 0; > > - if (p4d_leaf(*p4d)) > - return 1ULL << P4D_SHIFT; > + if (pgd_leaf(pgd)) > + return pgd_leaf_size(pgd); > > - pud = pud_offset(p4d, addr); > - if (!pud_present(*pud)) > + p4dp = p4d_offset_lockless(pgdp, pgd, addr); > + p4d = READ_ONCE(*p4dp); > + if (!p4d_present(p4d)) > return 0; > > - if (pud_leaf(*pud)) { > -#ifdef pud_page > - page = pud_page(*pud); > - if (PageHuge(page)) > - return page_size(compound_head(page)); > -#endif > - return 1ULL << PUD_SHIFT; > - } > + if (p4d_leaf(p4d)) > + return p4d_leaf_size(p4d); > > - pmd = pmd_offset(pud, addr); > - if (!pmd_present(*pmd)) > + pudp = pud_offset_lockless(p4dp, p4d, addr); > + pud = READ_ONCE(*pudp); > + if (!pud_present(pud)) > return 0; > > - if (pmd_leaf(*pmd)) { > -#ifdef pmd_page > - page = pmd_page(*pmd); > - if (PageHuge(page)) > - return page_size(compound_head(page)); > -#endif > - return 1ULL << PMD_SHIFT; > - } > + if (pud_leaf(pud)) > + return pud_leaf_size(pud); > > - pte = pte_offset_map(pmd, addr); > - if (!pte_present(*pte)) { > - pte_unmap(pte); > + pmdp = pmd_offset_lockless(pudp, pud, addr); > + pmd = READ_ONCE(*pmdp); > + if (!pmd_present(pmd)) > return 0; > - } > > - page = pte_page(*pte); > - if (PageHuge(page)) { > - u64 size = page_size(compound_head(page)); > - pte_unmap(pte); > - return size; > - } > - > - pte_unmap(pte); > - return PAGE_SIZE; > -} > + if (pmd_leaf(pmd)) > + return pmd_leaf_size(pmd); > > -#else > + ptep = pte_offset_map(&pmd, addr); > + pte = ptep_get_lockless(ptep); > + if (pte_present(pte)) > + size = pte_leaf_size(pte); > + pte_unmap(ptep); > +#endif /* CONFIG_HAVE_FAST_GUP */ > > -static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr) > -{ > - return 0; > + return size; > } > > -#endif > - > static u64 perf_get_page_size(unsigned long addr) > { > struct mm_struct *mm; > @@ -7109,7 +7081,7 @@ static u64 perf_get_page_size(unsigned l > mm = &init_mm; > } > > - size = arch_perf_get_page_size(mm, addr); > + size = perf_get_pgtable_size(mm, addr); > > local_irq_restore(flags); > > >
On Thu, Nov 26, 2020 at 12:34:58PM +0000, Matthew Wilcox wrote: > On Thu, Nov 26, 2020 at 01:01:17PM +0100, Peter Zijlstra wrote: > > The (new) page-table walker in arch_perf_get_page_size() is broken in > > various ways. Specifically while it is used in a lockless manner, it > > doesn't depend on CONFIG_HAVE_FAST_GUP nor uses the proper _lockless > > offset methods, nor is careful to only read each entry only once. > > > > Also the hugetlb support is broken due to calling pte_page() without > > first checking pte_special(). > > > > Rewrite the whole thing to be a proper lockless page-table walker and > > employ the new pXX_leaf_size() pgtable functions to determine the > > pagetable size without looking at the page-frames. > > > > Fixes: 51b646b2d9f8 ("perf,mm: Handle non-page-table-aligned hugetlbfs") > > Fixes: 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE") > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > Tested-by: Kan Liang <kan.liang@linux.intel.com> > > --- > > arch/arm64/include/asm/pgtable.h | 3 + > > arch/sparc/include/asm/pgtable_64.h | 13 ++++ > > arch/sparc/mm/hugetlbpage.c | 19 ++++-- > > include/linux/pgtable.h | 16 +++++ > > kernel/events/core.c | 102 +++++++++++++----------------------- > > 5 files changed, 82 insertions(+), 71 deletions(-) > > This diffstat doesn't match the patch in this email ... Urgh, no idea how I did that... I must've edited the diff and not done a quilt-refresh. Updated below. --- Subject: perf/core: Fix arch_perf_get_page_size() From: Peter Zijlstra <peterz@infradead.org> Date: Wed, 11 Nov 2020 13:43:57 +0100 The (new) page-table walker in arch_perf_get_page_size() is broken in various ways. Specifically while it is used in a lockless manner, it doesn't depend on CONFIG_HAVE_FAST_GUP nor uses the proper _lockless offset methods, nor is careful to only read each entry only once. Also the hugetlb support is broken due to calling pte_page() without first checking pte_special(). Rewrite the whole thing to be a proper lockless page-table walker and employ the new pXX_leaf_size() pgtable functions to determine the pagetable size without looking at the page-frames. Fixes: 51b646b2d9f8 ("perf,mm: Handle non-page-table-aligned hugetlbfs") Fixes: 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE") Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Kan Liang <kan.liang@linux.intel.com> --- kernel/events/core.c | 105 ++++++++++++++++++--------------------------------- 1 file changed, 39 insertions(+), 66 deletions(-) --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -52,6 +52,7 @@ #include <linux/mount.h> #include <linux/min_heap.h> #include <linux/highmem.h> +#include <linux/pgtable.h> #include "internal.h" @@ -7001,90 +7002,62 @@ static u64 perf_virt_to_phys(u64 virt) return phys_addr; } -#ifdef CONFIG_MMU - /* - * Return the MMU page size of a given virtual address. - * - * This generic implementation handles page-table aligned huge pages, as well - * as non-page-table aligned hugetlbfs compound pages. - * - * If an architecture supports and uses non-page-table aligned pages in their - * kernel mapping it will need to provide it's own implementation of this - * function. + * Return the pagetable size of a given virtual address. */ -__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr) +static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) { - struct page *page; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) - return 0; + u64 size = 0; - p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) +#ifdef CONFIG_HAVE_FAST_GUP + pgd_t *pgdp, pgd; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + + pgdp = pgd_offset(mm, addr); + pgd = READ_ONCE(*pgdp); + if (pgd_none(pgd)) return 0; - if (p4d_leaf(*p4d)) - return 1ULL << P4D_SHIFT; + if (pgd_leaf(pgd)) + return pgd_leaf_size(pgd); - pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) + p4dp = p4d_offset_lockless(pgdp, pgd, addr); + p4d = READ_ONCE(*p4dp); + if (!p4d_present(p4d)) return 0; - if (pud_leaf(*pud)) { -#ifdef pud_page - page = pud_page(*pud); - if (PageHuge(page)) - return page_size(compound_head(page)); -#endif - return 1ULL << PUD_SHIFT; - } + if (p4d_leaf(p4d)) + return p4d_leaf_size(p4d); - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) + pudp = pud_offset_lockless(p4dp, p4d, addr); + pud = READ_ONCE(*pudp); + if (!pud_present(pud)) return 0; - if (pmd_leaf(*pmd)) { -#ifdef pmd_page - page = pmd_page(*pmd); - if (PageHuge(page)) - return page_size(compound_head(page)); -#endif - return 1ULL << PMD_SHIFT; - } + if (pud_leaf(pud)) + return pud_leaf_size(pud); - pte = pte_offset_map(pmd, addr); - if (!pte_present(*pte)) { - pte_unmap(pte); + pmdp = pmd_offset_lockless(pudp, pud, addr); + pmd = READ_ONCE(*pmdp); + if (!pmd_present(pmd)) return 0; - } - page = pte_page(*pte); - if (PageHuge(page)) { - u64 size = page_size(compound_head(page)); - pte_unmap(pte); - return size; - } + if (pmd_leaf(pmd)) + return pmd_leaf_size(pmd); - pte_unmap(pte); - return PAGE_SIZE; -} - -#else + ptep = pte_offset_map(&pmd, addr); + pte = ptep_get_lockless(ptep); + if (pte_present(pte)) + size = pte_leaf_size(pte); + pte_unmap(ptep); +#endif /* CONFIG_HAVE_FAST_GUP */ -static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr) -{ - return 0; + return size; } -#endif - static u64 perf_get_page_size(unsigned long addr) { struct mm_struct *mm; @@ -7109,7 +7082,7 @@ static u64 perf_get_page_size(unsigned l mm = &init_mm; } - size = arch_perf_get_page_size(mm, addr); + size = perf_get_pgtable_size(mm, addr); local_irq_restore(flags);
On Thu, Nov 26, 2020 at 01:42:07PM +0100, Peter Zijlstra wrote: > + pgdp = pgd_offset(mm, addr); > + pgd = READ_ONCE(*pgdp); I forget how x86-32-PAE maps to Linux's PGD/P4D/PUD/PMD scheme, but according to volume 3, section 4.4.2, PAE paging uses a 64-bit PDE, so whether a PDE is a PGD or a PMD, we're only reading it with READ_ONCE rather than the lockless-retry method used by ptep_get_lockless(). So it's potentially racy? Do we need a pmdp_get_lockless() or pgdp_get_lockless()? [...] > + pmdp = pmd_offset_lockless(pudp, pud, addr); > + pmd = READ_ONCE(*pmdp); > + if (!pmd_present(pmd)) > return 0; > > + if (pmd_leaf(pmd)) > + return pmd_leaf_size(pmd); >
On Thu, Nov 26, 2020 at 12:56:06PM +0000, Matthew Wilcox wrote: > On Thu, Nov 26, 2020 at 01:42:07PM +0100, Peter Zijlstra wrote: > > + pgdp = pgd_offset(mm, addr); > > + pgd = READ_ONCE(*pgdp); > > I forget how x86-32-PAE maps to Linux's PGD/P4D/PUD/PMD scheme, but > according to volume 3, section 4.4.2, PAE paging uses a 64-bit PDE, so > whether a PDE is a PGD or a PMD, we're only reading it with READ_ONCE > rather than the lockless-retry method used by ptep_get_lockless(). > So it's potentially racy? Do we need a pmdp_get_lockless() or > pgdp_get_lockless()? Oh gawd... this isn't new here though, right? Current gup_fast also gets that wrong, if it is in deed wrong. I suppose it's a race far more likely today, with THP and all, than it ever was back then.
On Thu, Nov 26, 2020 at 02:06:19PM +0100, Peter Zijlstra wrote: > On Thu, Nov 26, 2020 at 12:56:06PM +0000, Matthew Wilcox wrote: > > On Thu, Nov 26, 2020 at 01:42:07PM +0100, Peter Zijlstra wrote: > > > + pgdp = pgd_offset(mm, addr); > > > + pgd = READ_ONCE(*pgdp); > > > > I forget how x86-32-PAE maps to Linux's PGD/P4D/PUD/PMD scheme, but > > according to volume 3, section 4.4.2, PAE paging uses a 64-bit PDE, so > > whether a PDE is a PGD or a PMD, we're only reading it with READ_ONCE > > rather than the lockless-retry method used by ptep_get_lockless(). > > So it's potentially racy? Do we need a pmdp_get_lockless() or > > pgdp_get_lockless()? > > Oh gawd... this isn't new here though, right? Current gup_fast also gets > that wrong, if it is in deed wrong. > > I suppose it's a race far more likely today, with THP and all, than it > ever was back then. Right, it's not new. I wouldn't block this patchset for that fix. Just want to get the problem on your radar ;-) I just never reviewed the gup fast codepath before, and this jumped out at me.
--- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -52,6 +52,7 @@ #include <linux/mount.h> #include <linux/min_heap.h> #include <linux/highmem.h> +#include <linux/pgtable.h> #include "internal.h" @@ -7001,90 +7001,62 @@ static u64 perf_virt_to_phys(u64 virt) return phys_addr; } -#ifdef CONFIG_MMU - /* - * Return the MMU page size of a given virtual address. - * - * This generic implementation handles page-table aligned huge pages, as well - * as non-page-table aligned hugetlbfs compound pages. - * - * If an architecture supports and uses non-page-table aligned pages in their - * kernel mapping it will need to provide it's own implementation of this - * function. + * Return the pagetable size of a given virtual address. */ -__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr) +static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) { - struct page *page; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + u64 size = 0; - pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) - return 0; +#ifdef CONFIG_HAVE_FAST_GUP + pgd_t *pgdp, pgd; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; - p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) + pgdp = pgd_offset(mm, addr); + pgd = READ_ONCE(*pgdp); + if (pgd_none(pgd)) return 0; - if (p4d_leaf(*p4d)) - return 1ULL << P4D_SHIFT; + if (pgd_leaf(pgd)) + return pgd_leaf_size(pgd); - pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) + p4dp = p4d_offset_lockless(pgdp, pgd, addr); + p4d = READ_ONCE(*p4dp); + if (!p4d_present(p4d)) return 0; - if (pud_leaf(*pud)) { -#ifdef pud_page - page = pud_page(*pud); - if (PageHuge(page)) - return page_size(compound_head(page)); -#endif - return 1ULL << PUD_SHIFT; - } + if (p4d_leaf(p4d)) + return p4d_leaf_size(p4d); - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) + pudp = pud_offset_lockless(p4dp, p4d, addr); + pud = READ_ONCE(*pudp); + if (!pud_present(pud)) return 0; - if (pmd_leaf(*pmd)) { -#ifdef pmd_page - page = pmd_page(*pmd); - if (PageHuge(page)) - return page_size(compound_head(page)); -#endif - return 1ULL << PMD_SHIFT; - } + if (pud_leaf(pud)) + return pud_leaf_size(pud); - pte = pte_offset_map(pmd, addr); - if (!pte_present(*pte)) { - pte_unmap(pte); + pmdp = pmd_offset_lockless(pudp, pud, addr); + pmd = READ_ONCE(*pmdp); + if (!pmd_present(pmd)) return 0; - } - page = pte_page(*pte); - if (PageHuge(page)) { - u64 size = page_size(compound_head(page)); - pte_unmap(pte); - return size; - } - - pte_unmap(pte); - return PAGE_SIZE; -} + if (pmd_leaf(pmd)) + return pmd_leaf_size(pmd); -#else + ptep = pte_offset_map(&pmd, addr); + pte = ptep_get_lockless(ptep); + if (pte_present(pte)) + size = pte_leaf_size(pte); + pte_unmap(ptep); +#endif /* CONFIG_HAVE_FAST_GUP */ -static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr) -{ - return 0; + return size; } -#endif - static u64 perf_get_page_size(unsigned long addr) { struct mm_struct *mm; @@ -7109,7 +7081,7 @@ static u64 perf_get_page_size(unsigned l mm = &init_mm; } - size = arch_perf_get_page_size(mm, addr); + size = perf_get_pgtable_size(mm, addr); local_irq_restore(flags);