diff mbox series

[mm-unstable,v1,5/5] mm: multi-gen LRU: use mmu_notifier_test_clear_young()

Message ID 20230217041230.2417228-6-yuzhao@google.com (mailing list archive)
State Handled Elsewhere
Headers show
Series mm/kvm: lockless accessed bit harvest | expand

Commit Message

Yu Zhao Feb. 17, 2023, 4:12 a.m. UTC
An existing selftest can quickly demonstrate the effectiveness of this
patch. On a generic workstation equipped with 128 CPUs and 256GB DRAM:

  $ sudo max_guest_memory_test -c 64 -m 250 -s 250

  MGLRU      run2
  ---------------
  Before    ~600s
  After      ~50s
  Off       ~250s

  kswapd (MGLRU before)
    100.00%  balance_pgdat
      100.00%  shrink_node
        100.00%  shrink_one
          99.97%  try_to_shrink_lruvec
            99.06%  evict_folios
              97.41%  shrink_folio_list
                31.33%  folio_referenced
                  31.06%  rmap_walk_file
                    30.89%  folio_referenced_one
                      20.83%  __mmu_notifier_clear_flush_young
                        20.54%  kvm_mmu_notifier_clear_flush_young
  =>                      19.34%  _raw_write_lock

  kswapd (MGLRU after)
    100.00%  balance_pgdat
      100.00%  shrink_node
        100.00%  shrink_one
          99.97%  try_to_shrink_lruvec
            99.51%  evict_folios
              71.70%  shrink_folio_list
                7.08%  folio_referenced
                  6.78%  rmap_walk_file
                    6.72%  folio_referenced_one
                      5.60%  lru_gen_look_around
  =>                    1.53%  __mmu_notifier_test_clear_young

  kswapd (MGLRU off)
    100.00%  balance_pgdat
      100.00%  shrink_node
        99.92%  shrink_lruvec
          69.95%  shrink_folio_list
            19.35%  folio_referenced
              18.37%  rmap_walk_file
                17.88%  folio_referenced_one
                  13.20%  __mmu_notifier_clear_flush_young
                    11.64%  kvm_mmu_notifier_clear_flush_young
  =>                  9.93%  _raw_write_lock
          26.23%  shrink_active_list
            25.50%  folio_referenced
              25.35%  rmap_walk_file
                25.28%  folio_referenced_one
                  23.87%  __mmu_notifier_clear_flush_young
                    23.69%  kvm_mmu_notifier_clear_flush_young
  =>                  18.98%  _raw_write_lock

Signed-off-by: Yu Zhao <yuzhao@google.com>
---
 include/linux/mmzone.h |   6 +-
 mm/rmap.c              |   8 +--
 mm/vmscan.c            | 127 ++++++++++++++++++++++++++++++++++++-----
 3 files changed, 121 insertions(+), 20 deletions(-)

Comments

Sean Christopherson Feb. 23, 2023, 5:43 p.m. UTC | #1
On Thu, Feb 16, 2023, Yu Zhao wrote:
> An existing selftest can quickly demonstrate the effectiveness of this
> patch. On a generic workstation equipped with 128 CPUs and 256GB DRAM:

Not my area of maintenance, but a non-existent changelog (for all intents and
purposes) for a change of this size and complexity is not acceptable.

>   $ sudo max_guest_memory_test -c 64 -m 250 -s 250
> 
>   MGLRU      run2
>   ---------------
>   Before    ~600s
>   After      ~50s
>   Off       ~250s
> 
>   kswapd (MGLRU before)
>     100.00%  balance_pgdat
>       100.00%  shrink_node
>         100.00%  shrink_one
>           99.97%  try_to_shrink_lruvec
>             99.06%  evict_folios
>               97.41%  shrink_folio_list
>                 31.33%  folio_referenced
>                   31.06%  rmap_walk_file
>                     30.89%  folio_referenced_one
>                       20.83%  __mmu_notifier_clear_flush_young
>                         20.54%  kvm_mmu_notifier_clear_flush_young
>   =>                      19.34%  _raw_write_lock
> 
>   kswapd (MGLRU after)
>     100.00%  balance_pgdat
>       100.00%  shrink_node
>         100.00%  shrink_one
>           99.97%  try_to_shrink_lruvec
>             99.51%  evict_folios
>               71.70%  shrink_folio_list
>                 7.08%  folio_referenced
>                   6.78%  rmap_walk_file
>                     6.72%  folio_referenced_one
>                       5.60%  lru_gen_look_around
>   =>                    1.53%  __mmu_notifier_test_clear_young

Do you happen to know how much of the improvement is due to batching, and how
much is due to using a walkless walk?

> @@ -5699,6 +5797,9 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
>  	if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
>  		caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
>  
> +	if (kvm_arch_has_test_clear_young() && get_cap(LRU_GEN_SPTE_WALK))
> +		caps |= BIT(LRU_GEN_SPTE_WALK);

As alluded to in patch 1, unless batching the walks even if KVM does _not_ support
a lockless walk is somehow _worse_ than using the existing mmu_notifier_clear_flush_young(),
I think batching the calls should be conditional only on LRU_GEN_SPTE_WALK.  Or
if we want to avoid batching when there are no mmu_notifier listeners, probe
mmu_notifiers.  But don't call into KVM directly.
Yu Zhao Feb. 23, 2023, 6:08 p.m. UTC | #2
On Thu, Feb 23, 2023 at 10:43 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Feb 16, 2023, Yu Zhao wrote:
> > An existing selftest can quickly demonstrate the effectiveness of this
> > patch. On a generic workstation equipped with 128 CPUs and 256GB DRAM:
>
> Not my area of maintenance, but a non-existent changelog (for all intents and
> purposes) for a change of this size and complexity is not acceptable.

Will fix.

> >   $ sudo max_guest_memory_test -c 64 -m 250 -s 250
> >
> >   MGLRU      run2
> >   ---------------
> >   Before    ~600s
> >   After      ~50s
> >   Off       ~250s
> >
> >   kswapd (MGLRU before)
> >     100.00%  balance_pgdat
> >       100.00%  shrink_node
> >         100.00%  shrink_one
> >           99.97%  try_to_shrink_lruvec
> >             99.06%  evict_folios
> >               97.41%  shrink_folio_list
> >                 31.33%  folio_referenced
> >                   31.06%  rmap_walk_file
> >                     30.89%  folio_referenced_one
> >                       20.83%  __mmu_notifier_clear_flush_young
> >                         20.54%  kvm_mmu_notifier_clear_flush_young
> >   =>                      19.34%  _raw_write_lock
> >
> >   kswapd (MGLRU after)
> >     100.00%  balance_pgdat
> >       100.00%  shrink_node
> >         100.00%  shrink_one
> >           99.97%  try_to_shrink_lruvec
> >             99.51%  evict_folios
> >               71.70%  shrink_folio_list
> >                 7.08%  folio_referenced
> >                   6.78%  rmap_walk_file
> >                     6.72%  folio_referenced_one
> >                       5.60%  lru_gen_look_around
> >   =>                    1.53%  __mmu_notifier_test_clear_young
>
> Do you happen to know how much of the improvement is due to batching, and how
> much is due to using a walkless walk?

No. I have three benchmarks running at the moment:
1. Windows SQL server guest on x86 host,
2. Apache Spark guest on arm64 host, and
3. Memcached guest on ppc64 host.

If you are really interested in that, I can reprioritize -- I need to
stop 1) and use that machine to get the number for you.

> > @@ -5699,6 +5797,9 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
> >       if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
> >               caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
> >
> > +     if (kvm_arch_has_test_clear_young() && get_cap(LRU_GEN_SPTE_WALK))
> > +             caps |= BIT(LRU_GEN_SPTE_WALK);
>
> As alluded to in patch 1, unless batching the walks even if KVM does _not_ support
> a lockless walk is somehow _worse_ than using the existing mmu_notifier_clear_flush_young(),
> I think batching the calls should be conditional only on LRU_GEN_SPTE_WALK.  Or
> if we want to avoid batching when there are no mmu_notifier listeners, probe
> mmu_notifiers.  But don't call into KVM directly.

I'm not sure I fully understand. Let's present the problem on the MM
side: assuming KVM supports lockless walks, batching can still be
worse (very unlikely), because GFNs can exhibit no memory locality at
all. So this option allows userspace to disable batching.

I fully understand why you don't want MM to call into KVM directly. No
acceptable ways to set up a clear interface between MM and KVM other
than the MMU notifier?
Sean Christopherson Feb. 23, 2023, 7:11 p.m. UTC | #3
On Thu, Feb 23, 2023, Yu Zhao wrote:
> On Thu, Feb 23, 2023 at 10:43 AM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Thu, Feb 16, 2023, Yu Zhao wrote:
> > >   kswapd (MGLRU before)
> > >     100.00%  balance_pgdat
> > >       100.00%  shrink_node
> > >         100.00%  shrink_one
> > >           99.97%  try_to_shrink_lruvec
> > >             99.06%  evict_folios
> > >               97.41%  shrink_folio_list
> > >                 31.33%  folio_referenced
> > >                   31.06%  rmap_walk_file
> > >                     30.89%  folio_referenced_one
> > >                       20.83%  __mmu_notifier_clear_flush_young
> > >                         20.54%  kvm_mmu_notifier_clear_flush_young
> > >   =>                      19.34%  _raw_write_lock
> > >
> > >   kswapd (MGLRU after)
> > >     100.00%  balance_pgdat
> > >       100.00%  shrink_node
> > >         100.00%  shrink_one
> > >           99.97%  try_to_shrink_lruvec
> > >             99.51%  evict_folios
> > >               71.70%  shrink_folio_list
> > >                 7.08%  folio_referenced
> > >                   6.78%  rmap_walk_file
> > >                     6.72%  folio_referenced_one
> > >                       5.60%  lru_gen_look_around
> > >   =>                    1.53%  __mmu_notifier_test_clear_young
> >
> > Do you happen to know how much of the improvement is due to batching, and how
> > much is due to using a walkless walk?
> 
> No. I have three benchmarks running at the moment:
> 1. Windows SQL server guest on x86 host,
> 2. Apache Spark guest on arm64 host, and
> 3. Memcached guest on ppc64 host.
> 
> If you are really interested in that, I can reprioritize -- I need to
> stop 1) and use that machine to get the number for you.

After looking at the "MGLRU before" stack again, it's definitely worth getting
those numbers.  The "before" isn't just taking mmu_lock, it's taking mmu_lock for
write _and_ flushing remote TLBs on _every_ PTE.  I suspect the batching is a
tiny percentage of the overall win (might be larger with RETPOLINE and friends),
and that the bulk of the improvement comes from avoiding the insanity of
kvm_mmu_notifier_clear_flush_young().

Speaking of which, what would it take to drop mmu_notifier_clear_flush_young()
entirely?  I.e. why can MGLRU tolerate stale information but !MGLRU cannot?  If
we simply deleted mmu_notifier_clear_flush_young() and used mmu_notifier_clear_young()
instead, would anyone notice, let alone care?

> > > @@ -5699,6 +5797,9 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
> > >       if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
> > >               caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
> > >
> > > +     if (kvm_arch_has_test_clear_young() && get_cap(LRU_GEN_SPTE_WALK))
> > > +             caps |= BIT(LRU_GEN_SPTE_WALK);
> >
> > As alluded to in patch 1, unless batching the walks even if KVM does _not_ support
> > a lockless walk is somehow _worse_ than using the existing mmu_notifier_clear_flush_young(),
> > I think batching the calls should be conditional only on LRU_GEN_SPTE_WALK.  Or
> > if we want to avoid batching when there are no mmu_notifier listeners, probe
> > mmu_notifiers.  But don't call into KVM directly.
> 
> I'm not sure I fully understand. Let's present the problem on the MM
> side: assuming KVM supports lockless walks, batching can still be
> worse (very unlikely), because GFNs can exhibit no memory locality at
> all. So this option allows userspace to disable batching.

I'm asking the opposite.  Is there a scenario where batching+lock is worse than
!batching+lock?  If not, then don't make batching depend on lockless walks.

> I fully understand why you don't want MM to call into KVM directly. No
> acceptable ways to set up a clear interface between MM and KVM other
> than the MMU notifier?

There are several options I can think of, but before we go spend time designing
the best API, I'd rather figure out if we care in the first place.
Yu Zhao Feb. 23, 2023, 7:36 p.m. UTC | #4
On Thu, Feb 23, 2023 at 12:11 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Feb 23, 2023, Yu Zhao wrote:
> > On Thu, Feb 23, 2023 at 10:43 AM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Thu, Feb 16, 2023, Yu Zhao wrote:
> > > >   kswapd (MGLRU before)
> > > >     100.00%  balance_pgdat
> > > >       100.00%  shrink_node
> > > >         100.00%  shrink_one
> > > >           99.97%  try_to_shrink_lruvec
> > > >             99.06%  evict_folios
> > > >               97.41%  shrink_folio_list
> > > >                 31.33%  folio_referenced
> > > >                   31.06%  rmap_walk_file
> > > >                     30.89%  folio_referenced_one
> > > >                       20.83%  __mmu_notifier_clear_flush_young
> > > >                         20.54%  kvm_mmu_notifier_clear_flush_young
> > > >   =>                      19.34%  _raw_write_lock
> > > >
> > > >   kswapd (MGLRU after)
> > > >     100.00%  balance_pgdat
> > > >       100.00%  shrink_node
> > > >         100.00%  shrink_one
> > > >           99.97%  try_to_shrink_lruvec
> > > >             99.51%  evict_folios
> > > >               71.70%  shrink_folio_list
> > > >                 7.08%  folio_referenced
> > > >                   6.78%  rmap_walk_file
> > > >                     6.72%  folio_referenced_one
> > > >                       5.60%  lru_gen_look_around
> > > >   =>                    1.53%  __mmu_notifier_test_clear_young
> > >
> > > Do you happen to know how much of the improvement is due to batching, and how
> > > much is due to using a walkless walk?
> >
> > No. I have three benchmarks running at the moment:
> > 1. Windows SQL server guest on x86 host,
> > 2. Apache Spark guest on arm64 host, and
> > 3. Memcached guest on ppc64 host.
> >
> > If you are really interested in that, I can reprioritize -- I need to
> > stop 1) and use that machine to get the number for you.
>
> After looking at the "MGLRU before" stack again, it's definitely worth getting
> those numbers.  The "before" isn't just taking mmu_lock, it's taking mmu_lock for
> write _and_ flushing remote TLBs on _every_ PTE.

Correct.

> I suspect the batching is a
> tiny percentage of the overall win (might be larger with RETPOLINE and friends),

Same here.

> and that the bulk of the improvement comes from avoiding the insanity of
> kvm_mmu_notifier_clear_flush_young().
>
> Speaking of which, what would it take to drop mmu_notifier_clear_flush_young()
> entirely?

That's not my call :)

Adding Johannes.

> I.e. why can MGLRU tolerate stale information but !MGLRU cannot?

Good question. The native clear API doesn't flush:

  int ptep_clear_flush_young(struct vm_area_struct *vma,
                             unsigned long address, pte_t *ptep)
  {
          /*
           * On x86 CPUs, clearing the accessed bit without a TLB flush
           * doesn't cause data corruption. [ It could cause incorrect
           * page aging and the (mistaken) reclaim of hot pages, but the
           * chance of that should be relatively low. ]
           *
           * So as a performance optimization don't flush the TLB when
           * clearing the accessed bit, it will eventually be flushed by
           * a context switch or a VM operation anyway. [ In the rare
           * event of it not getting flushed for a long time the delay
           * shouldn't really matter because there's no real memory
           * pressure for swapout to react to. ]
           */
          return ptep_test_and_clear_young(vma, address, ptep);
  }

> If
> we simply deleted mmu_notifier_clear_flush_young() and used mmu_notifier_clear_young()
> instead, would anyone notice, let alone care?

I tend to agree.

> > > > @@ -5699,6 +5797,9 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
> > > >       if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
> > > >               caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
> > > >
> > > > +     if (kvm_arch_has_test_clear_young() && get_cap(LRU_GEN_SPTE_WALK))
> > > > +             caps |= BIT(LRU_GEN_SPTE_WALK);
> > >
> > > As alluded to in patch 1, unless batching the walks even if KVM does _not_ support
> > > a lockless walk is somehow _worse_ than using the existing mmu_notifier_clear_flush_young(),
> > > I think batching the calls should be conditional only on LRU_GEN_SPTE_WALK.  Or
> > > if we want to avoid batching when there are no mmu_notifier listeners, probe
> > > mmu_notifiers.  But don't call into KVM directly.
> >
> > I'm not sure I fully understand. Let's present the problem on the MM
> > side: assuming KVM supports lockless walks, batching can still be
> > worse (very unlikely), because GFNs can exhibit no memory locality at
> > all. So this option allows userspace to disable batching.
>
> I'm asking the opposite.  Is there a scenario where batching+lock is worse than
> !batching+lock?  If not, then don't make batching depend on lockless walks.

Yes, absolutely. batching+lock means we take/release mmu_lock for
every single PTE in the entire VA space -- each small batch contains
64 PTEs but the entire batch is the whole KVM.

> > I fully understand why you don't want MM to call into KVM directly. No
> > acceptable ways to set up a clear interface between MM and KVM other
> > than the MMU notifier?
>
> There are several options I can think of, but before we go spend time designing
> the best API, I'd rather figure out if we care in the first place.

This is self serving -- MGLRU would be the only user in the near
future. But I never assume there will be no common ground, at least it
doesn't hurt to check.
Sean Christopherson Feb. 23, 2023, 7:58 p.m. UTC | #5
On Thu, Feb 23, 2023, Yu Zhao wrote:
> On Thu, Feb 23, 2023 at 12:11 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Thu, Feb 23, 2023, Yu Zhao wrote:
> > > > As alluded to in patch 1, unless batching the walks even if KVM does _not_ support
> > > > a lockless walk is somehow _worse_ than using the existing mmu_notifier_clear_flush_young(),
> > > > I think batching the calls should be conditional only on LRU_GEN_SPTE_WALK.  Or
> > > > if we want to avoid batching when there are no mmu_notifier listeners, probe
> > > > mmu_notifiers.  But don't call into KVM directly.
> > >
> > > I'm not sure I fully understand. Let's present the problem on the MM
> > > side: assuming KVM supports lockless walks, batching can still be
> > > worse (very unlikely), because GFNs can exhibit no memory locality at
> > > all. So this option allows userspace to disable batching.
> >
> > I'm asking the opposite.  Is there a scenario where batching+lock is worse than
> > !batching+lock?  If not, then don't make batching depend on lockless walks.
> 
> Yes, absolutely. batching+lock means we take/release mmu_lock for
> every single PTE in the entire VA space -- each small batch contains
> 64 PTEs but the entire batch is the whole KVM.

Who is "we"?  I don't see anything in the kernel that triggers walking the whole
VMA, e.g. lru_gen_look_around() limits the walk to a single PMD.  I feel like I'm
missing something...
Yu Zhao Feb. 23, 2023, 8:09 p.m. UTC | #6
On Thu, Feb 23, 2023 at 12:58 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Feb 23, 2023, Yu Zhao wrote:
> > On Thu, Feb 23, 2023 at 12:11 PM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Thu, Feb 23, 2023, Yu Zhao wrote:
> > > > > As alluded to in patch 1, unless batching the walks even if KVM does _not_ support
> > > > > a lockless walk is somehow _worse_ than using the existing mmu_notifier_clear_flush_young(),
> > > > > I think batching the calls should be conditional only on LRU_GEN_SPTE_WALK.  Or
> > > > > if we want to avoid batching when there are no mmu_notifier listeners, probe
> > > > > mmu_notifiers.  But don't call into KVM directly.
> > > >
> > > > I'm not sure I fully understand. Let's present the problem on the MM
> > > > side: assuming KVM supports lockless walks, batching can still be
> > > > worse (very unlikely), because GFNs can exhibit no memory locality at
> > > > all. So this option allows userspace to disable batching.
> > >
> > > I'm asking the opposite.  Is there a scenario where batching+lock is worse than
> > > !batching+lock?  If not, then don't make batching depend on lockless walks.
> >
> > Yes, absolutely. batching+lock means we take/release mmu_lock for
> > every single PTE in the entire VA space -- each small batch contains
> > 64 PTEs but the entire batch is the whole KVM.
>
> Who is "we"?

Oops -- shouldn't have used "we".

> I don't see anything in the kernel that triggers walking the whole
> VMA, e.g. lru_gen_look_around() limits the walk to a single PMD.  I feel like I'm
> missing something...

walk_mm() -> walk_pud_range() -> walk_pmd_range() -> walk_pte_range()
-> test_spte_young() -> mmu_notifier_test_clear_young().

MGLRU takes two passes: during the first pass, it sweeps entire VA
space on each MM (per MM/KVM); during the second pass, it uses the rmap on each
folio (per folio). The look around exploits the (spatial) locality in
the second pass, to get the best out of the expensive per folio rmap
walk.

(The first pass can't handle shared mappings; the second pass can.)
Sean Christopherson Feb. 23, 2023, 8:28 p.m. UTC | #7
On Thu, Feb 23, 2023, Yu Zhao wrote:
> On Thu, Feb 23, 2023 at 12:58 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Thu, Feb 23, 2023, Yu Zhao wrote:
> > > On Thu, Feb 23, 2023 at 12:11 PM Sean Christopherson <seanjc@google.com> wrote:
> > > >
> > > > On Thu, Feb 23, 2023, Yu Zhao wrote:
> > > > > > As alluded to in patch 1, unless batching the walks even if KVM does _not_ support
> > > > > > a lockless walk is somehow _worse_ than using the existing mmu_notifier_clear_flush_young(),
> > > > > > I think batching the calls should be conditional only on LRU_GEN_SPTE_WALK.  Or
> > > > > > if we want to avoid batching when there are no mmu_notifier listeners, probe
> > > > > > mmu_notifiers.  But don't call into KVM directly.
> > > > >
> > > > > I'm not sure I fully understand. Let's present the problem on the MM
> > > > > side: assuming KVM supports lockless walks, batching can still be
> > > > > worse (very unlikely), because GFNs can exhibit no memory locality at
> > > > > all. So this option allows userspace to disable batching.
> > > >
> > > > I'm asking the opposite.  Is there a scenario where batching+lock is worse than
> > > > !batching+lock?  If not, then don't make batching depend on lockless walks.
> > >
> > > Yes, absolutely. batching+lock means we take/release mmu_lock for
> > > every single PTE in the entire VA space -- each small batch contains
> > > 64 PTEs but the entire batch is the whole KVM.
> >
> > Who is "we"?
> 
> Oops -- shouldn't have used "we".
> 
> > I don't see anything in the kernel that triggers walking the whole
> > VMA, e.g. lru_gen_look_around() limits the walk to a single PMD.  I feel like I'm
> > missing something...
> 
> walk_mm() -> walk_pud_range() -> walk_pmd_range() -> walk_pte_range()
> -> test_spte_young() -> mmu_notifier_test_clear_young().
> 
> MGLRU takes two passes: during the first pass, it sweeps entire VA
> space on each MM (per MM/KVM); during the second pass, it uses the rmap on each
> folio (per folio).

Ah.  IIUC, userspace can use LRU_GEN_SPTE_WALK to control whether or not to walk
secondary MMUs, and the kernel further restricts LRU_GEN_SPTE_WALK to secondary
MMUs that implement a lockless walk.  And if the answer is "no", secondary MMUs
are simply not consulted.

If that's correct, then the proper way to handle this is by extending mmu_notifier_ops
to query (a) if there's at least one register listeners that implements
test_clear_young() and (b) if all registered listeners that implement test_clear_young()
support lockless walks.  That avoids direct dependencies on KVM, and avoids making
assumptions that may not always hold true, e.g. that KVM is the only mmu_notifier
user that supports the young APIs.

P.S. all of this info absolutely belongs in documentation and/or changelogs.
Yu Zhao Feb. 23, 2023, 8:48 p.m. UTC | #8
On Thu, Feb 23, 2023 at 1:29 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Feb 23, 2023, Yu Zhao wrote:
> > On Thu, Feb 23, 2023 at 12:58 PM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Thu, Feb 23, 2023, Yu Zhao wrote:
> > > > On Thu, Feb 23, 2023 at 12:11 PM Sean Christopherson <seanjc@google.com> wrote:
> > > > >
> > > > > On Thu, Feb 23, 2023, Yu Zhao wrote:
> > > > > > > As alluded to in patch 1, unless batching the walks even if KVM does _not_ support
> > > > > > > a lockless walk is somehow _worse_ than using the existing mmu_notifier_clear_flush_young(),
> > > > > > > I think batching the calls should be conditional only on LRU_GEN_SPTE_WALK.  Or
> > > > > > > if we want to avoid batching when there are no mmu_notifier listeners, probe
> > > > > > > mmu_notifiers.  But don't call into KVM directly.
> > > > > >
> > > > > > I'm not sure I fully understand. Let's present the problem on the MM
> > > > > > side: assuming KVM supports lockless walks, batching can still be
> > > > > > worse (very unlikely), because GFNs can exhibit no memory locality at
> > > > > > all. So this option allows userspace to disable batching.
> > > > >
> > > > > I'm asking the opposite.  Is there a scenario where batching+lock is worse than
> > > > > !batching+lock?  If not, then don't make batching depend on lockless walks.
> > > >
> > > > Yes, absolutely. batching+lock means we take/release mmu_lock for
> > > > every single PTE in the entire VA space -- each small batch contains
> > > > 64 PTEs but the entire batch is the whole KVM.
> > >
> > > Who is "we"?
> >
> > Oops -- shouldn't have used "we".
> >
> > > I don't see anything in the kernel that triggers walking the whole
> > > VMA, e.g. lru_gen_look_around() limits the walk to a single PMD.  I feel like I'm
> > > missing something...
> >
> > walk_mm() -> walk_pud_range() -> walk_pmd_range() -> walk_pte_range()
> > -> test_spte_young() -> mmu_notifier_test_clear_young().
> >
> > MGLRU takes two passes: during the first pass, it sweeps entire VA
> > space on each MM (per MM/KVM); during the second pass, it uses the rmap on each
> > folio (per folio).
>
> Ah.  IIUC, userspace can use LRU_GEN_SPTE_WALK to control whether or not to walk
> secondary MMUs, and the kernel further restricts LRU_GEN_SPTE_WALK to secondary
> MMUs that implement a lockless walk.  And if the answer is "no", secondary MMUs
> are simply not consulted.

Sorry for the bad naming -- probably LRU_GEN_SPTE_BATCH_WALK would be
less confusing.

MGLRU always consults the secondary MMU for each page it's going to
reclaim (during the second pass), i.e., it checks the A-bit in the
SPTE mapping a page (by the rmap) it plans to reclaim so that it won't
take a hot page away from KVM.

If the lockless walk is supported, MGLRU doesn't need to work at page
granularity: (physical) pages on the LRU list may have nothing in
common (e.g., from different processes), checking their PTEs/SPTEs one
by one is expensive. Instead, it sweeps the entire KVM spaces in the
first pass and checks the *adjacent SPTEs* of a page it plans to
reclaim in the second pass. Both rely on the *assumption* there would
be some spatial locality to exploit. This assumption can be wrong, and
LRU_GEN_SPTE_WALK disables it.

> If that's correct, then the proper way to handle this is by extending mmu_notifier_ops
> to query (a) if there's at least one register listeners that implements
> test_clear_young() and (b) if all registered listeners that implement test_clear_young()
> support lockless walks.  That avoids direct dependencies on KVM, and avoids making
> assumptions that may not always hold true, e.g. that KVM is the only mmu_notifier
> user that supports the young APIs.
>
> P.S. all of this info absolutely belongs in documentation and/or changelogs.

Will do.
diff mbox series

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9fb1b03b83b2..ce34b7ea8e4c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -379,6 +379,7 @@  enum {
 	LRU_GEN_CORE,
 	LRU_GEN_MM_WALK,
 	LRU_GEN_NONLEAF_YOUNG,
+	LRU_GEN_SPTE_WALK,
 	NR_LRU_GEN_CAPS
 };
 
@@ -485,7 +486,7 @@  struct lru_gen_mm_walk {
 };
 
 void lru_gen_init_lruvec(struct lruvec *lruvec);
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
 
 #ifdef CONFIG_MEMCG
 
@@ -573,8 +574,9 @@  static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
 {
 }
 
-static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 {
+	return false;
 }
 
 #ifdef CONFIG_MEMCG
diff --git a/mm/rmap.c b/mm/rmap.c
index 15ae24585fc4..eb0089f8f112 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -823,12 +823,10 @@  static bool folio_referenced_one(struct folio *folio,
 			return false; /* To break the loop */
 		}
 
-		if (pvmw.pte) {
-			if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
-				lru_gen_look_around(&pvmw);
+		if (lru_gen_enabled() && pvmw.pte) {
+			if (lru_gen_look_around(&pvmw))
 				referenced++;
-			}
-
+		} else if (pvmw.pte) {
 			if (ptep_clear_flush_young_notify(vma, address,
 						pvmw.pte))
 				referenced++;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9c1c5e8b24b8..d6d69f0baabf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -57,6 +57,8 @@ 
 #include <linux/khugepaged.h>
 #include <linux/rculist_nulls.h>
 #include <linux/random.h>
+#include <linux/mmu_notifier.h>
+#include <linux/kvm_host.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -3923,6 +3925,55 @@  static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
 	return folio;
 }
 
+static bool test_spte_young(struct mm_struct *mm, unsigned long addr, unsigned long end,
+			    unsigned long *bitmap, unsigned long *last)
+{
+	if (!kvm_arch_has_test_clear_young() || !get_cap(LRU_GEN_SPTE_WALK))
+		return false;
+
+	if (*last > addr)
+		goto done;
+
+	*last = end - addr > MIN_LRU_BATCH * PAGE_SIZE ?
+		addr + MIN_LRU_BATCH * PAGE_SIZE - 1 : end - 1;
+	bitmap_zero(bitmap, MIN_LRU_BATCH);
+
+	mmu_notifier_test_clear_young(mm, addr, *last + 1, false, bitmap);
+done:
+	return test_bit((*last - addr) / PAGE_SIZE, bitmap);
+}
+
+static void clear_spte_young(struct mm_struct *mm, unsigned long addr,
+			     unsigned long *bitmap, unsigned long *last)
+{
+	int i;
+	unsigned long start, end = *last + 1;
+
+	if (addr + PAGE_SIZE != end)
+		return;
+
+	i = find_last_bit(bitmap, MIN_LRU_BATCH);
+	if (i == MIN_LRU_BATCH)
+		return;
+
+	start = end - (i + 1) * PAGE_SIZE;
+
+	i = find_first_bit(bitmap, MIN_LRU_BATCH);
+
+	end -= i * PAGE_SIZE;
+
+	mmu_notifier_test_clear_young(mm, start, end, false, bitmap);
+}
+
+static void skip_spte_young(struct mm_struct *mm, unsigned long addr,
+			    unsigned long *bitmap, unsigned long *last)
+{
+	if (*last > addr)
+		__clear_bit((*last - addr) / PAGE_SIZE, bitmap);
+
+	clear_spte_young(mm, addr, bitmap, last);
+}
+
 static bool suitable_to_scan(int total, int young)
 {
 	int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
@@ -3938,6 +3989,8 @@  static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 	pte_t *pte;
 	spinlock_t *ptl;
 	unsigned long addr;
+	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
+	unsigned long last = 0;
 	int total = 0;
 	int young = 0;
 	struct lru_gen_mm_walk *walk = args->private;
@@ -3956,6 +4009,7 @@  static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 	pte = pte_offset_map(pmd, start & PMD_MASK);
 restart:
 	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
+		bool success;
 		unsigned long pfn;
 		struct folio *folio;
 
@@ -3963,20 +4017,27 @@  static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 		walk->mm_stats[MM_LEAF_TOTAL]++;
 
 		pfn = get_pte_pfn(pte[i], args->vma, addr);
-		if (pfn == -1)
+		if (pfn == -1) {
+			skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
 			continue;
+		}
 
-		if (!pte_young(pte[i])) {
+		success = test_spte_young(args->vma->vm_mm, addr, end, bitmap, &last);
+		if (!success && !pte_young(pte[i])) {
+			skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
 			walk->mm_stats[MM_LEAF_OLD]++;
 			continue;
 		}
 
 		folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
-		if (!folio)
+		if (!folio) {
+			skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
 			continue;
+		}
 
-		if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
-			VM_WARN_ON_ONCE(true);
+		clear_spte_young(args->vma->vm_mm, addr, bitmap, &last);
+		if (pte_young(pte[i]))
+			ptep_test_and_clear_young(args->vma, addr, pte + i);
 
 		young++;
 		walk->mm_stats[MM_LEAF_YOUNG]++;
@@ -4581,6 +4642,24 @@  static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  *                          rmap/PT walk feedback
  ******************************************************************************/
 
+static bool should_look_around(struct vm_area_struct *vma, unsigned long addr,
+			       pte_t *pte, int *young)
+{
+	unsigned long old = true;
+
+	*young = mmu_notifier_test_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE, true, &old);
+	if (!old)
+		*young = true;
+
+	if (pte_young(*pte)) {
+		ptep_test_and_clear_young(vma, addr, pte);
+		*young = true;
+		return true;
+	}
+
+	return !old && get_cap(LRU_GEN_SPTE_WALK);
+}
+
 /*
  * This function exploits spatial locality when shrink_folio_list() walks the
  * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
@@ -4588,12 +4667,14 @@  static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  * the PTE table to the Bloom filter. This forms a feedback loop between the
  * eviction and the aging.
  */
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 {
 	int i;
 	unsigned long start;
 	unsigned long end;
 	struct lru_gen_mm_walk *walk;
+	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
+	unsigned long last = 0;
 	int young = 0;
 	pte_t *pte = pvmw->pte;
 	unsigned long addr = pvmw->address;
@@ -4607,8 +4688,11 @@  void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	lockdep_assert_held(pvmw->ptl);
 	VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
 
+	if (!should_look_around(pvmw->vma, addr, pte, &young))
+		return young;
+
 	if (spin_is_contended(pvmw->ptl))
-		return;
+		return young;
 
 	/* avoid taking the LRU lock under the PTL when possible */
 	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
@@ -4616,6 +4700,9 @@  void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	start = max(addr & PMD_MASK, pvmw->vma->vm_start);
 	end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
 
+	if (end - start == PAGE_SIZE)
+		return young;
+
 	if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
 		if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
 			end = start + MIN_LRU_BATCH * PAGE_SIZE;
@@ -4629,28 +4716,37 @@  void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 
 	/* folio_update_gen() requires stable folio_memcg() */
 	if (!mem_cgroup_trylock_pages(memcg))
-		return;
+		return young;
 
 	arch_enter_lazy_mmu_mode();
 
 	pte -= (addr - start) / PAGE_SIZE;
 
 	for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+		bool success;
 		unsigned long pfn;
 
 		pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
-		if (pfn == -1)
+		if (pfn == -1) {
+			skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
 			continue;
+		}
 
-		if (!pte_young(pte[i]))
+		success = test_spte_young(pvmw->vma->vm_mm, addr, end, bitmap, &last);
+		if (!success && !pte_young(pte[i])) {
+			skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
 			continue;
+		}
 
 		folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap);
-		if (!folio)
+		if (!folio) {
+			skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
 			continue;
+		}
 
-		if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
-			VM_WARN_ON_ONCE(true);
+		clear_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
+		if (pte_young(pte[i]))
+			ptep_test_and_clear_young(pvmw->vma, addr, pte + i);
 
 		young++;
 
@@ -4680,6 +4776,8 @@  void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	/* feedback from rmap walkers to page table walkers */
 	if (suitable_to_scan(i, young))
 		update_bloom_filter(lruvec, max_seq, pvmw->pmd);
+
+	return young;
 }
 
 /******************************************************************************
@@ -5699,6 +5797,9 @@  static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
 	if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
 		caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
 
+	if (kvm_arch_has_test_clear_young() && get_cap(LRU_GEN_SPTE_WALK))
+		caps |= BIT(LRU_GEN_SPTE_WALK);
+
 	return sysfs_emit(buf, "0x%04x\n", caps);
 }