diff mbox series

[1/1] s390/kvm: implement clearing part of IPL clear

Message ID 20180228195320.165230-1-borntraeger@de.ibm.com
State New
Headers show
Series [1/1] s390/kvm: implement clearing part of IPL clear | expand

Commit Message

Christian Borntraeger Feb. 28, 2018, 7:53 p.m. UTC
When a guests reboots with diagnose 308 subcode 3 it requests the memory
to be cleared. We did not do it so far. This does not only violate the
architecture, it also misses the chance to free up that memory on
reboot, which would help on host memory over commitment.  By using
ram_block_discard_range we can cover both cases.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 target/s390x/kvm.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

Comments

Thomas Huth March 1, 2018, 3:58 a.m. UTC | #1
On 28.02.2018 20:53, Christian Borntraeger wrote:
> When a guests reboots with diagnose 308 subcode 3 it requests the memory
> to be cleared. We did not do it so far. This does not only violate the
> architecture, it also misses the chance to free up that memory on
> reboot, which would help on host memory over commitment.  By using
> ram_block_discard_range we can cover both cases.

Sounds like a good idea. I wonder whether that release_all_ram()
function should maybe rather reside in exec.c, so that other machines
that want to clear all RAM at reset time can use it, too?

> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> ---
>  target/s390x/kvm.c | 19 +++++++++++++++++++
>  1 file changed, 19 insertions(+)
> 
> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
> index 8f3a422288..2e145ad5c3 100644
> --- a/target/s390x/kvm.c
> +++ b/target/s390x/kvm.c
> @@ -34,6 +34,8 @@
>  #include "qapi/error.h"
>  #include "qemu/error-report.h"
>  #include "qemu/timer.h"
> +#include "qemu/rcu_queue.h"
> +#include "sysemu/cpus.h"
>  #include "sysemu/sysemu.h"
>  #include "sysemu/hw_accel.h"
>  #include "hw/boards.h"
> @@ -41,6 +43,7 @@
>  #include "sysemu/device_tree.h"
>  #include "exec/gdbstub.h"
>  #include "exec/address-spaces.h"
> +#include "exec/ram_addr.h"
>  #include "trace.h"
>  #include "qapi-event.h"
>  #include "hw/s390x/s390-pci-inst.h"
> @@ -1841,6 +1844,14 @@ static int kvm_arch_handle_debug_exit(S390CPU *cpu)
>      return ret;
>  }
>  
> +static void release_all_rams(void)

s/rams/ram/ maybe?

> +{
> +    struct RAMBlock *rb;
> +
> +    QLIST_FOREACH_RCU(rb, &ram_list.blocks, next)
> +        ram_block_discard_range(rb, 0, rb->used_length);

From a coding style point of view, I think there should be curly braces
around ram_block_discard_range() ?

> +}
> +
>  int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
>  {
>      S390CPU *cpu = S390_CPU(cs);
> @@ -1853,6 +1864,14 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
>              ret = handle_intercept(cpu);
>              break;
>          case KVM_EXIT_S390_RESET:
> +            if (run->s390_reset_flags & KVM_S390_RESET_CLEAR) {
> +                /*
> +                 * We will stop other CPUs anyway, avoid spurious crashes and
> +                 * get all CPUs out. The reset will take care of the resume.
> +                 */
> +                pause_all_vcpus();
> +                release_all_rams();
> +            }
>              s390_reipl_request();
>              break;
>          case KVM_EXIT_S390_TSCH:
> 

Apart from the cosmetic nits, patch looks good to me.

 Thomas
Christian Borntraeger March 1, 2018, 7:37 a.m. UTC | #2
On 03/01/2018 04:58 AM, Thomas Huth wrote:
> On 28.02.2018 20:53, Christian Borntraeger wrote:
>> When a guests reboots with diagnose 308 subcode 3 it requests the memory
>> to be cleared. We did not do it so far. This does not only violate the
>> architecture, it also misses the chance to free up that memory on
>> reboot, which would help on host memory over commitment.  By using
>> ram_block_discard_range we can cover both cases.
> 
> Sounds like a good idea. I wonder whether that release_all_ram()
> function should maybe rather reside in exec.c, so that other machines
> that want to clear all RAM at reset time can use it, too?

You already added Paolo, David - good.
I am open to that. As an alternative we can certainly move this function
from s390x/kvm.c to exec.c at a later point in time if a 2nd user comes along.

> 
>> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
>> ---
>>  target/s390x/kvm.c | 19 +++++++++++++++++++
>>  1 file changed, 19 insertions(+)
>>
>> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
>> index 8f3a422288..2e145ad5c3 100644
>> --- a/target/s390x/kvm.c
>> +++ b/target/s390x/kvm.c
>> @@ -34,6 +34,8 @@
>>  #include "qapi/error.h"
>>  #include "qemu/error-report.h"
>>  #include "qemu/timer.h"
>> +#include "qemu/rcu_queue.h"
>> +#include "sysemu/cpus.h"
>>  #include "sysemu/sysemu.h"
>>  #include "sysemu/hw_accel.h"
>>  #include "hw/boards.h"
>> @@ -41,6 +43,7 @@
>>  #include "sysemu/device_tree.h"
>>  #include "exec/gdbstub.h"
>>  #include "exec/address-spaces.h"
>> +#include "exec/ram_addr.h"
>>  #include "trace.h"
>>  #include "qapi-event.h"
>>  #include "hw/s390x/s390-pci-inst.h"
>> @@ -1841,6 +1844,14 @@ static int kvm_arch_handle_debug_exit(S390CPU *cpu)
>>      return ret;
>>  }
>>  
>> +static void release_all_rams(void)
> 
> s/rams/ram/ maybe?

yes.
> 
>> +{
>> +    struct RAMBlock *rb;
>> +
>> +    QLIST_FOREACH_RCU(rb, &ram_list.blocks, next)
>> +        ram_block_discard_range(rb, 0, rb->used_length);
> 
> From a coding style point of view, I think there should be curly braces
> around ram_block_discard_range() ?

yes.
> 
>> +}
>> +
>>  int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
>>  {
>>      S390CPU *cpu = S390_CPU(cs);
>> @@ -1853,6 +1864,14 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
>>              ret = handle_intercept(cpu);
>>              break;
>>          case KVM_EXIT_S390_RESET:
>> +            if (run->s390_reset_flags & KVM_S390_RESET_CLEAR) {
>> +                /*
>> +                 * We will stop other CPUs anyway, avoid spurious crashes and
>> +                 * get all CPUs out. The reset will take care of the resume.
>> +                 */
>> +                pause_all_vcpus();
>> +                release_all_rams();
>> +            }
>>              s390_reipl_request();
>>              break;
>>          case KVM_EXIT_S390_TSCH:
>>
> 
> Apart from the cosmetic nits, patch looks good to me.

Thanks. Will wait with the resend till Paolo,David have some comments.
Paolo Bonzini March 1, 2018, 8:44 a.m. UTC | #3
On 01/03/2018 04:58, Thomas Huth wrote:
>> When a guests reboots with diagnose 308 subcode 3 it requests the memory
>> to be cleared. We did not do it so far. This does not only violate the
>> architecture, it also misses the chance to free up that memory on
>> reboot, which would help on host memory over commitment.  By using
>> ram_block_discard_range we can cover both cases.
> 
> Sounds like a good idea. I wonder whether that release_all_ram()
> function should maybe rather reside in exec.c, so that other machines
> that want to clear all RAM at reset time can use it, too?

Either way is fine for me.  With s/rams/ram/ the s390-only patch is ok.

Paolo
David Hildenbrand March 1, 2018, 9:21 a.m. UTC | #4
On 28.02.2018 20:53, Christian Borntraeger wrote:
> When a guests reboots with diagnose 308 subcode 3 it requests the memory
> to be cleared. We did not do it so far. This does not only violate the
> architecture, it also misses the chance to free up that memory on
> reboot, which would help on host memory over commitment.  By using
> ram_block_discard_range we can cover both cases.
> 
> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> ---
>  target/s390x/kvm.c | 19 +++++++++++++++++++
>  1 file changed, 19 insertions(+)
> 
> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
> index 8f3a422288..2e145ad5c3 100644
> --- a/target/s390x/kvm.c
> +++ b/target/s390x/kvm.c
> @@ -34,6 +34,8 @@
>  #include "qapi/error.h"
>  #include "qemu/error-report.h"
>  #include "qemu/timer.h"
> +#include "qemu/rcu_queue.h"
> +#include "sysemu/cpus.h"
>  #include "sysemu/sysemu.h"
>  #include "sysemu/hw_accel.h"
>  #include "hw/boards.h"
> @@ -41,6 +43,7 @@
>  #include "sysemu/device_tree.h"
>  #include "exec/gdbstub.h"
>  #include "exec/address-spaces.h"
> +#include "exec/ram_addr.h"
>  #include "trace.h"
>  #include "qapi-event.h"
>  #include "hw/s390x/s390-pci-inst.h"
> @@ -1841,6 +1844,14 @@ static int kvm_arch_handle_debug_exit(S390CPU *cpu)
>      return ret;
>  }
>  
> +static void release_all_rams(void)
> +{
> +    struct RAMBlock *rb;
> +
> +    QLIST_FOREACH_RCU(rb, &ram_list.blocks, next)
> +        ram_block_discard_range(rb, 0, rb->used_length);
> +}
> +
>  int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
>  {
>      S390CPU *cpu = S390_CPU(cs);
> @@ -1853,6 +1864,14 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
>              ret = handle_intercept(cpu);
>              break;
>          case KVM_EXIT_S390_RESET:
> +            if (run->s390_reset_flags & KVM_S390_RESET_CLEAR) {
> +                /*
> +                 * We will stop other CPUs anyway, avoid spurious crashes and
> +                 * get all CPUs out. The reset will take care of the resume.
> +                 */
> +                pause_all_vcpus();
> +                release_all_rams();
> +            }

Hm, I have a patch lying around that tries to match pause/resume calls,
and only starts all CPUs again, once the last resume is called. (I
assume that there is the chance for a race between 2 callers of
pause/resume - you think it cannot happen, but as pause_all_vcpus() will
temporarily give up the qemu mutex, bad things can happen).

Can't we communicate that information to our ipl device and do the
clearing during the reset?

>              s390_reipl_request();
>              break;
>          case KVM_EXIT_S390_TSCH:
>
Dr. David Alan Gilbert March 1, 2018, 9:24 a.m. UTC | #5
* Thomas Huth (thuth@redhat.com) wrote:
> On 28.02.2018 20:53, Christian Borntraeger wrote:
> > When a guests reboots with diagnose 308 subcode 3 it requests the memory
> > to be cleared. We did not do it so far. This does not only violate the
> > architecture, it also misses the chance to free up that memory on
> > reboot, which would help on host memory over commitment.  By using
> > ram_block_discard_range we can cover both cases.
> 
> Sounds like a good idea. I wonder whether that release_all_ram()
> function should maybe rather reside in exec.c, so that other machines
> that want to clear all RAM at reset time can use it, too?
> 
> > Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> > ---
> >  target/s390x/kvm.c | 19 +++++++++++++++++++
> >  1 file changed, 19 insertions(+)
> > 
> > diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
> > index 8f3a422288..2e145ad5c3 100644
> > --- a/target/s390x/kvm.c
> > +++ b/target/s390x/kvm.c
> > @@ -34,6 +34,8 @@
> >  #include "qapi/error.h"
> >  #include "qemu/error-report.h"
> >  #include "qemu/timer.h"
> > +#include "qemu/rcu_queue.h"
> > +#include "sysemu/cpus.h"
> >  #include "sysemu/sysemu.h"
> >  #include "sysemu/hw_accel.h"
> >  #include "hw/boards.h"
> > @@ -41,6 +43,7 @@
> >  #include "sysemu/device_tree.h"
> >  #include "exec/gdbstub.h"
> >  #include "exec/address-spaces.h"
> > +#include "exec/ram_addr.h"
> >  #include "trace.h"
> >  #include "qapi-event.h"
> >  #include "hw/s390x/s390-pci-inst.h"
> > @@ -1841,6 +1844,14 @@ static int kvm_arch_handle_debug_exit(S390CPU *cpu)
> >      return ret;
> >  }
> >  
> > +static void release_all_rams(void)
> 
> s/rams/ram/ maybe?
> 
> > +{
> > +    struct RAMBlock *rb;
> > +
> > +    QLIST_FOREACH_RCU(rb, &ram_list.blocks, next)
> > +        ram_block_discard_range(rb, 0, rb->used_length);
> 
> From a coding style point of view, I think there should be curly braces
> around ram_block_discard_range() ?

I think this might break if it happens during a postcopy migrate.
The destination CPU is running, so it can do a reboot at just the wrong
time; and then the pages (that are protected by userfaultfd) would get
deallocated and trigger userfaultfd requests if accessed.

Dave

> > +}
> > +
> >  int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
> >  {
> >      S390CPU *cpu = S390_CPU(cs);
> > @@ -1853,6 +1864,14 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
> >              ret = handle_intercept(cpu);
> >              break;
> >          case KVM_EXIT_S390_RESET:
> > +            if (run->s390_reset_flags & KVM_S390_RESET_CLEAR) {
> > +                /*
> > +                 * We will stop other CPUs anyway, avoid spurious crashes and
> > +                 * get all CPUs out. The reset will take care of the resume.
> > +                 */
> > +                pause_all_vcpus();
> > +                release_all_rams();
> > +            }
> >              s390_reipl_request();
> >              break;
> >          case KVM_EXIT_S390_TSCH:
> > 
> 
> Apart from the cosmetic nits, patch looks good to me.
> 
>  Thomas
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Christian Borntraeger March 1, 2018, 11 a.m. UTC | #6
On 03/01/2018 10:24 AM, Dr. David Alan Gilbert wrote:
> * Thomas Huth (thuth@redhat.com) wrote:
>> On 28.02.2018 20:53, Christian Borntraeger wrote:
>>> When a guests reboots with diagnose 308 subcode 3 it requests the memory
>>> to be cleared. We did not do it so far. This does not only violate the
>>> architecture, it also misses the chance to free up that memory on
>>> reboot, which would help on host memory over commitment.  By using
>>> ram_block_discard_range we can cover both cases.
>>
>> Sounds like a good idea. I wonder whether that release_all_ram()
>> function should maybe rather reside in exec.c, so that other machines
>> that want to clear all RAM at reset time can use it, too?
>>
>>> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
>>> ---
>>>  target/s390x/kvm.c | 19 +++++++++++++++++++
>>>  1 file changed, 19 insertions(+)
>>>
>>> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
>>> index 8f3a422288..2e145ad5c3 100644
>>> --- a/target/s390x/kvm.c
>>> +++ b/target/s390x/kvm.c
>>> @@ -34,6 +34,8 @@
>>>  #include "qapi/error.h"
>>>  #include "qemu/error-report.h"
>>>  #include "qemu/timer.h"
>>> +#include "qemu/rcu_queue.h"
>>> +#include "sysemu/cpus.h"
>>>  #include "sysemu/sysemu.h"
>>>  #include "sysemu/hw_accel.h"
>>>  #include "hw/boards.h"
>>> @@ -41,6 +43,7 @@
>>>  #include "sysemu/device_tree.h"
>>>  #include "exec/gdbstub.h"
>>>  #include "exec/address-spaces.h"
>>> +#include "exec/ram_addr.h"
>>>  #include "trace.h"
>>>  #include "qapi-event.h"
>>>  #include "hw/s390x/s390-pci-inst.h"
>>> @@ -1841,6 +1844,14 @@ static int kvm_arch_handle_debug_exit(S390CPU *cpu)
>>>      return ret;
>>>  }
>>>  
>>> +static void release_all_rams(void)
>>
>> s/rams/ram/ maybe?
>>
>>> +{
>>> +    struct RAMBlock *rb;
>>> +
>>> +    QLIST_FOREACH_RCU(rb, &ram_list.blocks, next)
>>> +        ram_block_discard_range(rb, 0, rb->used_length);
>>
>> From a coding style point of view, I think there should be curly braces
>> around ram_block_discard_range() ?
> 
> I think this might break if it happens during a postcopy migrate.
> The destination CPU is running, so it can do a reboot at just the wrong
> time; and then the pages (that are protected by userfaultfd) would get
> deallocated and trigger userfaultfd requests if accessed.

Yes, userfaultd/postcopy is really fragile and relies on things that are not
necessarily true (e.g. virito-balloon can also invalidate pages).

The right thing here would be to actually terminate the postcopy migrate but
return it as "successful" (since we are going to clear that RAM anyway). Do 
you see a good way to achieve that?


> 
> Dave
> 
>>> +}
>>> +
>>>  int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
>>>  {
>>>      S390CPU *cpu = S390_CPU(cs);
>>> @@ -1853,6 +1864,14 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
>>>              ret = handle_intercept(cpu);
>>>              break;
>>>          case KVM_EXIT_S390_RESET:
>>> +            if (run->s390_reset_flags & KVM_S390_RESET_CLEAR) {
>>> +                /*
>>> +                 * We will stop other CPUs anyway, avoid spurious crashes and
>>> +                 * get all CPUs out. The reset will take care of the resume.
>>> +                 */
>>> +                pause_all_vcpus();
>>> +                release_all_rams();
>>> +            }
>>>              s390_reipl_request();
>>>              break;
>>>          case KVM_EXIT_S390_TSCH:
>>>
>>
>> Apart from the cosmetic nits, patch looks good to me.
>>
>>  Thomas
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Dr. David Alan Gilbert March 1, 2018, 11:45 a.m. UTC | #7
* Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> 
> 
> On 03/01/2018 10:24 AM, Dr. David Alan Gilbert wrote:
> > * Thomas Huth (thuth@redhat.com) wrote:
> >> On 28.02.2018 20:53, Christian Borntraeger wrote:
> >>> When a guests reboots with diagnose 308 subcode 3 it requests the memory
> >>> to be cleared. We did not do it so far. This does not only violate the
> >>> architecture, it also misses the chance to free up that memory on
> >>> reboot, which would help on host memory over commitment.  By using
> >>> ram_block_discard_range we can cover both cases.
> >>
> >> Sounds like a good idea. I wonder whether that release_all_ram()
> >> function should maybe rather reside in exec.c, so that other machines
> >> that want to clear all RAM at reset time can use it, too?
> >>
> >>> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> >>> ---
> >>>  target/s390x/kvm.c | 19 +++++++++++++++++++
> >>>  1 file changed, 19 insertions(+)
> >>>
> >>> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
> >>> index 8f3a422288..2e145ad5c3 100644
> >>> --- a/target/s390x/kvm.c
> >>> +++ b/target/s390x/kvm.c
> >>> @@ -34,6 +34,8 @@
> >>>  #include "qapi/error.h"
> >>>  #include "qemu/error-report.h"
> >>>  #include "qemu/timer.h"
> >>> +#include "qemu/rcu_queue.h"
> >>> +#include "sysemu/cpus.h"
> >>>  #include "sysemu/sysemu.h"
> >>>  #include "sysemu/hw_accel.h"
> >>>  #include "hw/boards.h"
> >>> @@ -41,6 +43,7 @@
> >>>  #include "sysemu/device_tree.h"
> >>>  #include "exec/gdbstub.h"
> >>>  #include "exec/address-spaces.h"
> >>> +#include "exec/ram_addr.h"
> >>>  #include "trace.h"
> >>>  #include "qapi-event.h"
> >>>  #include "hw/s390x/s390-pci-inst.h"
> >>> @@ -1841,6 +1844,14 @@ static int kvm_arch_handle_debug_exit(S390CPU *cpu)
> >>>      return ret;
> >>>  }
> >>>  
> >>> +static void release_all_rams(void)
> >>
> >> s/rams/ram/ maybe?
> >>
> >>> +{
> >>> +    struct RAMBlock *rb;
> >>> +
> >>> +    QLIST_FOREACH_RCU(rb, &ram_list.blocks, next)
> >>> +        ram_block_discard_range(rb, 0, rb->used_length);
> >>
> >> From a coding style point of view, I think there should be curly braces
> >> around ram_block_discard_range() ?
> > 
> > I think this might break if it happens during a postcopy migrate.
> > The destination CPU is running, so it can do a reboot at just the wrong
> > time; and then the pages (that are protected by userfaultfd) would get
> > deallocated and trigger userfaultfd requests if accessed.
> 
> Yes, userfaultd/postcopy is really fragile and relies on things that are not
> necessarily true (e.g. virito-balloon can also invalidate pages).

That's why we use qemu_balloon_inhibit around postcopy to stop
ballooning; I'm not aware of anything else that does the same.

> The right thing here would be to actually terminate the postcopy migrate but
> return it as "successful" (since we are going to clear that RAM anyway). Do 
> you see a good way to achieve that?

There's no current mechanism to do it; I think it would have to involve
some interaction with the source as well though to tell it that you
didn't need that area of RAM anyway.

However, there are more problems:
  a) Even forgetting the userfault problem, this is racy since during
postcopy you're still receiving blocks from the source at the same time;
so some of the area that you've discarded might get overwritten by data
from the source.

  b) Your release_all_rams seems to do all RAM Blocks - won't that nuke
any ROMs as well? Or maybe even flash?

  c) In a normal precopy migration, I think you may also get old data;
Paolo said that an MADV_DONTNEED won't cause the dirty flags to be set,
so if the migrate has already sent the data for a page, and then this
happens, before the CPUs are stopped during the migration, when you
restart on the destination you'll have the old data.

Dave

> 
> > 
> > Dave
> > 
> >>> +}
> >>> +
> >>>  int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
> >>>  {
> >>>      S390CPU *cpu = S390_CPU(cs);
> >>> @@ -1853,6 +1864,14 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
> >>>              ret = handle_intercept(cpu);
> >>>              break;
> >>>          case KVM_EXIT_S390_RESET:
> >>> +            if (run->s390_reset_flags & KVM_S390_RESET_CLEAR) {
> >>> +                /*
> >>> +                 * We will stop other CPUs anyway, avoid spurious crashes and
> >>> +                 * get all CPUs out. The reset will take care of the resume.
> >>> +                 */
> >>> +                pause_all_vcpus();
> >>> +                release_all_rams();
> >>> +            }
> >>>              s390_reipl_request();
> >>>              break;
> >>>          case KVM_EXIT_S390_TSCH:
> >>>
> >>
> >> Apart from the cosmetic nits, patch looks good to me.
> >>
> >>  Thomas
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Christian Borntraeger March 1, 2018, 12:08 p.m. UTC | #8
On 03/01/2018 12:45 PM, Dr. David Alan Gilbert wrote:
> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
>>
>>
>> On 03/01/2018 10:24 AM, Dr. David Alan Gilbert wrote:
>>> * Thomas Huth (thuth@redhat.com) wrote:
>>>> On 28.02.2018 20:53, Christian Borntraeger wrote:
>>>>> When a guests reboots with diagnose 308 subcode 3 it requests the memory
>>>>> to be cleared. We did not do it so far. This does not only violate the
>>>>> architecture, it also misses the chance to free up that memory on
>>>>> reboot, which would help on host memory over commitment.  By using
>>>>> ram_block_discard_range we can cover both cases.
>>>>
>>>> Sounds like a good idea. I wonder whether that release_all_ram()
>>>> function should maybe rather reside in exec.c, so that other machines
>>>> that want to clear all RAM at reset time can use it, too?
>>>>
>>>>> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
>>>>> ---
>>>>>  target/s390x/kvm.c | 19 +++++++++++++++++++
>>>>>  1 file changed, 19 insertions(+)
>>>>>
>>>>> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
>>>>> index 8f3a422288..2e145ad5c3 100644
>>>>> --- a/target/s390x/kvm.c
>>>>> +++ b/target/s390x/kvm.c
>>>>> @@ -34,6 +34,8 @@
>>>>>  #include "qapi/error.h"
>>>>>  #include "qemu/error-report.h"
>>>>>  #include "qemu/timer.h"
>>>>> +#include "qemu/rcu_queue.h"
>>>>> +#include "sysemu/cpus.h"
>>>>>  #include "sysemu/sysemu.h"
>>>>>  #include "sysemu/hw_accel.h"
>>>>>  #include "hw/boards.h"
>>>>> @@ -41,6 +43,7 @@
>>>>>  #include "sysemu/device_tree.h"
>>>>>  #include "exec/gdbstub.h"
>>>>>  #include "exec/address-spaces.h"
>>>>> +#include "exec/ram_addr.h"
>>>>>  #include "trace.h"
>>>>>  #include "qapi-event.h"
>>>>>  #include "hw/s390x/s390-pci-inst.h"
>>>>> @@ -1841,6 +1844,14 @@ static int kvm_arch_handle_debug_exit(S390CPU *cpu)
>>>>>      return ret;
>>>>>  }
>>>>>  
>>>>> +static void release_all_rams(void)
>>>>
>>>> s/rams/ram/ maybe?
>>>>
>>>>> +{
>>>>> +    struct RAMBlock *rb;
>>>>> +
>>>>> +    QLIST_FOREACH_RCU(rb, &ram_list.blocks, next)
>>>>> +        ram_block_discard_range(rb, 0, rb->used_length);
>>>>
>>>> From a coding style point of view, I think there should be curly braces
>>>> around ram_block_discard_range() ?
>>>
>>> I think this might break if it happens during a postcopy migrate.
>>> The destination CPU is running, so it can do a reboot at just the wrong
>>> time; and then the pages (that are protected by userfaultfd) would get
>>> deallocated and trigger userfaultfd requests if accessed.
>>
>> Yes, userfaultd/postcopy is really fragile and relies on things that are not
>> necessarily true (e.g. virito-balloon can also invalidate pages).
> 
> That's why we use qemu_balloon_inhibit around postcopy to stop
> ballooning; I'm not aware of anything else that does the same.

we also have at least the pte_unused thing in mm/rmap.c that clearly
predates userfaultfd. We might need to look into this as well....

> 
>> The right thing here would be to actually terminate the postcopy migrate but
>> return it as "successful" (since we are going to clear that RAM anyway). Do 
>> you see a good way to achieve that?
> 
> There's no current mechanism to do it; I think it would have to involve
> some interaction with the source as well though to tell it that you
> didn't need that area of RAM anyway.
> 
> However, there are more problems:
>   a) Even forgetting the userfault problem, this is racy since during
> postcopy you're still receiving blocks from the source at the same time;
> so some of the area that you've discarded might get overwritten by data
> from the source.

So how do you handle the case when the target system writes to memory
that is still in flight? Can we build on that mechanism?
> 
>   b) Your release_all_rams seems to do all RAM Blocks - won't that nuke
> any ROMs as well? Or maybe even flash?

ROMs loaded with load_elf (like our s390-ccw.img) are reloaded on every reset.
See rom_reset in /hw/core/loader.c

Is this different with the x86 bios?


> 
>   c) In a normal precopy migration, I think you may also get old data;
> Paolo said that an MADV_DONTNEED won't cause the dirty flags to be set,
> so if the migrate has already sent the data for a page, and then this
> happens, before the CPUs are stopped during the migration, when you
> restart on the destination you'll have the old data.

Yes, looks like we might get non-cleared data. Could we maybe combine fixing
and optimizing: we can stop tranmitting the memory and do a clean
startup on the target side. In other words could we actually use the
reset clear trigger to speed up migration?



> 
> Dave
> 
>>
>>>
>>> Dave
>>>
>>>>> +}
>>>>> +
>>>>>  int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
>>>>>  {
>>>>>      S390CPU *cpu = S390_CPU(cs);
>>>>> @@ -1853,6 +1864,14 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
>>>>>              ret = handle_intercept(cpu);
>>>>>              break;
>>>>>          case KVM_EXIT_S390_RESET:
>>>>> +            if (run->s390_reset_flags & KVM_S390_RESET_CLEAR) {
>>>>> +                /*
>>>>> +                 * We will stop other CPUs anyway, avoid spurious crashes and
>>>>> +                 * get all CPUs out. The reset will take care of the resume.
>>>>> +                 */
>>>>> +                pause_all_vcpus();
>>>>> +                release_all_rams();
>>>>> +            }
>>>>>              s390_reipl_request();
>>>>>              break;
>>>>>          case KVM_EXIT_S390_TSCH:
>>>>>
>>>>
>>>> Apart from the cosmetic nits, patch looks good to me.
>>>>
>>>>  Thomas
>>> --
>>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>>>
>>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Dr. David Alan Gilbert March 1, 2018, 12:28 p.m. UTC | #9
* Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> 
> 
> On 03/01/2018 12:45 PM, Dr. David Alan Gilbert wrote:
> > * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> >>
> >>
> >> On 03/01/2018 10:24 AM, Dr. David Alan Gilbert wrote:
> >>> * Thomas Huth (thuth@redhat.com) wrote:
> >>>> On 28.02.2018 20:53, Christian Borntraeger wrote:
> >>>>> When a guests reboots with diagnose 308 subcode 3 it requests the memory
> >>>>> to be cleared. We did not do it so far. This does not only violate the
> >>>>> architecture, it also misses the chance to free up that memory on
> >>>>> reboot, which would help on host memory over commitment.  By using
> >>>>> ram_block_discard_range we can cover both cases.
> >>>>
> >>>> Sounds like a good idea. I wonder whether that release_all_ram()
> >>>> function should maybe rather reside in exec.c, so that other machines
> >>>> that want to clear all RAM at reset time can use it, too?
> >>>>
> >>>>> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> >>>>> ---
> >>>>>  target/s390x/kvm.c | 19 +++++++++++++++++++
> >>>>>  1 file changed, 19 insertions(+)
> >>>>>
> >>>>> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
> >>>>> index 8f3a422288..2e145ad5c3 100644
> >>>>> --- a/target/s390x/kvm.c
> >>>>> +++ b/target/s390x/kvm.c
> >>>>> @@ -34,6 +34,8 @@
> >>>>>  #include "qapi/error.h"
> >>>>>  #include "qemu/error-report.h"
> >>>>>  #include "qemu/timer.h"
> >>>>> +#include "qemu/rcu_queue.h"
> >>>>> +#include "sysemu/cpus.h"
> >>>>>  #include "sysemu/sysemu.h"
> >>>>>  #include "sysemu/hw_accel.h"
> >>>>>  #include "hw/boards.h"
> >>>>> @@ -41,6 +43,7 @@
> >>>>>  #include "sysemu/device_tree.h"
> >>>>>  #include "exec/gdbstub.h"
> >>>>>  #include "exec/address-spaces.h"
> >>>>> +#include "exec/ram_addr.h"
> >>>>>  #include "trace.h"
> >>>>>  #include "qapi-event.h"
> >>>>>  #include "hw/s390x/s390-pci-inst.h"
> >>>>> @@ -1841,6 +1844,14 @@ static int kvm_arch_handle_debug_exit(S390CPU *cpu)
> >>>>>      return ret;
> >>>>>  }
> >>>>>  
> >>>>> +static void release_all_rams(void)
> >>>>
> >>>> s/rams/ram/ maybe?
> >>>>
> >>>>> +{
> >>>>> +    struct RAMBlock *rb;
> >>>>> +
> >>>>> +    QLIST_FOREACH_RCU(rb, &ram_list.blocks, next)
> >>>>> +        ram_block_discard_range(rb, 0, rb->used_length);
> >>>>
> >>>> From a coding style point of view, I think there should be curly braces
> >>>> around ram_block_discard_range() ?
> >>>
> >>> I think this might break if it happens during a postcopy migrate.
> >>> The destination CPU is running, so it can do a reboot at just the wrong
> >>> time; and then the pages (that are protected by userfaultfd) would get
> >>> deallocated and trigger userfaultfd requests if accessed.
> >>
> >> Yes, userfaultd/postcopy is really fragile and relies on things that are not
> >> necessarily true (e.g. virito-balloon can also invalidate pages).
> > 
> > That's why we use qemu_balloon_inhibit around postcopy to stop
> > ballooning; I'm not aware of anything else that does the same.
> 
> we also have at least the pte_unused thing in mm/rmap.c that clearly
> predates userfaultfd. We might need to look into this as well....

I've not come across that; what does that do?

> > 
> >> The right thing here would be to actually terminate the postcopy migrate but
> >> return it as "successful" (since we are going to clear that RAM anyway). Do 
> >> you see a good way to achieve that?
> > 
> > There's no current mechanism to do it; I think it would have to involve
> > some interaction with the source as well though to tell it that you
> > didn't need that area of RAM anyway.
> > 
> > However, there are more problems:
> >   a) Even forgetting the userfault problem, this is racy since during
> > postcopy you're still receiving blocks from the source at the same time;
> > so some of the area that you've discarded might get overwritten by data
> > from the source.
> 
> So how do you handle the case when the target system writes to memory
> that is still in flight? Can we build on that mechanism?

Once we've entered postcopy, a page is basically in one of two states:
   a) Not yet received - i.e. marked absent with MADV_DONTNEED;  if the
guest tries to write to it then it'll block with userfault and ask the
source for the page; so the write wont happen until the page arrives.
   b) Received - we've already got the page from the source; the source
never resends a page (once in postcopy) so now the destination can just
write to the page.

Once in postcopy, a page is received at most once (i.e. if it's not
been received during precopy).

I can imagine two ways of curing it:
   a) Simple but slow;  just read all the pages before doing the
discard,  this forces it to wait for the pages to be received.
   b) More complex but fast;  Add a message on the return path to the
source telling it that you're going to discard a range; the source then
marks it's notes as cleared for those pages and then sends some form of
ack, and at that point you drop it.

A 3rd; incomplete way; would be just to drop the userfaultfd on the
destination for the RAMBlocks that are being cleared;  but this does
leave the source state in a bit of a mess.


> >   b) Your release_all_rams seems to do all RAM Blocks - won't that nuke
> > any ROMs as well? Or maybe even flash?
> 
> ROMs loaded with load_elf (like our s390-ccw.img) are reloaded on every reset.
> See rom_reset in /hw/core/loader.c

Ah, so this is happening after your reset code you've added?

> Is this different with the x86 bios?

Not sure; I know x86 keeps some mirrored copies of ROMs across
reboots but I don't fully understand the mechanisms we use.
But the other case I was thinking of was stuff like pflash on x86 which
are the flash images holding variable data.
(Also watch out for the way ram_block_discard_range deals with file
backed memory; discarding is actually quite hard in some cases).

> >   c) In a normal precopy migration, I think you may also get old data;
> > Paolo said that an MADV_DONTNEED won't cause the dirty flags to be set,
> > so if the migrate has already sent the data for a page, and then this
> > happens, before the CPUs are stopped during the migration, when you
> > restart on the destination you'll have the old data.
> 
> Yes, looks like we might get non-cleared data. Could we maybe combine fixing
> and optimizing: we can stop tranmitting the memory and do a clean
> startup on the target side. In other words could we actually use the
> reset clear trigger to speed up migration?

They're separate problems because they happen on opposite sides; on
the source you've got a chance of doing that type of hack, but it would
be a bit invasive.

Dave

> 
> 
> 
> > 
> > Dave
> > 
> >>
> >>>
> >>> Dave
> >>>
> >>>>> +}
> >>>>> +
> >>>>>  int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
> >>>>>  {
> >>>>>      S390CPU *cpu = S390_CPU(cs);
> >>>>> @@ -1853,6 +1864,14 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
> >>>>>              ret = handle_intercept(cpu);
> >>>>>              break;
> >>>>>          case KVM_EXIT_S390_RESET:
> >>>>> +            if (run->s390_reset_flags & KVM_S390_RESET_CLEAR) {
> >>>>> +                /*
> >>>>> +                 * We will stop other CPUs anyway, avoid spurious crashes and
> >>>>> +                 * get all CPUs out. The reset will take care of the resume.
> >>>>> +                 */
> >>>>> +                pause_all_vcpus();
> >>>>> +                release_all_rams();
> >>>>> +            }
> >>>>>              s390_reipl_request();
> >>>>>              break;
> >>>>>          case KVM_EXIT_S390_TSCH:
> >>>>>
> >>>>
> >>>> Apart from the cosmetic nits, patch looks good to me.
> >>>>
> >>>>  Thomas
> >>> --
> >>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> >>>
> >>
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Christian Borntraeger March 1, 2018, 12:35 p.m. UTC | #10
On 03/01/2018 01:28 PM, Dr. David Alan Gilbert wrote:
> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
>>
>>
>> On 03/01/2018 12:45 PM, Dr. David Alan Gilbert wrote:
>>> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
>>>>
>>>>
>>>> On 03/01/2018 10:24 AM, Dr. David Alan Gilbert wrote:
>>>>> * Thomas Huth (thuth@redhat.com) wrote:
>>>>>> On 28.02.2018 20:53, Christian Borntraeger wrote:
>>>>>>> When a guests reboots with diagnose 308 subcode 3 it requests the memory
>>>>>>> to be cleared. We did not do it so far. This does not only violate the
>>>>>>> architecture, it also misses the chance to free up that memory on
>>>>>>> reboot, which would help on host memory over commitment.  By using
>>>>>>> ram_block_discard_range we can cover both cases.
>>>>>>
>>>>>> Sounds like a good idea. I wonder whether that release_all_ram()
>>>>>> function should maybe rather reside in exec.c, so that other machines
>>>>>> that want to clear all RAM at reset time can use it, too?
>>>>>>
>>>>>>> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
>>>>>>> ---
>>>>>>>  target/s390x/kvm.c | 19 +++++++++++++++++++
>>>>>>>  1 file changed, 19 insertions(+)
>>>>>>>
>>>>>>> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
>>>>>>> index 8f3a422288..2e145ad5c3 100644
>>>>>>> --- a/target/s390x/kvm.c
>>>>>>> +++ b/target/s390x/kvm.c
>>>>>>> @@ -34,6 +34,8 @@
>>>>>>>  #include "qapi/error.h"
>>>>>>>  #include "qemu/error-report.h"
>>>>>>>  #include "qemu/timer.h"
>>>>>>> +#include "qemu/rcu_queue.h"
>>>>>>> +#include "sysemu/cpus.h"
>>>>>>>  #include "sysemu/sysemu.h"
>>>>>>>  #include "sysemu/hw_accel.h"
>>>>>>>  #include "hw/boards.h"
>>>>>>> @@ -41,6 +43,7 @@
>>>>>>>  #include "sysemu/device_tree.h"
>>>>>>>  #include "exec/gdbstub.h"
>>>>>>>  #include "exec/address-spaces.h"
>>>>>>> +#include "exec/ram_addr.h"
>>>>>>>  #include "trace.h"
>>>>>>>  #include "qapi-event.h"
>>>>>>>  #include "hw/s390x/s390-pci-inst.h"
>>>>>>> @@ -1841,6 +1844,14 @@ static int kvm_arch_handle_debug_exit(S390CPU *cpu)
>>>>>>>      return ret;
>>>>>>>  }
>>>>>>>  
>>>>>>> +static void release_all_rams(void)
>>>>>>
>>>>>> s/rams/ram/ maybe?
>>>>>>
>>>>>>> +{
>>>>>>> +    struct RAMBlock *rb;
>>>>>>> +
>>>>>>> +    QLIST_FOREACH_RCU(rb, &ram_list.blocks, next)
>>>>>>> +        ram_block_discard_range(rb, 0, rb->used_length);
>>>>>>
>>>>>> From a coding style point of view, I think there should be curly braces
>>>>>> around ram_block_discard_range() ?
>>>>>
>>>>> I think this might break if it happens during a postcopy migrate.
>>>>> The destination CPU is running, so it can do a reboot at just the wrong
>>>>> time; and then the pages (that are protected by userfaultfd) would get
>>>>> deallocated and trigger userfaultfd requests if accessed.
>>>>
>>>> Yes, userfaultd/postcopy is really fragile and relies on things that are not
>>>> necessarily true (e.g. virito-balloon can also invalidate pages).
>>>
>>> That's why we use qemu_balloon_inhibit around postcopy to stop
>>> ballooning; I'm not aware of anything else that does the same.
>>
>> we also have at least the pte_unused thing in mm/rmap.c that clearly
>> predates userfaultfd. We might need to look into this as well....
> 
> I've not come across that; what does that do?

It can drop a page on page out if the page is no longer of value. It is used by
the CMMA (guest page hinting) code of s390x.

see kernel mm/rmap.c


static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                     unsigned long address, void *arg)
{
[...]
                } else if (pte_unused(pteval)) {
                        /*
                         * The guest indicated that the page content is of no
                         * interest anymore. Simply discard the pte, vmscan
                         * will take care of the rest.
                         */
			dec_mm_counter(mm, mm_counter(page));
                        /* We have to invalidate as we cleared the pte */
                        mmu_notifier_invalidate_range(mm, address,
                                                      address + PAGE_SIZE);
                } else if (IS_ENABLED(CONFIG_MIGRATION) &&
                                (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
[...]


> 
>>>
>>>> The right thing here would be to actually terminate the postcopy migrate but
>>>> return it as "successful" (since we are going to clear that RAM anyway). Do 
>>>> you see a good way to achieve that?
>>>
>>> There's no current mechanism to do it; I think it would have to involve
>>> some interaction with the source as well though to tell it that you
>>> didn't need that area of RAM anyway.
>>>
>>> However, there are more problems:
>>>   a) Even forgetting the userfault problem, this is racy since during
>>> postcopy you're still receiving blocks from the source at the same time;
>>> so some of the area that you've discarded might get overwritten by data
>>> from the source.
>>
>> So how do you handle the case when the target system writes to memory
>> that is still in flight? Can we build on that mechanism?
> 
> Once we've entered postcopy, a page is basically in one of two states:
>    a) Not yet received - i.e. marked absent with MADV_DONTNEED;  if the
> guest tries to write to it then it'll block with userfault and ask the
> source for the page; so the write wont happen until the page arrives.
>    b) Received - we've already got the page from the source; the source
> never resends a page (once in postcopy) so now the destination can just
> write to the page.
> 
> Once in postcopy, a page is received at most once (i.e. if it's not
> been received during precopy).
> 
> I can imagine two ways of curing it:
>    a) Simple but slow;  just read all the pages before doing the
> discard,  this forces it to wait for the pages to be received.
>    b) More complex but fast;  Add a message on the return path to the
> source telling it that you're going to discard a range; the source then
> marks it's notes as cleared for those pages and then sends some form of
> ack, and at that point you drop it.

this looks like the most promising approach, but some work.

> 
> A 3rd; incomplete way; would be just to drop the userfaultfd on the
> destination for the RAMBlocks that are being cleared;  but this does
> leave the source state in a bit of a mess.
> 
> 
>>>   b) Your release_all_rams seems to do all RAM Blocks - won't that nuke
>>> any ROMs as well? Or maybe even flash?
>>
>> ROMs loaded with load_elf (like our s390-ccw.img) are reloaded on every reset.
>> See rom_reset in /hw/core/loader.c
> 
> Ah, so this is happening after your reset code you've added?

Yes, I am stopping all CPU, clear the memory. And then I call system_reset.
Christian Borntraeger March 1, 2018, 12:39 p.m. UTC | #11
On 03/01/2018 01:35 PM, Christian Borntraeger wrote:
> 
> 
> On 03/01/2018 01:28 PM, Dr. David Alan Gilbert wrote:
>> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
>>>
>>>
>>> On 03/01/2018 12:45 PM, Dr. David Alan Gilbert wrote:
>>>> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
>>>>>
>>>>>
>>>>> On 03/01/2018 10:24 AM, Dr. David Alan Gilbert wrote:
>>>>>> * Thomas Huth (thuth@redhat.com) wrote:
>>>>>>> On 28.02.2018 20:53, Christian Borntraeger wrote:
>>>>>>>> When a guests reboots with diagnose 308 subcode 3 it requests the memory
>>>>>>>> to be cleared. We did not do it so far. This does not only violate the
>>>>>>>> architecture, it also misses the chance to free up that memory on
>>>>>>>> reboot, which would help on host memory over commitment.  By using
>>>>>>>> ram_block_discard_range we can cover both cases.
>>>>>>>
>>>>>>> Sounds like a good idea. I wonder whether that release_all_ram()
>>>>>>> function should maybe rather reside in exec.c, so that other machines
>>>>>>> that want to clear all RAM at reset time can use it, too?
>>>>>>>
>>>>>>>> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
>>>>>>>> ---
>>>>>>>>  target/s390x/kvm.c | 19 +++++++++++++++++++
>>>>>>>>  1 file changed, 19 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
>>>>>>>> index 8f3a422288..2e145ad5c3 100644
>>>>>>>> --- a/target/s390x/kvm.c
>>>>>>>> +++ b/target/s390x/kvm.c
>>>>>>>> @@ -34,6 +34,8 @@
>>>>>>>>  #include "qapi/error.h"
>>>>>>>>  #include "qemu/error-report.h"
>>>>>>>>  #include "qemu/timer.h"
>>>>>>>> +#include "qemu/rcu_queue.h"
>>>>>>>> +#include "sysemu/cpus.h"
>>>>>>>>  #include "sysemu/sysemu.h"
>>>>>>>>  #include "sysemu/hw_accel.h"
>>>>>>>>  #include "hw/boards.h"
>>>>>>>> @@ -41,6 +43,7 @@
>>>>>>>>  #include "sysemu/device_tree.h"
>>>>>>>>  #include "exec/gdbstub.h"
>>>>>>>>  #include "exec/address-spaces.h"
>>>>>>>> +#include "exec/ram_addr.h"
>>>>>>>>  #include "trace.h"
>>>>>>>>  #include "qapi-event.h"
>>>>>>>>  #include "hw/s390x/s390-pci-inst.h"
>>>>>>>> @@ -1841,6 +1844,14 @@ static int kvm_arch_handle_debug_exit(S390CPU *cpu)
>>>>>>>>      return ret;
>>>>>>>>  }
>>>>>>>>  
>>>>>>>> +static void release_all_rams(void)
>>>>>>>
>>>>>>> s/rams/ram/ maybe?
>>>>>>>
>>>>>>>> +{
>>>>>>>> +    struct RAMBlock *rb;
>>>>>>>> +
>>>>>>>> +    QLIST_FOREACH_RCU(rb, &ram_list.blocks, next)
>>>>>>>> +        ram_block_discard_range(rb, 0, rb->used_length);
>>>>>>>
>>>>>>> From a coding style point of view, I think there should be curly braces
>>>>>>> around ram_block_discard_range() ?
>>>>>>
>>>>>> I think this might break if it happens during a postcopy migrate.
>>>>>> The destination CPU is running, so it can do a reboot at just the wrong
>>>>>> time; and then the pages (that are protected by userfaultfd) would get
>>>>>> deallocated and trigger userfaultfd requests if accessed.
>>>>>
>>>>> Yes, userfaultd/postcopy is really fragile and relies on things that are not
>>>>> necessarily true (e.g. virito-balloon can also invalidate pages).
>>>>
>>>> That's why we use qemu_balloon_inhibit around postcopy to stop
>>>> ballooning; I'm not aware of anything else that does the same.
>>>
>>> we also have at least the pte_unused thing in mm/rmap.c that clearly
>>> predates userfaultfd. We might need to look into this as well....
>>
>> I've not come across that; what does that do?
> 
> It can drop a page on page out if the page is no longer of value. It is used by
> the CMMA (guest page hinting) code of s390x.
> 
> see kernel mm/rmap.c
> 
> 
> static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
>                      unsigned long address, void *arg)
> {
> [...]
>                 } else if (pte_unused(pteval)) {
>                         /*
>                          * The guest indicated that the page content is of no
>                          * interest anymore. Simply discard the pte, vmscan
>                          * will take care of the rest.
>                          */
> 			dec_mm_counter(mm, mm_counter(page));
>                         /* We have to invalidate as we cleared the pte */
>                         mmu_notifier_invalidate_range(mm, address,
>                                                       address + PAGE_SIZE);
>                 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
>                                 (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
> [...]
> 
> 

Maybe something like this in the kernel

diff --git a/mm/rmap.c b/mm/rmap.c
index 47db27f8049e..9bdf4d448987 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1483,7 +1483,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                                set_pte_at(mm, address, pvmw.pte, pteval);
                        }
 
-               } else if (pte_unused(pteval)) {
+               } else if (pte_unused(pteval) && !vma->vm_userfaultfd_ctx.ctx) {
                        /*
                         * The guest indicated that the page content is of no
                         * interest anymore. Simply discard the pte, vmscan


could help?
Dr. David Alan Gilbert March 1, 2018, 12:49 p.m. UTC | #12
* Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> 
> 
> On 03/01/2018 01:28 PM, Dr. David Alan Gilbert wrote:
> > * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> >>
> >>
> >> On 03/01/2018 12:45 PM, Dr. David Alan Gilbert wrote:
> >>> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> >>>>
> >>>>
> >>>> On 03/01/2018 10:24 AM, Dr. David Alan Gilbert wrote:
> >>>>> * Thomas Huth (thuth@redhat.com) wrote:
> >>>>>> On 28.02.2018 20:53, Christian Borntraeger wrote:
> >>>>>>> When a guests reboots with diagnose 308 subcode 3 it requests the memory
> >>>>>>> to be cleared. We did not do it so far. This does not only violate the
> >>>>>>> architecture, it also misses the chance to free up that memory on
> >>>>>>> reboot, which would help on host memory over commitment.  By using
> >>>>>>> ram_block_discard_range we can cover both cases.
> >>>>>>
> >>>>>> Sounds like a good idea. I wonder whether that release_all_ram()
> >>>>>> function should maybe rather reside in exec.c, so that other machines
> >>>>>> that want to clear all RAM at reset time can use it, too?
> >>>>>>
> >>>>>>> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> >>>>>>> ---
> >>>>>>>  target/s390x/kvm.c | 19 +++++++++++++++++++
> >>>>>>>  1 file changed, 19 insertions(+)
> >>>>>>>
> >>>>>>> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
> >>>>>>> index 8f3a422288..2e145ad5c3 100644
> >>>>>>> --- a/target/s390x/kvm.c
> >>>>>>> +++ b/target/s390x/kvm.c
> >>>>>>> @@ -34,6 +34,8 @@
> >>>>>>>  #include "qapi/error.h"
> >>>>>>>  #include "qemu/error-report.h"
> >>>>>>>  #include "qemu/timer.h"
> >>>>>>> +#include "qemu/rcu_queue.h"
> >>>>>>> +#include "sysemu/cpus.h"
> >>>>>>>  #include "sysemu/sysemu.h"
> >>>>>>>  #include "sysemu/hw_accel.h"
> >>>>>>>  #include "hw/boards.h"
> >>>>>>> @@ -41,6 +43,7 @@
> >>>>>>>  #include "sysemu/device_tree.h"
> >>>>>>>  #include "exec/gdbstub.h"
> >>>>>>>  #include "exec/address-spaces.h"
> >>>>>>> +#include "exec/ram_addr.h"
> >>>>>>>  #include "trace.h"
> >>>>>>>  #include "qapi-event.h"
> >>>>>>>  #include "hw/s390x/s390-pci-inst.h"
> >>>>>>> @@ -1841,6 +1844,14 @@ static int kvm_arch_handle_debug_exit(S390CPU *cpu)
> >>>>>>>      return ret;
> >>>>>>>  }
> >>>>>>>  
> >>>>>>> +static void release_all_rams(void)
> >>>>>>
> >>>>>> s/rams/ram/ maybe?
> >>>>>>
> >>>>>>> +{
> >>>>>>> +    struct RAMBlock *rb;
> >>>>>>> +
> >>>>>>> +    QLIST_FOREACH_RCU(rb, &ram_list.blocks, next)
> >>>>>>> +        ram_block_discard_range(rb, 0, rb->used_length);
> >>>>>>
> >>>>>> From a coding style point of view, I think there should be curly braces
> >>>>>> around ram_block_discard_range() ?
> >>>>>
> >>>>> I think this might break if it happens during a postcopy migrate.
> >>>>> The destination CPU is running, so it can do a reboot at just the wrong
> >>>>> time; and then the pages (that are protected by userfaultfd) would get
> >>>>> deallocated and trigger userfaultfd requests if accessed.
> >>>>
> >>>> Yes, userfaultd/postcopy is really fragile and relies on things that are not
> >>>> necessarily true (e.g. virito-balloon can also invalidate pages).
> >>>
> >>> That's why we use qemu_balloon_inhibit around postcopy to stop
> >>> ballooning; I'm not aware of anything else that does the same.
> >>
> >> we also have at least the pte_unused thing in mm/rmap.c that clearly
> >> predates userfaultfd. We might need to look into this as well....
> > 
> > I've not come across that; what does that do?
> 
> It can drop a page on page out if the page is no longer of value. It is used by
> the CMMA (guest page hinting) code of s390x.
> 
> see kernel mm/rmap.c
> 
> 
> static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
>                      unsigned long address, void *arg)
> {
> [...]
>                 } else if (pte_unused(pteval)) {
>                         /*
>                          * The guest indicated that the page content is of no
>                          * interest anymore. Simply discard the pte, vmscan
>                          * will take care of the rest.
>                          */
> 			dec_mm_counter(mm, mm_counter(page));
>                         /* We have to invalidate as we cleared the pte */
>                         mmu_notifier_invalidate_range(mm, address,
>                                                       address + PAGE_SIZE);
>                 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
>                                 (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
> [...]

OK, probably best to check with Andrea what the best way to get that
happy with userfault is.

> > 
> >>>
> >>>> The right thing here would be to actually terminate the postcopy migrate but
> >>>> return it as "successful" (since we are going to clear that RAM anyway). Do 
> >>>> you see a good way to achieve that?
> >>>
> >>> There's no current mechanism to do it; I think it would have to involve
> >>> some interaction with the source as well though to tell it that you
> >>> didn't need that area of RAM anyway.
> >>>
> >>> However, there are more problems:
> >>>   a) Even forgetting the userfault problem, this is racy since during
> >>> postcopy you're still receiving blocks from the source at the same time;
> >>> so some of the area that you've discarded might get overwritten by data
> >>> from the source.
> >>
> >> So how do you handle the case when the target system writes to memory
> >> that is still in flight? Can we build on that mechanism?
> > 
> > Once we've entered postcopy, a page is basically in one of two states:
> >    a) Not yet received - i.e. marked absent with MADV_DONTNEED;  if the
> > guest tries to write to it then it'll block with userfault and ask the
> > source for the page; so the write wont happen until the page arrives.
> >    b) Received - we've already got the page from the source; the source
> > never resends a page (once in postcopy) so now the destination can just
> > write to the page.
> > 
> > Once in postcopy, a page is received at most once (i.e. if it's not
> > been received during precopy).
> > 
> > I can imagine two ways of curing it:
> >    a) Simple but slow;  just read all the pages before doing the
> > discard,  this forces it to wait for the pages to be received.
> >    b) More complex but fast;  Add a message on the return path to the
> > source telling it that you're going to discard a range; the source then
> > marks it's notes as cleared for those pages and then sends some form of
> > ack, and at that point you drop it.
> 
> this looks like the most promising approach, but some work.

Yes, you can add a new MIG_RP_MSG_ type for the destination to tell the
source; that's pretty easy.  Remember that there will still be pages in
flight after you've sent this message so you'll have to wait for those
to clear out.

> > 
> > A 3rd; incomplete way; would be just to drop the userfaultfd on the
> > destination for the RAMBlocks that are being cleared;  but this does
> > leave the source state in a bit of a mess.
> > 
> > 
> >>>   b) Your release_all_rams seems to do all RAM Blocks - won't that nuke
> >>> any ROMs as well? Or maybe even flash?
> >>
> >> ROMs loaded with load_elf (like our s390-ccw.img) are reloaded on every reset.
> >> See rom_reset in /hw/core/loader.c
> > 
> > Ah, so this is happening after your reset code you've added?
> 
> Yes, I am stopping all CPU, clear the memory. And then I call system_reset.

OK.

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert March 1, 2018, 12:58 p.m. UTC | #13
* Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> 
> 
> On 03/01/2018 01:35 PM, Christian Borntraeger wrote:
> > 
> > 
> > On 03/01/2018 01:28 PM, Dr. David Alan Gilbert wrote:
> >> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> >>>
> >>>
> >>> On 03/01/2018 12:45 PM, Dr. David Alan Gilbert wrote:
> >>>> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> >>>>>
> >>>>>
> >>>>> On 03/01/2018 10:24 AM, Dr. David Alan Gilbert wrote:
> >>>>>> * Thomas Huth (thuth@redhat.com) wrote:
> >>>>>>> On 28.02.2018 20:53, Christian Borntraeger wrote:
> >>>>>>>> When a guests reboots with diagnose 308 subcode 3 it requests the memory
> >>>>>>>> to be cleared. We did not do it so far. This does not only violate the
> >>>>>>>> architecture, it also misses the chance to free up that memory on
> >>>>>>>> reboot, which would help on host memory over commitment.  By using
> >>>>>>>> ram_block_discard_range we can cover both cases.
> >>>>>>>
> >>>>>>> Sounds like a good idea. I wonder whether that release_all_ram()
> >>>>>>> function should maybe rather reside in exec.c, so that other machines
> >>>>>>> that want to clear all RAM at reset time can use it, too?
> >>>>>>>
> >>>>>>>> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> >>>>>>>> ---
> >>>>>>>>  target/s390x/kvm.c | 19 +++++++++++++++++++
> >>>>>>>>  1 file changed, 19 insertions(+)
> >>>>>>>>
> >>>>>>>> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
> >>>>>>>> index 8f3a422288..2e145ad5c3 100644
> >>>>>>>> --- a/target/s390x/kvm.c
> >>>>>>>> +++ b/target/s390x/kvm.c
> >>>>>>>> @@ -34,6 +34,8 @@
> >>>>>>>>  #include "qapi/error.h"
> >>>>>>>>  #include "qemu/error-report.h"
> >>>>>>>>  #include "qemu/timer.h"
> >>>>>>>> +#include "qemu/rcu_queue.h"
> >>>>>>>> +#include "sysemu/cpus.h"
> >>>>>>>>  #include "sysemu/sysemu.h"
> >>>>>>>>  #include "sysemu/hw_accel.h"
> >>>>>>>>  #include "hw/boards.h"
> >>>>>>>> @@ -41,6 +43,7 @@
> >>>>>>>>  #include "sysemu/device_tree.h"
> >>>>>>>>  #include "exec/gdbstub.h"
> >>>>>>>>  #include "exec/address-spaces.h"
> >>>>>>>> +#include "exec/ram_addr.h"
> >>>>>>>>  #include "trace.h"
> >>>>>>>>  #include "qapi-event.h"
> >>>>>>>>  #include "hw/s390x/s390-pci-inst.h"
> >>>>>>>> @@ -1841,6 +1844,14 @@ static int kvm_arch_handle_debug_exit(S390CPU *cpu)
> >>>>>>>>      return ret;
> >>>>>>>>  }
> >>>>>>>>  
> >>>>>>>> +static void release_all_rams(void)
> >>>>>>>
> >>>>>>> s/rams/ram/ maybe?
> >>>>>>>
> >>>>>>>> +{
> >>>>>>>> +    struct RAMBlock *rb;
> >>>>>>>> +
> >>>>>>>> +    QLIST_FOREACH_RCU(rb, &ram_list.blocks, next)
> >>>>>>>> +        ram_block_discard_range(rb, 0, rb->used_length);
> >>>>>>>
> >>>>>>> From a coding style point of view, I think there should be curly braces
> >>>>>>> around ram_block_discard_range() ?
> >>>>>>
> >>>>>> I think this might break if it happens during a postcopy migrate.
> >>>>>> The destination CPU is running, so it can do a reboot at just the wrong
> >>>>>> time; and then the pages (that are protected by userfaultfd) would get
> >>>>>> deallocated and trigger userfaultfd requests if accessed.
> >>>>>
> >>>>> Yes, userfaultd/postcopy is really fragile and relies on things that are not
> >>>>> necessarily true (e.g. virito-balloon can also invalidate pages).
> >>>>
> >>>> That's why we use qemu_balloon_inhibit around postcopy to stop
> >>>> ballooning; I'm not aware of anything else that does the same.
> >>>
> >>> we also have at least the pte_unused thing in mm/rmap.c that clearly
> >>> predates userfaultfd. We might need to look into this as well....
> >>
> >> I've not come across that; what does that do?
> > 
> > It can drop a page on page out if the page is no longer of value. It is used by
> > the CMMA (guest page hinting) code of s390x.
> > 
> > see kernel mm/rmap.c
> > 
> > 
> > static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
> >                      unsigned long address, void *arg)
> > {
> > [...]
> >                 } else if (pte_unused(pteval)) {
> >                         /*
> >                          * The guest indicated that the page content is of no
> >                          * interest anymore. Simply discard the pte, vmscan
> >                          * will take care of the rest.
> >                          */
> > 			dec_mm_counter(mm, mm_counter(page));
> >                         /* We have to invalidate as we cleared the pte */
> >                         mmu_notifier_invalidate_range(mm, address,
> >                                                       address + PAGE_SIZE);
> >                 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
> >                                 (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
> > [...]
> > 
> > 
> 
> Maybe something like this in the kernel
> 
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 47db27f8049e..9bdf4d448987 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1483,7 +1483,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
>                                 set_pte_at(mm, address, pvmw.pte, pteval);
>                         }
>  
> -               } else if (pte_unused(pteval)) {
> +               } else if (pte_unused(pteval) && !vma->vm_userfaultfd_ctx.ctx) {
>                         /*
>                          * The guest indicated that the page content is of no
>                          * interest anymore. Simply discard the pte, vmscan
> 
> 
> could help?

I guess so, but please check with aarcange; I don't know the mm code.

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Cornelia Huck March 5, 2018, 12:54 p.m. UTC | #14
On Wed, 28 Feb 2018 19:53:20 +0000
Christian Borntraeger <borntraeger@de.ibm.com> wrote:

> When a guests reboots with diagnose 308 subcode 3 it requests the memory
> to be cleared. We did not do it so far. This does not only violate the
> architecture, it also misses the chance to free up that memory on
> reboot, which would help on host memory over commitment.  By using
> ram_block_discard_range we can cover both cases.
> 
> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> ---
>  target/s390x/kvm.c | 19 +++++++++++++++++++
>  1 file changed, 19 insertions(+)

So, this needs more thought (and additional kernel changes)? (Trying to
pick up patches that make sense for 2.12.)
Christian Borntraeger March 5, 2018, 1:04 p.m. UTC | #15
On 03/05/2018 01:54 PM, Cornelia Huck wrote:
> On Wed, 28 Feb 2018 19:53:20 +0000
> Christian Borntraeger <borntraeger@de.ibm.com> wrote:
> 
>> When a guests reboots with diagnose 308 subcode 3 it requests the memory
>> to be cleared. We did not do it so far. This does not only violate the
>> architecture, it also misses the chance to free up that memory on
>> reboot, which would help on host memory over commitment.  By using
>> ram_block_discard_range we can cover both cases.
>>
>> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
>> ---
>>  target/s390x/kvm.c | 19 +++++++++++++++++++
>>  1 file changed, 19 insertions(+)
> 
> So, this needs more thought (and additional kernel changes)? (Trying to
> pick up patches that make sense for 2.12.)

Yes, lets defer this.
diff mbox series

Patch

diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
index 8f3a422288..2e145ad5c3 100644
--- a/target/s390x/kvm.c
+++ b/target/s390x/kvm.c
@@ -34,6 +34,8 @@ 
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "qemu/timer.h"
+#include "qemu/rcu_queue.h"
+#include "sysemu/cpus.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/hw_accel.h"
 #include "hw/boards.h"
@@ -41,6 +43,7 @@ 
 #include "sysemu/device_tree.h"
 #include "exec/gdbstub.h"
 #include "exec/address-spaces.h"
+#include "exec/ram_addr.h"
 #include "trace.h"
 #include "qapi-event.h"
 #include "hw/s390x/s390-pci-inst.h"
@@ -1841,6 +1844,14 @@  static int kvm_arch_handle_debug_exit(S390CPU *cpu)
     return ret;
 }
 
+static void release_all_rams(void)
+{
+    struct RAMBlock *rb;
+
+    QLIST_FOREACH_RCU(rb, &ram_list.blocks, next)
+        ram_block_discard_range(rb, 0, rb->used_length);
+}
+
 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
 {
     S390CPU *cpu = S390_CPU(cs);
@@ -1853,6 +1864,14 @@  int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
             ret = handle_intercept(cpu);
             break;
         case KVM_EXIT_S390_RESET:
+            if (run->s390_reset_flags & KVM_S390_RESET_CLEAR) {
+                /*
+                 * We will stop other CPUs anyway, avoid spurious crashes and
+                 * get all CPUs out. The reset will take care of the resume.
+                 */
+                pause_all_vcpus();
+                release_all_rams();
+            }
             s390_reipl_request();
             break;
         case KVM_EXIT_S390_TSCH: