Patchwork [10/10] Maintaing number of dirty pages

login
register
mail settings
Submitter Juan Quintela
Date Nov. 23, 2010, 11:03 p.m.
Message ID <411b3f89b6b79b429686bd1b1fa188ba55b6c6f7.1290552026.git.quintela@redhat.com>
Download mbox | patch
Permalink /patch/72777/
State New
Headers show

Comments

Juan Quintela - Nov. 23, 2010, 11:03 p.m.
From: Juan Quintela <quintela@trasno.org>

Calculate the number of dirty pages takes a lot on hosts with lots
of memory.  Just maintain how many pages are dirty.  Only sync bitmaps
if number is small enough.

Signed-off-by: Juan Quintela <quintela@trasno.org>
Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 arch_init.c |   15 +--------------
 cpu-all.h   |    7 +++++++
 exec.c      |    1 +
 3 files changed, 9 insertions(+), 14 deletions(-)
Anthony Liguori - Nov. 30, 2010, 2:13 a.m.
On 11/23/2010 05:03 PM, Juan Quintela wrote:
> From: Juan Quintela<quintela@trasno.org>
>
> Calculate the number of dirty pages takes a lot on hosts with lots
> of memory.  Just maintain how many pages are dirty.  Only sync bitmaps
> if number is small enough.
>    

There needs to be numbers that justify this as part of the commit message.

This only works for KVM because it happens to use set_dirty() whenever 
it dirties memory.  I don't think will work with TCG.

I also think that the fundamental problem is the kernel dirty bitmap.  
Perhaps it should return a multiple level table instead of a simple 
linear bitmap.

Regards,

Anthony Liguori

> Signed-off-by: Juan Quintela<quintela@trasno.org>
> Signed-off-by: Juan Quintela<quintela@redhat.com>
> ---
>   arch_init.c |   15 +--------------
>   cpu-all.h   |    7 +++++++
>   exec.c      |    1 +
>   3 files changed, 9 insertions(+), 14 deletions(-)
>
> diff --git a/arch_init.c b/arch_init.c
> index b463798..c2bc144 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -176,20 +176,7 @@ static uint64_t bytes_transferred;
>
>   static uint64_t ram_save_remaining(void)
>   {
> -    RAMBlock *block;
> -    uint64_t count = 0;
> -
> -    QLIST_FOREACH(block,&ram_list.blocks, next) {
> -        ram_addr_t addr;
> -        for (addr = block->offset; addr<  block->offset + block->length;
> -             addr += TARGET_PAGE_SIZE) {
> -            if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) {
> -                count++;
> -            }
> -        }
> -    }
> -
> -    return count;
> +    return ram_list.dirty_pages;
>   }
>
>   uint64_t ram_bytes_remaining(void)
> diff --git a/cpu-all.h b/cpu-all.h
> index 30ae17d..5dc27c6 100644
> --- a/cpu-all.h
> +++ b/cpu-all.h
> @@ -874,6 +874,7 @@ typedef struct RAMBlock {
>
>   typedef struct RAMList {
>       uint8_t *phys_dirty;
> +    uint64_t dirty_pages;
>       QLIST_HEAD(ram, RAMBlock) blocks;
>   } RAMList;
>   extern RAMList ram_list;
> @@ -922,6 +923,9 @@ static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
>
>   static inline void cpu_physical_memory_set_dirty(ram_addr_t addr)
>   {
> +    if (!cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG))
> +        ram_list.dirty_pages++;
> +
>       ram_list.phys_dirty[addr>>  TARGET_PAGE_BITS] = 0xff;
>   }
>
> @@ -942,6 +946,9 @@ static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start,
>       mask = ~dirty_flags;
>       p = ram_list.phys_dirty + (start>>  TARGET_PAGE_BITS);
>       for (i = 0; i<  len; i++) {
> +        if (cpu_physical_memory_get_dirty(start + i * TARGET_PAGE_SIZE,
> +                                          MIGRATION_DIRTY_FLAG&  dirty_flags))
> +            ram_list.dirty_pages--;
>           p[i]&= mask;
>       }
>   }
> diff --git a/exec.c b/exec.c
> index f5b2386..ca2506e 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -2866,6 +2866,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
>                                          last_ram_offset()>>  TARGET_PAGE_BITS);
>       memset(ram_list.phys_dirty + (new_block->offset>>  TARGET_PAGE_BITS),
>              0xff, size>>  TARGET_PAGE_BITS);
> +    ram_list.dirty_pages += size>>  TARGET_PAGE_BITS;
>
>       if (kvm_enabled())
>           kvm_setup_guest_memory(new_block->host, size);
>
Juan Quintela - Nov. 30, 2010, 2:46 p.m.
Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 11/23/2010 05:03 PM, Juan Quintela wrote:
>> From: Juan Quintela<quintela@trasno.org>
>>
>> Calculate the number of dirty pages takes a lot on hosts with lots
>> of memory.  Just maintain how many pages are dirty.  Only sync bitmaps
>> if number is small enough.
>>    
>
> There needs to be numbers that justify this as part of the commit message.

They are on patch 0/6.

Additionally, with 64GB of RAM, this bitmap is HUGE, having to walk over
it in each ram_save_live() call is too onerous.

> This only works for KVM because it happens to use set_dirty() whenever
> it dirties memory.  I don't think will work with TCG.

The TCG is already broken with respect to migration.  Nothing else
sets that bitmap nowadays.

> I also think that the fundamental problem is the kernel dirty bitmap.
> Perhaps it should return a multiple level table instead of a simple
> linear bitmap.

KVM interface is suboptimal, no doubt here.  But the bigger problem is
qemu implementation.  Having a byte arry where it only uses 2 bits is
wasteful at least.  And having to transform the array forth<->back with
kvm is worse still.

Longer term my plan is:
- make kvm return the number of dirty pages somehow (i.e. no need to
  calculate them)
- split bitmaps, migration bitmap only needs to be handled during
  migration, and CODE bitmap only for TCG.
- interface with kvm would better be something like array of pages
  or array (pages, count).  But I need to measure if there is any
  differences/improvements with one/another.

We were discussing yesterday what made ram_live_block() staying on top
of the profiles, it was this.  This makes ram_save_live() to dissapear
from profiles.

Later, Juan.

> Regards,
>
> Anthony Liguori
>
>> Signed-off-by: Juan Quintela<quintela@trasno.org>
>> Signed-off-by: Juan Quintela<quintela@redhat.com>
>> ---
>>   arch_init.c |   15 +--------------
>>   cpu-all.h   |    7 +++++++
>>   exec.c      |    1 +
>>   3 files changed, 9 insertions(+), 14 deletions(-)
>>
>> diff --git a/arch_init.c b/arch_init.c
>> index b463798..c2bc144 100644
>> --- a/arch_init.c
>> +++ b/arch_init.c
>> @@ -176,20 +176,7 @@ static uint64_t bytes_transferred;
>>
>>   static uint64_t ram_save_remaining(void)
>>   {
>> -    RAMBlock *block;
>> -    uint64_t count = 0;
>> -
>> -    QLIST_FOREACH(block,&ram_list.blocks, next) {
>> -        ram_addr_t addr;
>> -        for (addr = block->offset; addr<  block->offset + block->length;
>> -             addr += TARGET_PAGE_SIZE) {
>> -            if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) {
>> -                count++;
>> -            }
>> -        }
>> -    }
>> -
>> -    return count;
>> +    return ram_list.dirty_pages;
>>   }
>>
>>   uint64_t ram_bytes_remaining(void)
>> diff --git a/cpu-all.h b/cpu-all.h
>> index 30ae17d..5dc27c6 100644
>> --- a/cpu-all.h
>> +++ b/cpu-all.h
>> @@ -874,6 +874,7 @@ typedef struct RAMBlock {
>>
>>   typedef struct RAMList {
>>       uint8_t *phys_dirty;
>> +    uint64_t dirty_pages;
>>       QLIST_HEAD(ram, RAMBlock) blocks;
>>   } RAMList;
>>   extern RAMList ram_list;
>> @@ -922,6 +923,9 @@ static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
>>
>>   static inline void cpu_physical_memory_set_dirty(ram_addr_t addr)
>>   {
>> +    if (!cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG))
>> +        ram_list.dirty_pages++;
>> +
>>       ram_list.phys_dirty[addr>>  TARGET_PAGE_BITS] = 0xff;
>>   }
>>
>> @@ -942,6 +946,9 @@ static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start,
>>       mask = ~dirty_flags;
>>       p = ram_list.phys_dirty + (start>>  TARGET_PAGE_BITS);
>>       for (i = 0; i<  len; i++) {
>> +        if (cpu_physical_memory_get_dirty(start + i * TARGET_PAGE_SIZE,
>> +                                          MIGRATION_DIRTY_FLAG&  dirty_flags))
>> +            ram_list.dirty_pages--;
>>           p[i]&= mask;
>>       }
>>   }
>> diff --git a/exec.c b/exec.c
>> index f5b2386..ca2506e 100644
>> --- a/exec.c
>> +++ b/exec.c
>> @@ -2866,6 +2866,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
>>                                          last_ram_offset()>>  TARGET_PAGE_BITS);
>>       memset(ram_list.phys_dirty + (new_block->offset>>  TARGET_PAGE_BITS),
>>              0xff, size>>  TARGET_PAGE_BITS);
>> +    ram_list.dirty_pages += size>>  TARGET_PAGE_BITS;
>>
>>       if (kvm_enabled())
>>           kvm_setup_guest_memory(new_block->host, size);
>>
Avi Kivity - Dec. 1, 2010, 2:46 p.m.
On 11/30/2010 04:46 PM, Juan Quintela wrote:
> Anthony Liguori<anthony@codemonkey.ws>  wrote:
> >  On 11/23/2010 05:03 PM, Juan Quintela wrote:
> >>  From: Juan Quintela<quintela@trasno.org>
> >>
> >>  Calculate the number of dirty pages takes a lot on hosts with lots
> >>  of memory.  Just maintain how many pages are dirty.  Only sync bitmaps
> >>  if number is small enough.
> >>
> >
> >  There needs to be numbers that justify this as part of the commit message.
>
> They are on patch 0/6.
>
> Additionally, with 64GB of RAM, this bitmap is HUGE, having to walk over
> it in each ram_save_live() call is too onerous.

It's not so huge.  It's scaled down by a factor of 8 * 4096 = 32K.  So 
it's a 2MB bitmap.  If kept as a bitmap and accessed in longs, it can be 
read in less than a millisecond.

An optimization can be to look at the previous ram_save_live (which had 
to walk the bitmap).  If old_nr_dirty > 4*target_nr_dirty, assume we 
need one more pass and don't scan the bitmap.

Another optimization is to stop the count when we reach the target; 
instead of ram_save_remaining() have a ram_save_may_stop() which counts 
the number of dirty bits until it reaches target_nr_dirty or exhausts 
the bitmap.
Juan Quintela - Dec. 1, 2010, 3:51 p.m.
Avi Kivity <avi@redhat.com> wrote:
> On 11/30/2010 04:46 PM, Juan Quintela wrote:
>> Anthony Liguori<anthony@codemonkey.ws>  wrote:
>> >  On 11/23/2010 05:03 PM, Juan Quintela wrote:
>> >>  From: Juan Quintela<quintela@trasno.org>
>> >>
>> >>  Calculate the number of dirty pages takes a lot on hosts with lots
>> >>  of memory.  Just maintain how many pages are dirty.  Only sync bitmaps
>> >>  if number is small enough.
>> >>
>> >
>> >  There needs to be numbers that justify this as part of the commit message.
>>
>> They are on patch 0/6.
>>
>> Additionally, with 64GB of RAM, this bitmap is HUGE, having to walk over
>> it in each ram_save_live() call is too onerous.
>
> It's not so huge.  It's scaled down by a factor of 8 * 4096 = 32K.  So
> it's a 2MB bitmap.  If kept as a bitmap and accessed in longs, it can
> be read in less than a millisecond.

Going to undertand why we need the other bitmaps for kvm.

Basically we have:
- CODE bit: kvm don't care
- VGA bit: not really sure how much we care.  My understanding is that
  the bitmap for the VGA don't need to be as big as all memory.
- MIGRATION bit: we care O:-)

Now, we are setting the MIGRATION in three ways:
- kvm (we sync with that)
- vhost net: it uses a different mechanism, not shared with kvm
- *write[blw].  My understanding is that we care about this ones.  Do we
  really care?

We are also syncing too much bitmaps, we can do a bit less syncing.


> An optimization can be to look at the previous ram_save_live (which
> had to walk the bitmap).  If old_nr_dirty > 4*target_nr_dirty, assume
> we need one more pass and don't scan the bitmap.

We had a similar optimization on rhel5.  We only synched the bitmap each
time that we passed over address 0.  And each time that we called
ram_save_remaining().

Trivial optimization for ram_save_reamaining is to:
- maintain the number of pages that are dirty (we only modify the bitmap
 in two places, trivial to inc/dec the total number of dirty pages).
- on ram_save_live only sync bitmap if we are about to exit from the
  loop.  If pages dirty are still bigger that the one we need to go to
  non-save life, it makes no sense to sync.

> Another optimization is to stop the count when we reach the target;
> instead of ram_save_remaining() have a ram_save_may_stop() which
> counts the number of dirty bits until it reaches target_nr_dirty or
> exhausts the bitmap.

The problem here is that guest want to continue running, and continues
dirtying pages.  So, no obvious place where to stop counting.  Or I am
losing something?

Later, Juan.
Anthony Liguori - Dec. 1, 2010, 3:55 p.m.
On 12/01/2010 09:51 AM, Juan Quintela wrote:
> Avi Kivity<avi@redhat.com>  wrote:
>    
>> On 11/30/2010 04:46 PM, Juan Quintela wrote:
>>      
>>> Anthony Liguori<anthony@codemonkey.ws>   wrote:
>>>        
>>>>   On 11/23/2010 05:03 PM, Juan Quintela wrote:
>>>>          
>>>>>   From: Juan Quintela<quintela@trasno.org>
>>>>>
>>>>>   Calculate the number of dirty pages takes a lot on hosts with lots
>>>>>   of memory.  Just maintain how many pages are dirty.  Only sync bitmaps
>>>>>   if number is small enough.
>>>>>
>>>>>            
>>>>   There needs to be numbers that justify this as part of the commit message.
>>>>          
>>> They are on patch 0/6.
>>>
>>> Additionally, with 64GB of RAM, this bitmap is HUGE, having to walk over
>>> it in each ram_save_live() call is too onerous.
>>>        
>> It's not so huge.  It's scaled down by a factor of 8 * 4096 = 32K.  So
>> it's a 2MB bitmap.  If kept as a bitmap and accessed in longs, it can
>> be read in less than a millisecond.

BTW, by this logic, even a 1-byte dirty bitmap is only 16mb which can be 
read in less than 16ms so where is the reported 24 minute stall coming from?

Regards,

Anthony Liguori
Juan Quintela - Dec. 1, 2010, 4:25 p.m.
Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 12/01/2010 09:51 AM, Juan Quintela wrote:
>> Avi Kivity<avi@redhat.com>  wrote:
>>    
>>> On 11/30/2010 04:46 PM, Juan Quintela wrote:
>>>      
>>>> Anthony Liguori<anthony@codemonkey.ws>   wrote:
>>>>        
>>>>>   On 11/23/2010 05:03 PM, Juan Quintela wrote:
>>>>>          
>>>>>>   From: Juan Quintela<quintela@trasno.org>
>>>>>>
>>>>>>   Calculate the number of dirty pages takes a lot on hosts with lots
>>>>>>   of memory.  Just maintain how many pages are dirty.  Only sync bitmaps
>>>>>>   if number is small enough.
>>>>>>
>>>>>>            
>>>>>   There needs to be numbers that justify this as part of the commit message.
>>>>>          
>>>> They are on patch 0/6.
>>>>
>>>> Additionally, with 64GB of RAM, this bitmap is HUGE, having to walk over
>>>> it in each ram_save_live() call is too onerous.
>>>>        
>>> It's not so huge.  It's scaled down by a factor of 8 * 4096 = 32K.  So
>>> it's a 2MB bitmap.  If kept as a bitmap and accessed in longs, it can
>>> be read in less than a millisecond.
>
> BTW, by this logic, even a 1-byte dirty bitmap is only 16mb which can
> be read in less than 16ms so where is the reported 24 minute stall
> coming from?

a) we read the bitmap more than once
b) the 1ms is based on "we read" it with longs and optimized, just now
   we have to read it by byte and inside the byte.

Later, Juan.

>
> Regards,
>
> Anthony Liguori
Anthony Liguori - Dec. 1, 2010, 4:33 p.m.
On 12/01/2010 10:25 AM, Juan Quintela wrote:
> Anthony Liguori<anthony@codemonkey.ws>  wrote:
>    
>> On 12/01/2010 09:51 AM, Juan Quintela wrote:
>>      
>>> Avi Kivity<avi@redhat.com>   wrote:
>>>
>>>        
>>>> On 11/30/2010 04:46 PM, Juan Quintela wrote:
>>>>
>>>>          
>>>>> Anthony Liguori<anthony@codemonkey.ws>    wrote:
>>>>>
>>>>>            
>>>>>>    On 11/23/2010 05:03 PM, Juan Quintela wrote:
>>>>>>
>>>>>>              
>>>>>>>    From: Juan Quintela<quintela@trasno.org>
>>>>>>>
>>>>>>>    Calculate the number of dirty pages takes a lot on hosts with lots
>>>>>>>    of memory.  Just maintain how many pages are dirty.  Only sync bitmaps
>>>>>>>    if number is small enough.
>>>>>>>
>>>>>>>
>>>>>>>                
>>>>>>    There needs to be numbers that justify this as part of the commit message.
>>>>>>
>>>>>>              
>>>>> They are on patch 0/6.
>>>>>
>>>>> Additionally, with 64GB of RAM, this bitmap is HUGE, having to walk over
>>>>> it in each ram_save_live() call is too onerous.
>>>>>
>>>>>            
>>>> It's not so huge.  It's scaled down by a factor of 8 * 4096 = 32K.  So
>>>> it's a 2MB bitmap.  If kept as a bitmap and accessed in longs, it can
>>>> be read in less than a millisecond.
>>>>          
>> BTW, by this logic, even a 1-byte dirty bitmap is only 16mb which can
>> be read in less than 16ms so where is the reported 24 minute stall
>> coming from?
>>      
> a) we read the bitmap more than once
>    

Not in a single iteration which is what the "stall" would consist of.

> b) the 1ms is based on "we read" it with longs and optimized, just now
>     we have to read it by byte and inside the byte.
>    

Byte accesses verse long access doesn't turn 16ms into 24 minutes.

Regards,

Anthony Liguori

> Later, Juan.
>
>    
>> Regards,
>>
>> Anthony Liguori
>>
Avi Kivity - Dec. 1, 2010, 4:43 p.m.
On 12/01/2010 06:33 PM, Anthony Liguori wrote:
>>> BTW, by this logic, even a 1-byte dirty bitmap is only 16mb which can
>>> be read in less than 16ms so where is the reported 24 minute stall
>>> coming from?
>> a) we read the bitmap more than once
>
> Not in a single iteration which is what the "stall" would consist of.
>
>> b) the 1ms is based on "we read" it with longs and optimized, just now
>>     we have to read it by byte and inside the byte.
>
> Byte accesses verse long access doesn't turn 16ms into 24 minutes.

We need actual measurements instead of speculations.  Walking the dirty 
bitmap _did_ _not_ introduce any stalls.  It was the qemu mutex, or 
walking kvm data structures in the kernel, or something.  No amount of 
dirty bitmap optimization will fix it.
Anthony Liguori - Dec. 1, 2010, 4:49 p.m.
On 12/01/2010 10:43 AM, Avi Kivity wrote:
> On 12/01/2010 06:33 PM, Anthony Liguori wrote:
>>>> BTW, by this logic, even a 1-byte dirty bitmap is only 16mb which can
>>>> be read in less than 16ms so where is the reported 24 minute stall
>>>> coming from?
>>> a) we read the bitmap more than once
>>
>> Not in a single iteration which is what the "stall" would consist of.
>>
>>> b) the 1ms is based on "we read" it with longs and optimized, just now
>>>     we have to read it by byte and inside the byte.
>>
>> Byte accesses verse long access doesn't turn 16ms into 24 minutes.
>
> We need actual measurements instead of speculations.

Yes, I agree 100%.  I think the place to start is what I suggested in a 
previous note in this thread, we need to measure actual stall time in 
the guest.

Regards,

Anthony Liguori
Avi Kivity - Dec. 1, 2010, 4:52 p.m.
On 12/01/2010 06:49 PM, Anthony Liguori wrote:
>> We need actual measurements instead of speculations.
>
>
> Yes, I agree 100%.  I think the place to start is what I suggested in 
> a previous note in this thread, we need to measure actual stall time 
> in the guest.

I'd actually start at the host.  How much time does 
ioctl(KVM_GET_DIRTY_LOG) take?  What's the percentage of time qemu_mutex 
is held?
Anthony Liguori - Dec. 1, 2010, 4:56 p.m.
On 12/01/2010 10:52 AM, Avi Kivity wrote:
> On 12/01/2010 06:49 PM, Anthony Liguori wrote:
>>> We need actual measurements instead of speculations.
>>
>>
>> Yes, I agree 100%.  I think the place to start is what I suggested in 
>> a previous note in this thread, we need to measure actual stall time 
>> in the guest.
>
> I'd actually start at the host.  How much time does 
> ioctl(KVM_GET_DIRTY_LOG) take?  What's the percentage of time 
> qemu_mutex is held?

The question is, what really are the symptoms of the problem.  It's not 
necessarily a bad thing if KVM_GET_DIRTY_LOG takes a long while 
qemu_mutex is held.

Is the problem that the monitor responds slowly?  Is the problem that 
the guest isn't consistently getting execution time?  Is the proper 
simply that the guest isn't getting enough total execution time?

I think we need to identify exactly what the problem is before we look 
for sources.

Regards,

Anthony Liguori
Avi Kivity - Dec. 1, 2010, 5:01 p.m.
On 12/01/2010 06:56 PM, Anthony Liguori wrote:
> On 12/01/2010 10:52 AM, Avi Kivity wrote:
>> On 12/01/2010 06:49 PM, Anthony Liguori wrote:
>>>> We need actual measurements instead of speculations.
>>>
>>>
>>> Yes, I agree 100%.  I think the place to start is what I suggested 
>>> in a previous note in this thread, we need to measure actual stall 
>>> time in the guest.
>>
>> I'd actually start at the host.  How much time does 
>> ioctl(KVM_GET_DIRTY_LOG) take?  What's the percentage of time 
>> qemu_mutex is held?
>
> The question is, what really are the symptoms of the problem.  It's 
> not necessarily a bad thing if KVM_GET_DIRTY_LOG takes a long while 
> qemu_mutex is held.

Whether or not qemu_mutex is held, long KVM_GET_DIRTY_LONG runtimes are 
bad, since they are a lower bound on your downtime.  And 
KVM_GET_DIRTY_LOG does a lot of work, and invokes 
synchronize_srcu_expedited(), which can be very slow.

>
> Is the problem that the monitor responds slowly?  Is the problem that 
> the guest isn't consistently getting execution time?  Is the proper 
> simply that the guest isn't getting enough total execution time?

All three can happen if qemu_mutex is held too long.  The third can 
happen for other reasons (like KVM_GET_DIRTY_LOG holding the mmu 
spinlock for too long, can fix with O(1) write protection).

>
> I think we need to identify exactly what the problem is before we look 
> for sources.
>
Anthony Liguori - Dec. 1, 2010, 5:05 p.m.
On 12/01/2010 11:01 AM, Avi Kivity wrote:
> On 12/01/2010 06:56 PM, Anthony Liguori wrote:
>> On 12/01/2010 10:52 AM, Avi Kivity wrote:
>>> On 12/01/2010 06:49 PM, Anthony Liguori wrote:
>>>>> We need actual measurements instead of speculations.
>>>>
>>>>
>>>> Yes, I agree 100%.  I think the place to start is what I suggested 
>>>> in a previous note in this thread, we need to measure actual stall 
>>>> time in the guest.
>>>
>>> I'd actually start at the host.  How much time does 
>>> ioctl(KVM_GET_DIRTY_LOG) take?  What's the percentage of time 
>>> qemu_mutex is held?
>>
>> The question is, what really are the symptoms of the problem.  It's 
>> not necessarily a bad thing if KVM_GET_DIRTY_LOG takes a long while 
>> qemu_mutex is held.
>
> Whether or not qemu_mutex is held, long KVM_GET_DIRTY_LONG runtimes 
> are bad, since they are a lower bound on your downtime.  And 
> KVM_GET_DIRTY_LOG does a lot of work, and invokes 
> synchronize_srcu_expedited(), which can be very slow.

That's fine, and you're right, it's a useful thing to do, but this 
series originated because of a problem and we ought to make sure we 
capture what the actual problem is.  That's not to say we shouldn't 
improve things that could stand to be improved.

>>
>> Is the problem that the monitor responds slowly?  Is the problem that 
>> the guest isn't consistently getting execution time?  Is the proper 
>> simply that the guest isn't getting enough total execution time?
>
> All three can happen if qemu_mutex is held too long.

Right, but I'm starting to think that the root of the problem is not 
that it's being held too long but that it's being held too often.

Regards,

Anthony Liguori

>   The third can happen for other reasons (like KVM_GET_DIRTY_LOG 
> holding the mmu spinlock for too long, can fix with O(1) write 
> protection).
>
>>
>> I think we need to identify exactly what the problem is before we 
>> look for sources.
>>
>
>
Juan Quintela - Dec. 1, 2010, 6:51 p.m.
Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 12/01/2010 11:01 AM, Avi Kivity wrote:
>> On 12/01/2010 06:56 PM, Anthony Liguori wrote:
>>> On 12/01/2010 10:52 AM, Avi Kivity wrote:
>>>> On 12/01/2010 06:49 PM, Anthony Liguori wrote:
>>>>>> We need actual measurements instead of speculations.
>>>>>
>>>>>
>>>>> Yes, I agree 100%.  I think the place to start is what I
>>>>> suggested in a previous note in this thread, we need to measure
>>>>> actual stall time in the guest.
>>>>
>>>> I'd actually start at the host.  How much time does
>>>> ioctl(KVM_GET_DIRTY_LOG) take?  What's the percentage of time
>>>> qemu_mutex is held?
>>>
>>> The question is, what really are the symptoms of the problem.  It's
>>> not necessarily a bad thing if KVM_GET_DIRTY_LOG takes a long while
>>> qemu_mutex is held.
>>
>> Whether or not qemu_mutex is held, long KVM_GET_DIRTY_LONG runtimes
>> are bad, since they are a lower bound on your downtime.  And
>> KVM_GET_DIRTY_LOG does a lot of work, and invokes
>> synchronize_srcu_expedited(), which can be very slow.
>
> That's fine, and you're right, it's a useful thing to do, but this
> series originated because of a problem and we ought to make sure we
> capture what the actual problem is.  That's not to say we shouldn't
> improve things that could stand to be improved.
>
>>>
>>> Is the problem that the monitor responds slowly?  Is the problem
>>> that the guest isn't consistently getting execution time?  Is the
>>> proper simply that the guest isn't getting enough total execution
>>> time?
>>
>> All three can happen if qemu_mutex is held too long.
>
> Right, but I'm starting to think that the root of the problem is not
> that it's being held too long but that it's being held too often.

Ok, I tested yesterday dropping qemu_mutex on ram_save_block (crude
thing, just qemu_mutex_unlock_iothread(); loop ;
qemu_mutex_lock_iothread();

As requested by Anthony, I tested on the guest to see how big stalls
were.  Code is:

        while (1) {
                if (gettimeofday(&t0, NULL) != 0)
                        perror("gettimeofday 1");
                if (usleep(100) != 0)
                        perror("usleep");
                if (gettimeofday(&t1, NULL) != 0)
                        perror("gettimeofday 2");
                t1.tv_usec -= t0.tv_usec;
                if (t1.tv_usec < 0) {
                        t1.tv_usec += 1000000;
                        t1.tv_sec--;
                }
                t1.tv_sec -= t0.tv_sec;

                if (t1.tv_sec || t1.tv_usec > 5000)
                        printf("delay of %ld\n", t1.tv_sec * 1000000 + t1.tv_usec);
        }

I tried in a guest that is completely idle with 8vcpus.  on idle, only
some stalls in the 5-8ms happens (as expected).

(this is after my series).

As soon as I start migration, we got several stalls in the 15-200ms
range.  Notice that stalls are not bigger because I limit the time that
qemu_mutex is held on the iothread to 50ms each time.

doing the crude qemu_mutex drop on ram_save_live, means that this
ministalls got way smaller in the 10-15ms range (some rare at 20ms).

And then we have an stall of around 120ms during the non-live part of
the migration.  I can't find where this stall comes from (i.e. saving
all the rest of pages and normal sections take much less time).  But on
the other hand, I have no instrumentation yet to measure how long it
takes to move to the other host and restart there.

So, we are still not there, but now we have only a single 120ms stall on
the guest, versus the 1-4 seconds ones that we used to have.

I don't have access to this machines until next week, so I am spending
this week implementing the ideas given on this thread.

Later, Juan.

Patch

diff --git a/arch_init.c b/arch_init.c
index b463798..c2bc144 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -176,20 +176,7 @@  static uint64_t bytes_transferred;

 static uint64_t ram_save_remaining(void)
 {
-    RAMBlock *block;
-    uint64_t count = 0;
-
-    QLIST_FOREACH(block, &ram_list.blocks, next) {
-        ram_addr_t addr;
-        for (addr = block->offset; addr < block->offset + block->length;
-             addr += TARGET_PAGE_SIZE) {
-            if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) {
-                count++;
-            }
-        }
-    }
-
-    return count;
+    return ram_list.dirty_pages;
 }

 uint64_t ram_bytes_remaining(void)
diff --git a/cpu-all.h b/cpu-all.h
index 30ae17d..5dc27c6 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -874,6 +874,7 @@  typedef struct RAMBlock {

 typedef struct RAMList {
     uint8_t *phys_dirty;
+    uint64_t dirty_pages;
     QLIST_HEAD(ram, RAMBlock) blocks;
 } RAMList;
 extern RAMList ram_list;
@@ -922,6 +923,9 @@  static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,

 static inline void cpu_physical_memory_set_dirty(ram_addr_t addr)
 {
+    if (!cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG))
+        ram_list.dirty_pages++;
+
     ram_list.phys_dirty[addr >> TARGET_PAGE_BITS] = 0xff;
 }

@@ -942,6 +946,9 @@  static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start,
     mask = ~dirty_flags;
     p = ram_list.phys_dirty + (start >> TARGET_PAGE_BITS);
     for (i = 0; i < len; i++) {
+        if (cpu_physical_memory_get_dirty(start + i * TARGET_PAGE_SIZE,
+                                          MIGRATION_DIRTY_FLAG & dirty_flags))
+            ram_list.dirty_pages--;
         p[i] &= mask;
     }
 }
diff --git a/exec.c b/exec.c
index f5b2386..ca2506e 100644
--- a/exec.c
+++ b/exec.c
@@ -2866,6 +2866,7 @@  ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
                                        last_ram_offset() >> TARGET_PAGE_BITS);
     memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS),
            0xff, size >> TARGET_PAGE_BITS);
+    ram_list.dirty_pages += size >> TARGET_PAGE_BITS;

     if (kvm_enabled())
         kvm_setup_guest_memory(new_block->host, size);