Message ID | 411b3f89b6b79b429686bd1b1fa188ba55b6c6f7.1290552026.git.quintela@redhat.com |
---|---|
State | New |
Headers | show |
On 11/23/2010 05:03 PM, Juan Quintela wrote: > From: Juan Quintela<quintela@trasno.org> > > Calculate the number of dirty pages takes a lot on hosts with lots > of memory. Just maintain how many pages are dirty. Only sync bitmaps > if number is small enough. > There needs to be numbers that justify this as part of the commit message. This only works for KVM because it happens to use set_dirty() whenever it dirties memory. I don't think will work with TCG. I also think that the fundamental problem is the kernel dirty bitmap. Perhaps it should return a multiple level table instead of a simple linear bitmap. Regards, Anthony Liguori > Signed-off-by: Juan Quintela<quintela@trasno.org> > Signed-off-by: Juan Quintela<quintela@redhat.com> > --- > arch_init.c | 15 +-------------- > cpu-all.h | 7 +++++++ > exec.c | 1 + > 3 files changed, 9 insertions(+), 14 deletions(-) > > diff --git a/arch_init.c b/arch_init.c > index b463798..c2bc144 100644 > --- a/arch_init.c > +++ b/arch_init.c > @@ -176,20 +176,7 @@ static uint64_t bytes_transferred; > > static uint64_t ram_save_remaining(void) > { > - RAMBlock *block; > - uint64_t count = 0; > - > - QLIST_FOREACH(block,&ram_list.blocks, next) { > - ram_addr_t addr; > - for (addr = block->offset; addr< block->offset + block->length; > - addr += TARGET_PAGE_SIZE) { > - if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) { > - count++; > - } > - } > - } > - > - return count; > + return ram_list.dirty_pages; > } > > uint64_t ram_bytes_remaining(void) > diff --git a/cpu-all.h b/cpu-all.h > index 30ae17d..5dc27c6 100644 > --- a/cpu-all.h > +++ b/cpu-all.h > @@ -874,6 +874,7 @@ typedef struct RAMBlock { > > typedef struct RAMList { > uint8_t *phys_dirty; > + uint64_t dirty_pages; > QLIST_HEAD(ram, RAMBlock) blocks; > } RAMList; > extern RAMList ram_list; > @@ -922,6 +923,9 @@ static inline int cpu_physical_memory_get_dirty(ram_addr_t addr, > > static inline void cpu_physical_memory_set_dirty(ram_addr_t addr) > { > + if (!cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) > + ram_list.dirty_pages++; > + > ram_list.phys_dirty[addr>> TARGET_PAGE_BITS] = 0xff; > } > > @@ -942,6 +946,9 @@ static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start, > mask = ~dirty_flags; > p = ram_list.phys_dirty + (start>> TARGET_PAGE_BITS); > for (i = 0; i< len; i++) { > + if (cpu_physical_memory_get_dirty(start + i * TARGET_PAGE_SIZE, > + MIGRATION_DIRTY_FLAG& dirty_flags)) > + ram_list.dirty_pages--; > p[i]&= mask; > } > } > diff --git a/exec.c b/exec.c > index f5b2386..ca2506e 100644 > --- a/exec.c > +++ b/exec.c > @@ -2866,6 +2866,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name, > last_ram_offset()>> TARGET_PAGE_BITS); > memset(ram_list.phys_dirty + (new_block->offset>> TARGET_PAGE_BITS), > 0xff, size>> TARGET_PAGE_BITS); > + ram_list.dirty_pages += size>> TARGET_PAGE_BITS; > > if (kvm_enabled()) > kvm_setup_guest_memory(new_block->host, size); >
Anthony Liguori <anthony@codemonkey.ws> wrote: > On 11/23/2010 05:03 PM, Juan Quintela wrote: >> From: Juan Quintela<quintela@trasno.org> >> >> Calculate the number of dirty pages takes a lot on hosts with lots >> of memory. Just maintain how many pages are dirty. Only sync bitmaps >> if number is small enough. >> > > There needs to be numbers that justify this as part of the commit message. They are on patch 0/6. Additionally, with 64GB of RAM, this bitmap is HUGE, having to walk over it in each ram_save_live() call is too onerous. > This only works for KVM because it happens to use set_dirty() whenever > it dirties memory. I don't think will work with TCG. The TCG is already broken with respect to migration. Nothing else sets that bitmap nowadays. > I also think that the fundamental problem is the kernel dirty bitmap. > Perhaps it should return a multiple level table instead of a simple > linear bitmap. KVM interface is suboptimal, no doubt here. But the bigger problem is qemu implementation. Having a byte arry where it only uses 2 bits is wasteful at least. And having to transform the array forth<->back with kvm is worse still. Longer term my plan is: - make kvm return the number of dirty pages somehow (i.e. no need to calculate them) - split bitmaps, migration bitmap only needs to be handled during migration, and CODE bitmap only for TCG. - interface with kvm would better be something like array of pages or array (pages, count). But I need to measure if there is any differences/improvements with one/another. We were discussing yesterday what made ram_live_block() staying on top of the profiles, it was this. This makes ram_save_live() to dissapear from profiles. Later, Juan. > Regards, > > Anthony Liguori > >> Signed-off-by: Juan Quintela<quintela@trasno.org> >> Signed-off-by: Juan Quintela<quintela@redhat.com> >> --- >> arch_init.c | 15 +-------------- >> cpu-all.h | 7 +++++++ >> exec.c | 1 + >> 3 files changed, 9 insertions(+), 14 deletions(-) >> >> diff --git a/arch_init.c b/arch_init.c >> index b463798..c2bc144 100644 >> --- a/arch_init.c >> +++ b/arch_init.c >> @@ -176,20 +176,7 @@ static uint64_t bytes_transferred; >> >> static uint64_t ram_save_remaining(void) >> { >> - RAMBlock *block; >> - uint64_t count = 0; >> - >> - QLIST_FOREACH(block,&ram_list.blocks, next) { >> - ram_addr_t addr; >> - for (addr = block->offset; addr< block->offset + block->length; >> - addr += TARGET_PAGE_SIZE) { >> - if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) { >> - count++; >> - } >> - } >> - } >> - >> - return count; >> + return ram_list.dirty_pages; >> } >> >> uint64_t ram_bytes_remaining(void) >> diff --git a/cpu-all.h b/cpu-all.h >> index 30ae17d..5dc27c6 100644 >> --- a/cpu-all.h >> +++ b/cpu-all.h >> @@ -874,6 +874,7 @@ typedef struct RAMBlock { >> >> typedef struct RAMList { >> uint8_t *phys_dirty; >> + uint64_t dirty_pages; >> QLIST_HEAD(ram, RAMBlock) blocks; >> } RAMList; >> extern RAMList ram_list; >> @@ -922,6 +923,9 @@ static inline int cpu_physical_memory_get_dirty(ram_addr_t addr, >> >> static inline void cpu_physical_memory_set_dirty(ram_addr_t addr) >> { >> + if (!cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) >> + ram_list.dirty_pages++; >> + >> ram_list.phys_dirty[addr>> TARGET_PAGE_BITS] = 0xff; >> } >> >> @@ -942,6 +946,9 @@ static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start, >> mask = ~dirty_flags; >> p = ram_list.phys_dirty + (start>> TARGET_PAGE_BITS); >> for (i = 0; i< len; i++) { >> + if (cpu_physical_memory_get_dirty(start + i * TARGET_PAGE_SIZE, >> + MIGRATION_DIRTY_FLAG& dirty_flags)) >> + ram_list.dirty_pages--; >> p[i]&= mask; >> } >> } >> diff --git a/exec.c b/exec.c >> index f5b2386..ca2506e 100644 >> --- a/exec.c >> +++ b/exec.c >> @@ -2866,6 +2866,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name, >> last_ram_offset()>> TARGET_PAGE_BITS); >> memset(ram_list.phys_dirty + (new_block->offset>> TARGET_PAGE_BITS), >> 0xff, size>> TARGET_PAGE_BITS); >> + ram_list.dirty_pages += size>> TARGET_PAGE_BITS; >> >> if (kvm_enabled()) >> kvm_setup_guest_memory(new_block->host, size); >>
On 11/30/2010 04:46 PM, Juan Quintela wrote: > Anthony Liguori<anthony@codemonkey.ws> wrote: > > On 11/23/2010 05:03 PM, Juan Quintela wrote: > >> From: Juan Quintela<quintela@trasno.org> > >> > >> Calculate the number of dirty pages takes a lot on hosts with lots > >> of memory. Just maintain how many pages are dirty. Only sync bitmaps > >> if number is small enough. > >> > > > > There needs to be numbers that justify this as part of the commit message. > > They are on patch 0/6. > > Additionally, with 64GB of RAM, this bitmap is HUGE, having to walk over > it in each ram_save_live() call is too onerous. It's not so huge. It's scaled down by a factor of 8 * 4096 = 32K. So it's a 2MB bitmap. If kept as a bitmap and accessed in longs, it can be read in less than a millisecond. An optimization can be to look at the previous ram_save_live (which had to walk the bitmap). If old_nr_dirty > 4*target_nr_dirty, assume we need one more pass and don't scan the bitmap. Another optimization is to stop the count when we reach the target; instead of ram_save_remaining() have a ram_save_may_stop() which counts the number of dirty bits until it reaches target_nr_dirty or exhausts the bitmap.
Avi Kivity <avi@redhat.com> wrote: > On 11/30/2010 04:46 PM, Juan Quintela wrote: >> Anthony Liguori<anthony@codemonkey.ws> wrote: >> > On 11/23/2010 05:03 PM, Juan Quintela wrote: >> >> From: Juan Quintela<quintela@trasno.org> >> >> >> >> Calculate the number of dirty pages takes a lot on hosts with lots >> >> of memory. Just maintain how many pages are dirty. Only sync bitmaps >> >> if number is small enough. >> >> >> > >> > There needs to be numbers that justify this as part of the commit message. >> >> They are on patch 0/6. >> >> Additionally, with 64GB of RAM, this bitmap is HUGE, having to walk over >> it in each ram_save_live() call is too onerous. > > It's not so huge. It's scaled down by a factor of 8 * 4096 = 32K. So > it's a 2MB bitmap. If kept as a bitmap and accessed in longs, it can > be read in less than a millisecond. Going to undertand why we need the other bitmaps for kvm. Basically we have: - CODE bit: kvm don't care - VGA bit: not really sure how much we care. My understanding is that the bitmap for the VGA don't need to be as big as all memory. - MIGRATION bit: we care O:-) Now, we are setting the MIGRATION in three ways: - kvm (we sync with that) - vhost net: it uses a different mechanism, not shared with kvm - *write[blw]. My understanding is that we care about this ones. Do we really care? We are also syncing too much bitmaps, we can do a bit less syncing. > An optimization can be to look at the previous ram_save_live (which > had to walk the bitmap). If old_nr_dirty > 4*target_nr_dirty, assume > we need one more pass and don't scan the bitmap. We had a similar optimization on rhel5. We only synched the bitmap each time that we passed over address 0. And each time that we called ram_save_remaining(). Trivial optimization for ram_save_reamaining is to: - maintain the number of pages that are dirty (we only modify the bitmap in two places, trivial to inc/dec the total number of dirty pages). - on ram_save_live only sync bitmap if we are about to exit from the loop. If pages dirty are still bigger that the one we need to go to non-save life, it makes no sense to sync. > Another optimization is to stop the count when we reach the target; > instead of ram_save_remaining() have a ram_save_may_stop() which > counts the number of dirty bits until it reaches target_nr_dirty or > exhausts the bitmap. The problem here is that guest want to continue running, and continues dirtying pages. So, no obvious place where to stop counting. Or I am losing something? Later, Juan.
On 12/01/2010 09:51 AM, Juan Quintela wrote: > Avi Kivity<avi@redhat.com> wrote: > >> On 11/30/2010 04:46 PM, Juan Quintela wrote: >> >>> Anthony Liguori<anthony@codemonkey.ws> wrote: >>> >>>> On 11/23/2010 05:03 PM, Juan Quintela wrote: >>>> >>>>> From: Juan Quintela<quintela@trasno.org> >>>>> >>>>> Calculate the number of dirty pages takes a lot on hosts with lots >>>>> of memory. Just maintain how many pages are dirty. Only sync bitmaps >>>>> if number is small enough. >>>>> >>>>> >>>> There needs to be numbers that justify this as part of the commit message. >>>> >>> They are on patch 0/6. >>> >>> Additionally, with 64GB of RAM, this bitmap is HUGE, having to walk over >>> it in each ram_save_live() call is too onerous. >>> >> It's not so huge. It's scaled down by a factor of 8 * 4096 = 32K. So >> it's a 2MB bitmap. If kept as a bitmap and accessed in longs, it can >> be read in less than a millisecond. BTW, by this logic, even a 1-byte dirty bitmap is only 16mb which can be read in less than 16ms so where is the reported 24 minute stall coming from? Regards, Anthony Liguori
Anthony Liguori <anthony@codemonkey.ws> wrote: > On 12/01/2010 09:51 AM, Juan Quintela wrote: >> Avi Kivity<avi@redhat.com> wrote: >> >>> On 11/30/2010 04:46 PM, Juan Quintela wrote: >>> >>>> Anthony Liguori<anthony@codemonkey.ws> wrote: >>>> >>>>> On 11/23/2010 05:03 PM, Juan Quintela wrote: >>>>> >>>>>> From: Juan Quintela<quintela@trasno.org> >>>>>> >>>>>> Calculate the number of dirty pages takes a lot on hosts with lots >>>>>> of memory. Just maintain how many pages are dirty. Only sync bitmaps >>>>>> if number is small enough. >>>>>> >>>>>> >>>>> There needs to be numbers that justify this as part of the commit message. >>>>> >>>> They are on patch 0/6. >>>> >>>> Additionally, with 64GB of RAM, this bitmap is HUGE, having to walk over >>>> it in each ram_save_live() call is too onerous. >>>> >>> It's not so huge. It's scaled down by a factor of 8 * 4096 = 32K. So >>> it's a 2MB bitmap. If kept as a bitmap and accessed in longs, it can >>> be read in less than a millisecond. > > BTW, by this logic, even a 1-byte dirty bitmap is only 16mb which can > be read in less than 16ms so where is the reported 24 minute stall > coming from? a) we read the bitmap more than once b) the 1ms is based on "we read" it with longs and optimized, just now we have to read it by byte and inside the byte. Later, Juan. > > Regards, > > Anthony Liguori
On 12/01/2010 10:25 AM, Juan Quintela wrote: > Anthony Liguori<anthony@codemonkey.ws> wrote: > >> On 12/01/2010 09:51 AM, Juan Quintela wrote: >> >>> Avi Kivity<avi@redhat.com> wrote: >>> >>> >>>> On 11/30/2010 04:46 PM, Juan Quintela wrote: >>>> >>>> >>>>> Anthony Liguori<anthony@codemonkey.ws> wrote: >>>>> >>>>> >>>>>> On 11/23/2010 05:03 PM, Juan Quintela wrote: >>>>>> >>>>>> >>>>>>> From: Juan Quintela<quintela@trasno.org> >>>>>>> >>>>>>> Calculate the number of dirty pages takes a lot on hosts with lots >>>>>>> of memory. Just maintain how many pages are dirty. Only sync bitmaps >>>>>>> if number is small enough. >>>>>>> >>>>>>> >>>>>>> >>>>>> There needs to be numbers that justify this as part of the commit message. >>>>>> >>>>>> >>>>> They are on patch 0/6. >>>>> >>>>> Additionally, with 64GB of RAM, this bitmap is HUGE, having to walk over >>>>> it in each ram_save_live() call is too onerous. >>>>> >>>>> >>>> It's not so huge. It's scaled down by a factor of 8 * 4096 = 32K. So >>>> it's a 2MB bitmap. If kept as a bitmap and accessed in longs, it can >>>> be read in less than a millisecond. >>>> >> BTW, by this logic, even a 1-byte dirty bitmap is only 16mb which can >> be read in less than 16ms so where is the reported 24 minute stall >> coming from? >> > a) we read the bitmap more than once > Not in a single iteration which is what the "stall" would consist of. > b) the 1ms is based on "we read" it with longs and optimized, just now > we have to read it by byte and inside the byte. > Byte accesses verse long access doesn't turn 16ms into 24 minutes. Regards, Anthony Liguori > Later, Juan. > > >> Regards, >> >> Anthony Liguori >>
On 12/01/2010 06:33 PM, Anthony Liguori wrote: >>> BTW, by this logic, even a 1-byte dirty bitmap is only 16mb which can >>> be read in less than 16ms so where is the reported 24 minute stall >>> coming from? >> a) we read the bitmap more than once > > Not in a single iteration which is what the "stall" would consist of. > >> b) the 1ms is based on "we read" it with longs and optimized, just now >> we have to read it by byte and inside the byte. > > Byte accesses verse long access doesn't turn 16ms into 24 minutes. We need actual measurements instead of speculations. Walking the dirty bitmap _did_ _not_ introduce any stalls. It was the qemu mutex, or walking kvm data structures in the kernel, or something. No amount of dirty bitmap optimization will fix it.
On 12/01/2010 10:43 AM, Avi Kivity wrote: > On 12/01/2010 06:33 PM, Anthony Liguori wrote: >>>> BTW, by this logic, even a 1-byte dirty bitmap is only 16mb which can >>>> be read in less than 16ms so where is the reported 24 minute stall >>>> coming from? >>> a) we read the bitmap more than once >> >> Not in a single iteration which is what the "stall" would consist of. >> >>> b) the 1ms is based on "we read" it with longs and optimized, just now >>> we have to read it by byte and inside the byte. >> >> Byte accesses verse long access doesn't turn 16ms into 24 minutes. > > We need actual measurements instead of speculations. Yes, I agree 100%. I think the place to start is what I suggested in a previous note in this thread, we need to measure actual stall time in the guest. Regards, Anthony Liguori
On 12/01/2010 06:49 PM, Anthony Liguori wrote: >> We need actual measurements instead of speculations. > > > Yes, I agree 100%. I think the place to start is what I suggested in > a previous note in this thread, we need to measure actual stall time > in the guest. I'd actually start at the host. How much time does ioctl(KVM_GET_DIRTY_LOG) take? What's the percentage of time qemu_mutex is held?
On 12/01/2010 10:52 AM, Avi Kivity wrote: > On 12/01/2010 06:49 PM, Anthony Liguori wrote: >>> We need actual measurements instead of speculations. >> >> >> Yes, I agree 100%. I think the place to start is what I suggested in >> a previous note in this thread, we need to measure actual stall time >> in the guest. > > I'd actually start at the host. How much time does > ioctl(KVM_GET_DIRTY_LOG) take? What's the percentage of time > qemu_mutex is held? The question is, what really are the symptoms of the problem. It's not necessarily a bad thing if KVM_GET_DIRTY_LOG takes a long while qemu_mutex is held. Is the problem that the monitor responds slowly? Is the problem that the guest isn't consistently getting execution time? Is the proper simply that the guest isn't getting enough total execution time? I think we need to identify exactly what the problem is before we look for sources. Regards, Anthony Liguori
On 12/01/2010 06:56 PM, Anthony Liguori wrote: > On 12/01/2010 10:52 AM, Avi Kivity wrote: >> On 12/01/2010 06:49 PM, Anthony Liguori wrote: >>>> We need actual measurements instead of speculations. >>> >>> >>> Yes, I agree 100%. I think the place to start is what I suggested >>> in a previous note in this thread, we need to measure actual stall >>> time in the guest. >> >> I'd actually start at the host. How much time does >> ioctl(KVM_GET_DIRTY_LOG) take? What's the percentage of time >> qemu_mutex is held? > > The question is, what really are the symptoms of the problem. It's > not necessarily a bad thing if KVM_GET_DIRTY_LOG takes a long while > qemu_mutex is held. Whether or not qemu_mutex is held, long KVM_GET_DIRTY_LONG runtimes are bad, since they are a lower bound on your downtime. And KVM_GET_DIRTY_LOG does a lot of work, and invokes synchronize_srcu_expedited(), which can be very slow. > > Is the problem that the monitor responds slowly? Is the problem that > the guest isn't consistently getting execution time? Is the proper > simply that the guest isn't getting enough total execution time? All three can happen if qemu_mutex is held too long. The third can happen for other reasons (like KVM_GET_DIRTY_LOG holding the mmu spinlock for too long, can fix with O(1) write protection). > > I think we need to identify exactly what the problem is before we look > for sources. >
On 12/01/2010 11:01 AM, Avi Kivity wrote: > On 12/01/2010 06:56 PM, Anthony Liguori wrote: >> On 12/01/2010 10:52 AM, Avi Kivity wrote: >>> On 12/01/2010 06:49 PM, Anthony Liguori wrote: >>>>> We need actual measurements instead of speculations. >>>> >>>> >>>> Yes, I agree 100%. I think the place to start is what I suggested >>>> in a previous note in this thread, we need to measure actual stall >>>> time in the guest. >>> >>> I'd actually start at the host. How much time does >>> ioctl(KVM_GET_DIRTY_LOG) take? What's the percentage of time >>> qemu_mutex is held? >> >> The question is, what really are the symptoms of the problem. It's >> not necessarily a bad thing if KVM_GET_DIRTY_LOG takes a long while >> qemu_mutex is held. > > Whether or not qemu_mutex is held, long KVM_GET_DIRTY_LONG runtimes > are bad, since they are a lower bound on your downtime. And > KVM_GET_DIRTY_LOG does a lot of work, and invokes > synchronize_srcu_expedited(), which can be very slow. That's fine, and you're right, it's a useful thing to do, but this series originated because of a problem and we ought to make sure we capture what the actual problem is. That's not to say we shouldn't improve things that could stand to be improved. >> >> Is the problem that the monitor responds slowly? Is the problem that >> the guest isn't consistently getting execution time? Is the proper >> simply that the guest isn't getting enough total execution time? > > All three can happen if qemu_mutex is held too long. Right, but I'm starting to think that the root of the problem is not that it's being held too long but that it's being held too often. Regards, Anthony Liguori > The third can happen for other reasons (like KVM_GET_DIRTY_LOG > holding the mmu spinlock for too long, can fix with O(1) write > protection). > >> >> I think we need to identify exactly what the problem is before we >> look for sources. >> > >
Anthony Liguori <anthony@codemonkey.ws> wrote: > On 12/01/2010 11:01 AM, Avi Kivity wrote: >> On 12/01/2010 06:56 PM, Anthony Liguori wrote: >>> On 12/01/2010 10:52 AM, Avi Kivity wrote: >>>> On 12/01/2010 06:49 PM, Anthony Liguori wrote: >>>>>> We need actual measurements instead of speculations. >>>>> >>>>> >>>>> Yes, I agree 100%. I think the place to start is what I >>>>> suggested in a previous note in this thread, we need to measure >>>>> actual stall time in the guest. >>>> >>>> I'd actually start at the host. How much time does >>>> ioctl(KVM_GET_DIRTY_LOG) take? What's the percentage of time >>>> qemu_mutex is held? >>> >>> The question is, what really are the symptoms of the problem. It's >>> not necessarily a bad thing if KVM_GET_DIRTY_LOG takes a long while >>> qemu_mutex is held. >> >> Whether or not qemu_mutex is held, long KVM_GET_DIRTY_LONG runtimes >> are bad, since they are a lower bound on your downtime. And >> KVM_GET_DIRTY_LOG does a lot of work, and invokes >> synchronize_srcu_expedited(), which can be very slow. > > That's fine, and you're right, it's a useful thing to do, but this > series originated because of a problem and we ought to make sure we > capture what the actual problem is. That's not to say we shouldn't > improve things that could stand to be improved. > >>> >>> Is the problem that the monitor responds slowly? Is the problem >>> that the guest isn't consistently getting execution time? Is the >>> proper simply that the guest isn't getting enough total execution >>> time? >> >> All three can happen if qemu_mutex is held too long. > > Right, but I'm starting to think that the root of the problem is not > that it's being held too long but that it's being held too often. Ok, I tested yesterday dropping qemu_mutex on ram_save_block (crude thing, just qemu_mutex_unlock_iothread(); loop ; qemu_mutex_lock_iothread(); As requested by Anthony, I tested on the guest to see how big stalls were. Code is: while (1) { if (gettimeofday(&t0, NULL) != 0) perror("gettimeofday 1"); if (usleep(100) != 0) perror("usleep"); if (gettimeofday(&t1, NULL) != 0) perror("gettimeofday 2"); t1.tv_usec -= t0.tv_usec; if (t1.tv_usec < 0) { t1.tv_usec += 1000000; t1.tv_sec--; } t1.tv_sec -= t0.tv_sec; if (t1.tv_sec || t1.tv_usec > 5000) printf("delay of %ld\n", t1.tv_sec * 1000000 + t1.tv_usec); } I tried in a guest that is completely idle with 8vcpus. on idle, only some stalls in the 5-8ms happens (as expected). (this is after my series). As soon as I start migration, we got several stalls in the 15-200ms range. Notice that stalls are not bigger because I limit the time that qemu_mutex is held on the iothread to 50ms each time. doing the crude qemu_mutex drop on ram_save_live, means that this ministalls got way smaller in the 10-15ms range (some rare at 20ms). And then we have an stall of around 120ms during the non-live part of the migration. I can't find where this stall comes from (i.e. saving all the rest of pages and normal sections take much less time). But on the other hand, I have no instrumentation yet to measure how long it takes to move to the other host and restart there. So, we are still not there, but now we have only a single 120ms stall on the guest, versus the 1-4 seconds ones that we used to have. I don't have access to this machines until next week, so I am spending this week implementing the ideas given on this thread. Later, Juan.
diff --git a/arch_init.c b/arch_init.c index b463798..c2bc144 100644 --- a/arch_init.c +++ b/arch_init.c @@ -176,20 +176,7 @@ static uint64_t bytes_transferred; static uint64_t ram_save_remaining(void) { - RAMBlock *block; - uint64_t count = 0; - - QLIST_FOREACH(block, &ram_list.blocks, next) { - ram_addr_t addr; - for (addr = block->offset; addr < block->offset + block->length; - addr += TARGET_PAGE_SIZE) { - if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) { - count++; - } - } - } - - return count; + return ram_list.dirty_pages; } uint64_t ram_bytes_remaining(void) diff --git a/cpu-all.h b/cpu-all.h index 30ae17d..5dc27c6 100644 --- a/cpu-all.h +++ b/cpu-all.h @@ -874,6 +874,7 @@ typedef struct RAMBlock { typedef struct RAMList { uint8_t *phys_dirty; + uint64_t dirty_pages; QLIST_HEAD(ram, RAMBlock) blocks; } RAMList; extern RAMList ram_list; @@ -922,6 +923,9 @@ static inline int cpu_physical_memory_get_dirty(ram_addr_t addr, static inline void cpu_physical_memory_set_dirty(ram_addr_t addr) { + if (!cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) + ram_list.dirty_pages++; + ram_list.phys_dirty[addr >> TARGET_PAGE_BITS] = 0xff; } @@ -942,6 +946,9 @@ static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start, mask = ~dirty_flags; p = ram_list.phys_dirty + (start >> TARGET_PAGE_BITS); for (i = 0; i < len; i++) { + if (cpu_physical_memory_get_dirty(start + i * TARGET_PAGE_SIZE, + MIGRATION_DIRTY_FLAG & dirty_flags)) + ram_list.dirty_pages--; p[i] &= mask; } } diff --git a/exec.c b/exec.c index f5b2386..ca2506e 100644 --- a/exec.c +++ b/exec.c @@ -2866,6 +2866,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name, last_ram_offset() >> TARGET_PAGE_BITS); memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS), 0xff, size >> TARGET_PAGE_BITS); + ram_list.dirty_pages += size >> TARGET_PAGE_BITS; if (kvm_enabled()) kvm_setup_guest_memory(new_block->host, size);