Message ID | 1452169208-840-14-git-send-email-zhang.zhanghailiang@huawei.com |
---|---|
State | New |
Headers | show |
* zhanghailiang (zhang.zhanghailiang@huawei.com) wrote: > If we modify VM's RAM (pages) during setup stage after enable write-protect > notification in snapshot thread, the modification action will get stuck because > we only remove the page's write-protect in savevm process, it blocked by itself. > > To fix this bug, we will remove page's write-protect in fault thread during > the setup stage. Besides, we should not try to get global lock after setup stage, > or there maybe an deadlock error. Hmm this complicates things a bit more doesn't it. What's the order of: a) setup b) savings devices c) Being able to transmit the pages? Are these pages that are being modified during setup, being modified as part of the device state save? Dave > > Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com> > --- > include/migration/migration.h | 4 ++-- > migration/migration.c | 2 +- > migration/postcopy-ram.c | 17 ++++++++++++++++- > migration/ram.c | 37 +++++++++++++++++++++++++++++++------ > 4 files changed, 50 insertions(+), 10 deletions(-) > > diff --git a/include/migration/migration.h b/include/migration/migration.h > index ef4c071..435de31 100644 > --- a/include/migration/migration.h > +++ b/include/migration/migration.h > @@ -127,7 +127,7 @@ struct MigrationSrcPageRequest { > RAMBlock *rb; > hwaddr offset; > hwaddr len; > - > + uint8_t *pages_copy_addr; > QSIMPLEQ_ENTRY(MigrationSrcPageRequest) next_req; > }; > > @@ -333,7 +333,7 @@ void global_state_store_running(void); > > void flush_page_queue(MigrationState *ms); > int ram_save_queue_pages(MigrationState *ms, const char *rbname, > - ram_addr_t start, ram_addr_t len); > + ram_addr_t start, ram_addr_t len, bool copy_pages); > > PostcopyState postcopy_state_get(void); > /* Set the state and return the old state */ > diff --git a/migration/migration.c b/migration/migration.c > index 3765c3b..bf4c7a1 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -1248,7 +1248,7 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, > return; > } > > - if (ram_save_queue_pages(ms, rbname, start, len)) { > + if (ram_save_queue_pages(ms, rbname, start, len, false)) { > mark_source_rp_bad(ms); > } > } > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c > index 61392d3..2cf477d 100644 > --- a/migration/postcopy-ram.c > +++ b/migration/postcopy-ram.c > @@ -543,13 +543,28 @@ static void *postcopy_ram_fault_thread(void *opaque) > MigrationState *ms = container_of(us, MigrationState, > userfault_state); > ret = ram_save_queue_pages(ms, qemu_ram_get_idstr(rb), rb_offset, > - hostpagesize); > + hostpagesize, true); > > if (ret < 0) { > error_report("%s: Save: %"PRIx64 " failed!", > __func__, (uint64_t)msg.arg.pagefault.address); > break; > } > + > + /* Note: In the setup process, snapshot_thread may modify VM's > + * write-protected pages, we should not block it there, or there > + * will be an deadlock error. > + */ > + if (migration_in_setup(ms)) { > + uint64_t host = msg.arg.pagefault.address; > + > + host &= ~(hostpagesize - 1); > + ret = ram_set_pages_wp(host, getpagesize(), true, > + us->userfault_fd); > + if (ret < 0) { > + error_report("Remove page's write-protect failed"); > + } > + } > } > } > trace_postcopy_ram_fault_thread_exit(); > diff --git a/migration/ram.c b/migration/ram.c > index 8656719..747f9aa 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -233,6 +233,7 @@ struct PageSearchStatus { > ram_addr_t offset; > /* Set once we wrap around */ > bool complete_round; > + uint8_t *pages_copy; > }; > typedef struct PageSearchStatus PageSearchStatus; > > @@ -742,7 +743,12 @@ static int ram_save_page(QEMUFile *f, PageSearchStatus *pss, > RAMBlock *block = pss->block; > ram_addr_t offset = pss->offset; > > - p = block->host + offset; > + /* If we have a copy of this page, use the backup page first */ > + if (pss->pages_copy) { > + p = pss->pages_copy; > + } else { > + p = block->host + offset; > + } > > /* In doubt sent page as normal */ > bytes_xmit = 0; > @@ -926,7 +932,12 @@ static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, > RAMBlock *block = pss->block; > ram_addr_t offset = pss->offset; > > - p = block->host + offset; > + /* If we have a copy of this page, use the backup first */ > + if (pss->pages_copy) { > + p = pss->pages_copy; > + } else { > + p = block->host + offset; > + } > > bytes_xmit = 0; > ret = ram_control_save_page(f, block->offset, > @@ -1043,7 +1054,7 @@ static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, > * Returns: block (or NULL if none available) > */ > static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, > - ram_addr_t *ram_addr_abs) > + ram_addr_t *ram_addr_abs, uint8 **pages_copy_addr) > { > RAMBlock *block = NULL; > > @@ -1055,7 +1066,7 @@ static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, > *offset = entry->offset; > *ram_addr_abs = (entry->offset + entry->rb->offset) & > TARGET_PAGE_MASK; > - > + *pages_copy_addr = entry->pages_copy_addr; > if (entry->len > TARGET_PAGE_SIZE) { > entry->len -= TARGET_PAGE_SIZE; > entry->offset += TARGET_PAGE_SIZE; > @@ -1086,9 +1097,10 @@ static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, > RAMBlock *block; > ram_addr_t offset; > bool dirty; > + uint8 *pages_backup_addr = NULL; > > do { > - block = unqueue_page(ms, &offset, ram_addr_abs); > + block = unqueue_page(ms, &offset, ram_addr_abs, &pages_backup_addr); > /* > * We're sending this page, and since it's postcopy nothing else > * will dirty it, and we must make sure it doesn't get sent again > @@ -1130,6 +1142,7 @@ static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, > */ > pss->block = block; > pss->offset = offset; > + pss->pages_copy = pages_backup_addr; > } > > return !!block; > @@ -1166,7 +1179,7 @@ void flush_page_queue(MigrationState *ms) > * Return: 0 on success > */ > int ram_save_queue_pages(MigrationState *ms, const char *rbname, > - ram_addr_t start, ram_addr_t len) > + ram_addr_t start, ram_addr_t len, bool copy_pages) > { > RAMBlock *ramblock; > > @@ -1206,6 +1219,17 @@ int ram_save_queue_pages(MigrationState *ms, const char *rbname, > new_entry->rb = ramblock; > new_entry->offset = start; > new_entry->len = len; > + if (copy_pages) { > + /* Fix me: Better to realize a memory pool */ > + new_entry->pages_copy_addr = g_try_malloc0(len); > + > + if (!new_entry->pages_copy_addr) { > + error_report("%s: Failed to alloc memory", __func__); > + return -1; > + } > + > + memcpy(new_entry->pages_copy_addr, ramblock_ptr(ramblock, start), len); > + } > > memory_region_ref(ramblock->mr); > qemu_mutex_lock(&ms->src_page_req_mutex); > @@ -1342,6 +1366,7 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, > pss.block = last_seen_block; > pss.offset = last_offset; > pss.complete_round = false; > + pss.pages_copy = NULL; > > if (!pss.block) { > pss.block = QLIST_FIRST_RCU(&ram_list.blocks); > -- > 1.8.3.1 > > -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On 2016/7/14 1:52, Dr. David Alan Gilbert wrote: > * zhanghailiang (zhang.zhanghailiang@huawei.com) wrote: >> If we modify VM's RAM (pages) during setup stage after enable write-protect >> notification in snapshot thread, the modification action will get stuck because >> we only remove the page's write-protect in savevm process, it blocked by itself. >> >> To fix this bug, we will remove page's write-protect in fault thread during >> the setup stage. Besides, we should not try to get global lock after setup stage, >> or there maybe an deadlock error. > > Hmm this complicates things a bit more doesn't it. > What's the order of: > a) setup > b) savings devices > c) Being able to transmit the pages? > > Are these pages that are being modified during setup, being modified > as part of the device state save? > Yes, I'm not sure if the problem still exist or not after exchanging the sequence of 'save devices' and 'enable ram notify'. I'll look into it. Hailaing > Dave > >> >> Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com> >> --- >> include/migration/migration.h | 4 ++-- >> migration/migration.c | 2 +- >> migration/postcopy-ram.c | 17 ++++++++++++++++- >> migration/ram.c | 37 +++++++++++++++++++++++++++++++------ >> 4 files changed, 50 insertions(+), 10 deletions(-) >> >> diff --git a/include/migration/migration.h b/include/migration/migration.h >> index ef4c071..435de31 100644 >> --- a/include/migration/migration.h >> +++ b/include/migration/migration.h >> @@ -127,7 +127,7 @@ struct MigrationSrcPageRequest { >> RAMBlock *rb; >> hwaddr offset; >> hwaddr len; >> - >> + uint8_t *pages_copy_addr; >> QSIMPLEQ_ENTRY(MigrationSrcPageRequest) next_req; >> }; >> >> @@ -333,7 +333,7 @@ void global_state_store_running(void); >> >> void flush_page_queue(MigrationState *ms); >> int ram_save_queue_pages(MigrationState *ms, const char *rbname, >> - ram_addr_t start, ram_addr_t len); >> + ram_addr_t start, ram_addr_t len, bool copy_pages); >> >> PostcopyState postcopy_state_get(void); >> /* Set the state and return the old state */ >> diff --git a/migration/migration.c b/migration/migration.c >> index 3765c3b..bf4c7a1 100644 >> --- a/migration/migration.c >> +++ b/migration/migration.c >> @@ -1248,7 +1248,7 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, >> return; >> } >> >> - if (ram_save_queue_pages(ms, rbname, start, len)) { >> + if (ram_save_queue_pages(ms, rbname, start, len, false)) { >> mark_source_rp_bad(ms); >> } >> } >> diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c >> index 61392d3..2cf477d 100644 >> --- a/migration/postcopy-ram.c >> +++ b/migration/postcopy-ram.c >> @@ -543,13 +543,28 @@ static void *postcopy_ram_fault_thread(void *opaque) >> MigrationState *ms = container_of(us, MigrationState, >> userfault_state); >> ret = ram_save_queue_pages(ms, qemu_ram_get_idstr(rb), rb_offset, >> - hostpagesize); >> + hostpagesize, true); >> >> if (ret < 0) { >> error_report("%s: Save: %"PRIx64 " failed!", >> __func__, (uint64_t)msg.arg.pagefault.address); >> break; >> } >> + >> + /* Note: In the setup process, snapshot_thread may modify VM's >> + * write-protected pages, we should not block it there, or there >> + * will be an deadlock error. >> + */ >> + if (migration_in_setup(ms)) { >> + uint64_t host = msg.arg.pagefault.address; >> + >> + host &= ~(hostpagesize - 1); >> + ret = ram_set_pages_wp(host, getpagesize(), true, >> + us->userfault_fd); >> + if (ret < 0) { >> + error_report("Remove page's write-protect failed"); >> + } >> + } >> } >> } >> trace_postcopy_ram_fault_thread_exit(); >> diff --git a/migration/ram.c b/migration/ram.c >> index 8656719..747f9aa 100644 >> --- a/migration/ram.c >> +++ b/migration/ram.c >> @@ -233,6 +233,7 @@ struct PageSearchStatus { >> ram_addr_t offset; >> /* Set once we wrap around */ >> bool complete_round; >> + uint8_t *pages_copy; >> }; >> typedef struct PageSearchStatus PageSearchStatus; >> >> @@ -742,7 +743,12 @@ static int ram_save_page(QEMUFile *f, PageSearchStatus *pss, >> RAMBlock *block = pss->block; >> ram_addr_t offset = pss->offset; >> >> - p = block->host + offset; >> + /* If we have a copy of this page, use the backup page first */ >> + if (pss->pages_copy) { >> + p = pss->pages_copy; >> + } else { >> + p = block->host + offset; >> + } >> >> /* In doubt sent page as normal */ >> bytes_xmit = 0; >> @@ -926,7 +932,12 @@ static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, >> RAMBlock *block = pss->block; >> ram_addr_t offset = pss->offset; >> >> - p = block->host + offset; >> + /* If we have a copy of this page, use the backup first */ >> + if (pss->pages_copy) { >> + p = pss->pages_copy; >> + } else { >> + p = block->host + offset; >> + } >> >> bytes_xmit = 0; >> ret = ram_control_save_page(f, block->offset, >> @@ -1043,7 +1054,7 @@ static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, >> * Returns: block (or NULL if none available) >> */ >> static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, >> - ram_addr_t *ram_addr_abs) >> + ram_addr_t *ram_addr_abs, uint8 **pages_copy_addr) >> { >> RAMBlock *block = NULL; >> >> @@ -1055,7 +1066,7 @@ static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, >> *offset = entry->offset; >> *ram_addr_abs = (entry->offset + entry->rb->offset) & >> TARGET_PAGE_MASK; >> - >> + *pages_copy_addr = entry->pages_copy_addr; >> if (entry->len > TARGET_PAGE_SIZE) { >> entry->len -= TARGET_PAGE_SIZE; >> entry->offset += TARGET_PAGE_SIZE; >> @@ -1086,9 +1097,10 @@ static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, >> RAMBlock *block; >> ram_addr_t offset; >> bool dirty; >> + uint8 *pages_backup_addr = NULL; >> >> do { >> - block = unqueue_page(ms, &offset, ram_addr_abs); >> + block = unqueue_page(ms, &offset, ram_addr_abs, &pages_backup_addr); >> /* >> * We're sending this page, and since it's postcopy nothing else >> * will dirty it, and we must make sure it doesn't get sent again >> @@ -1130,6 +1142,7 @@ static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, >> */ >> pss->block = block; >> pss->offset = offset; >> + pss->pages_copy = pages_backup_addr; >> } >> >> return !!block; >> @@ -1166,7 +1179,7 @@ void flush_page_queue(MigrationState *ms) >> * Return: 0 on success >> */ >> int ram_save_queue_pages(MigrationState *ms, const char *rbname, >> - ram_addr_t start, ram_addr_t len) >> + ram_addr_t start, ram_addr_t len, bool copy_pages) >> { >> RAMBlock *ramblock; >> >> @@ -1206,6 +1219,17 @@ int ram_save_queue_pages(MigrationState *ms, const char *rbname, >> new_entry->rb = ramblock; >> new_entry->offset = start; >> new_entry->len = len; >> + if (copy_pages) { >> + /* Fix me: Better to realize a memory pool */ >> + new_entry->pages_copy_addr = g_try_malloc0(len); >> + >> + if (!new_entry->pages_copy_addr) { >> + error_report("%s: Failed to alloc memory", __func__); >> + return -1; >> + } >> + >> + memcpy(new_entry->pages_copy_addr, ramblock_ptr(ramblock, start), len); >> + } >> >> memory_region_ref(ramblock->mr); >> qemu_mutex_lock(&ms->src_page_req_mutex); >> @@ -1342,6 +1366,7 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, >> pss.block = last_seen_block; >> pss.offset = last_offset; >> pss.complete_round = false; >> + pss.pages_copy = NULL; >> >> if (!pss.block) { >> pss.block = QLIST_FIRST_RCU(&ram_list.blocks); >> -- >> 1.8.3.1 >> >> > -- > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK > > . >
diff --git a/include/migration/migration.h b/include/migration/migration.h index ef4c071..435de31 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -127,7 +127,7 @@ struct MigrationSrcPageRequest { RAMBlock *rb; hwaddr offset; hwaddr len; - + uint8_t *pages_copy_addr; QSIMPLEQ_ENTRY(MigrationSrcPageRequest) next_req; }; @@ -333,7 +333,7 @@ void global_state_store_running(void); void flush_page_queue(MigrationState *ms); int ram_save_queue_pages(MigrationState *ms, const char *rbname, - ram_addr_t start, ram_addr_t len); + ram_addr_t start, ram_addr_t len, bool copy_pages); PostcopyState postcopy_state_get(void); /* Set the state and return the old state */ diff --git a/migration/migration.c b/migration/migration.c index 3765c3b..bf4c7a1 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1248,7 +1248,7 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, return; } - if (ram_save_queue_pages(ms, rbname, start, len)) { + if (ram_save_queue_pages(ms, rbname, start, len, false)) { mark_source_rp_bad(ms); } } diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 61392d3..2cf477d 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -543,13 +543,28 @@ static void *postcopy_ram_fault_thread(void *opaque) MigrationState *ms = container_of(us, MigrationState, userfault_state); ret = ram_save_queue_pages(ms, qemu_ram_get_idstr(rb), rb_offset, - hostpagesize); + hostpagesize, true); if (ret < 0) { error_report("%s: Save: %"PRIx64 " failed!", __func__, (uint64_t)msg.arg.pagefault.address); break; } + + /* Note: In the setup process, snapshot_thread may modify VM's + * write-protected pages, we should not block it there, or there + * will be an deadlock error. + */ + if (migration_in_setup(ms)) { + uint64_t host = msg.arg.pagefault.address; + + host &= ~(hostpagesize - 1); + ret = ram_set_pages_wp(host, getpagesize(), true, + us->userfault_fd); + if (ret < 0) { + error_report("Remove page's write-protect failed"); + } + } } } trace_postcopy_ram_fault_thread_exit(); diff --git a/migration/ram.c b/migration/ram.c index 8656719..747f9aa 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -233,6 +233,7 @@ struct PageSearchStatus { ram_addr_t offset; /* Set once we wrap around */ bool complete_round; + uint8_t *pages_copy; }; typedef struct PageSearchStatus PageSearchStatus; @@ -742,7 +743,12 @@ static int ram_save_page(QEMUFile *f, PageSearchStatus *pss, RAMBlock *block = pss->block; ram_addr_t offset = pss->offset; - p = block->host + offset; + /* If we have a copy of this page, use the backup page first */ + if (pss->pages_copy) { + p = pss->pages_copy; + } else { + p = block->host + offset; + } /* In doubt sent page as normal */ bytes_xmit = 0; @@ -926,7 +932,12 @@ static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, RAMBlock *block = pss->block; ram_addr_t offset = pss->offset; - p = block->host + offset; + /* If we have a copy of this page, use the backup first */ + if (pss->pages_copy) { + p = pss->pages_copy; + } else { + p = block->host + offset; + } bytes_xmit = 0; ret = ram_control_save_page(f, block->offset, @@ -1043,7 +1054,7 @@ static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, * Returns: block (or NULL if none available) */ static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, - ram_addr_t *ram_addr_abs) + ram_addr_t *ram_addr_abs, uint8 **pages_copy_addr) { RAMBlock *block = NULL; @@ -1055,7 +1066,7 @@ static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, *offset = entry->offset; *ram_addr_abs = (entry->offset + entry->rb->offset) & TARGET_PAGE_MASK; - + *pages_copy_addr = entry->pages_copy_addr; if (entry->len > TARGET_PAGE_SIZE) { entry->len -= TARGET_PAGE_SIZE; entry->offset += TARGET_PAGE_SIZE; @@ -1086,9 +1097,10 @@ static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, RAMBlock *block; ram_addr_t offset; bool dirty; + uint8 *pages_backup_addr = NULL; do { - block = unqueue_page(ms, &offset, ram_addr_abs); + block = unqueue_page(ms, &offset, ram_addr_abs, &pages_backup_addr); /* * We're sending this page, and since it's postcopy nothing else * will dirty it, and we must make sure it doesn't get sent again @@ -1130,6 +1142,7 @@ static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, */ pss->block = block; pss->offset = offset; + pss->pages_copy = pages_backup_addr; } return !!block; @@ -1166,7 +1179,7 @@ void flush_page_queue(MigrationState *ms) * Return: 0 on success */ int ram_save_queue_pages(MigrationState *ms, const char *rbname, - ram_addr_t start, ram_addr_t len) + ram_addr_t start, ram_addr_t len, bool copy_pages) { RAMBlock *ramblock; @@ -1206,6 +1219,17 @@ int ram_save_queue_pages(MigrationState *ms, const char *rbname, new_entry->rb = ramblock; new_entry->offset = start; new_entry->len = len; + if (copy_pages) { + /* Fix me: Better to realize a memory pool */ + new_entry->pages_copy_addr = g_try_malloc0(len); + + if (!new_entry->pages_copy_addr) { + error_report("%s: Failed to alloc memory", __func__); + return -1; + } + + memcpy(new_entry->pages_copy_addr, ramblock_ptr(ramblock, start), len); + } memory_region_ref(ramblock->mr); qemu_mutex_lock(&ms->src_page_req_mutex); @@ -1342,6 +1366,7 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, pss.block = last_seen_block; pss.offset = last_offset; pss.complete_round = false; + pss.pages_copy = NULL; if (!pss.block) { pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
If we modify VM's RAM (pages) during setup stage after enable write-protect notification in snapshot thread, the modification action will get stuck because we only remove the page's write-protect in savevm process, it blocked by itself. To fix this bug, we will remove page's write-protect in fault thread during the setup stage. Besides, we should not try to get global lock after setup stage, or there maybe an deadlock error. Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com> --- include/migration/migration.h | 4 ++-- migration/migration.c | 2 +- migration/postcopy-ram.c | 17 ++++++++++++++++- migration/ram.c | 37 +++++++++++++++++++++++++++++++------ 4 files changed, 50 insertions(+), 10 deletions(-)