diff mbox series

[PULL,132/136] mem-prealloc: optimize large guest startup

Message ID 1582632454-16491-30-git-send-email-pbonzini@redhat.com
State New
Headers show
Series None | expand

Commit Message

Paolo Bonzini Feb. 25, 2020, 12:07 p.m. UTC
From: bauerchen <bauerchen@tencent.com>

[desc]:
    Large memory VM starts slowly when using -mem-prealloc, and
    there are some areas to optimize in current method;

    1、mmap will be used to alloc threads stack during create page
    clearing threads, and it will attempt mm->mmap_sem for write
    lock, but clearing threads have hold read lock, this competition
    will cause threads createion very slow;

    2、methods of calcuating pages for per threads is not well;if we use
    64 threads to split 160 hugepage,63 threads clear 2page,1 thread
    clear 34 page,so the entire speed is very slow;

    to solve the first problem,we add a mutex in thread function,and
    start all threads when all threads finished createion;
    and the second problem, we spread remainder to other threads,in
    situation that 160 hugepage and 64 threads, there are 32 threads
    clear 3 pages,and 32 threads clear 2 pages.

[test]:
    320G 84c VM start time can be reduced to 10s
    680G 84c VM start time can be reduced to 18s

Signed-off-by: bauerchen <bauerchen@tencent.com>
Reviewed-by: Pan Rui <ruippan@tencent.com>
Reviewed-by: Ivan Ren <ivanren@tencent.com>
[Simplify computation of the number of pages per thread. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 util/oslib-posix.c | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

Comments

Laurent Vivier March 16, 2020, 8:42 a.m. UTC | #1
Hi,

a bug has been reported in launchpad for this patch:

[Regression]Powerpc kvm guest unable to start with hugepage backed
            memory
https://bugs.launchpad.net/qemu/+bug/1866962

Thanks,
Laurent

Le 25/02/2020 à 13:07, Paolo Bonzini a écrit :
> From: bauerchen <bauerchen@tencent.com>
> 
> [desc]:
>     Large memory VM starts slowly when using -mem-prealloc, and
>     there are some areas to optimize in current method;
> 
>     1、mmap will be used to alloc threads stack during create page
>     clearing threads, and it will attempt mm->mmap_sem for write
>     lock, but clearing threads have hold read lock, this competition
>     will cause threads createion very slow;
> 
>     2、methods of calcuating pages for per threads is not well;if we use
>     64 threads to split 160 hugepage,63 threads clear 2page,1 thread
>     clear 34 page,so the entire speed is very slow;
> 
>     to solve the first problem,we add a mutex in thread function,and
>     start all threads when all threads finished createion;
>     and the second problem, we spread remainder to other threads,in
>     situation that 160 hugepage and 64 threads, there are 32 threads
>     clear 3 pages,and 32 threads clear 2 pages.
> 
> [test]:
>     320G 84c VM start time can be reduced to 10s
>     680G 84c VM start time can be reduced to 18s
> 
> Signed-off-by: bauerchen <bauerchen@tencent.com>
> Reviewed-by: Pan Rui <ruippan@tencent.com>
> Reviewed-by: Ivan Ren <ivanren@tencent.com>
> [Simplify computation of the number of pages per thread. - Paolo]
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  util/oslib-posix.c | 32 ++++++++++++++++++++++++--------
>  1 file changed, 24 insertions(+), 8 deletions(-)
> 
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index 5a291cc..897e8f3 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -76,6 +76,10 @@ static MemsetThread *memset_thread;
>  static int memset_num_threads;
>  static bool memset_thread_failed;
>  
> +static QemuMutex page_mutex;
> +static QemuCond page_cond;
> +static bool threads_created_flag;
> +
>  int qemu_get_thread_id(void)
>  {
>  #if defined(__linux__)
> @@ -403,6 +407,17 @@ static void *do_touch_pages(void *arg)
>      MemsetThread *memset_args = (MemsetThread *)arg;
>      sigset_t set, oldset;
>  
> +    /*
> +     * On Linux, the page faults from the loop below can cause mmap_sem
> +     * contention with allocation of the thread stacks.  Do not start
> +     * clearing until all threads have been created.
> +     */
> +    qemu_mutex_lock(&page_mutex);
> +    while(!threads_created_flag){
> +        qemu_cond_wait(&page_cond, &page_mutex);
> +    }
> +    qemu_mutex_unlock(&page_mutex);
> +
>      /* unblock SIGBUS */
>      sigemptyset(&set);
>      sigaddset(&set, SIGBUS);
> @@ -451,27 +466,28 @@ static inline int get_memset_num_threads(int smp_cpus)
>  static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
>                              int smp_cpus)
>  {
> -    size_t numpages_per_thread;
> -    size_t size_per_thread;
> +    size_t numpages_per_thread, leftover;
>      char *addr = area;
>      int i = 0;
>  
>      memset_thread_failed = false;
> +    threads_created_flag = false;
>      memset_num_threads = get_memset_num_threads(smp_cpus);
>      memset_thread = g_new0(MemsetThread, memset_num_threads);
> -    numpages_per_thread = (numpages / memset_num_threads);
> -    size_per_thread = (hpagesize * numpages_per_thread);
> +    numpages_per_thread = numpages / memset_num_threads;
> +    leftover = numpages % memset_num_threads;
>      for (i = 0; i < memset_num_threads; i++) {
>          memset_thread[i].addr = addr;
> -        memset_thread[i].numpages = (i == (memset_num_threads - 1)) ?
> -                                    numpages : numpages_per_thread;
> +        memset_thread[i].numpages = numpages_per_thread + (i < leftover);
>          memset_thread[i].hpagesize = hpagesize;
>          qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
>                             do_touch_pages, &memset_thread[i],
>                             QEMU_THREAD_JOINABLE);
> -        addr += size_per_thread;
> -        numpages -= numpages_per_thread;
> +        addr += memset_thread[i].numpages * hpagesize;
>      }
> +    threads_created_flag = true;
> +    qemu_cond_broadcast(&page_cond);
> +
>      for (i = 0; i < memset_num_threads; i++) {
>          qemu_thread_join(&memset_thread[i].pgthread);
>      }
>
Paolo Bonzini March 16, 2020, 8:45 a.m. UTC | #2
On 16/03/20 09:42, Laurent Vivier wrote:
> Hi,
> 
> a bug has been reported in launchpad for this patch:
> 
> [Regression]Powerpc kvm guest unable to start with hugepage backed
>             memory
> https://bugs.launchpad.net/qemu/+bug/1866962

Indeed, I'm sending the pull request with the fix today.  Sorry for the
breakage.

Paolo
diff mbox series

Patch

diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 5a291cc..897e8f3 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -76,6 +76,10 @@  static MemsetThread *memset_thread;
 static int memset_num_threads;
 static bool memset_thread_failed;
 
+static QemuMutex page_mutex;
+static QemuCond page_cond;
+static bool threads_created_flag;
+
 int qemu_get_thread_id(void)
 {
 #if defined(__linux__)
@@ -403,6 +407,17 @@  static void *do_touch_pages(void *arg)
     MemsetThread *memset_args = (MemsetThread *)arg;
     sigset_t set, oldset;
 
+    /*
+     * On Linux, the page faults from the loop below can cause mmap_sem
+     * contention with allocation of the thread stacks.  Do not start
+     * clearing until all threads have been created.
+     */
+    qemu_mutex_lock(&page_mutex);
+    while(!threads_created_flag){
+        qemu_cond_wait(&page_cond, &page_mutex);
+    }
+    qemu_mutex_unlock(&page_mutex);
+
     /* unblock SIGBUS */
     sigemptyset(&set);
     sigaddset(&set, SIGBUS);
@@ -451,27 +466,28 @@  static inline int get_memset_num_threads(int smp_cpus)
 static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
                             int smp_cpus)
 {
-    size_t numpages_per_thread;
-    size_t size_per_thread;
+    size_t numpages_per_thread, leftover;
     char *addr = area;
     int i = 0;
 
     memset_thread_failed = false;
+    threads_created_flag = false;
     memset_num_threads = get_memset_num_threads(smp_cpus);
     memset_thread = g_new0(MemsetThread, memset_num_threads);
-    numpages_per_thread = (numpages / memset_num_threads);
-    size_per_thread = (hpagesize * numpages_per_thread);
+    numpages_per_thread = numpages / memset_num_threads;
+    leftover = numpages % memset_num_threads;
     for (i = 0; i < memset_num_threads; i++) {
         memset_thread[i].addr = addr;
-        memset_thread[i].numpages = (i == (memset_num_threads - 1)) ?
-                                    numpages : numpages_per_thread;
+        memset_thread[i].numpages = numpages_per_thread + (i < leftover);
         memset_thread[i].hpagesize = hpagesize;
         qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
                            do_touch_pages, &memset_thread[i],
                            QEMU_THREAD_JOINABLE);
-        addr += size_per_thread;
-        numpages -= numpages_per_thread;
+        addr += memset_thread[i].numpages * hpagesize;
     }
+    threads_created_flag = true;
+    qemu_cond_broadcast(&page_cond);
+
     for (i = 0; i < memset_num_threads; i++) {
         qemu_thread_join(&memset_thread[i].pgthread);
     }