diff mbox series

[5/5] linux-user: Do not align brk with host page size

Message ID 20230731080317.112658-6-akihiko.odaki@daynix.com
State New
Headers show
Series linux-user: brk/mmap fixes | expand

Commit Message

Akihiko Odaki July 31, 2023, 8:03 a.m. UTC
do_brk() minimizes calls into target_mmap() by aligning the address
with host page size, which is potentially larger than the target page
size. However, the current implementation of this optimization has two
bugs:

- The start of brk is rounded up with the host page size while brk
  advertises an address aligned with the target page size as the
  beginning of brk. This makes the beginning of brk unmapped.
- Content clearing after mapping is flawed. The size to clear is
  specified as HOST_PAGE_ALIGN(brk_page) - brk_page, but brk_page is
  aligned with the host page size so it is always zero.

This optimization actually has no practical benefit. It makes difference
when brk() is called multiple times with values in a range of the host
page size. However, sophisticated memory allocators try to avoid to
make such frequent brk() calls. For example, glibc 2.37 calls brk() to
shrink the heap only when there is a room more than 128 KiB. It is
rare to have a page size larger than 128 KiB if it happens.

Let's remove the optimization to fix the bugs and make the code simpler.

Fixes: 86f04735ac ("linux-user: Fix brk() to release pages")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1616
Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
---
 linux-user/elfload.c |  4 ++--
 linux-user/syscall.c | 54 ++++++++++----------------------------------
 2 files changed, 14 insertions(+), 44 deletions(-)

Comments

Helge Deller July 31, 2023, 3:51 p.m. UTC | #1
On 7/31/23 10:03, Akihiko Odaki wrote:
> do_brk() minimizes calls into target_mmap() by aligning the address
> with host page size, which is potentially larger than the target page
> size.

Keep in mind, that host page size can be smaller than target page size too.
That's the reason why host brk (brk_page) and target brk (target_brk)
needs to be tracked individually.
So, it's not an optimization, but required.
Btw, I think we have been there before with the idea of
just keeping track of host pages...

> However, the current implementation of this optimization has two
> bugs:
>
> - The start of brk is rounded up with the host page size while brk
>    advertises an address aligned with the target page size as the
>    beginning of brk. This makes the beginning of brk unmapped.
> - Content clearing after mapping is flawed. The size to clear is
>    specified as HOST_PAGE_ALIGN(brk_page) - brk_page, but brk_page is
>    aligned with the host page size so it is always zero.
>
> This optimization actually has no practical benefit. It makes difference
> when brk() is called multiple times with values in a range of the host
> page size. However, sophisticated memory allocators try to avoid to
> make such frequent brk() calls. For example, glibc 2.37 calls brk() to
> shrink the heap only when there is a room more than 128 KiB. It is
> rare to have a page size larger than 128 KiB if it happens.
>
> Let's remove the optimization to fix the bugs and make the code simpler.
>
> Fixes: 86f04735ac ("linux-user: Fix brk() to release pages")
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1616
> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
> ---
>   linux-user/elfload.c |  4 ++--
>   linux-user/syscall.c | 54 ++++++++++----------------------------------
>   2 files changed, 14 insertions(+), 44 deletions(-)
>
> diff --git a/linux-user/elfload.c b/linux-user/elfload.c
> index 861ec07abc..2aee2298ec 100644
> --- a/linux-user/elfload.c
> +++ b/linux-user/elfload.c
> @@ -3678,8 +3678,8 @@ int load_elf_binary(struct linux_binprm *bprm, struct image_info *info)
>        * to mmap pages in this space.
>        */
>       if (info->reserve_brk) {
> -        abi_ulong start_brk = HOST_PAGE_ALIGN(info->brk);
> -        abi_ulong end_brk = HOST_PAGE_ALIGN(info->brk + info->reserve_brk);
> +        abi_ulong start_brk = TARGET_PAGE_ALIGN(info->brk);
> +        abi_ulong end_brk = TARGET_PAGE_ALIGN(info->brk + info->reserve_brk);

In my patchset I removed the reserve_brk stuff...

>           target_munmap(start_brk, end_brk - start_brk);
>       }
>
> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
> index ebdc8c144c..475260b7ce 100644
> --- a/linux-user/syscall.c
> +++ b/linux-user/syscall.c
> @@ -802,81 +802,51 @@ static inline int host_to_target_sock_type(int host_type)
>   }
>
>   static abi_ulong target_brk, initial_target_brk;
> -static abi_ulong brk_page;

with that you loose the ability to track the brk address on host.


>   void target_set_brk(abi_ulong new_brk)
>   {
>       target_brk = TARGET_PAGE_ALIGN(new_brk);
>       initial_target_brk = target_brk;
> -    brk_page = HOST_PAGE_ALIGN(target_brk);
>   }
>
>   /* do_brk() must return target values and target errnos. */
>   abi_long do_brk(abi_ulong brk_val)
>   {
>       abi_long mapped_addr;
> -    abi_ulong new_alloc_size;
> -    abi_ulong new_brk, new_host_brk_page;
> +    abi_ulong new_brk;
> +    abi_ulong old_brk;
>
>       /* brk pointers are always untagged */
>
> -    /* return old brk value if brk_val unchanged */
> -    if (brk_val == target_brk) {
> -        return target_brk;
> -    }
> -
>       /* do not allow to shrink below initial brk value */
>       if (brk_val < initial_target_brk) {
>           return target_brk;
>       }
>
>       new_brk = TARGET_PAGE_ALIGN(brk_val);
> -    new_host_brk_page = HOST_PAGE_ALIGN(brk_val);
> +    old_brk = TARGET_PAGE_ALIGN(target_brk);
>
> -    /* brk_val and old target_brk might be on the same page */
> -    if (new_brk == TARGET_PAGE_ALIGN(target_brk)) {
> -        /* empty remaining bytes in (possibly larger) host page */
> -        memset(g2h_untagged(new_brk), 0, new_host_brk_page - new_brk);

you can't drop that.
guest could have clobbered the upper parts, e.g. when it first mapped
all (e.g. 8k), wrote to it, then released upper half of the host page (4k) and
then remaps the upper 4k again.
In that case we need to ensure that the upper 4k gets cleaned. On a physical
machine the kernel would have requested a new page, but here we are stuck
with the bigger host page which we can't simply release.

Helge

> +    /* new and old target_brk might be on the same page */
> +    if (new_brk == old_brk) {
>           target_brk = brk_val;
>           return target_brk;
>       }
>
>       /* Release heap if necesary */
> -    if (new_brk < target_brk) {
> -        /* empty remaining bytes in (possibly larger) host page */
> -        memset(g2h_untagged(new_brk), 0, new_host_brk_page - new_brk);
> -
> -        /* free unused host pages and set new brk_page */
> -        target_munmap(new_host_brk_page, brk_page - new_host_brk_page);
> -        brk_page = new_host_brk_page;
> +    if (new_brk < old_brk) {
> +        target_munmap(new_brk, old_brk - new_brk);
>
>           target_brk = brk_val;
>           return target_brk;
>       }
>
> -    if (new_host_brk_page > brk_page) {
> -        new_alloc_size = new_host_brk_page - brk_page;
> -        mapped_addr = target_mmap(brk_page, new_alloc_size,
> -                                  PROT_READ | PROT_WRITE,
> -                                  MAP_FIXED_NOREPLACE | MAP_ANON | MAP_PRIVATE,
> -                                  0, 0);
> -    } else {
> -        new_alloc_size = 0;
> -        mapped_addr = brk_page;
> -    }
> -
> -    if (mapped_addr == brk_page) {
> -        /* Heap contents are initialized to zero, as for anonymous
> -         * mapped pages.  Technically the new pages are already
> -         * initialized to zero since they *are* anonymous mapped
> -         * pages, however we have to take care with the contents that
> -         * come from the remaining part of the previous page: it may
> -         * contains garbage data due to a previous heap usage (grown
> -         * then shrunken).  */
> -        memset(g2h_untagged(brk_page), 0, HOST_PAGE_ALIGN(brk_page) - brk_page);
> +    mapped_addr = target_mmap(old_brk, new_brk - old_brk,
> +                              PROT_READ | PROT_WRITE,
> +                              MAP_FIXED_NOREPLACE | MAP_ANON | MAP_PRIVATE,
> +                              0, 0);
>
> +    if (mapped_addr == old_brk) {
>           target_brk = brk_val;
> -        brk_page = new_host_brk_page;
>           return target_brk;
>       }
>
Akihiko Odaki July 31, 2023, 10:43 p.m. UTC | #2
On 2023/08/01 0:51, Helge Deller wrote:
> On 7/31/23 10:03, Akihiko Odaki wrote:
>> do_brk() minimizes calls into target_mmap() by aligning the address
>> with host page size, which is potentially larger than the target page
>> size.
> 
> Keep in mind, that host page size can be smaller than target page size too.
> That's the reason why host brk (brk_page) and target brk (target_brk)
> needs to be tracked individually.
> So, it's not an optimization, but required.
> Btw, I think we have been there before with the idea of
> just keeping track of host pages...

It does not matter even if the host page size is smaller or larger. 
do_brk() solely relies on target_mmap(), which works with target page size.

> 
>> However, the current implementation of this optimization has two
>> bugs:
>>
>> - The start of brk is rounded up with the host page size while brk
>>    advertises an address aligned with the target page size as the
>>    beginning of brk. This makes the beginning of brk unmapped.
>> - Content clearing after mapping is flawed. The size to clear is
>>    specified as HOST_PAGE_ALIGN(brk_page) - brk_page, but brk_page is
>>    aligned with the host page size so it is always zero.
>>
>> This optimization actually has no practical benefit. It makes difference
>> when brk() is called multiple times with values in a range of the host
>> page size. However, sophisticated memory allocators try to avoid to
>> make such frequent brk() calls. For example, glibc 2.37 calls brk() to
>> shrink the heap only when there is a room more than 128 KiB. It is
>> rare to have a page size larger than 128 KiB if it happens.
>>
>> Let's remove the optimization to fix the bugs and make the code simpler.
>>
>> Fixes: 86f04735ac ("linux-user: Fix brk() to release pages")
>> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1616
>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
>> ---
>>   linux-user/elfload.c |  4 ++--
>>   linux-user/syscall.c | 54 ++++++++++----------------------------------
>>   2 files changed, 14 insertions(+), 44 deletions(-)
>>
>> diff --git a/linux-user/elfload.c b/linux-user/elfload.c
>> index 861ec07abc..2aee2298ec 100644
>> --- a/linux-user/elfload.c
>> +++ b/linux-user/elfload.c
>> @@ -3678,8 +3678,8 @@ int load_elf_binary(struct linux_binprm *bprm, 
>> struct image_info *info)
>>        * to mmap pages in this space.
>>        */
>>       if (info->reserve_brk) {
>> -        abi_ulong start_brk = HOST_PAGE_ALIGN(info->brk);
>> -        abi_ulong end_brk = HOST_PAGE_ALIGN(info->brk + 
>> info->reserve_brk);
>> +        abi_ulong start_brk = TARGET_PAGE_ALIGN(info->brk);
>> +        abi_ulong end_brk = TARGET_PAGE_ALIGN(info->brk + 
>> info->reserve_brk);
> 
> In my patchset I removed the reserve_brk stuff...
> 
>>           target_munmap(start_brk, end_brk - start_brk);
>>       }
>>
>> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
>> index ebdc8c144c..475260b7ce 100644
>> --- a/linux-user/syscall.c
>> +++ b/linux-user/syscall.c
>> @@ -802,81 +802,51 @@ static inline int host_to_target_sock_type(int 
>> host_type)
>>   }
>>
>>   static abi_ulong target_brk, initial_target_brk;
>> -static abi_ulong brk_page;
> 
> with that you loose the ability to track the brk address on host.

brk_page actually doesn't track the address on the host. It's just an 
address on the target but aligned with host page size. And the alignment 
is purely for optimization.

> 
> 
>>   void target_set_brk(abi_ulong new_brk)
>>   {
>>       target_brk = TARGET_PAGE_ALIGN(new_brk);
>>       initial_target_brk = target_brk;
>> -    brk_page = HOST_PAGE_ALIGN(target_brk);
>>   }
>>
>>   /* do_brk() must return target values and target errnos. */
>>   abi_long do_brk(abi_ulong brk_val)
>>   {
>>       abi_long mapped_addr;
>> -    abi_ulong new_alloc_size;
>> -    abi_ulong new_brk, new_host_brk_page;
>> +    abi_ulong new_brk;
>> +    abi_ulong old_brk;
>>
>>       /* brk pointers are always untagged */
>>
>> -    /* return old brk value if brk_val unchanged */
>> -    if (brk_val == target_brk) {
>> -        return target_brk;
>> -    }
>> -
>>       /* do not allow to shrink below initial brk value */
>>       if (brk_val < initial_target_brk) {
>>           return target_brk;
>>       }
>>
>>       new_brk = TARGET_PAGE_ALIGN(brk_val);
>> -    new_host_brk_page = HOST_PAGE_ALIGN(brk_val);
>> +    old_brk = TARGET_PAGE_ALIGN(target_brk);
>>
>> -    /* brk_val and old target_brk might be on the same page */
>> -    if (new_brk == TARGET_PAGE_ALIGN(target_brk)) {
>> -        /* empty remaining bytes in (possibly larger) host page */
>> -        memset(g2h_untagged(new_brk), 0, new_host_brk_page - new_brk);
> 
> you can't drop that.
> guest could have clobbered the upper parts, e.g. when it first mapped
> all (e.g. 8k), wrote to it, then released upper half of the host page 
> (4k) and
> then remaps the upper 4k again.
> In that case we need to ensure that the upper 4k gets cleaned. On a 
> physical
> machine the kernel would have requested a new page, but here we are stuck
> with the bigger host page which we can't simply release.

Such a case should also be handled with target_mmap().

Regards,
Akihiko Odaki
diff mbox series

Patch

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 861ec07abc..2aee2298ec 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -3678,8 +3678,8 @@  int load_elf_binary(struct linux_binprm *bprm, struct image_info *info)
      * to mmap pages in this space.
      */
     if (info->reserve_brk) {
-        abi_ulong start_brk = HOST_PAGE_ALIGN(info->brk);
-        abi_ulong end_brk = HOST_PAGE_ALIGN(info->brk + info->reserve_brk);
+        abi_ulong start_brk = TARGET_PAGE_ALIGN(info->brk);
+        abi_ulong end_brk = TARGET_PAGE_ALIGN(info->brk + info->reserve_brk);
         target_munmap(start_brk, end_brk - start_brk);
     }
 
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index ebdc8c144c..475260b7ce 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -802,81 +802,51 @@  static inline int host_to_target_sock_type(int host_type)
 }
 
 static abi_ulong target_brk, initial_target_brk;
-static abi_ulong brk_page;
 
 void target_set_brk(abi_ulong new_brk)
 {
     target_brk = TARGET_PAGE_ALIGN(new_brk);
     initial_target_brk = target_brk;
-    brk_page = HOST_PAGE_ALIGN(target_brk);
 }
 
 /* do_brk() must return target values and target errnos. */
 abi_long do_brk(abi_ulong brk_val)
 {
     abi_long mapped_addr;
-    abi_ulong new_alloc_size;
-    abi_ulong new_brk, new_host_brk_page;
+    abi_ulong new_brk;
+    abi_ulong old_brk;
 
     /* brk pointers are always untagged */
 
-    /* return old brk value if brk_val unchanged */
-    if (brk_val == target_brk) {
-        return target_brk;
-    }
-
     /* do not allow to shrink below initial brk value */
     if (brk_val < initial_target_brk) {
         return target_brk;
     }
 
     new_brk = TARGET_PAGE_ALIGN(brk_val);
-    new_host_brk_page = HOST_PAGE_ALIGN(brk_val);
+    old_brk = TARGET_PAGE_ALIGN(target_brk);
 
-    /* brk_val and old target_brk might be on the same page */
-    if (new_brk == TARGET_PAGE_ALIGN(target_brk)) {
-        /* empty remaining bytes in (possibly larger) host page */
-        memset(g2h_untagged(new_brk), 0, new_host_brk_page - new_brk);
+    /* new and old target_brk might be on the same page */
+    if (new_brk == old_brk) {
         target_brk = brk_val;
         return target_brk;
     }
 
     /* Release heap if necesary */
-    if (new_brk < target_brk) {
-        /* empty remaining bytes in (possibly larger) host page */
-        memset(g2h_untagged(new_brk), 0, new_host_brk_page - new_brk);
-
-        /* free unused host pages and set new brk_page */
-        target_munmap(new_host_brk_page, brk_page - new_host_brk_page);
-        brk_page = new_host_brk_page;
+    if (new_brk < old_brk) {
+        target_munmap(new_brk, old_brk - new_brk);
 
         target_brk = brk_val;
         return target_brk;
     }
 
-    if (new_host_brk_page > brk_page) {
-        new_alloc_size = new_host_brk_page - brk_page;
-        mapped_addr = target_mmap(brk_page, new_alloc_size,
-                                  PROT_READ | PROT_WRITE,
-                                  MAP_FIXED_NOREPLACE | MAP_ANON | MAP_PRIVATE,
-                                  0, 0);
-    } else {
-        new_alloc_size = 0;
-        mapped_addr = brk_page;
-    }
-
-    if (mapped_addr == brk_page) {
-        /* Heap contents are initialized to zero, as for anonymous
-         * mapped pages.  Technically the new pages are already
-         * initialized to zero since they *are* anonymous mapped
-         * pages, however we have to take care with the contents that
-         * come from the remaining part of the previous page: it may
-         * contains garbage data due to a previous heap usage (grown
-         * then shrunken).  */
-        memset(g2h_untagged(brk_page), 0, HOST_PAGE_ALIGN(brk_page) - brk_page);
+    mapped_addr = target_mmap(old_brk, new_brk - old_brk,
+                              PROT_READ | PROT_WRITE,
+                              MAP_FIXED_NOREPLACE | MAP_ANON | MAP_PRIVATE,
+                              0, 0);
 
+    if (mapped_addr == old_brk) {
         target_brk = brk_val;
-        brk_page = new_host_brk_page;
         return target_brk;
     }