diff mbox series

memcontrol03: Account for process size in cgroup allocation

Message ID 20250505105310.15072-1-mdoucha@suse.cz
State Needs Review / ACK
Headers show
Series memcontrol03: Account for process size in cgroup allocation | expand

Checks

Context Check Description
ltpci/debian_stable_s390x-linux-gnu-gcc_s390x success success
ltpci/debian_stable_powerpc64le-linux-gnu-gcc_ppc64el success success
ltpci/debian_stable_aarch64-linux-gnu-gcc_arm64 success success
ltpci/debian_stable_gcc success success
ltpci/debian_stable_gcc success success
ltpci/debian_testing_gcc success success
ltpci/alpine_latest_gcc success success
ltpci/opensuse-leap_latest_gcc success success
ltpci/ubuntu_jammy_gcc success success
ltpci/opensuse-archive_42-2_gcc success success
ltpci/debian_oldstable_clang success success
ltpci/fedora_latest_clang success success
ltpci/quay-io-centos-centos_stream9_gcc success success
ltpci/debian_testing_clang success success
ltpci/ubuntu_bionic_gcc success success
ltpci/debian_oldstable_gcc success success

Commit Message

Martin Doucha May 5, 2025, 10:53 a.m. UTC
The first trunk_G allocation has 2MB safety margin to avoid triggering
OOM killer. However, on systems with 64K pagesize, this may not be enough.
Account for process size as reported by cgroup memory stats before
allocating memory in child processes.

Signed-off-by: Martin Doucha <mdoucha@suse.cz>
---
 .../kernel/controllers/memcg/memcontrol03.c   | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

Comments

Li Wang May 6, 2025, 6:59 a.m. UTC | #1
On Mon, May 5, 2025 at 6:53 PM Martin Doucha <mdoucha@suse.cz> wrote:

> The first trunk_G allocation has 2MB safety margin to avoid triggering
> OOM killer. However, on systems with 64K pagesize, this may not be enough.
> Account for process size as reported by cgroup memory stats before
> allocating memory in child processes.
>
> Signed-off-by: Martin Doucha <mdoucha@suse.cz>
>

Reviewed-by: Li Wang <liwang@redhat.com>

---
>  .../kernel/controllers/memcg/memcontrol03.c   | 20 +++++++++++++++----
>  1 file changed, 16 insertions(+), 4 deletions(-)
>
> diff --git a/testcases/kernel/controllers/memcg/memcontrol03.c
> b/testcases/kernel/controllers/memcg/memcontrol03.c
> index b5bbb9954..d2e489ad6 100644
> --- a/testcases/kernel/controllers/memcg/memcontrol03.c
> +++ b/testcases/kernel/controllers/memcg/memcontrol03.c
> @@ -94,17 +94,23 @@ static void cleanup_sub_groups(void)
>  }
>
>  static void alloc_anon_in_child(const struct tst_cg_group *const cg,
> -                               const size_t size, const int expect_oom)
> +       size_t size, const int expect_oom)
>  {
>         int status;
>         const pid_t pid = SAFE_FORK();
> +       size_t cgmem;
>
>         if (!pid) {
>                 SAFE_CG_PRINTF(cg, "cgroup.procs", "%d", getpid());
> +               SAFE_CG_SCANF(cg, "memory.current", "%zu", &cgmem);
> +               size = size > cgmem ? size - cgmem : 0;
>
>                 tst_res(TINFO, "Child %d in %s: Allocating anon: %"PRIdPTR,
>                 getpid(), tst_cg_group_name(cg), size);
> -               alloc_anon(size);
> +
> +               if (size)
> +                       alloc_anon(size);
> +
>                 exit(0);
>         }
>
> @@ -128,9 +134,10 @@ static void alloc_anon_in_child(const struct
> tst_cg_group *const cg,
>  }
>
>  static void alloc_pagecache_in_child(const struct tst_cg_group *const cg,
> -                                    const size_t size)
> +       size_t size)
>  {
>         const pid_t pid = SAFE_FORK();
> +       size_t cgmem;
>
>         if (pid) {
>                 TST_CHECKPOINT_WAIT(CHILD_IDLE);
> @@ -138,10 +145,15 @@ static void alloc_pagecache_in_child(const struct
> tst_cg_group *const cg,
>         }
>
>         SAFE_CG_PRINTF(cg, "cgroup.procs", "%d", getpid());
> +       SAFE_CG_SCANF(cg, "memory.current", "%zu", &cgmem);
> +       size = size > cgmem ? size - cgmem : 0;
>
>         tst_res(TINFO, "Child %d in %s: Allocating pagecache: %"PRIdPTR,
>                 getpid(), tst_cg_group_name(cg), size);
> -       alloc_pagecache(fd, size);
> +
> +       if (size)
> +               alloc_pagecache(fd, size);
> +
>         SAFE_FSYNC(fd);
>
>         TST_CHECKPOINT_WAKE(CHILD_IDLE);
> --
> 2.49.0
>
>
> --
> Mailing list info: https://lists.linux.it/listinfo/ltp
>
>
Cyril Hrubis May 7, 2025, 2:23 p.m. UTC | #2
Hi!
> The first trunk_G allocation has 2MB safety margin to avoid triggering
> OOM killer. However, on systems with 64K pagesize, this may not be enough.
> Account for process size as reported by cgroup memory stats before
> allocating memory in child processes.

Is there a reason to keep the 2MB safety after this patch?

Or can we do:

diff --git a/testcases/kernel/controllers/memcg/memcontrol03.c b/testcases/kernel/controllers/memcg/memcontrol03.c
index b5bbb9954..e7f126880 100644
--- a/testcases/kernel/controllers/memcg/memcontrol03.c
+++ b/testcases/kernel/controllers/memcg/memcontrol03.c
@@ -200,7 +200,7 @@ static void test_memcg_min(void)
                sleep(1);
        }

-       alloc_anon_in_child(trunk_cg[G], MB(148), 0);
+       alloc_anon_in_child(trunk_cg[G], MB(150), 0);

        SAFE_CG_SCANF(trunk_cg[B], "memory.current", "%ld", c);
        TST_EXP_EXPR(values_close(c[0], MB(50), 5),

> --- a/testcases/kernel/controllers/memcg/memcontrol03.c
> +++ b/testcases/kernel/controllers/memcg/memcontrol03.c
> @@ -94,17 +94,23 @@ static void cleanup_sub_groups(void)
>  }
>  
>  static void alloc_anon_in_child(const struct tst_cg_group *const cg,
> -				const size_t size, const int expect_oom)
> +	size_t size, const int expect_oom)
>  {
>  	int status;
>  	const pid_t pid = SAFE_FORK();
> +	size_t cgmem;
>  
>  	if (!pid) {
>  		SAFE_CG_PRINTF(cg, "cgroup.procs", "%d", getpid());
> +		SAFE_CG_SCANF(cg, "memory.current", "%zu", &cgmem);
> +		size = size > cgmem ? size - cgmem : 0;

Here we depend on the fact that process memory has been properly
accounted for when it starts running its code. Are you sure that we can
rely on this or does this just happen to work?
Martin Doucha May 7, 2025, 3:36 p.m. UTC | #3
On 07. 05. 25 16:23, Cyril Hrubis wrote:
> Hi!
>> The first trunk_G allocation has 2MB safety margin to avoid triggering
>> OOM killer. However, on systems with 64K pagesize, this may not be enough.
>> Account for process size as reported by cgroup memory stats before
>> allocating memory in child processes.
> 
> Is there a reason to keep the 2MB safety after this patch?

I'd say there's no reason to remove it. On x86_64, the patch will 
increase the safety margin by only 256KB and that memory is already 
allocated to the cgroup. If we remove the safety margin, any additional 
buffer allocation in glibc may trigger OOM.

> Or can we do:
> 
> diff --git a/testcases/kernel/controllers/memcg/memcontrol03.c b/testcases/kernel/controllers/memcg/memcontrol03.c
> index b5bbb9954..e7f126880 100644
> --- a/testcases/kernel/controllers/memcg/memcontrol03.c
> +++ b/testcases/kernel/controllers/memcg/memcontrol03.c
> @@ -200,7 +200,7 @@ static void test_memcg_min(void)
>                  sleep(1);
>          }
> 
> -       alloc_anon_in_child(trunk_cg[G], MB(148), 0);
> +       alloc_anon_in_child(trunk_cg[G], MB(150), 0);
> 
>          SAFE_CG_SCANF(trunk_cg[B], "memory.current", "%ld", c);
>          TST_EXP_EXPR(values_close(c[0], MB(50), 5),
> 
>> --- a/testcases/kernel/controllers/memcg/memcontrol03.c
>> +++ b/testcases/kernel/controllers/memcg/memcontrol03.c
>> @@ -94,17 +94,23 @@ static void cleanup_sub_groups(void)
>>   }
>>   
>>   static void alloc_anon_in_child(const struct tst_cg_group *const cg,
>> -				const size_t size, const int expect_oom)
>> +	size_t size, const int expect_oom)
>>   {
>>   	int status;
>>   	const pid_t pid = SAFE_FORK();
>> +	size_t cgmem;
>>   
>>   	if (!pid) {
>>   		SAFE_CG_PRINTF(cg, "cgroup.procs", "%d", getpid());
>> +		SAFE_CG_SCANF(cg, "memory.current", "%zu", &cgmem);
>> +		size = size > cgmem ? size - cgmem : 0;
> 
> Here we depend on the fact that process memory has been properly
> accounted for when it starts running its code. Are you sure that we can
> rely on this or does this just happen to work?

Actually, my commit message is slightly misleading because the existing 
process memory does not get migrated to the new cgroup. But the cgroup 
itself may already have non-zero memory usage even when empty, likely 
for internal kernel structures. Any new allocations of kernel structures 
should also be finished when the process migration completes. So unless 
the migration behavior changes in the near future, we can rely on this.

This sentence in the commit message:
"Account for process size as reported by cgroup memory stats before..."
should be changed to:
"Account for existing cgroup memory usage before..."
Cyril Hrubis May 9, 2025, 9:21 a.m. UTC | #4
Hi!
> > Here we depend on the fact that process memory has been properly
> > accounted for when it starts running its code. Are you sure that we can
> > rely on this or does this just happen to work?
> 
> Actually, my commit message is slightly misleading because the existing 
> process memory does not get migrated to the new cgroup. But the cgroup 
> itself may already have non-zero memory usage even when empty, likely 
> for internal kernel structures. Any new allocations of kernel structures 
> should also be finished when the process migration completes. So unless 
> the migration behavior changes in the near future, we can rely on this.

I suppose that the cgroup is charged for the memory it needs to track
the resources, that makes sense. I wonder if we can read that once at
the start of the test when we create the cgroups and use that value
later on.

> This sentence in the commit message:
> "Account for process size as reported by cgroup memory stats before..."
> should be changed to:
> "Account for existing cgroup memory usage before..."

That sounds better. I suppose that we can get this merged with this
change.

Reviewed-by: Cyril Hrubis <chrubis@suse.cz>
Martin Doucha May 9, 2025, 9:40 a.m. UTC | #5
On 09. 05. 25 11:21, Cyril Hrubis wrote:
> Hi!
>>> Here we depend on the fact that process memory has been properly
>>> accounted for when it starts running its code. Are you sure that we can
>>> rely on this or does this just happen to work?
>>
>> Actually, my commit message is slightly misleading because the existing
>> process memory does not get migrated to the new cgroup. But the cgroup
>> itself may already have non-zero memory usage even when empty, likely
>> for internal kernel structures. Any new allocations of kernel structures
>> should also be finished when the process migration completes. So unless
>> the migration behavior changes in the near future, we can rely on this.
> 
> I suppose that the cgroup is charged for the memory it needs to track
> the resources, that makes sense. I wonder if we can read that once at
> the start of the test when we create the cgroups and use that value
> later on.

Unfortunately, we can't. I've tested this and memory.current can change 
a lot during the first process migration.
Cyril Hrubis May 9, 2025, 10:01 a.m. UTC | #6
Hi!
> >>> Here we depend on the fact that process memory has been properly
> >>> accounted for when it starts running its code. Are you sure that we can
> >>> rely on this or does this just happen to work?
> >>
> >> Actually, my commit message is slightly misleading because the existing
> >> process memory does not get migrated to the new cgroup. But the cgroup
> >> itself may already have non-zero memory usage even when empty, likely
> >> for internal kernel structures. Any new allocations of kernel structures
> >> should also be finished when the process migration completes. So unless
> >> the migration behavior changes in the near future, we can rely on this.
> > 
> > I suppose that the cgroup is charged for the memory it needs to track
> > the resources, that makes sense. I wonder if we can read that once at
> > the start of the test when we create the cgroups and use that value
> > later on.
> 
> Unfortunately, we can't. I've tested this and memory.current can change 
> a lot during the first process migration.

That does sound strange. @Michal any idea what happens here?
Martin Doucha May 9, 2025, 10:11 a.m. UTC | #7
On 09. 05. 25 12:01, Cyril Hrubis wrote:
> Hi!
>>>>> Here we depend on the fact that process memory has been properly
>>>>> accounted for when it starts running its code. Are you sure that we can
>>>>> rely on this or does this just happen to work?
>>>>
>>>> Actually, my commit message is slightly misleading because the existing
>>>> process memory does not get migrated to the new cgroup. But the cgroup
>>>> itself may already have non-zero memory usage even when empty, likely
>>>> for internal kernel structures. Any new allocations of kernel structures
>>>> should also be finished when the process migration completes. So unless
>>>> the migration behavior changes in the near future, we can rely on this.
>>>
>>> I suppose that the cgroup is charged for the memory it needs to track
>>> the resources, that makes sense. I wonder if we can read that once at
>>> the start of the test when we create the cgroups and use that value
>>> later on.
>>
>> Unfortunately, we can't. I've tested this and memory.current can change
>> a lot during the first process migration.
> 
> That does sound strange. @Michal any idea what happens here?

My guess is that the kernel structure allocation is just lazy. The 
cgroup memory counter usually starts at zero. Then it allocates 
structures on the first process migration and keeps them until the 
cgroup gets destroyed.
Martin Doucha May 9, 2025, 2:41 p.m. UTC | #8
On 09. 05. 25 16:11, Michal Koutný wrote:
> On Fri, May 09, 2025 at 12:01:47PM +0200, Cyril Hrubis <chrubis@suse.cz> wrote:
>>> Unfortunately, we can't. I've tested this and memory.current can change
>>> a lot during the first process migration.
>>
>> That does sound strange. @Michal any idea what happens here?
> 
> [Process migrates itself (echo 0 >$target_cg/cgroup.procs) or] it's
> otherwise active during the migration?
> 
> (Also, the apparent increase of memory.current may be amplified because
> of MEMCG_CHARGE_BATCH even with initially small allocation.)

The process migrates itself:
SAFE_CG_PRINTF(cg, "cgroup.procs", "%d", getpid());

We're dealing with an issue where the test has 2MB safety margin from 
triggering OOM but immediately after the process migrates itself into 
the cgroup on PPC64LE, memory.current will be ~4MB and the process will 
randomly trigger OOM anyway. So we're increasing the safety margin by 
whatever memory.current says immediately after the migration.
diff mbox series

Patch

diff --git a/testcases/kernel/controllers/memcg/memcontrol03.c b/testcases/kernel/controllers/memcg/memcontrol03.c
index b5bbb9954..d2e489ad6 100644
--- a/testcases/kernel/controllers/memcg/memcontrol03.c
+++ b/testcases/kernel/controllers/memcg/memcontrol03.c
@@ -94,17 +94,23 @@  static void cleanup_sub_groups(void)
 }
 
 static void alloc_anon_in_child(const struct tst_cg_group *const cg,
-				const size_t size, const int expect_oom)
+	size_t size, const int expect_oom)
 {
 	int status;
 	const pid_t pid = SAFE_FORK();
+	size_t cgmem;
 
 	if (!pid) {
 		SAFE_CG_PRINTF(cg, "cgroup.procs", "%d", getpid());
+		SAFE_CG_SCANF(cg, "memory.current", "%zu", &cgmem);
+		size = size > cgmem ? size - cgmem : 0;
 
 		tst_res(TINFO, "Child %d in %s: Allocating anon: %"PRIdPTR,
 		getpid(), tst_cg_group_name(cg), size);
-		alloc_anon(size);
+
+		if (size)
+			alloc_anon(size);
+
 		exit(0);
 	}
 
@@ -128,9 +134,10 @@  static void alloc_anon_in_child(const struct tst_cg_group *const cg,
 }
 
 static void alloc_pagecache_in_child(const struct tst_cg_group *const cg,
-				     const size_t size)
+	size_t size)
 {
 	const pid_t pid = SAFE_FORK();
+	size_t cgmem;
 
 	if (pid) {
 		TST_CHECKPOINT_WAIT(CHILD_IDLE);
@@ -138,10 +145,15 @@  static void alloc_pagecache_in_child(const struct tst_cg_group *const cg,
 	}
 
 	SAFE_CG_PRINTF(cg, "cgroup.procs", "%d", getpid());
+	SAFE_CG_SCANF(cg, "memory.current", "%zu", &cgmem);
+	size = size > cgmem ? size - cgmem : 0;
 
 	tst_res(TINFO, "Child %d in %s: Allocating pagecache: %"PRIdPTR,
 		getpid(), tst_cg_group_name(cg), size);
-	alloc_pagecache(fd, size);
+
+	if (size)
+		alloc_pagecache(fd, size);
+
 	SAFE_FSYNC(fd);
 
 	TST_CHECKPOINT_WAKE(CHILD_IDLE);