diff mbox series

[v2] move_pages12: handle errno EBUSY for madvise(..., MADV_SOFT_OFFLINE)

Message ID 20190725033713.8551-1-liwang@redhat.com
State Superseded
Headers show
Series [v2] move_pages12: handle errno EBUSY for madvise(..., MADV_SOFT_OFFLINE) | expand

Commit Message

Li Wang July 25, 2019, 3:37 a.m. UTC
The test#2 is going to simulate the race condition, where move_pages()
and soft offline are called on a single hugetlb page concurrently. But,
it return EBUSY and report FAIL in soft-offline a moving hugepage as a
result sometimes.

The root cause seems a call to page_huge_active return false, then the
soft offline action will failed to isolate hugepage with EBUSY return as
below call trace:

In Parent:
  madvise(..., MADV_SOFT_OFFLINE)
  ...
    soft_offline_page
      soft_offline_in_use_page
        soft_offline_huge_page
          isolate_huge_page
            page_huge_active  --> return false at here

In Child:
  move_pages()
  ...
    do_move_pages
      do_move_pages_to_node
        add_page_for_migration
          isolate_huge_page   --> it has already isolated the hugepage

In this patch, I simply regard the returned EBUSY as a normal situation and
mask it in error handler. Because move_pages is calling add_page_for_migration
to isolate hugepage before do migration, so that's very possible to hit the
collision and return EBUSY on the same page.

Error log:
----------
move_pages12.c:235: INFO: Free RAM 8386256 kB
move_pages12.c:253: INFO: Increasing 2048kB hugepages pool on node 0 to 4
move_pages12.c:263: INFO: Increasing 2048kB hugepages pool on node 1 to 6
move_pages12.c:179: INFO: Allocating and freeing 4 hugepages on node 0
move_pages12.c:179: INFO: Allocating and freeing 4 hugepages on node 1
move_pages12.c:169: PASS: Bug not reproduced
move_pages12.c:81: FAIL: madvise failed: SUCCESS
move_pages12.c:81: FAIL: madvise failed: SUCCESS
move_pages12.c:143: BROK: mmap((nil),4194304,3,262178,-1,0) failed: ENOMEM
move_pages12.c:114: FAIL: move_pages failed: EINVAL

Dmesg:
------
[165435.492170] soft offline: 0x61c00 hugepage failed to isolate
[165435.590252] soft offline: 0x61c00 hugepage failed to isolate
[165435.725493] soft offline: 0x61400 hugepage failed to isolate

Other two fixes in this patch:
 * use TERRNO(but not TTERRNO) to catch madvise(..., MADV_SOFT_OFFLINE) errno
 * go out test when hugepage allocating failed with ENOMEM

Signed-off-by: Li Wang <liwang@redhat.com>
---

Notes:
    v1 --> v2
       attempt to retry the mmap() on ENOMEM.

 .../kernel/syscalls/move_pages/move_pages12.c | 38 ++++++++++++++-----
 1 file changed, 29 insertions(+), 9 deletions(-)

Comments

Cyril Hrubis July 26, 2019, 12:12 p.m. UTC | #1
Hi!
>  static void do_test(unsigned int n)
>  {
> -	int i;
> +	int i, ret, retrys;
> +	void *ptr;
>  	pid_t cpid = -1;
>  	int status;
>  	unsigned int twenty_percent = (tst_timeout_remaining() / 5);
> @@ -135,19 +136,37 @@ static void do_test(unsigned int n)
>  	if (cpid == 0)
>  		do_child(tcases[n].tpages);
>  
> -	for (i = 0; i < LOOPS; i++) {
> -		void *ptr;
> +	for (i = 0; i < LOOPS; retrys = 0, i++) {
> +retry:
> +		ptr = mmap(NULL, tcases[n].tpages * hpsz,
> +				PROT_READ | PROT_WRITE,
> +				MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
> +		if (ptr == MAP_FAILED) {
> +			if (errno == ENOMEM) {
> +				if (retrys < LOOPS) {
> +					retrys++;
> +					usleep(1000);
> +					goto retry;
> +				}
> +
> +				if (i > 0) {
> +					tst_res(TINFO, "Test run %d times", i);
> +					goto out;
> +				}
> +			}
> +
> +			tst_brk(TBROK | TERRNO, "Cannot allocate hugepage");
> +		}

Why so complicated?

What about just doing usleep() and continue in case of the failure?


	for (i = 0; i < LOOPS; i++) {
		ptr = mmap(...);
		if (ptr == MAP_FAILED) {
			if (errno == ENOMEM) {
				usleep(1000);
				continue;
			}

			tst_brk(...);
		}

		...
	}


> -		ptr = SAFE_MMAP(NULL, tcases[n].tpages * hpsz,
> -			PROT_READ | PROT_WRITE,
> -			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
>  		if (ptr != addr)
>  			tst_brk(TBROK, "Failed to mmap at desired addr");
>  
>  		memset(addr, 0, tcases[n].tpages * hpsz);
>  
>  		if (tcases[n].offline) {
> -			if (do_soft_offline(tcases[n].tpages) == EINVAL) {
> +			ret = do_soft_offline(tcases[n].tpages);
> +
> +			if (ret == EINVAL) {
>  				SAFE_KILL(cpid, SIGKILL);
>  				SAFE_WAITPID(cpid, &status, 0);
>  				SAFE_MUNMAP(addr, tcases[n].tpages * hpsz);
> @@ -163,6 +182,7 @@ static void do_test(unsigned int n)
>  			break;
>  	}
>  
> +out:
>  	SAFE_KILL(cpid, SIGKILL);
>  	SAFE_WAITPID(cpid, &status, 0);
>  	if (!WIFEXITED(status))
> -- 
> 2.20.1
>
Li Wang July 26, 2019, 1:21 p.m. UTC | #2
Hi Cyril,

> Why so complicated?
> What about just doing usleep() and continue in case of the failure?

Seems I was falling too deep in this bug itself:).

I thought there are two situations we might hit ENOMEM:
1. In the first loop(i == 0), this should be a different issue and
better to break the test.
2. the test hit EBUSY and race condition results in ENOMEM in a
general loop(i >= 1), here we should retry to do mmap(). Even the
retries get arrive LOOPS the test should goto exit and report PASS
with print run times.

To be honest, your simple way also works for those two situations, we
just need to add one more sentence to judge if 'i == 0' before the
break.

So, I agree with you suggest here.


New found by this test:
===================

----------Patch V1---------
I run the patch v1(with EBUSY ignored and no mmap() retry on ENOMEM)
and always get PASS.

# ./move_pages12
tst_test.c:1100: INFO: Timeout per run is 0h 05m 00s
move_pages12.c:251: INFO: Free RAM 189624232 kB
move_pages12.c:269: INFO: Increasing 2048kB hugepages pool on node 0 to 6
move_pages12.c:279: INFO: Increasing 2048kB hugepages pool on node 1 to 4
move_pages12.c:195: INFO: Allocating and freeing 4 hugepages on node 0
move_pages12.c:195: INFO: Allocating and freeing 4 hugepages on node 1
move_pages12.c:185: PASS: Bug not reproduced
i = 4
move_pages12.c:185: PASS: Bug not reproduced

Summary:
passed   2
failed   0
skipped  0
warnings 0

----------Patch V2---------------
But patch v2(no matter go with my complicated retries or you simple
method), It always gets killed by SIGBUS in the retrys mmap() on
ENOMEM, I guess it is a new kernel problem (not same as the first
SIGBUS without commit 6bc9b56433b).

# ./move_pages12
tst_test.c:1100: INFO: Timeout per run is 0h 05m 00s
move_pages12.c:251: INFO: Free RAM 194212832 kB
move_pages12.c:269: INFO: Increasing 2048kB hugepages pool on node 0 to 4
move_pages12.c:279: INFO: Increasing 2048kB hugepages pool on node 1 to 6
move_pages12.c:195: INFO: Allocating and freeing 4 hugepages on node 0
move_pages12.c:195: INFO: Allocating and freeing 4 hugepages on node 1
move_pages12.c:185: PASS: Bug not reproduced
i = 4
i = 6
i = 8
...
i = 136
i = 138
i = 140
tst_test.c:1145: BROK: Test killed by SIGBUS!

Summary:
passed   1
failed   0
skipped  0
warnings 0
move_pages12.c:114: FAIL: move_pages failed: ESRCH


-----system env-----
# uname -r
5.3.0-rc1+

# numactl -H
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 16 17 18 19
node 0 size: 32171 MB
node 0 free: 25358 MB
node 1 cpus: 4 5 6 7 20 21 22 23
node 1 size: 16125 MB
node 1 free: 5565 MB
node 2 cpus: 8 9 10 11 24 25 26 27
node 2 size: 96765 MB
node 2 free: 90646 MB
node 3 cpus: 12 13 14 15 28 29 30 31
node 3 size: 64482 MB
node 3 free: 60820 MB
node distances:
node   0   1   2   3
  0:  10  11  11  11
  1:  11  10  11  11
  2:  11  11  10  11
  3:  11  11  11  10

So, maybe we have to re-evaluate this patch V2 and to figure out why
the retry mmap() hitting SIGBUS fails.

--
Regards,
Li Wang
Cyril Hrubis July 26, 2019, 1:31 p.m. UTC | #3
Hi!
> -----system env-----
> # uname -r
> 5.3.0-rc1+
> 
> # numactl -H
> available: 4 nodes (0-3)
> node 0 cpus: 0 1 2 3 16 17 18 19
> node 0 size: 32171 MB
> node 0 free: 25358 MB
> node 1 cpus: 4 5 6 7 20 21 22 23
> node 1 size: 16125 MB
> node 1 free: 5565 MB
> node 2 cpus: 8 9 10 11 24 25 26 27
> node 2 size: 96765 MB
> node 2 free: 90646 MB
> node 3 cpus: 12 13 14 15 28 29 30 31
> node 3 size: 64482 MB
> node 3 free: 60820 MB
> node distances:
> node   0   1   2   3
>   0:  10  11  11  11
>   1:  11  10  11  11
>   2:  11  11  10  11
>   3:  11  11  11  10
> 
> So, maybe we have to re-evaluate this patch V2 and to figure out why
> the retry mmap() hitting SIGBUS fails.

One possibility would be that the numa_move_pages() triggers SIGBUS
while we do the usleep() before we attempt to retry the mmap(). In that
case the race was present in the test all the time but couldn't be
triggered because the window where the memory is unmapped was very
short. If that is the case we should as well set up a handler to SIGBUS
and ignore it as well.
Li Wang July 29, 2019, 4:53 a.m. UTC | #4
On Fri, Jul 26, 2019 at 9:31 PM Cyril Hrubis <chrubis@suse.cz> wrote:

> > So, maybe we have to re-evaluate this patch V2 and to figure out why
> > the retry mmap() hitting SIGBUS fails.
>
> One possibility would be that the numa_move_pages() triggers SIGBUS
> while we do the usleep() before we attempt to retry the mmap(). In that
> case the race was present in the test all the time but couldn't be
> triggered because the window where the memory is unmapped was very
> short. If that is the case we should as well set up a handler to SIGBUS
> and ignore it as well.

No, It's not like the numa_move_pages() triggers SIGBUS because in the
end child print:
    move_pages12.c:114: FAIL: move_pages failed: ESRCH
that ESRCH means the child is still alive and detect ppid is not available.

It's more like to retry mmap() triggers SIGBUS while doing the
numa_move_pages() in background. That is very similar to the kernel
bug which was mentioned by commit 6bc9b56433b76e40d(mm: fix race on
soft-offlining ). A race condition between soft offline and
hugetlb_fault which causes unexpected process SIGBUS killing.

And, I will send an email to linux-mm@ to RFC.

--
Regards,
Li Wang
diff mbox series

Patch

diff --git a/testcases/kernel/syscalls/move_pages/move_pages12.c b/testcases/kernel/syscalls/move_pages/move_pages12.c
index 964b712fb..203402cfb 100644
--- a/testcases/kernel/syscalls/move_pages/move_pages12.c
+++ b/testcases/kernel/syscalls/move_pages/move_pages12.c
@@ -77,8 +77,8 @@  static void *addr;
 static int do_soft_offline(int tpgs)
 {
 	if (madvise(addr, tpgs * hpsz, MADV_SOFT_OFFLINE) == -1) {
-		if (errno != EINVAL)
-			tst_res(TFAIL | TTERRNO, "madvise failed");
+		if (errno != EINVAL && errno != EBUSY)
+			tst_res(TFAIL | TERRNO, "madvise failed");
 		return errno;
 	}
 	return 0;
@@ -121,7 +121,8 @@  static void do_child(int tpgs)
 
 static void do_test(unsigned int n)
 {
-	int i;
+	int i, ret, retrys;
+	void *ptr;
 	pid_t cpid = -1;
 	int status;
 	unsigned int twenty_percent = (tst_timeout_remaining() / 5);
@@ -135,19 +136,37 @@  static void do_test(unsigned int n)
 	if (cpid == 0)
 		do_child(tcases[n].tpages);
 
-	for (i = 0; i < LOOPS; i++) {
-		void *ptr;
+	for (i = 0; i < LOOPS; retrys = 0, i++) {
+retry:
+		ptr = mmap(NULL, tcases[n].tpages * hpsz,
+				PROT_READ | PROT_WRITE,
+				MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
+		if (ptr == MAP_FAILED) {
+			if (errno == ENOMEM) {
+				if (retrys < LOOPS) {
+					retrys++;
+					usleep(1000);
+					goto retry;
+				}
+
+				if (i > 0) {
+					tst_res(TINFO, "Test run %d times", i);
+					goto out;
+				}
+			}
+
+			tst_brk(TBROK | TERRNO, "Cannot allocate hugepage");
+		}
 
-		ptr = SAFE_MMAP(NULL, tcases[n].tpages * hpsz,
-			PROT_READ | PROT_WRITE,
-			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
 		if (ptr != addr)
 			tst_brk(TBROK, "Failed to mmap at desired addr");
 
 		memset(addr, 0, tcases[n].tpages * hpsz);
 
 		if (tcases[n].offline) {
-			if (do_soft_offline(tcases[n].tpages) == EINVAL) {
+			ret = do_soft_offline(tcases[n].tpages);
+
+			if (ret == EINVAL) {
 				SAFE_KILL(cpid, SIGKILL);
 				SAFE_WAITPID(cpid, &status, 0);
 				SAFE_MUNMAP(addr, tcases[n].tpages * hpsz);
@@ -163,6 +182,7 @@  static void do_test(unsigned int n)
 			break;
 	}
 
+out:
 	SAFE_KILL(cpid, SIGKILL);
 	SAFE_WAITPID(cpid, &status, 0);
 	if (!WIFEXITED(status))