[2/2,WORK-IN-PROGRESS] lib/tst_test: Dump stack for test processes stuck in kernel

Message ID 20180627152217.7067-2-chrubis@suse.cz
State New
Headers show
Series
  • Untitled series #52434
Related show

Commit Message

Cyril Hrubis June 27, 2018, 3:22 p.m.
This commit adds a small helper library to find a process(es) given a
process group ID and dump their stacks.

Example output:

$ ./shmctl05
tst_test.c:1015: INFO: Timeout per run is 0h 00m 10s
Test timeouted, sending SIGKILL!
tst_test.c:1059: TFAIL: Test process child stuck in the kernel!
tst_find_pid.c:90: INFO: Pid 1272 stuck in kernel!
Kernel stacktrace follows:
[<ffffffffa3c12564>] __switch_to_asm+0x34/0x70
[<ffffffffa3c12570>] __switch_to_asm+0x40/0x70
[<ffffffffa3625761>] __switch_to+0x2c1/0x6e0
[<ffffffffa393e194>] call_rwsem_down_read_failed+0x14/0x30
[<ffffffffa3704802>] acct_collect+0x42/0x1a0
[<ffffffffa367d36a>] do_exit+0x74a/0xaf0
[<ffffffffa3c13d27>] rewind_stack_do_exit+0x17/0x20
[<ffffffffffffffff>] 0xffffffffffffffff
tst_test.c:1061: FAIL: Congratulation, likely test hit a kernel bug.

TODO: The main test process uses signal handler and alarm to call _exit if the
      child process that executes the actuall test timeouts. We need to redesign
      this if we want to dump the stack in that case as well.

Signed-off-by: Cyril Hrubis <chrubis@suse.cz>
CC: Jan Stancek <jstancek@redhat.com>
---
 include/tst_dump_stacks.h |  25 +++++++++++
 lib/tst_dump_stacks.c     | 108 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/tst_test.c            |   3 +-
 3 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 include/tst_dump_stacks.h
 create mode 100644 lib/tst_dump_stacks.c

Comments

Jan Stancek June 28, 2018, 1:05 p.m. | #1
----- Original Message -----
> This commit adds a small helper library to find a process(es) given a
> process group ID and dump their stacks.
> 
> Example output:
> 
> $ ./shmctl05
> tst_test.c:1015: INFO: Timeout per run is 0h 00m 10s
> Test timeouted, sending SIGKILL!
> tst_test.c:1059: TFAIL: Test process child stuck in the kernel!
> tst_find_pid.c:90: INFO: Pid 1272 stuck in kernel!
> Kernel stacktrace follows:
> [<ffffffffa3c12564>] __switch_to_asm+0x34/0x70
> [<ffffffffa3c12570>] __switch_to_asm+0x40/0x70
> [<ffffffffa3625761>] __switch_to+0x2c1/0x6e0
> [<ffffffffa393e194>] call_rwsem_down_read_failed+0x14/0x30
> [<ffffffffa3704802>] acct_collect+0x42/0x1a0
> [<ffffffffa367d36a>] do_exit+0x74a/0xaf0
> [<ffffffffa3c13d27>] rewind_stack_do_exit+0x17/0x20
> [<ffffffffffffffff>] 0xffffffffffffffff
> tst_test.c:1061: FAIL: Congratulation, likely test hit a kernel bug.
> 
> TODO: The main test process uses signal handler and alarm to call _exit if
> the
>       child process that executes the actuall test timeouts. We need to
>       redesign
>       this if we want to dump the stack in that case as well.

Hi,

What if we dropped _exit() from signal handler, and left all
killing to code added in 1/2 of this series?

Signal handler will only note that we hit timeout:

static void alarm_handler(int sig LTP_ATTRIBUTE_UNUSED)
{
        WRITE_MSG("Test timed out!\n");
        ++timeout_hit;
}

and fork_testrun() will be periodically checking for it:

do {
    usleep(10000);
    ret = SAFE_WAITPID(test_pid, &status, WNOHANG);
} while (ret == 0 || timeout_hit == 0);

// try to kill process group here

> 
> Signed-off-by: Cyril Hrubis <chrubis@suse.cz>
> CC: Jan Stancek <jstancek@redhat.com>
> ---
>  include/tst_dump_stacks.h |  25 +++++++++++
>  lib/tst_dump_stacks.c     | 108
>  ++++++++++++++++++++++++++++++++++++++++++++++
>  lib/tst_test.c            |   3 +-
>  3 files changed, 135 insertions(+), 1 deletion(-)
>  create mode 100644 include/tst_dump_stacks.h
>  create mode 100644 lib/tst_dump_stacks.c
> 
> diff --git a/include/tst_dump_stacks.h b/include/tst_dump_stacks.h
> new file mode 100644
> index 000000000..643cc58a8
> --- /dev/null
> +++ b/include/tst_dump_stacks.h
> @@ -0,0 +1,25 @@
> +/*
> + * Copyright (c) 2018 Cyril Hrubis <chrubis@suse.cz>
> + *
> + * This program is free software: you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation, either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef TST_DUMP_STACKS__
> +#define TST_DUMP_STACKS__
> +
> +void tst_dump_stacks_by_pgid(pid_t pgid);
> +
> +void tst_dump_stack_by_pid(pid_t pid);
> +
> +#endif /* TST_DUMP_STACKS__ */
> diff --git a/lib/tst_dump_stacks.c b/lib/tst_dump_stacks.c
> new file mode 100644
> index 000000000..aa97c6820
> --- /dev/null
> +++ b/lib/tst_dump_stacks.c
> @@ -0,0 +1,108 @@
> +/*
> + * Copyright (c) 2018 Cyril Hrubis <chrubis@suse.cz>
> + *
> + * This program is free software: you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation, either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <ctype.h>
> +#include <stdio.h>
> +
> +#define TST_NO_DEFAULT_MAIN 1
> +#include "tst_test.h"
> +
> +static void *process_search_init(void)
> +{
> +	DIR *dir = SAFE_OPENDIR("/proc/");
> +
> +	return dir;
> +}
> +
> +static int is_number(const char *str)
> +{
> +	do {
> +		if (!isdigit(*str))
> +			return 0;
> +	} while (*(++str));
> +
> +	return 1;
> +}
> +
> +static int process_search_pgid_next(void *pid_search, pid_t pgid)
> +{
> +	struct dirent *ent;
> +	DIR *dir = pid_search;
> +	char path[1024];
> +	int ppgid, pid;
> +	FILE *f;
> +
> +	while ((ent = readdir(dir))) {
> +		if (ent->d_type != DT_DIR)
> +			continue;
> +		if (!is_number(ent->d_name))
> +			continue;
> +
> +		snprintf(path, sizeof(path), "/proc/%s/stat", ent->d_name);
> +
> +		f = fopen(path, "r");
> +		if (!f)
> +			continue;
> +
> +		if (fscanf(f, "%i %*s %*c %*i %i", &pid, &ppgid) != 2) {
> +			tst_res(TWARN, "fscanf(%s) failed!", ent->d_name);
> +			fclose(f);
> +			continue;
> +		}
> +
> +		fclose(f);
> +
> +		if (ppgid == pgid)
> +			break;
> +	}
> +
> +	if (ent)
> +		return pid;
> +
> +	closedir(dir);
> +	return -1;
> +}
> +
> +void tst_dump_stack_by_pid(pid_t pid)
> +{
> +	int fd, len;
> +	char buf[512];
> +	char path[1024];
> +
> +	tst_res(TINFO, "Pid %i stuck in kernel!", pid);
> +
> +	fprintf(stderr, "Kernel stacktrace follows:\n");
> +	fflush(stderr);
> +
> +	snprintf(path, sizeof(path), "/proc/%i/stack", pid);
> +
> +	fd = SAFE_OPEN(path, O_RDONLY);
> +
> +	while ((len = SAFE_READ(0, fd, buf, sizeof(buf))) > 0)
> +		SAFE_WRITE(1, 2, buf, len);
> +
> +	SAFE_CLOSE(fd);
> +}
> +
> +void tst_dump_stacks_by_pgid(pid_t pgid)
> +{
> +	void *ps = process_search_init();
> +	int pid;
> +
> +	while ((pid = process_search_pgid_next(ps, pgid)) != -1)
> +		tst_dump_stack_by_pid(pid);
> +}
> diff --git a/lib/tst_test.c b/lib/tst_test.c
> index 329168a24..d9476c02c 100644
> --- a/lib/tst_test.c
> +++ b/lib/tst_test.c
> @@ -1058,7 +1058,8 @@ static int fork_testrun(void)
>  		if (retries++ <= 14)
>  			continue;
>  
> -		tst_res(TFAIL, "Test process child stuck in the kernel!");
> +		tst_res(TFAIL, "Test process child(ren) stuck in the kernel!");
> +		tst_dump_stacks_by_pgid(test_pid);
>  		tst_brk(TFAIL, "Congratulation, likely test hit a kernel bug.");
>  	}

Looks good to me.

Regards,
Jan

Patch

diff --git a/include/tst_dump_stacks.h b/include/tst_dump_stacks.h
new file mode 100644
index 000000000..643cc58a8
--- /dev/null
+++ b/include/tst_dump_stacks.h
@@ -0,0 +1,25 @@ 
+/*
+ * Copyright (c) 2018 Cyril Hrubis <chrubis@suse.cz>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TST_DUMP_STACKS__
+#define TST_DUMP_STACKS__
+
+void tst_dump_stacks_by_pgid(pid_t pgid);
+
+void tst_dump_stack_by_pid(pid_t pid);
+
+#endif /* TST_DUMP_STACKS__ */
diff --git a/lib/tst_dump_stacks.c b/lib/tst_dump_stacks.c
new file mode 100644
index 000000000..aa97c6820
--- /dev/null
+++ b/lib/tst_dump_stacks.c
@@ -0,0 +1,108 @@ 
+/*
+ * Copyright (c) 2018 Cyril Hrubis <chrubis@suse.cz>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <ctype.h>
+#include <stdio.h>
+
+#define TST_NO_DEFAULT_MAIN 1
+#include "tst_test.h"
+
+static void *process_search_init(void)
+{
+	DIR *dir = SAFE_OPENDIR("/proc/");
+
+	return dir;
+}
+
+static int is_number(const char *str)
+{
+	do {
+		if (!isdigit(*str))
+			return 0;
+	} while (*(++str));
+
+	return 1;
+}
+
+static int process_search_pgid_next(void *pid_search, pid_t pgid)
+{
+	struct dirent *ent;
+	DIR *dir = pid_search;
+	char path[1024];
+	int ppgid, pid;
+	FILE *f;
+
+	while ((ent = readdir(dir))) {
+		if (ent->d_type != DT_DIR)
+			continue;
+		if (!is_number(ent->d_name))
+			continue;
+
+		snprintf(path, sizeof(path), "/proc/%s/stat", ent->d_name);
+
+		f = fopen(path, "r");
+		if (!f)
+			continue;
+
+		if (fscanf(f, "%i %*s %*c %*i %i", &pid, &ppgid) != 2) {
+			tst_res(TWARN, "fscanf(%s) failed!", ent->d_name);
+			fclose(f);
+			continue;
+		}
+
+		fclose(f);
+
+		if (ppgid == pgid)
+			break;
+	}
+
+	if (ent)
+		return pid;
+
+	closedir(dir);
+	return -1;
+}
+
+void tst_dump_stack_by_pid(pid_t pid)
+{
+	int fd, len;
+	char buf[512];
+	char path[1024];
+
+	tst_res(TINFO, "Pid %i stuck in kernel!", pid);
+
+	fprintf(stderr, "Kernel stacktrace follows:\n");
+	fflush(stderr);
+
+	snprintf(path, sizeof(path), "/proc/%i/stack", pid);
+
+	fd = SAFE_OPEN(path, O_RDONLY);
+
+	while ((len = SAFE_READ(0, fd, buf, sizeof(buf))) > 0)
+		SAFE_WRITE(1, 2, buf, len);
+
+	SAFE_CLOSE(fd);
+}
+
+void tst_dump_stacks_by_pgid(pid_t pgid)
+{
+	void *ps = process_search_init();
+	int pid;
+
+	while ((pid = process_search_pgid_next(ps, pgid)) != -1)
+		tst_dump_stack_by_pid(pid);
+}
diff --git a/lib/tst_test.c b/lib/tst_test.c
index 329168a24..d9476c02c 100644
--- a/lib/tst_test.c
+++ b/lib/tst_test.c
@@ -1058,7 +1058,8 @@  static int fork_testrun(void)
 		if (retries++ <= 14)
 			continue;
 
-		tst_res(TFAIL, "Test process child stuck in the kernel!");
+		tst_res(TFAIL, "Test process child(ren) stuck in the kernel!");
+		tst_dump_stacks_by_pgid(test_pid);
 		tst_brk(TFAIL, "Congratulation, likely test hit a kernel bug.");
 	}