diff mbox series

cpuset_regression_test: Fix for already existing cpusets

Message ID 20191115101039.43386-1-lkml@jv-coder.de
State Superseded
Headers show
Series cpuset_regression_test: Fix for already existing cpusets | expand

Commit Message

Joerg Vehlow Nov. 15, 2019, 10:10 a.m. UTC
From: Joerg Vehlow <joerg.vehlow@aox-tech.de>

If there are already cpusets defined on the system, that use cpu 0-1,
the test fails, because it tries to exclusively use cpu 0-1 for the
testcase.

The fix sets the cpuset for all cgroups to 0 and disables exclusive
cpu usage for the duration of the test and restores it on cleanup.
For the test only cpu 1 is set as exclusive. This is enough to
trigger the bug this regression test was designed for.
This was tested by reverting the commit mentioned in the testcase.

Signed-off-by: Joerg Vehlow <joerg.vehlow@aox-tech.de>
---
 .../cpuset/cpuset_regression_test.sh          | 58 +++++++++++++++++--
 1 file changed, 54 insertions(+), 4 deletions(-)

Comments

Joerg Vehlow Nov. 16, 2020, 11:58 a.m. UTC | #1
Hi,

a ping for this patch? Is something like that (workaround environment 
issues) not wanted in ltp?

Greets
Jörg

On 11/15/2019 11:10 AM, Joerg Vehlow wrote:
> From: Joerg Vehlow <joerg.vehlow@aox-tech.de>
>
> If there are already cpusets defined on the system, that use cpu 0-1,
> the test fails, because it tries to exclusively use cpu 0-1 for the
> testcase.
>
> The fix sets the cpuset for all cgroups to 0 and disables exclusive
> cpu usage for the duration of the test and restores it on cleanup.
> For the test only cpu 1 is set as exclusive. This is enough to
> trigger the bug this regression test was designed for.
> This was tested by reverting the commit mentioned in the testcase.
>
> Signed-off-by: Joerg Vehlow <joerg.vehlow@aox-tech.de>
> ---
>   .../cpuset/cpuset_regression_test.sh          | 58 +++++++++++++++++--
>   1 file changed, 54 insertions(+), 4 deletions(-)
>
> diff --git a/testcases/kernel/controllers/cpuset/cpuset_regression_test.sh b/testcases/kernel/controllers/cpuset/cpuset_regression_test.sh
> index dccfd91cd..ed5e30f2a 100755
> --- a/testcases/kernel/controllers/cpuset/cpuset_regression_test.sh
> +++ b/testcases/kernel/controllers/cpuset/cpuset_regression_test.sh
> @@ -26,6 +26,49 @@ TCID=cpuset_regression_test
>   TST_TOTAL=1
>   . test.sh
>   
> +# cpuset_backup_and_update <backup_dir> <what> <value>
> +# Create backup of the values of a specific file (<what>)
> +# in all cpuset groups and set the value to <value>
> +# The backup is written to <backup_dir> in the same structure
> +# as in the cpuset filesystem
> +cpuset_backup_and_update()
> +{
> +	local backup_dir=$1
> +	local what=$2
> +	local value=$3
> +	local old_dir=$(pwd)
> +
> +	cd ${root_cpuset_dir}
> +	find . -mindepth 2 -name ${what} -print0 |
> +	while IFS= read -r -d '' file; do
> +		tst_resm TINFO "Backup ${file} ($(cat "${file}"))"
> +		mkdir -p "$(dirname "${backup_dir}/${file}")"
> +		cat "${file}" > "${backup_dir}/${file}"
> +		echo "${value}" > "${file}"
> +	done
> +
> +	cd ${old_dir}
> +}
> +
> +# cpuset_restore <backup_dir> <what>
> +# Restores the value of a file (<what>) in all cpuset
> +# groups from the backup created by cpuset_backup_and_update
> +cpuset_restore()
> +{
> +	local backup_dir=$1
> +	local what=$2
> +	local old_dir=$(pwd)
> +
> +	cd ${backup_dir}
> +	find . -mindepth 2 -name ${what} -print0 |
> +	while IFS= read -r -d '' file; do
> +		tst_resm TINFO "Restore ${file} ($(cat "${file}"))"
> +		cat "${file}" > "${root_cpuset_dir}/${file}"
> +	done
> +
> +	cd ${old_dir}
> +}
> +
>   setup()
>   {
>   	tst_require_root
> @@ -69,6 +112,10 @@ setup()
>   			       "do not exist."
>   	fi
>   
> +	mkdir cpuset_backup
> +	cpuset_backup_and_update "$(pwd)/cpuset_backup" ${cpu_exclusive} 0
> +	cpuset_backup_and_update "$(pwd)/cpuset_backup" cpuset.cpus 0
> +
>   	cpu_exclusive_value=$(cat ${root_cpuset_dir}/${cpu_exclusive})
>   	if [ "${cpu_exclusive_value}" != "1" ];then
>   		echo 1 > ${root_cpuset_dir}/${cpu_exclusive}
> @@ -86,6 +133,9 @@ cleanup()
>   		rmdir ${root_cpuset_dir}/testdir
>   	fi
>   
> +	cpuset_restore "$(pwd)/cpuset_backup" cpuset.cpus
> +	cpuset_restore "$(pwd)/cpuset_backup" ${cpu_exclusive}
> +
>   	if [ "$cpu_exclusive_value" != 1 ]; then
>   		# Need to flush, or may be output:
>   		# "write error: Device or resource busy"
> @@ -129,15 +179,15 @@ cpuset_test()
>   	fi
>   
>   	# ${cpus} is empty at the begin, that maybe make the system *crash*.
> -	echo 0-1 > ${root_cpuset_dir}/testdir/${cpus}
> +	echo 1 > ${root_cpuset_dir}/testdir/${cpus}
>   	if [ $? -ne 0 ]; then
> -		tst_brkm TFAIL "'echo 0-1 >" \
> +		tst_brkm TFAIL "'echo 1 >" \
>   			       "${root_cpuset_dir}/testdir/${cpus}' failed"
>   	fi
>   
>   	local cpus_value=$(cat ${root_cpuset_dir}/testdir/${cpus})
> -	if [ "${cpus_value}" != "0-1" ]; then
> -		tst_brkm TFAIL "${cpus} is '${cpus_value}', expected '0-1'"
> +	if [ "${cpus_value}" != "1" ]; then
> +		tst_brkm TFAIL "${cpus} is '${cpus_value}', expected '1'"
>   	fi
>   
>   	tst_resm TPASS "Bug is not reproduced"
Richard Palethorpe Nov. 16, 2020, 2:46 p.m. UTC | #2
Hello,

Joerg Vehlow <lkml@jv-coder.de> writes:

> Hi,
>
> a ping for this patch? Is something like that (workaround environment 
> issues) not wanted in ltp?

Generally speaking, yes, again you are right to bump it and these tests
are in need of more attention. However I have a couple of concerns about
this.

>
> Greets
> Jörg
>
> On 11/15/2019 11:10 AM, Joerg Vehlow wrote:
>> From: Joerg Vehlow <joerg.vehlow@aox-tech.de>
>>
>> If there are already cpusets defined on the system, that use cpu 0-1,
>> the test fails, because it tries to exclusively use cpu 0-1 for the
>> testcase.
>>
>> The fix sets the cpuset for all cgroups to 0 and disables exclusive
>> cpu usage for the duration of the test and restores it on cleanup.
>> For the test only cpu 1 is set as exclusive. This is enough to
>> trigger the bug this regression test was designed for.
>> This was tested by reverting the commit mentioned in the testcase.

If the system has already set exclusive cpus then it is unlikely this
regression effects it. Either the kernel has been patched or the system
manager configures the cpus first before setting the exclusive knob.

Normally I would say the test should try to run anyway, but you are
having to make some intrusive changes to the cgroup setup which could
lead to other problems.

So why not just call 'tst_brk TCONF' if the system already has exclusive
cpus configured?
Joerg Vehlow Dec. 4, 2020, 10:32 a.m. UTC | #3
Hi,
On 11/16/2020 3:46 PM, Richard Palethorpe wrote:
> If the system has already set exclusive cpus then it is unlikely this
> regression effects it. Either the kernel has been patched or the system
> manager configures the cpus first before setting the exclusive knob.
Yes "either or". If the system manager or whatever configured the 
cgroups did it in the
"right" order, that cannot trigger the bug, we do not know, if the bug 
still exists.

> Normally I would say the test should try to run anyway, but you are
> having to make some intrusive changes to the cgroup setup which could
> lead to other problems.
>
> So why not just call 'tst_brk TCONF' if the system already has exclusive
> cpus configured?
The question is, should ltp try hard to run a test or not. You may be right,
that this could have other effects, but ltp tests can crash a system anyway,
so I wouldn't worry about that. Of course TCONF would be simpler, but it 
would
also skip the test...

Do you have a scenario in mind, where changing the cpusets could potentially
cause problems? This would require a system, where something meaningful is
running, that requires specific cpu time or a specific cpu. But if that 
would
be the case, all ltp tests could interfere with it right?

Jörg
Richard Palethorpe Dec. 7, 2020, 10:41 a.m. UTC | #4
Hello Joerg,

Joerg Vehlow <lkml@jv-coder.de> writes:

> Hi,
> On 11/16/2020 3:46 PM, Richard Palethorpe wrote:
>> If the system has already set exclusive cpus then it is unlikely this
>> regression effects it. Either the kernel has been patched or the system
>> manager configures the cpus first before setting the exclusive knob.
> Yes "either or". If the system manager or whatever configured the
> cgroups did it in the
> "right" order, that cannot trigger the bug, we do not know, if the bug
> still exists.

Yes and this is why I would normally say we should still try to find the
bug.

>
>> Normally I would say the test should try to run anyway, but you are
>> having to make some intrusive changes to the cgroup setup which could
>> lead to other problems.
>>
>> So why not just call 'tst_brk TCONF' if the system already has exclusive
>> cpus configured?
> The question is, should ltp try hard to run a test or not. You may be right,
> that this could have other effects, but ltp tests can crash a system anyway,
> so I wouldn't worry about that. Of course TCONF would be simpler, but
> it would
> also skip the test...

In general we have the rule that tests should try to leave the system in
a working state. Sometimes that is not possible, but that is usually
only if a test triggers a serious issue.

>
> Do you have a scenario in mind, where changing the cpusets could potentially
> cause problems? This would require a system, where something meaningful is
> running, that requires specific cpu time or a specific cpu. But if
> that would
> be the case, all ltp tests could interfere with it right?
>
> Jörg

If we assume there is a good reason for having exclusive cpusets, even
if we don't know that reason, then we can't just remove them and expect
the system to continue working. Possibly it will even cause errors in
later unrelated tests and it will take some time for somebody to figure
out that it is due to a process running on the wrong CPU.

I assume that if a particular CGroup has exclusive CPU access then
processes in the root CGroup will not run in it. However if they do then
the user may run LTP tests in a leaf CGroup. So you can't assume all
tests would break such a system.

OTOH TCONF is often ignored, but this seems like quite a small and
tricky corner case that we are adding complexity for.
diff mbox series

Patch

diff --git a/testcases/kernel/controllers/cpuset/cpuset_regression_test.sh b/testcases/kernel/controllers/cpuset/cpuset_regression_test.sh
index dccfd91cd..ed5e30f2a 100755
--- a/testcases/kernel/controllers/cpuset/cpuset_regression_test.sh
+++ b/testcases/kernel/controllers/cpuset/cpuset_regression_test.sh
@@ -26,6 +26,49 @@  TCID=cpuset_regression_test
 TST_TOTAL=1
 . test.sh
 
+# cpuset_backup_and_update <backup_dir> <what> <value>
+# Create backup of the values of a specific file (<what>)
+# in all cpuset groups and set the value to <value>
+# The backup is written to <backup_dir> in the same structure
+# as in the cpuset filesystem
+cpuset_backup_and_update()
+{
+	local backup_dir=$1
+	local what=$2
+	local value=$3
+	local old_dir=$(pwd)
+
+	cd ${root_cpuset_dir}
+	find . -mindepth 2 -name ${what} -print0 |
+	while IFS= read -r -d '' file; do
+		tst_resm TINFO "Backup ${file} ($(cat "${file}"))"
+		mkdir -p "$(dirname "${backup_dir}/${file}")"
+		cat "${file}" > "${backup_dir}/${file}"
+		echo "${value}" > "${file}"
+	done
+
+	cd ${old_dir}
+}
+
+# cpuset_restore <backup_dir> <what>
+# Restores the value of a file (<what>) in all cpuset
+# groups from the backup created by cpuset_backup_and_update
+cpuset_restore()
+{
+	local backup_dir=$1
+	local what=$2
+	local old_dir=$(pwd)
+
+	cd ${backup_dir}
+	find . -mindepth 2 -name ${what} -print0 |
+	while IFS= read -r -d '' file; do
+		tst_resm TINFO "Restore ${file} ($(cat "${file}"))"
+		cat "${file}" > "${root_cpuset_dir}/${file}"
+	done
+
+	cd ${old_dir}
+}
+
 setup()
 {
 	tst_require_root
@@ -69,6 +112,10 @@  setup()
 			       "do not exist."
 	fi
 
+	mkdir cpuset_backup
+	cpuset_backup_and_update "$(pwd)/cpuset_backup" ${cpu_exclusive} 0
+	cpuset_backup_and_update "$(pwd)/cpuset_backup" cpuset.cpus 0
+
 	cpu_exclusive_value=$(cat ${root_cpuset_dir}/${cpu_exclusive})
 	if [ "${cpu_exclusive_value}" != "1" ];then
 		echo 1 > ${root_cpuset_dir}/${cpu_exclusive}
@@ -86,6 +133,9 @@  cleanup()
 		rmdir ${root_cpuset_dir}/testdir
 	fi
 
+	cpuset_restore "$(pwd)/cpuset_backup" cpuset.cpus
+	cpuset_restore "$(pwd)/cpuset_backup" ${cpu_exclusive}
+
 	if [ "$cpu_exclusive_value" != 1 ]; then
 		# Need to flush, or may be output:
 		# "write error: Device or resource busy"
@@ -129,15 +179,15 @@  cpuset_test()
 	fi
 
 	# ${cpus} is empty at the begin, that maybe make the system *crash*.
-	echo 0-1 > ${root_cpuset_dir}/testdir/${cpus}
+	echo 1 > ${root_cpuset_dir}/testdir/${cpus}
 	if [ $? -ne 0 ]; then
-		tst_brkm TFAIL "'echo 0-1 >" \
+		tst_brkm TFAIL "'echo 1 >" \
 			       "${root_cpuset_dir}/testdir/${cpus}' failed"
 	fi
 
 	local cpus_value=$(cat ${root_cpuset_dir}/testdir/${cpus})
-	if [ "${cpus_value}" != "0-1" ]; then
-		tst_brkm TFAIL "${cpus} is '${cpus_value}', expected '0-1'"
+	if [ "${cpus_value}" != "1" ]; then
+		tst_brkm TFAIL "${cpus} is '${cpus_value}', expected '1'"
 	fi
 
 	tst_resm TPASS "Bug is not reproduced"