diff mbox series

Add test for possible writev() issues with NULL buffer in iovec

Message ID 20210219125728.18580-1-mdoucha@suse.cz
State Changes Requested
Headers show
Series Add test for possible writev() issues with NULL buffer in iovec | expand

Commit Message

Martin Doucha Feb. 19, 2021, 12:57 p.m. UTC
Fixes #790

Signed-off-by: Martin Doucha <mdoucha@suse.cz>
---

This test triggers temporary write of invalid data into test file on some
file systems on kernel 4.4.21 and older.

 runtest/syscalls                            |   1 +
 testcases/kernel/syscalls/writev/.gitignore |   1 +
 testcases/kernel/syscalls/writev/Makefile   |   3 +
 testcases/kernel/syscalls/writev/writev03.c | 142 ++++++++++++++++++++
 4 files changed, 147 insertions(+)
 create mode 100644 testcases/kernel/syscalls/writev/writev03.c

Comments

Richard Palethorpe Feb. 19, 2021, 2:35 p.m. UTC | #1
Hello,

Martin Doucha <mdoucha@suse.cz> writes:

> Fixes #790
>
> Signed-off-by: Martin Doucha <mdoucha@suse.cz>
> ---
>
> This test triggers temporary write of invalid data into test file on some
> file systems on kernel 4.4.21 and older.
>
>  runtest/syscalls                            |   1 +
>  testcases/kernel/syscalls/writev/.gitignore |   1 +
>  testcases/kernel/syscalls/writev/Makefile   |   3 +
>  testcases/kernel/syscalls/writev/writev03.c | 142 ++++++++++++++++++++
>  4 files changed, 147 insertions(+)
>  create mode 100644 testcases/kernel/syscalls/writev/writev03.c
>
> diff --git a/runtest/syscalls b/runtest/syscalls
> index ae47a6d5e..f01d94540 100644
> --- a/runtest/syscalls
> +++ b/runtest/syscalls
> @@ -1675,6 +1675,7 @@ write05 write05
>  
>  writev01 writev01
>  writev02 writev02
> +writev03 writev03
>  writev05 writev05
>  writev06 writev06
>  writev07 writev07
> diff --git a/testcases/kernel/syscalls/writev/.gitignore b/testcases/kernel/syscalls/writev/.gitignore
> index d60da0f43..167779736 100644
> --- a/testcases/kernel/syscalls/writev/.gitignore
> +++ b/testcases/kernel/syscalls/writev/.gitignore
> @@ -1,5 +1,6 @@
>  /writev01
>  /writev02
> +/writev03
>  /writev05
>  /writev06
>  /writev07
> diff --git a/testcases/kernel/syscalls/writev/Makefile b/testcases/kernel/syscalls/writev/Makefile
> index 4844a6910..6627abaed 100644
> --- a/testcases/kernel/syscalls/writev/Makefile
> +++ b/testcases/kernel/syscalls/writev/Makefile
> @@ -9,4 +9,7 @@ endif
>  
>  include $(top_srcdir)/include/mk/testcases.mk
>  
> +writev03: CFLAGS += -pthread
> +writev03: LDLIBS += -lrt
> +
>  include $(top_srcdir)/include/mk/generic_leaf_target.mk
> diff --git a/testcases/kernel/syscalls/writev/writev03.c b/testcases/kernel/syscalls/writev/writev03.c
> new file mode 100644
> index 000000000..f48efccf2
> --- /dev/null
> +++ b/testcases/kernel/syscalls/writev/writev03.c
> @@ -0,0 +1,142 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (C) 2021 SUSE LLC <mdoucha@suse.cz>
> + *
> + * Check for potential issues in writev() if the first iovec entry is NULL
> + * and the next one is not present in RAM. This can result in a brief window
> + * where writev() first writes uninitialized data into the file (possibly
> + * exposing internal kernel structures) and then overwrites it with the real
> + * iovec contents later. Bugs fixed in:
> + *
> + *  commit d4690f1e1cdabb4d61207b6787b1605a0dc0aeab
> + *  Author: Al Viro <viro@ZenIV.linux.org.uk>
> + *  Date:   Fri Sep 16 00:11:45 2016 +0100
> + *
> + *  fix iov_iter_fault_in_readable()
> + */
> +
> +#include <sys/uio.h>
> +#include "tst_test.h"
> +#include "tst_fuzzy_sync.h"
> +
> +#define CHUNK_SIZE 256
> +#define BUF_SIZE (2 * CHUNK_SIZE)
> +#define MNTPOINT "mntpoint"
> +#define TEMPFILE MNTPOINT "/test_file"
> +#define MAPFILE MNTPOINT "/map_file"
> +
> +static unsigned char buf[BUF_SIZE], *map_ptr;
> +static int mapfd = -1, writefd = -1;
> +static volatile ssize_t written;

written should be atomic type (see below)

> +static struct tst_fzsync_pair fzsync_pair;
> +struct iovec iov[5];
> +
> +static void setup(void)
> +{
> +	int i;
> +
> +	for (i = 0; i < BUF_SIZE; i++)
> +		buf[i] = i & 0xff;
> +
> +	mapfd = SAFE_OPEN(MAPFILE, O_CREAT|O_RDWR|O_TRUNC, 0644);
> +	SAFE_WRITE(1, mapfd, buf, BUF_SIZE);
> +
> +	tst_fzsync_pair_init(&fzsync_pair);
> +}
> +
> +static void *thread_run(void *arg)
> +{
> +	while (tst_fzsync_run_b(&fzsync_pair)) {
> +		writefd = SAFE_OPEN(TEMPFILE, O_CREAT|O_WRONLY|O_TRUNC, 0644);
> +		written = BUF_SIZE;
> +		tst_fzsync_wait_b(&fzsync_pair);
> +
> +		/*
> +		 * Do *NOT* preload the data using MAP_POPULATE or touching
> +		 * the mapped range. We're testing whether writev() handles
> +		 * fault-in correctly.
> +		 */
> +		map_ptr = SAFE_MMAP(NULL, BUF_SIZE, PROT_READ, MAP_SHARED,
> +			mapfd, 0);

Possibly, instead of recreating the mapping each loop you could call
madvise MADV_DONTNEED on the mapping. In which case it may also be best
to use MAP_PRIVATE as well. Of courese if it is already fast then this
does not matter.

> +		iov[1].iov_base = map_ptr;
> +		iov[1].iov_len = CHUNK_SIZE;
> +		iov[3].iov_base = map_ptr + CHUNK_SIZE;
> +		iov[3].iov_len = CHUNK_SIZE;
> +
> +		tst_fzsync_start_race_b(&fzsync_pair);
> +		written = writev(writefd, iov, ARRAY_SIZE(iov));

To be on the safe side we should write to written with the atomic
functions (see below).

> +		tst_fzsync_end_race_b(&fzsync_pair);
> +
> +		SAFE_MUNMAP(map_ptr, BUF_SIZE);
> +		map_ptr = NULL;
> +		SAFE_CLOSE(writefd);
> +	}
> +
> +	return arg;
> +}
> +
> +static void run(void)
> +{
> +	int fd, failed = 0;
> +	ssize_t total_read;
> +	unsigned char readbuf[BUF_SIZE];
> +
> +	tst_fzsync_pair_reset(&fzsync_pair, thread_run);
> +
> +	while (!failed && tst_fzsync_run_a(&fzsync_pair)) {
> +		tst_fzsync_wait_a(&fzsync_pair);
> +		fd = SAFE_OPEN(TEMPFILE, O_RDONLY);
> +		tst_fzsync_start_race_a(&fzsync_pair);
> +
> +		for (total_read = 0; total_read < written;) {

Also read from written with the tst_atomic functions. This is especially
important for weak memory models because written may not be synchronised
straight away. Then it could block on read().

There is also a small chance some architecture will update ssize_t
non-atomically so written is smaller than expected. This would lead to a
false positive.

I suppose an alternative would be to complete writing the data just using
an ordinary write() call or however you want.

> +			total_read += SAFE_READ(0, fd, readbuf + total_read,
> +				BUF_SIZE - total_read);
> +		}
> +
> +		tst_fzsync_end_race_a(&fzsync_pair);
> +		SAFE_CLOSE(fd);
> +
> +		if (total_read > BUF_SIZE)
> +			tst_brk(TBROK, "read() returned too much data");
> +
> +		if (total_read <= 0)
> +			continue;
> +
> +		if (memcmp(readbuf, buf, (size_t)total_read)) {
> +			tst_res(TFAIL, "writev() wrote invalid data");
> +			failed = 1;
> +			break;
> +		}
> +	}
> +
> +	if (!failed)
> +		tst_res(TPASS, "writev() handles page fault-in correctly");
> +}
> +
> +static void cleanup(void)
> +{
> +	if (map_ptr && map_ptr != MAP_FAILED)
> +		SAFE_MUNMAP(map_ptr, BUF_SIZE);
> +
> +	if (mapfd >= 0)
> +		SAFE_CLOSE(mapfd);
> +
> +	if (writefd >= 0)
> +		SAFE_CLOSE(writefd);
> +
> +	tst_fzsync_pair_cleanup(&fzsync_pair);
> +}
> +
> +static struct tst_test test = {
> +	.test_all = run,
> +	.needs_root = 1,
> +	.mount_device = 1,
> +	.mntpoint = MNTPOINT,
> +	.all_filesystems = 1,
> +	.setup = setup,
> +	.cleanup = cleanup,
> +	.tags = (const struct tst_tag[]) {
> +		{"linux-git", "d4690f1e1cda"},

I guess CVE is on the way?

> +		{}
> +	}
> +};
> -- 
> 2.30.0

Apart from the volatile variable looks good!
Martin Doucha Feb. 19, 2021, 5:37 p.m. UTC | #2
On 19. 02. 21 15:35, Richard Palethorpe wrote:
>> +static void *thread_run(void *arg)
>> +{
>> +	while (tst_fzsync_run_b(&fzsync_pair)) {
>> +		writefd = SAFE_OPEN(TEMPFILE, O_CREAT|O_WRONLY|O_TRUNC, 0644);
>> +		written = BUF_SIZE;
>> +		tst_fzsync_wait_b(&fzsync_pair);
>> +
>> +		/*
>> +		 * Do *NOT* preload the data using MAP_POPULATE or touching
>> +		 * the mapped range. We're testing whether writev() handles
>> +		 * fault-in correctly.
>> +		 */
>> +		map_ptr = SAFE_MMAP(NULL, BUF_SIZE, PROT_READ, MAP_SHARED,
>> +			mapfd, 0);
> 
> Possibly, instead of recreating the mapping each loop you could call
> madvise MADV_DONTNEED on the mapping. In which case it may also be best
> to use MAP_PRIVATE as well. Of courese if it is already fast then this
> does not matter.

I considered using madvise(MADV_DONTNEED) but decided against it because
it might only mark the memory page as stale but otherwise leave the
contents untouched. That would result in writev() always writing the
correct data after the first iteration even if the page-in is completely
broken and the first iteration only passed due to lucky timing.
Recreating the mapping is fast enough and more reliable for reproducing
the expected bugs.

>> +static void run(void)
>> +{
>> +	int fd, failed = 0;
>> +	ssize_t total_read;
>> +	unsigned char readbuf[BUF_SIZE];
>> +
>> +	tst_fzsync_pair_reset(&fzsync_pair, thread_run);
>> +
>> +	while (!failed && tst_fzsync_run_a(&fzsync_pair)) {
>> +		tst_fzsync_wait_a(&fzsync_pair);
>> +		fd = SAFE_OPEN(TEMPFILE, O_RDONLY);
>> +		tst_fzsync_start_race_a(&fzsync_pair);
>> +
>> +		for (total_read = 0; total_read < written;) {
> 
> Also read from written with the tst_atomic functions. This is especially
> important for weak memory models because written may not be synchronised
> straight away. Then it could block on read().
> 
> There is also a small chance some architecture will update ssize_t
> non-atomically so written is smaller than expected. This would lead to a
> false positive.
> 
> I suppose an alternative would be to complete writing the data just using
> an ordinary write() call or however you want.

This test does not care whether writev() actually writes anything to the
test file. Returning -1 is perfectly valid result and the test will
simply skip to the next iteration. The only failure states are:
- read() fails (returning 0 is OK)
- read() finds too much data in the file (I should adjust readbuf size
to actually detect that)
- read() loads something that doesn't match what was supposed to be
written (and in case of short write, we care only about the portion that
was actually written)

> I guess CVE is on the way?

I'm not aware of any existing CVE and the bugfix is almost 4 years old
so I don't expect any.

I'll resubmit on Monday after some improvements, including atomic
load/store.
diff mbox series

Patch

diff --git a/runtest/syscalls b/runtest/syscalls
index ae47a6d5e..f01d94540 100644
--- a/runtest/syscalls
+++ b/runtest/syscalls
@@ -1675,6 +1675,7 @@  write05 write05
 
 writev01 writev01
 writev02 writev02
+writev03 writev03
 writev05 writev05
 writev06 writev06
 writev07 writev07
diff --git a/testcases/kernel/syscalls/writev/.gitignore b/testcases/kernel/syscalls/writev/.gitignore
index d60da0f43..167779736 100644
--- a/testcases/kernel/syscalls/writev/.gitignore
+++ b/testcases/kernel/syscalls/writev/.gitignore
@@ -1,5 +1,6 @@ 
 /writev01
 /writev02
+/writev03
 /writev05
 /writev06
 /writev07
diff --git a/testcases/kernel/syscalls/writev/Makefile b/testcases/kernel/syscalls/writev/Makefile
index 4844a6910..6627abaed 100644
--- a/testcases/kernel/syscalls/writev/Makefile
+++ b/testcases/kernel/syscalls/writev/Makefile
@@ -9,4 +9,7 @@  endif
 
 include $(top_srcdir)/include/mk/testcases.mk
 
+writev03: CFLAGS += -pthread
+writev03: LDLIBS += -lrt
+
 include $(top_srcdir)/include/mk/generic_leaf_target.mk
diff --git a/testcases/kernel/syscalls/writev/writev03.c b/testcases/kernel/syscalls/writev/writev03.c
new file mode 100644
index 000000000..f48efccf2
--- /dev/null
+++ b/testcases/kernel/syscalls/writev/writev03.c
@@ -0,0 +1,142 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2021 SUSE LLC <mdoucha@suse.cz>
+ *
+ * Check for potential issues in writev() if the first iovec entry is NULL
+ * and the next one is not present in RAM. This can result in a brief window
+ * where writev() first writes uninitialized data into the file (possibly
+ * exposing internal kernel structures) and then overwrites it with the real
+ * iovec contents later. Bugs fixed in:
+ *
+ *  commit d4690f1e1cdabb4d61207b6787b1605a0dc0aeab
+ *  Author: Al Viro <viro@ZenIV.linux.org.uk>
+ *  Date:   Fri Sep 16 00:11:45 2016 +0100
+ *
+ *  fix iov_iter_fault_in_readable()
+ */
+
+#include <sys/uio.h>
+#include "tst_test.h"
+#include "tst_fuzzy_sync.h"
+
+#define CHUNK_SIZE 256
+#define BUF_SIZE (2 * CHUNK_SIZE)
+#define MNTPOINT "mntpoint"
+#define TEMPFILE MNTPOINT "/test_file"
+#define MAPFILE MNTPOINT "/map_file"
+
+static unsigned char buf[BUF_SIZE], *map_ptr;
+static int mapfd = -1, writefd = -1;
+static volatile ssize_t written;
+static struct tst_fzsync_pair fzsync_pair;
+struct iovec iov[5];
+
+static void setup(void)
+{
+	int i;
+
+	for (i = 0; i < BUF_SIZE; i++)
+		buf[i] = i & 0xff;
+
+	mapfd = SAFE_OPEN(MAPFILE, O_CREAT|O_RDWR|O_TRUNC, 0644);
+	SAFE_WRITE(1, mapfd, buf, BUF_SIZE);
+
+	tst_fzsync_pair_init(&fzsync_pair);
+}
+
+static void *thread_run(void *arg)
+{
+	while (tst_fzsync_run_b(&fzsync_pair)) {
+		writefd = SAFE_OPEN(TEMPFILE, O_CREAT|O_WRONLY|O_TRUNC, 0644);
+		written = BUF_SIZE;
+		tst_fzsync_wait_b(&fzsync_pair);
+
+		/*
+		 * Do *NOT* preload the data using MAP_POPULATE or touching
+		 * the mapped range. We're testing whether writev() handles
+		 * fault-in correctly.
+		 */
+		map_ptr = SAFE_MMAP(NULL, BUF_SIZE, PROT_READ, MAP_SHARED,
+			mapfd, 0);
+		iov[1].iov_base = map_ptr;
+		iov[1].iov_len = CHUNK_SIZE;
+		iov[3].iov_base = map_ptr + CHUNK_SIZE;
+		iov[3].iov_len = CHUNK_SIZE;
+
+		tst_fzsync_start_race_b(&fzsync_pair);
+		written = writev(writefd, iov, ARRAY_SIZE(iov));
+		tst_fzsync_end_race_b(&fzsync_pair);
+
+		SAFE_MUNMAP(map_ptr, BUF_SIZE);
+		map_ptr = NULL;
+		SAFE_CLOSE(writefd);
+	}
+
+	return arg;
+}
+
+static void run(void)
+{
+	int fd, failed = 0;
+	ssize_t total_read;
+	unsigned char readbuf[BUF_SIZE];
+
+	tst_fzsync_pair_reset(&fzsync_pair, thread_run);
+
+	while (!failed && tst_fzsync_run_a(&fzsync_pair)) {
+		tst_fzsync_wait_a(&fzsync_pair);
+		fd = SAFE_OPEN(TEMPFILE, O_RDONLY);
+		tst_fzsync_start_race_a(&fzsync_pair);
+
+		for (total_read = 0; total_read < written;) {
+			total_read += SAFE_READ(0, fd, readbuf + total_read,
+				BUF_SIZE - total_read);
+		}
+
+		tst_fzsync_end_race_a(&fzsync_pair);
+		SAFE_CLOSE(fd);
+
+		if (total_read > BUF_SIZE)
+			tst_brk(TBROK, "read() returned too much data");
+
+		if (total_read <= 0)
+			continue;
+
+		if (memcmp(readbuf, buf, (size_t)total_read)) {
+			tst_res(TFAIL, "writev() wrote invalid data");
+			failed = 1;
+			break;
+		}
+	}
+
+	if (!failed)
+		tst_res(TPASS, "writev() handles page fault-in correctly");
+}
+
+static void cleanup(void)
+{
+	if (map_ptr && map_ptr != MAP_FAILED)
+		SAFE_MUNMAP(map_ptr, BUF_SIZE);
+
+	if (mapfd >= 0)
+		SAFE_CLOSE(mapfd);
+
+	if (writefd >= 0)
+		SAFE_CLOSE(writefd);
+
+	tst_fzsync_pair_cleanup(&fzsync_pair);
+}
+
+static struct tst_test test = {
+	.test_all = run,
+	.needs_root = 1,
+	.mount_device = 1,
+	.mntpoint = MNTPOINT,
+	.all_filesystems = 1,
+	.setup = setup,
+	.cleanup = cleanup,
+	.tags = (const struct tst_tag[]) {
+		{"linux-git", "d4690f1e1cda"},
+		{}
+	}
+};