mbox series

[0/4,SRU,BIONIC] block: fix silent corruption in Linux kernel 4.15

Message ID 20181009160442.19894-1-colin.king@canonical.com
Headers show
Series block: fix silent corruption in Linux kernel 4.15 | expand

Message

Colin Ian King Oct. 9, 2018, 4:04 p.m. UTC
From: Colin Ian King <colin.king@canonical.com>

Buglink: https://bugs.launchpad.net/bugs/1796542

== SRU Justification ==

A silent data corruption was introduced in v4.10-rc1 with commit
72ecad22d9f198aafee64218512e02ffa7818671 and was fixed in v4.18-rc7
with commit 17d51b10d7773e4618bcac64648f30f12d4078fb. It affects
users of O_DIRECT, in our case a KVM virtual machine with drives
which use qemu's "cache=none" option.

== Fix ==

Upstream commits:

0aa69fd32a5f766e997ca8ab4723c5a1146efa8b
  block: add a lower-level bio_add_page interface

b403ea2404889e1227812fa9657667a1deb9c694
  block: bio_iov_iter_get_pages: fix size of last iovec

9362dd1109f87a9d0a798fbc890cb339c171ed35 
  blkdev: __blkdev_direct_IO_simple: fix leak in error case

17d51b10d7773e4618bcac64648f30f12d4078fb
  block: bio_iov_iter_get_pages: pin more pages for multi-segment IOs

The first 3 patches are required for a clean application of the final
patch that actually addresses the problem with a fix to this known
issue.

== Regression Potential ==

This touches the block layer, so there is risk potential in data
corruption. The fixes have several weeks in the upstream kernel and
so far, I see no subsequent fixes required.

== Test Case ==

Build the program listed below [1]
kudos to Jan Kara, and run with:

dd if=/dev/zero if=loop.img bs=1M count=2048
sudo losetup /dev/loop0 loop.img

./blkdev-dio-test /dev/loop0 0 &
./blkdev-dio-test /dev/loop0 2048 &

Without the fix, ones lost writes fairly soon.  Without the fix, this
runs without any losy write messages.

blkdev-dio-test.c:

#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <sys/uio.h>

#define PAGE_SIZE 4096
#define SECT_SIZE 512
#define BUF_OFF (2*SECT_SIZE)

int main(int argc, char **argv)
{
	int fd = open(argv[1], O_RDWR | O_DIRECT);
	int ret;
	char *buf;
	loff_t off;
	struct iovec iov[2];
	unsigned int seq;

	if (fd < 0) {
		perror("open");
		return 1;
	}

	off = strtol(argv[2], NULL, 10);

	buf = aligned_alloc(PAGE_SIZE, PAGE_SIZE);

	iov[0].iov_base = buf;
	iov[0].iov_len = SECT_SIZE;
	iov[1].iov_base = buf + BUF_OFF;
	iov[1].iov_len = SECT_SIZE;

	seq = 0;
	memset(buf, 0, PAGE_SIZE);
	while (1) {
		*(unsigned int *)buf = seq;
		*(unsigned int *)(buf + BUF_OFF) = seq;
		ret = pwritev(fd, iov, 2, off);
		if (ret < 0) {
			perror("pwritev");
			return 1;
		}
		if (ret != 2*SECT_SIZE) {
			fprintf(stderr, "Short pwritev: %d\n", ret);
			return 1;
		}
		ret = pread(fd, buf, PAGE_SIZE, off);
		if (ret < 0) {
			perror("pread");
			return 1;
		}
		if (ret != PAGE_SIZE) {
			fprintf(stderr, "Short read: %d\n", ret);
			return 1;
		}
		if (*(unsigned int *)buf != seq ||
		    *(unsigned int *)(buf + SECT_SIZE) != seq) {
			printf("Lost write %u: %u %u\n", seq, *(unsigned int *)buf, *(unsigned int *)(buf + SECT_SIZE));
			return 1;
		}
		seq++;
	}

	return 0;
}

References:
[1] https://www.spinics.net/lists/linux-block/msg28507.html

---

Christoph Hellwig (1):
  block: add a lower-level bio_add_page interface

Martin Wilck (3):
  block: bio_iov_iter_get_pages: fix size of last iovec
  blkdev: __blkdev_direct_IO_simple: fix leak in error case
  block: bio_iov_iter_get_pages: pin more pages for multi-segment IOs

 block/bio.c         | 149 ++++++++++++++++++++++++++++++--------------
 fs/block_dev.c      |   9 +--
 include/linux/bio.h |   9 +++
 3 files changed, 117 insertions(+), 50 deletions(-)

Comments

Stefan Bader Oct. 9, 2018, 5:08 p.m. UTC | #1
On 09.10.2018 18:04, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
> 
> Buglink: https://bugs.launchpad.net/bugs/1796542
> 
> == SRU Justification ==
> 
> A silent data corruption was introduced in v4.10-rc1 with commit
> 72ecad22d9f198aafee64218512e02ffa7818671 and was fixed in v4.18-rc7
> with commit 17d51b10d7773e4618bcac64648f30f12d4078fb. It affects
> users of O_DIRECT, in our case a KVM virtual machine with drives
> which use qemu's "cache=none" option.
> 
> == Fix ==
> 
> Upstream commits:
> 
> 0aa69fd32a5f766e997ca8ab4723c5a1146efa8b
>   block: add a lower-level bio_add_page interface
> 
> b403ea2404889e1227812fa9657667a1deb9c694
>   block: bio_iov_iter_get_pages: fix size of last iovec
> 
> 9362dd1109f87a9d0a798fbc890cb339c171ed35 
>   blkdev: __blkdev_direct_IO_simple: fix leak in error case
> 
> 17d51b10d7773e4618bcac64648f30f12d4078fb
>   block: bio_iov_iter_get_pages: pin more pages for multi-segment IOs
> 
> The first 3 patches are required for a clean application of the final
> patch that actually addresses the problem with a fix to this known
> issue.
> 
> == Regression Potential ==
> 
> This touches the block layer, so there is risk potential in data
> corruption. The fixes have several weeks in the upstream kernel and
> so far, I see no subsequent fixes required.
> 
> == Test Case ==
> 
> Build the program listed below [1]
> kudos to Jan Kara, and run with:
> 
> dd if=/dev/zero if=loop.img bs=1M count=2048
> sudo losetup /dev/loop0 loop.img
> 
> ./blkdev-dio-test /dev/loop0 0 &
> ./blkdev-dio-test /dev/loop0 2048 &
> 
> Without the fix, ones lost writes fairly soon.  Without the fix, this
> runs without any losy write messages.
> 
> blkdev-dio-test.c:
> 
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <unistd.h>
> #include <fcntl.h>
> #include <string.h>
> #include <stdlib.h>
> #include <sys/uio.h>
> 
> #define PAGE_SIZE 4096
> #define SECT_SIZE 512
> #define BUF_OFF (2*SECT_SIZE)
> 
> int main(int argc, char **argv)
> {
> 	int fd = open(argv[1], O_RDWR | O_DIRECT);
> 	int ret;
> 	char *buf;
> 	loff_t off;
> 	struct iovec iov[2];
> 	unsigned int seq;
> 
> 	if (fd < 0) {
> 		perror("open");
> 		return 1;
> 	}
> 
> 	off = strtol(argv[2], NULL, 10);
> 
> 	buf = aligned_alloc(PAGE_SIZE, PAGE_SIZE);
> 
> 	iov[0].iov_base = buf;
> 	iov[0].iov_len = SECT_SIZE;
> 	iov[1].iov_base = buf + BUF_OFF;
> 	iov[1].iov_len = SECT_SIZE;
> 
> 	seq = 0;
> 	memset(buf, 0, PAGE_SIZE);
> 	while (1) {
> 		*(unsigned int *)buf = seq;
> 		*(unsigned int *)(buf + BUF_OFF) = seq;
> 		ret = pwritev(fd, iov, 2, off);
> 		if (ret < 0) {
> 			perror("pwritev");
> 			return 1;
> 		}
> 		if (ret != 2*SECT_SIZE) {
> 			fprintf(stderr, "Short pwritev: %d\n", ret);
> 			return 1;
> 		}
> 		ret = pread(fd, buf, PAGE_SIZE, off);
> 		if (ret < 0) {
> 			perror("pread");
> 			return 1;
> 		}
> 		if (ret != PAGE_SIZE) {
> 			fprintf(stderr, "Short read: %d\n", ret);
> 			return 1;
> 		}
> 		if (*(unsigned int *)buf != seq ||
> 		    *(unsigned int *)(buf + SECT_SIZE) != seq) {
> 			printf("Lost write %u: %u %u\n", seq, *(unsigned int *)buf, *(unsigned int *)(buf + SECT_SIZE));
> 			return 1;
> 		}
> 		seq++;
> 	}
> 
> 	return 0;
> }
> 
> References:
> [1] https://www.spinics.net/lists/linux-block/msg28507.html
> 
> ---
> 
> Christoph Hellwig (1):
>   block: add a lower-level bio_add_page interface
> 
> Martin Wilck (3):
>   block: bio_iov_iter_get_pages: fix size of last iovec
>   blkdev: __blkdev_direct_IO_simple: fix leak in error case
>   block: bio_iov_iter_get_pages: pin more pages for multi-segment IOs
> 
>  block/bio.c         | 149 ++++++++++++++++++++++++++++++--------------
>  fs/block_dev.c      |   9 +--
>  include/linux/bio.h |   9 +++
>  3 files changed, 117 insertions(+), 50 deletions(-)
> 
Successful testing and so far changes had no further follow-up changes.

Acked-by: Stefan Bader <stefan.bader@canonical.com>
Kleber Sacilotto de Souza Oct. 10, 2018, 7:24 a.m. UTC | #2
On 10/09/18 18:04, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
> 
> Buglink: https://bugs.launchpad.net/bugs/1796542
> 
> == SRU Justification ==
> 
> A silent data corruption was introduced in v4.10-rc1 with commit
> 72ecad22d9f198aafee64218512e02ffa7818671 and was fixed in v4.18-rc7
> with commit 17d51b10d7773e4618bcac64648f30f12d4078fb. It affects
> users of O_DIRECT, in our case a KVM virtual machine with drives
> which use qemu's "cache=none" option.
> 
> == Fix ==
> 
> Upstream commits:
> 
> 0aa69fd32a5f766e997ca8ab4723c5a1146efa8b
>   block: add a lower-level bio_add_page interface
> 
> b403ea2404889e1227812fa9657667a1deb9c694
>   block: bio_iov_iter_get_pages: fix size of last iovec
> 
> 9362dd1109f87a9d0a798fbc890cb339c171ed35 
>   blkdev: __blkdev_direct_IO_simple: fix leak in error case
> 
> 17d51b10d7773e4618bcac64648f30f12d4078fb
>   block: bio_iov_iter_get_pages: pin more pages for multi-segment IOs
> 
> The first 3 patches are required for a clean application of the final
> patch that actually addresses the problem with a fix to this known
> issue.
> 
> == Regression Potential ==
> 
> This touches the block layer, so there is risk potential in data
> corruption. The fixes have several weeks in the upstream kernel and
> so far, I see no subsequent fixes required.
> 
> == Test Case ==
> 
> Build the program listed below [1]
> kudos to Jan Kara, and run with:
> 
> dd if=/dev/zero if=loop.img bs=1M count=2048
> sudo losetup /dev/loop0 loop.img
> 
> ./blkdev-dio-test /dev/loop0 0 &
> ./blkdev-dio-test /dev/loop0 2048 &
> 
> Without the fix, ones lost writes fairly soon.  Without the fix, this
> runs without any losy write messages.
> 
> blkdev-dio-test.c:
> 
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <unistd.h>
> #include <fcntl.h>
> #include <string.h>
> #include <stdlib.h>
> #include <sys/uio.h>
> 
> #define PAGE_SIZE 4096
> #define SECT_SIZE 512
> #define BUF_OFF (2*SECT_SIZE)
> 
> int main(int argc, char **argv)
> {
> 	int fd = open(argv[1], O_RDWR | O_DIRECT);
> 	int ret;
> 	char *buf;
> 	loff_t off;
> 	struct iovec iov[2];
> 	unsigned int seq;
> 
> 	if (fd < 0) {
> 		perror("open");
> 		return 1;
> 	}
> 
> 	off = strtol(argv[2], NULL, 10);
> 
> 	buf = aligned_alloc(PAGE_SIZE, PAGE_SIZE);
> 
> 	iov[0].iov_base = buf;
> 	iov[0].iov_len = SECT_SIZE;
> 	iov[1].iov_base = buf + BUF_OFF;
> 	iov[1].iov_len = SECT_SIZE;
> 
> 	seq = 0;
> 	memset(buf, 0, PAGE_SIZE);
> 	while (1) {
> 		*(unsigned int *)buf = seq;
> 		*(unsigned int *)(buf + BUF_OFF) = seq;
> 		ret = pwritev(fd, iov, 2, off);
> 		if (ret < 0) {
> 			perror("pwritev");
> 			return 1;
> 		}
> 		if (ret != 2*SECT_SIZE) {
> 			fprintf(stderr, "Short pwritev: %d\n", ret);
> 			return 1;
> 		}
> 		ret = pread(fd, buf, PAGE_SIZE, off);
> 		if (ret < 0) {
> 			perror("pread");
> 			return 1;
> 		}
> 		if (ret != PAGE_SIZE) {
> 			fprintf(stderr, "Short read: %d\n", ret);
> 			return 1;
> 		}
> 		if (*(unsigned int *)buf != seq ||
> 		    *(unsigned int *)(buf + SECT_SIZE) != seq) {
> 			printf("Lost write %u: %u %u\n", seq, *(unsigned int *)buf, *(unsigned int *)(buf + SECT_SIZE));
> 			return 1;
> 		}
> 		seq++;
> 	}
> 
> 	return 0;
> }
> 
> References:
> [1] https://www.spinics.net/lists/linux-block/msg28507.html
> 
> ---
> 
> Christoph Hellwig (1):
>   block: add a lower-level bio_add_page interface
> 
> Martin Wilck (3):
>   block: bio_iov_iter_get_pages: fix size of last iovec
>   blkdev: __blkdev_direct_IO_simple: fix leak in error case
>   block: bio_iov_iter_get_pages: pin more pages for multi-segment IOs
> 
>  block/bio.c         | 149 ++++++++++++++++++++++++++++++--------------
>  fs/block_dev.c      |   9 +--
>  include/linux/bio.h |   9 +++
>  3 files changed, 117 insertions(+), 50 deletions(-)
> 

Acked-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
Stefan Bader Oct. 10, 2018, 9:18 a.m. UTC | #3
On 09.10.2018 18:04, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
> 
> Buglink: https://bugs.launchpad.net/bugs/1796542
> 
> == SRU Justification ==
> 
> A silent data corruption was introduced in v4.10-rc1 with commit
> 72ecad22d9f198aafee64218512e02ffa7818671 and was fixed in v4.18-rc7
> with commit 17d51b10d7773e4618bcac64648f30f12d4078fb. It affects
> users of O_DIRECT, in our case a KVM virtual machine with drives
> which use qemu's "cache=none" option.
> 
> == Fix ==
> 
> Upstream commits:
> 
> 0aa69fd32a5f766e997ca8ab4723c5a1146efa8b
>   block: add a lower-level bio_add_page interface
> 
> b403ea2404889e1227812fa9657667a1deb9c694
>   block: bio_iov_iter_get_pages: fix size of last iovec
> 
> 9362dd1109f87a9d0a798fbc890cb339c171ed35 
>   blkdev: __blkdev_direct_IO_simple: fix leak in error case
> 
> 17d51b10d7773e4618bcac64648f30f12d4078fb
>   block: bio_iov_iter_get_pages: pin more pages for multi-segment IOs
> 
> The first 3 patches are required for a clean application of the final
> patch that actually addresses the problem with a fix to this known
> issue.
> 
> == Regression Potential ==
> 
> This touches the block layer, so there is risk potential in data
> corruption. The fixes have several weeks in the upstream kernel and
> so far, I see no subsequent fixes required.
> 
> == Test Case ==
> 
> Build the program listed below [1]
> kudos to Jan Kara, and run with:
> 
> dd if=/dev/zero if=loop.img bs=1M count=2048
> sudo losetup /dev/loop0 loop.img
> 
> ./blkdev-dio-test /dev/loop0 0 &
> ./blkdev-dio-test /dev/loop0 2048 &
> 
> Without the fix, ones lost writes fairly soon.  Without the fix, this
> runs without any losy write messages.
> 
> blkdev-dio-test.c:
> 
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <unistd.h>
> #include <fcntl.h>
> #include <string.h>
> #include <stdlib.h>
> #include <sys/uio.h>
> 
> #define PAGE_SIZE 4096
> #define SECT_SIZE 512
> #define BUF_OFF (2*SECT_SIZE)
> 
> int main(int argc, char **argv)
> {
> 	int fd = open(argv[1], O_RDWR | O_DIRECT);
> 	int ret;
> 	char *buf;
> 	loff_t off;
> 	struct iovec iov[2];
> 	unsigned int seq;
> 
> 	if (fd < 0) {
> 		perror("open");
> 		return 1;
> 	}
> 
> 	off = strtol(argv[2], NULL, 10);
> 
> 	buf = aligned_alloc(PAGE_SIZE, PAGE_SIZE);
> 
> 	iov[0].iov_base = buf;
> 	iov[0].iov_len = SECT_SIZE;
> 	iov[1].iov_base = buf + BUF_OFF;
> 	iov[1].iov_len = SECT_SIZE;
> 
> 	seq = 0;
> 	memset(buf, 0, PAGE_SIZE);
> 	while (1) {
> 		*(unsigned int *)buf = seq;
> 		*(unsigned int *)(buf + BUF_OFF) = seq;
> 		ret = pwritev(fd, iov, 2, off);
> 		if (ret < 0) {
> 			perror("pwritev");
> 			return 1;
> 		}
> 		if (ret != 2*SECT_SIZE) {
> 			fprintf(stderr, "Short pwritev: %d\n", ret);
> 			return 1;
> 		}
> 		ret = pread(fd, buf, PAGE_SIZE, off);
> 		if (ret < 0) {
> 			perror("pread");
> 			return 1;
> 		}
> 		if (ret != PAGE_SIZE) {
> 			fprintf(stderr, "Short read: %d\n", ret);
> 			return 1;
> 		}
> 		if (*(unsigned int *)buf != seq ||
> 		    *(unsigned int *)(buf + SECT_SIZE) != seq) {
> 			printf("Lost write %u: %u %u\n", seq, *(unsigned int *)buf, *(unsigned int *)(buf + SECT_SIZE));
> 			return 1;
> 		}
> 		seq++;
> 	}
> 
> 	return 0;
> }
> 
> References:
> [1] https://www.spinics.net/lists/linux-block/msg28507.html
> 
> ---
> 
> Christoph Hellwig (1):
>   block: add a lower-level bio_add_page interface
> 
> Martin Wilck (3):
>   block: bio_iov_iter_get_pages: fix size of last iovec
>   blkdev: __blkdev_direct_IO_simple: fix leak in error case
>   block: bio_iov_iter_get_pages: pin more pages for multi-segment IOs
> 
>  block/bio.c         | 149 ++++++++++++++++++++++++++++++--------------
>  fs/block_dev.c      |   9 +--
>  include/linux/bio.h |   9 +++
>  3 files changed, 117 insertions(+), 50 deletions(-)
> 
Applied to bionic/master-next for re-spin. Thanks.

-Stefan