diff mbox

[BUG] ext2/3/4: dio reads stale data when we do some append dio writes

Message ID 20131119095302.GA4534@gmail.com
State New, archived
Headers show

Commit Message

Zheng Liu Nov. 19, 2013, 9:53 a.m. UTC
Hi all,

Currently we check i_size in buffered read path after we know the page
is updated.  But it could return some zero-filled pages to the userspace
when we do some append dio writes meanwhile we do some dio reads.  We
could use the following code snippet to reproduce this problem in a
ext2/3/4 filesystem.  In the mean time, I run this code snippet against
xfs and btrfs, and they can survive.  Furthermore, if we do some buffered
read, xfs also has the same problem.  So I guess that this might be a
generic problem.

code snippet:
  #define _GNU_SOURCE

  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
  #include <memory.h>

  #include <unistd.h>
  #include <fcntl.h>
  #include <sys/types.h>
  #include <sys/stat.h>
  #include <errno.h>

  #include <pthread.h>

  #define BUF_ALIGN	1024

  struct writer_data {
  	int fd;
  	size_t blksize;
  	char *buf;
  };

  static void *writer(void *arg)
  {
  	struct writer_data *data = (struct writer_data *)arg;
  	int ret;

  	ret = write(data->fd, data->buf, data->blksize);
  	if (ret < 0)
  		fprintf(stderr, "write file failed: %s\n", strerror(errno));

  	return NULL;
  }

  int main(int argc, char *argv[])
  {
  	pthread_t tid;
  	struct writer_data wdata;
  	size_t max_blocks = 10 * 1024;
  	size_t blksize = 1 * 1024 * 1024;
  	char *rbuf, *wbuf;
  	int readfd, writefd;
  	int i, j;

  	if (argc < 2) {
  		fprintf(stderr, "usage: %s [filename]\n", argv[0]);
  		exit(1);
  	}

  	writefd = open(argv[1], O_CREAT|O_DIRECT|O_WRONLY|O_APPEND|O_TRUNC, S_IRWXU);
  	if (writefd < 0) {
  		fprintf(stderr, "failed to open wfile: %s\n", strerror(errno));
  		exit(1);
  	}
  	readfd = open(argv[1], O_DIRECT|O_RDONLY, S_IRWXU);
  	if (readfd < 0) {
  		fprintf(stderr, "failed to open rfile: %s\n", strerror(errno));
  		exit(1);
  	}

  	if (posix_memalign((void **)&wbuf, BUF_ALIGN, blksize)) {
  		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
  		exit(1);
  	}

  	if (posix_memalign((void **)&rbuf, 4096, blksize)) {
  		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
  		exit(1);
  	}

  	memset(wbuf, 'a', blksize);

  	wdata.fd = writefd;
  	wdata.blksize = blksize;
  	wdata.buf = wbuf;

  	for (i = 0; i < max_blocks; i++) {
  		void *retval;
  		int ret;

  		ret = pthread_create(&tid, NULL, writer, &wdata);
  		if (ret) {
  			fprintf(stderr, "create thread failed: %s\n", strerror(errno));
  			exit(1);
  		}

  		memset(rbuf, 'b', blksize);
  		do {
  			ret = pread(readfd, rbuf, blksize, i * blksize);
  		} while (ret <= 0);

  		if (ret < 0) {
  			fprintf(stderr, "read file failed: %s\n", strerror(errno));
  			exit(1);
  		}

  		if (pthread_join(tid, &retval)) {
  			fprintf(stderr, "pthread join failed: %s\n", strerror(errno));
  			exit(1);
  		}

  		if (ret >= 0) {
  			for (j = 0; j < ret; j++) {
  				if (rbuf[i] != 'a') {
  					fprintf(stderr, "encounter an error: offset %ld\n",
  						i);
  					goto err;
  				}
  			}
  		}
  	}

  err:
  	free(wbuf);
  	free(rbuf);

  	return 0;
  }

build & run:
  $ gcc code.c -o code -lpthread # build
  $ ./code ${filename} # run

As we expected, we should read nothing or data with 'a'.  But now we
read data with '0'.  I take a closer look at the code and it seems that
there is a bug in vfs.  Let me describe my found here.

  reader					writer
                                                generic_file_aio_write()
                                                ->__generic_file_aio_write()
                                                  ->generic_file_direct_write()
  generic_file_aio_read()
  ->do_generic_file_read()
    [fallback to buffered read]

    ->find_get_page()
    ->page_cache_sync_readahead()
    ->find_get_page()
    [in find_page label, we couldn't find a
     page before and after calling
     page_cache_sync_readahead().  So go to
     no_cached_page label]

    ->page_cache_alloc_cold()
    ->add_to_page_cache_lru()
    [in no_cached_page label, we alloc a page
     and goto readpage label.]

    ->aops->readpage()
    [in readpage label, readpage() callback
     is called and mpage_readpage() return a
     zero-filled page (e.g. ext3/4), and go
     to page_ok label]

                                                  ->a_ops->direct_IO()
                                                  ->i_size_write()
                                                  [we enlarge the i_size]

    Here we check i_size
    [in page_ok label, we check i_size but
     it has been enlarged.  Thus, we pass
     the check and return a zero-filled page]

I attach a patch below to fix this problem in vfs.  However, to be honest, the
fix is very dirty.  But frankly I haven't had a good solution until now.  So I
send this mail to talk about this problem.  Please let me know if I miss
something.

Any comment and idea are always welcome.

Thanks,
						- Zheng

From: Zheng Liu <wenqing.lz@taobao.com>

Subject: [PATCH] vfs: check i_size at the beginning of do_generic_file_read()

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
---
 mm/filemap.c |   13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

Comments

Christoph Hellwig Nov. 19, 2013, 10:22 a.m. UTC | #1
On Tue, Nov 19, 2013 at 05:53:03PM +0800, Zheng Liu wrote:
> code snippet:

Can you please wire this up as a regression test for xfstests?

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zheng Liu Nov. 19, 2013, 10:45 a.m. UTC | #2
On Tue, Nov 19, 2013 at 02:22:35AM -0800, Christoph Hellwig wrote:
> On Tue, Nov 19, 2013 at 05:53:03PM +0800, Zheng Liu wrote:
> > code snippet:
> 
> Can you please wire this up as a regression test for xfstests?
> 

No problem.  I am happy to do that.

BTW, that would be great if you could give me some comments about this
problem itself.  Is it a generic problem in vfs?  Or just a feature by
design?

Thanks,
                                                - Zheng
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dmitry Monakhov Nov. 19, 2013, 10:54 a.m. UTC | #3
On Tue, 19 Nov 2013 17:53:03 +0800, Zheng Liu <gnehzuil.liu@gmail.com> wrote:
> Hi all,
> 
> Currently we check i_size in buffered read path after we know the page
> is updated.  But it could return some zero-filled pages to the userspace
> when we do some append dio writes meanwhile we do some dio reads.  We
> could use the following code snippet to reproduce this problem in a
> ext2/3/4 filesystem.  In the mean time, I run this code snippet against
> xfs and btrfs, and they can survive.  Furthermore, if we do some buffered
> read, xfs also has the same problem.  So I guess that this might be a
> generic problem.
> 
> code snippet:
>   #define _GNU_SOURCE
> 
>   #include <stdio.h>
>   #include <stdlib.h>
>   #include <string.h>
>   #include <memory.h>
> 
>   #include <unistd.h>
>   #include <fcntl.h>
>   #include <sys/types.h>
>   #include <sys/stat.h>
>   #include <errno.h>
> 
>   #include <pthread.h>
> 
>   #define BUF_ALIGN	1024
> 
>   struct writer_data {
>   	int fd;
>   	size_t blksize;
>   	char *buf;
>   };
> 
>   static void *writer(void *arg)
>   {
>   	struct writer_data *data = (struct writer_data *)arg;
>   	int ret;
> 
>   	ret = write(data->fd, data->buf, data->blksize);
>   	if (ret < 0)
>   		fprintf(stderr, "write file failed: %s\n", strerror(errno));
> 
>   	return NULL;
>   }
> 
>   int main(int argc, char *argv[])
>   {
>   	pthread_t tid;
>   	struct writer_data wdata;
>   	size_t max_blocks = 10 * 1024;
>   	size_t blksize = 1 * 1024 * 1024;
>   	char *rbuf, *wbuf;
>   	int readfd, writefd;
>   	int i, j;
> 
>   	if (argc < 2) {
>   		fprintf(stderr, "usage: %s [filename]\n", argv[0]);
>   		exit(1);
>   	}
> 
>   	writefd = open(argv[1], O_CREAT|O_DIRECT|O_WRONLY|O_APPEND|O_TRUNC, S_IRWXU);
>   	if (writefd < 0) {
>   		fprintf(stderr, "failed to open wfile: %s\n", strerror(errno));
>   		exit(1);
>   	}
>   	readfd = open(argv[1], O_DIRECT|O_RDONLY, S_IRWXU);
>   	if (readfd < 0) {
>   		fprintf(stderr, "failed to open rfile: %s\n", strerror(errno));
>   		exit(1);
>   	}
> 
>   	if (posix_memalign((void **)&wbuf, BUF_ALIGN, blksize)) {
>   		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
>   		exit(1);
>   	}
> 
>   	if (posix_memalign((void **)&rbuf, 4096, blksize)) {
>   		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
>   		exit(1);
>   	}
> 
>   	memset(wbuf, 'a', blksize);
> 
>   	wdata.fd = writefd;
>   	wdata.blksize = blksize;
>   	wdata.buf = wbuf;
> 
>   	for (i = 0; i < max_blocks; i++) {
>   		void *retval;
>   		int ret;
> 
>   		ret = pthread_create(&tid, NULL, writer, &wdata);
>   		if (ret) {
>   			fprintf(stderr, "create thread failed: %s\n", strerror(errno));
>   			exit(1);
>   		}
> 
>   		memset(rbuf, 'b', blksize);
>   		do {
>   			ret = pread(readfd, rbuf, blksize, i * blksize);
>   		} while (ret <= 0);
> 
>   		if (ret < 0) {
>   			fprintf(stderr, "read file failed: %s\n", strerror(errno));
>   			exit(1);
>   		}
> 
>   		if (pthread_join(tid, &retval)) {
>   			fprintf(stderr, "pthread join failed: %s\n", strerror(errno));
>   			exit(1);
>   		}
> 
>   		if (ret >= 0) {
>   			for (j = 0; j < ret; j++) {
>   				if (rbuf[i] != 'a') {
>   					fprintf(stderr, "encounter an error: offset %ld\n",
>   						i);
>   					goto err;
>   				}
>   			}
>   		}
>   	}
> 
>   err:
>   	free(wbuf);
>   	free(rbuf);
> 
>   	return 0;
>   }
> 
> build & run:
>   $ gcc code.c -o code -lpthread # build
>   $ ./code ${filename} # run
> 
> As we expected, we should read nothing or data with 'a'.  But now we
> read data with '0'.  I take a closer look at the code and it seems that
> there is a bug in vfs.  Let me describe my found here.
> 
>   reader					writer
>                                                 generic_file_aio_write()
>                                                 ->__generic_file_aio_write()
>                                                   ->generic_file_direct_write()
>   generic_file_aio_read()
>   ->do_generic_file_read()
>     [fallback to buffered read]
> 
>     ->find_get_page()
>     ->page_cache_sync_readahead()
>     ->find_get_page()
>     [in find_page label, we couldn't find a
>      page before and after calling
>      page_cache_sync_readahead().  So go to
>      no_cached_page label]
> 
>     ->page_cache_alloc_cold()
>     ->add_to_page_cache_lru()
>     [in no_cached_page label, we alloc a page
>      and goto readpage label.]
> 
>     ->aops->readpage()
>     [in readpage label, readpage() callback
>      is called and mpage_readpage() return a
>      zero-filled page (e.g. ext3/4), and go
>      to page_ok label]
> 
>                                                   ->a_ops->direct_IO()
>                                                   ->i_size_write()
>                                                   [we enlarge the i_size]
> 
>     Here we check i_size
>     [in page_ok label, we check i_size but
>      it has been enlarged.  Thus, we pass
>      the check and return a zero-filled page]
> 
> I attach a patch below to fix this problem in vfs.  However, to be honest, the
> fix is very dirty.  But frankly I haven't had a good solution until now.  So I
> send this mail to talk about this problem.  Please let me know if I miss
> something.
Looks sane because in orrder to get correct result we have to read
variables in opposite order in comparison to update order.
> 
> Any comment and idea are always welcome.
> 
> Thanks,
> 						- Zheng
> 
> From: Zheng Liu <wenqing.lz@taobao.com>
> 
> Subject: [PATCH] vfs: check i_size at the beginning of do_generic_file_read()
> 
> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> ---
>  mm/filemap.c |   13 +++++++++++--
>  1 file changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 1e6aec4..9de2ad8 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1107,6 +1107,8 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
>  	pgoff_t index;
>  	pgoff_t last_index;
>  	pgoff_t prev_index;
> +	pgoff_t end_index;
> +	loff_t isize;
>  	unsigned long offset;      /* offset into pagecache page */
>  	unsigned int prev_offset;
>  	int error;
> @@ -1117,10 +1119,17 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
>  	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
>  	offset = *ppos & ~PAGE_CACHE_MASK;
>  
> +	/*
> +	 * We must check i_size at the beginning of this function to avoid to return
> +	 * zero-filled page to userspace when the application does append dio writes.
> +	 */
> +	isize = i_size_read(inode);
> +	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
> +	if (unlikely(!isize || index > end_index))
> +		goto out;
> +
This not sufficient because protect from append case only.
following scenario still can result in zefo-filed pages
Let inode has i_size=8192
task1                               task2
pread(, off=4096,sz=4096)           truncate(4096)
 ->do_generic_file_read()
     check i_size OK
                                     ->i_size_write()
                                     ->truncate_pagecache()
     ->page_cache_alloc_cold                        
       ->aops->readpage() ->ZERO
                                    pwrite(off=4096,sz=4096)
                                    ->generic_file_direct_write()
                                    ->i_size_write()
         check i_size OK     
>  	for (;;) {
>  		struct page *page;
> -		pgoff_t end_index;
> -		loff_t isize;
>  		unsigned long nr, ret;
>  
>  		cond_resched();
> -- 
> 1.7.9.7
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 19, 2013, 11:01 a.m. UTC | #4
On Tue, Nov 19, 2013 at 06:45:08PM +0800, Zheng Liu wrote:
> BTW, that would be great if you could give me some comments about this
> problem itself.  Is it a generic problem in vfs?  Or just a feature by
> design?

It seems like a fundamental issue with ext4 (and most older Linux
filesystems) trying to do dio reads without locking out writers.

If you have a shared/exclusive lock to protect dio reads against
buffered or appending writers like XFS does this issue is properly
prevented.
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 19, 2013, 11:18 a.m. UTC | #5
On Tue, Nov 19, 2013 at 07:19:47PM +0800, Zheng Liu wrote:
> Yes, I know that XFS has a shared/exclusive lock.  I guess that is why
> it can pass the test.  But another question is why xfs fails when we do
> some append dio writes with doing buffered read.

Can you provide a test case for that issue?

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zheng Liu Nov. 19, 2013, 11:19 a.m. UTC | #6
On Tue, Nov 19, 2013 at 03:01:47AM -0800, Christoph Hellwig wrote:
> On Tue, Nov 19, 2013 at 06:45:08PM +0800, Zheng Liu wrote:
> > BTW, that would be great if you could give me some comments about this
> > problem itself.  Is it a generic problem in vfs?  Or just a feature by
> > design?
> 
> It seems like a fundamental issue with ext4 (and most older Linux
> filesystems) trying to do dio reads without locking out writers.

Ah, that is a long story for me.  Anyway, thanks for your comment.

> 
> If you have a shared/exclusive lock to protect dio reads against
> buffered or appending writers like XFS does this issue is properly
> prevented.

Yes, I know that XFS has a shared/exclusive lock.  I guess that is why
it can pass the test.  But another question is why xfs fails when we do
some append dio writes with doing buffered read.

Regards,
                                                - Zheng
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zheng Liu Nov. 19, 2013, 11:45 a.m. UTC | #7
On Tue, Nov 19, 2013 at 02:54:49PM +0400, Dmitry Monakhov wrote:
> On Tue, 19 Nov 2013 17:53:03 +0800, Zheng Liu <gnehzuil.liu@gmail.com> wrote:
[...]
> > As we expected, we should read nothing or data with 'a'.  But now we
> > read data with '0'.  I take a closer look at the code and it seems that
> > there is a bug in vfs.  Let me describe my found here.
> > 
> >   reader					writer
> >                                                 generic_file_aio_write()
> >                                                 ->__generic_file_aio_write()
> >                                                   ->generic_file_direct_write()
> >   generic_file_aio_read()
> >   ->do_generic_file_read()
> >     [fallback to buffered read]
> > 
> >     ->find_get_page()
> >     ->page_cache_sync_readahead()
> >     ->find_get_page()
> >     [in find_page label, we couldn't find a
> >      page before and after calling
> >      page_cache_sync_readahead().  So go to
> >      no_cached_page label]
> > 
> >     ->page_cache_alloc_cold()
> >     ->add_to_page_cache_lru()
> >     [in no_cached_page label, we alloc a page
> >      and goto readpage label.]
> > 
> >     ->aops->readpage()
> >     [in readpage label, readpage() callback
> >      is called and mpage_readpage() return a
> >      zero-filled page (e.g. ext3/4), and go
> >      to page_ok label]
> > 
> >                                                   ->a_ops->direct_IO()
> >                                                   ->i_size_write()
> >                                                   [we enlarge the i_size]
> > 
> >     Here we check i_size
> >     [in page_ok label, we check i_size but
> >      it has been enlarged.  Thus, we pass
> >      the check and return a zero-filled page]
> > 
> > I attach a patch below to fix this problem in vfs.  However, to be honest, the
> > fix is very dirty.  But frankly I haven't had a good solution until now.  So I
> > send this mail to talk about this problem.  Please let me know if I miss
> > something.
> Looks sane because in orrder to get correct result we have to read
> variables in opposite order in comparison to update order.
> > 
> > Any comment and idea are always welcome.
> > 
> > Thanks,
> > 						- Zheng
> > 
> > From: Zheng Liu <wenqing.lz@taobao.com>
> > 
> > Subject: [PATCH] vfs: check i_size at the beginning of do_generic_file_read()
> > 
> > Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> > ---
> >  mm/filemap.c |   13 +++++++++++--
> >  1 file changed, 11 insertions(+), 2 deletions(-)
> > 
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index 1e6aec4..9de2ad8 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -1107,6 +1107,8 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
> >  	pgoff_t index;
> >  	pgoff_t last_index;
> >  	pgoff_t prev_index;
> > +	pgoff_t end_index;
> > +	loff_t isize;
> >  	unsigned long offset;      /* offset into pagecache page */
> >  	unsigned int prev_offset;
> >  	int error;
> > @@ -1117,10 +1119,17 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
> >  	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
> >  	offset = *ppos & ~PAGE_CACHE_MASK;
> >  
> > +	/*
> > +	 * We must check i_size at the beginning of this function to avoid to return
> > +	 * zero-filled page to userspace when the application does append dio writes.
> > +	 */
> > +	isize = i_size_read(inode);
> > +	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
> > +	if (unlikely(!isize || index > end_index))
> > +		goto out;
> > +
> This not sufficient because protect from append case only.
> following scenario still can result in zefo-filed pages

If I understand correctly, it couldn't happen.

> Let inode has i_size=8192
> task1                               task2
> pread(, off=4096,sz=4096)           truncate(4096)
>  ->do_generic_file_read()
>      check i_size OK
>                                      ->i_size_write()
>                                      ->truncate_pagecache()
>      ->page_cache_alloc_cold                        
>        ->aops->readpage() ->ZERO
>                                     pwrite(off=4096,sz=4096)
>                                     ->generic_file_direct_write()

                                        ->invalidate_inode_pages2_range()
                                        ->direct_IO()
                                        ->invalidate_inode_pages2_range()
                                        So the page should be invalidated

                                                - Zheng

>                                     ->i_size_write()
>          check i_size OK     
> >  	for (;;) {
> >  		struct page *page;
> > -		pgoff_t end_index;
> > -		loff_t isize;
> >  		unsigned long nr, ret;
> >  
> >  		cond_resched();
> > -- 
> > 1.7.9.7
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zheng Liu Nov. 19, 2013, 11:51 a.m. UTC | #8
On Tue, Nov 19, 2013 at 03:18:26AM -0800, Christoph Hellwig wrote:
> On Tue, Nov 19, 2013 at 07:19:47PM +0800, Zheng Liu wrote:
> > Yes, I know that XFS has a shared/exclusive lock.  I guess that is why
> > it can pass the test.  But another question is why xfs fails when we do
> > some append dio writes with doing buffered read.
> 
> Can you provide a test case for that issue?

Simple.  Reader just need to open this file without O_DIRECT flag.  I
paste the full code snippet below.  Please take care of this line:
	readfd = open(argv[1], /*O_DIRECT|*/O_RDONLY, S_IRWXU);

The result of this program on my own sand box looks like below:
        encounter an error: offset 0

                                                - Zheng

#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <memory.h>

#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>

#include <pthread.h>

#define BUF_ALIGN	1024

struct writer_data {
	int fd;
	size_t blksize;
	char *buf;
};

static void *writer(void *arg)
{
	struct writer_data *data = (struct writer_data *)arg;
	int ret;

	ret = write(data->fd, data->buf, data->blksize);
	if (ret < 0)
		fprintf(stderr, "write file failed: %s\n", strerror(errno));

	return NULL;
}

int main(int argc, char *argv[])
{
	pthread_t tid;
	struct writer_data wdata;
	size_t max_blocks = 10 * 1024;
	size_t blksize = 1 * 1024 * 1024;
	char *rbuf, *wbuf;
	int readfd, writefd;
	int i, j;

	if (argc < 2) {
		fprintf(stderr, "usage: %s [filename]\n", argv[0]);
		exit(1);
	}

	writefd = open(argv[1], O_CREAT|O_DIRECT|O_WRONLY|O_APPEND|O_TRUNC, S_IRWXU);
	if (writefd < 0) {
		fprintf(stderr, "failed to open wfile: %s\n", strerror(errno));
		exit(1);
	}
	readfd = open(argv[1], /*O_DIRECT|*/O_RDONLY, S_IRWXU);
	if (readfd < 0) {
		fprintf(stderr, "failed to open rfile: %s\n", strerror(errno));
		exit(1);
	}

	if (posix_memalign((void **)&wbuf, BUF_ALIGN, blksize)) {
		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
		exit(1);
	}

	if (posix_memalign((void **)&rbuf, 4096, blksize)) {
		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
		exit(1);
	}

	memset(wbuf, 'a', blksize);

	wdata.fd = writefd;
	wdata.blksize = blksize;
	wdata.buf = wbuf;

	for (i = 0; i < max_blocks; i++) {
		void *retval;
		int ret;

		ret = pthread_create(&tid, NULL, writer, &wdata);
		if (ret) {
			fprintf(stderr, "create thread failed: %s\n", strerror(errno));
			exit(1);
		}

		memset(rbuf, 'b', blksize);
		do {
			ret = pread(readfd, rbuf, blksize, i * blksize);
		} while (ret <= 0);

		if (ret < 0) {
			fprintf(stderr, "read file failed: %s\n", strerror(errno));
			exit(1);
		}

		if (pthread_join(tid, &retval)) {
			fprintf(stderr, "pthread join failed: %s\n", strerror(errno));
			exit(1);
		}

		if (ret >= 0) {
			for (j = 0; j < ret; j++) {
				if (rbuf[i] != 'a') {
					fprintf(stderr, "encounter an error: offset %ld\n",
						i);
					goto err;
				}
			}
		}
	}

err:
	free(wbuf);
	free(rbuf);

	return 0;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Nov. 19, 2013, 12:01 p.m. UTC | #9
On Tue, Nov 19, 2013 at 03:18:26AM -0800, Christoph Hellwig wrote:
> On Tue, Nov 19, 2013 at 07:19:47PM +0800, Zheng Liu wrote:
> > Yes, I know that XFS has a shared/exclusive lock.  I guess that is why
> > it can pass the test.  But another question is why xfs fails when we do
> > some append dio writes with doing buffered read.
> 
> Can you provide a test case for that issue?

For XFS, appending direct IO writes only hold the IOLOCK exclusive
for as long as it takes to guarantee that the the region between the
old EOF and the new EOF is full of zeros before it is demoted.  i.e.
once the region is guaranteed not to expose stale data, the
exclusive IO lock is demoted to to a shared lock and a buffered read
is then allowed to proceed concurrently with the DIO write.

Hence even appending writes occur concurrently with buffered reads,
and if the read overlaps the block at the old EOF then the page
brought into the page cache will have zeros in it.

FWIW, there's a wonderful comment in generic_file_direct_write()
that pretty much covers this case:

        /*
         * Finally, try again to invalidate clean pages which might have been
         * cached by non-direct readahead, or faulted in by get_user_pages()
         * if the source of the write was an mmap'ed region of the file
         * we're writing.  Either one is a pretty crazy thing to do,
         * so we don't support it 100%.  If this invalidation
         * fails, tough, the write still worked...
         */

The kernel code simply does not have the exclusion mechanisms to
make concurrent buffered and direct IO robust. This is one of the
problems (amongst many) that we've been looking to solve with an VFS
level IO range lock of some kind....

Cheers,

Dave.
Dave Chinner Nov. 19, 2013, 12:09 p.m. UTC | #10
On Tue, Nov 19, 2013 at 07:51:22PM +0800, Zheng Liu wrote:
> On Tue, Nov 19, 2013 at 03:18:26AM -0800, Christoph Hellwig wrote:
> > On Tue, Nov 19, 2013 at 07:19:47PM +0800, Zheng Liu wrote:
> > > Yes, I know that XFS has a shared/exclusive lock.  I guess that is why
> > > it can pass the test.  But another question is why xfs fails when we do
> > > some append dio writes with doing buffered read.
> > 
> > Can you provide a test case for that issue?
> 
> Simple.  Reader just need to open this file without O_DIRECT flag.  I
> paste the full code snippet below.  Please take care of this line:
> 	readfd = open(argv[1], /*O_DIRECT|*/O_RDONLY, S_IRWXU);
> 
> The result of this program on my own sand box looks like below:
>         encounter an error: offset 0
....
> 		if (ret >= 0) {
> 			for (j = 0; j < ret; j++) {
> 				if (rbuf[i] != 'a') {
> 					fprintf(stderr, "encounter an error: offset %ld\n",
> 						i);
> 					goto err;

Should be checking rbuf[j], perhaps?

Cheers,

Dave.
Zheng Liu Nov. 19, 2013, 12:18 p.m. UTC | #11
On Tue, Nov 19, 2013 at 11:09:29PM +1100, Dave Chinner wrote:
> On Tue, Nov 19, 2013 at 07:51:22PM +0800, Zheng Liu wrote:
> > On Tue, Nov 19, 2013 at 03:18:26AM -0800, Christoph Hellwig wrote:
> > > On Tue, Nov 19, 2013 at 07:19:47PM +0800, Zheng Liu wrote:
> > > > Yes, I know that XFS has a shared/exclusive lock.  I guess that is why
> > > > it can pass the test.  But another question is why xfs fails when we do
> > > > some append dio writes with doing buffered read.
> > > 
> > > Can you provide a test case for that issue?
> > 
> > Simple.  Reader just need to open this file without O_DIRECT flag.  I
> > paste the full code snippet below.  Please take care of this line:
> > 	readfd = open(argv[1], /*O_DIRECT|*/O_RDONLY, S_IRWXU);
> > 
> > The result of this program on my own sand box looks like below:
> >         encounter an error: offset 0
> ....
> > 		if (ret >= 0) {
> > 			for (j = 0; j < ret; j++) {
> > 				if (rbuf[i] != 'a') {
> > 					fprintf(stderr, "encounter an error: offset %ld\n",
> > 						i);
> > 					goto err;
> 
> Should be checking rbuf[j], perhaps?

Oops, it's my fault.  Yes. it should check rbuf[j].

Thanks,
                                                - Zheng
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zheng Liu Nov. 19, 2013, 12:20 p.m. UTC | #12
On Tue, Nov 19, 2013 at 11:01:12PM +1100, Dave Chinner wrote:
> On Tue, Nov 19, 2013 at 03:18:26AM -0800, Christoph Hellwig wrote:
> > On Tue, Nov 19, 2013 at 07:19:47PM +0800, Zheng Liu wrote:
> > > Yes, I know that XFS has a shared/exclusive lock.  I guess that is why
> > > it can pass the test.  But another question is why xfs fails when we do
> > > some append dio writes with doing buffered read.
> > 
> > Can you provide a test case for that issue?
> 
> For XFS, appending direct IO writes only hold the IOLOCK exclusive
> for as long as it takes to guarantee that the the region between the
> old EOF and the new EOF is full of zeros before it is demoted.  i.e.
> once the region is guaranteed not to expose stale data, the
> exclusive IO lock is demoted to to a shared lock and a buffered read
> is then allowed to proceed concurrently with the DIO write.
> 
> Hence even appending writes occur concurrently with buffered reads,
> and if the read overlaps the block at the old EOF then the page
> brought into the page cache will have zeros in it.
> 
> FWIW, there's a wonderful comment in generic_file_direct_write()
> that pretty much covers this case:
> 
>         /*
>          * Finally, try again to invalidate clean pages which might have been
>          * cached by non-direct readahead, or faulted in by get_user_pages()
>          * if the source of the write was an mmap'ed region of the file
>          * we're writing.  Either one is a pretty crazy thing to do,
>          * so we don't support it 100%.  If this invalidation
>          * fails, tough, the write still worked...
>          */
> 
> The kernel code simply does not have the exclusion mechanisms to
> make concurrent buffered and direct IO robust. This is one of the
> problems (amongst many) that we've been looking to solve with an VFS
> level IO range lock of some kind....

Thanks for pointing it out.

                                                - Zheng
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara Nov. 27, 2013, 11:01 p.m. UTC | #13
Hello,

On Tue 19-11-13 17:53:03, Zheng Liu wrote:
> Currently we check i_size in buffered read path after we know the page
> is updated.  But it could return some zero-filled pages to the userspace
> when we do some append dio writes meanwhile we do some dio reads.  We
> could use the following code snippet to reproduce this problem in a
> ext2/3/4 filesystem.  In the mean time, I run this code snippet against
> xfs and btrfs, and they can survive.  Furthermore, if we do some buffered
> read, xfs also has the same problem.  So I guess that this might be a
> generic problem.
<snip code>
> As we expected, we should read nothing or data with 'a'.  But now we
> read data with '0'.  I take a closer look at the code and it seems that
> there is a bug in vfs.  Let me describe my found here.
> 
>   reader					writer
>                                                 generic_file_aio_write()
>                                                 ->__generic_file_aio_write()
>                                                   ->generic_file_direct_write()
>   generic_file_aio_read()
>   ->do_generic_file_read()
>     [fallback to buffered read]
> 
>     ->find_get_page()
>     ->page_cache_sync_readahead()
>     ->find_get_page()
>     [in find_page label, we couldn't find a
>      page before and after calling
>      page_cache_sync_readahead().  So go to
>      no_cached_page label]
> 
>     ->page_cache_alloc_cold()
>     ->add_to_page_cache_lru()
>     [in no_cached_page label, we alloc a page
>      and goto readpage label.]
> 
>     ->aops->readpage()
>     [in readpage label, readpage() callback
>      is called and mpage_readpage() return a
>      zero-filled page (e.g. ext3/4), and go
>      to page_ok label]
> 
>                                                   ->a_ops->direct_IO()
>                                                   ->i_size_write()
>                                                   [we enlarge the i_size]
> 
>     Here we check i_size
>     [in page_ok label, we check i_size but
>      it has been enlarged.  Thus, we pass
>      the check and return a zero-filled page]
  Yeah, this looks like it can cause the problem you see.

> I attach a patch below to fix this problem in vfs.  However, to be honest, the
> fix is very dirty.  But frankly I haven't had a good solution until now.  So I
> send this mail to talk about this problem.  Please let me know if I miss
> something.
> 
> Any comment and idea are always welcome.
  Well, your patch will fix only the easy scenario but there are other
scenarios which could result in a similar problem - e.g. if the writer
does:
  pwrite(fd, buf, blksize, 0);
  ftruncate(fd, 0);
  pwrite(fd, buf, blksize, 0);
the reader even with the additional check can get unlucky and read zeros
(the first i_size check happens before truncate, the second check happens
after the second pwrite).

To solve this reliably you'd need some lock serializing dio and buffered io
on the file range. The mapping range locks I'm working on will provide
this synchronization but that's not a solution I have now. So until then
some imperfect but simple solution might be OK - but instead of what you
do, I'd prefer to trim desc->count at the beginning of
do_generic_file_read() so that *ppos+desc->count <= i_size. Because your
check also has the disadvantage that it doesn't work when the read doesn't
read just the last page in the file.

								Honza

> From: Zheng Liu <wenqing.lz@taobao.com>
> 
> Subject: [PATCH] vfs: check i_size at the beginning of do_generic_file_read()
> 
> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> ---
>  mm/filemap.c |   13 +++++++++++--
>  1 file changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 1e6aec4..9de2ad8 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1107,6 +1107,8 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
>  	pgoff_t index;
>  	pgoff_t last_index;
>  	pgoff_t prev_index;
> +	pgoff_t end_index;
> +	loff_t isize;
>  	unsigned long offset;      /* offset into pagecache page */
>  	unsigned int prev_offset;
>  	int error;
> @@ -1117,10 +1119,17 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
>  	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
>  	offset = *ppos & ~PAGE_CACHE_MASK;
>  
> +	/*
> +	 * We must check i_size at the beginning of this function to avoid to return
> +	 * zero-filled page to userspace when the application does append dio writes.
> +	 */
> +	isize = i_size_read(inode);
> +	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
> +	if (unlikely(!isize || index > end_index))
> +		goto out;
> +
>  	for (;;) {
>  		struct page *page;
> -		pgoff_t end_index;
> -		loff_t isize;
>  		unsigned long nr, ret;
>  
>  		cond_resched();
> -- 
> 1.7.9.7
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6aec4..9de2ad8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1107,6 +1107,8 @@  static void do_generic_file_read(struct file *filp, loff_t *ppos,
 	pgoff_t index;
 	pgoff_t last_index;
 	pgoff_t prev_index;
+	pgoff_t end_index;
+	loff_t isize;
 	unsigned long offset;      /* offset into pagecache page */
 	unsigned int prev_offset;
 	int error;
@@ -1117,10 +1119,17 @@  static void do_generic_file_read(struct file *filp, loff_t *ppos,
 	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
 	offset = *ppos & ~PAGE_CACHE_MASK;
 
+	/*
+	 * We must check i_size at the beginning of this function to avoid to return
+	 * zero-filled page to userspace when the application does append dio writes.
+	 */
+	isize = i_size_read(inode);
+	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+	if (unlikely(!isize || index > end_index))
+		goto out;
+
 	for (;;) {
 		struct page *page;
-		pgoff_t end_index;
-		loff_t isize;
 		unsigned long nr, ret;
 
 		cond_resched();