[v2] iomap: report collisions between directio and buffered writes to userspace

Message ID 20171117193925.GM5119@magnolia
State New
Headers show
Series
  • [v2] iomap: report collisions between directio and buffered writes to userspace
Related show

Commit Message

Darrick J. Wong Nov. 17, 2017, 7:39 p.m.
From: Darrick J. Wong <darrick.wong@oracle.com>

If two programs simultaneously try to write to the same part of a file
via direct IO and buffered IO, there's a chance that the post-diowrite
pagecache invalidation will fail on the dirty page.  When this happens,
the dio write succeeded, which means that the page cache is no longer
coherent with the disk!

Programs are not supposed to mix IO types and this is a clear case of
data corruption, so store an EIO which will be reflected to userspace
during the next fsync.  Replace the WARN_ON with a ratelimited pr_crit
so that the developers have /some/ kind of breadcrumb to track down the
offending program(s) and file(s) involved.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/direct-io.c     |   24 +++++++++++++++++++++++-
 fs/iomap.c         |   12 ++++++++++--
 include/linux/fs.h |    1 +
 3 files changed, 34 insertions(+), 3 deletions(-)

Comments

Liu Bo Nov. 17, 2017, 10:56 p.m. | #1
On Fri, Nov 17, 2017 at 11:39:25AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> If two programs simultaneously try to write to the same part of a file
> via direct IO and buffered IO, there's a chance that the post-diowrite
> pagecache invalidation will fail on the dirty page.  When this happens,
> the dio write succeeded, which means that the page cache is no longer
> coherent with the disk!
> 
> Programs are not supposed to mix IO types and this is a clear case of
> data corruption, so store an EIO which will be reflected to userspace
> during the next fsync.  Replace the WARN_ON with a ratelimited pr_crit
> so that the developers have /some/ kind of breadcrumb to track down the
> offending program(s) and file(s) involved.
>

Looks good to me, thanks for addressing the warning.

Reviewed-by: Liu Bo <bo.li.liu@oracle.com>

Thanks,

-liubo

> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/direct-io.c     |   24 +++++++++++++++++++++++-
>  fs/iomap.c         |   12 ++++++++++--
>  include/linux/fs.h |    1 +
>  3 files changed, 34 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 98fe132..ef5d12a 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -219,6 +219,27 @@ static inline struct page *dio_get_page(struct dio *dio,
>  	return dio->pages[sdio->head];
>  }
>  
> +/*
> + * Warn about a page cache invalidation failure during a direct io write.
> + */
> +void dio_warn_stale_pagecache(struct file *filp)
> +{
> +	static DEFINE_RATELIMIT_STATE(_rs, 30 * HZ, DEFAULT_RATELIMIT_BURST);
> +	char pathname[128];
> +	struct inode *inode = file_inode(filp);
> +	char *path;
> +
> +	errseq_set(&inode->i_mapping->wb_err, -EIO);
> +	if (__ratelimit(&_rs)) {
> +		path = file_path(filp, pathname, sizeof(pathname));
> +		if (IS_ERR(path))
> +			path = "(unknown)";
> +		pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
> +		pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
> +			current->comm);
> +	}
> +}
> +
>  /**
>   * dio_complete() - called when all DIO BIO I/O has been completed
>   * @offset: the byte offset in the file of the completed operation
> @@ -290,7 +311,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
>  		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
>  					offset >> PAGE_SHIFT,
>  					(offset + ret - 1) >> PAGE_SHIFT);
> -		WARN_ON_ONCE(err);
> +		if (err)
> +			dio_warn_stale_pagecache(dio->iocb->ki_filp);
>  	}
>  
>  	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 5011a96..028f329 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -753,7 +753,8 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  		err = invalidate_inode_pages2_range(inode->i_mapping,
>  				offset >> PAGE_SHIFT,
>  				(offset + dio->size - 1) >> PAGE_SHIFT);
> -		WARN_ON_ONCE(err);
> +		if (err)
> +			dio_warn_stale_pagecache(iocb->ki_filp);
>  	}
>  
>  	inode_dio_end(file_inode(iocb->ki_filp));
> @@ -1012,9 +1013,16 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	if (ret)
>  		goto out_free_dio;
>  
> +	/*
> +	 * Try to invalidate cache pages for the range we're direct
> +	 * writing.  If this invalidation fails, tough, the write will
> +	 * still work, but racing two incompatible write paths is a
> +	 * pretty crazy thing to do, so we don't support it 100%.
> +	 */
>  	ret = invalidate_inode_pages2_range(mapping,
>  			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
> -	WARN_ON_ONCE(ret);
> +	if (ret)
> +		dio_warn_stale_pagecache(iocb->ki_filp);
>  	ret = 0;
>  
>  	if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 2690864..0e5f060 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2976,6 +2976,7 @@ enum {
>  };
>  
>  void dio_end_io(struct bio *bio);
> +void dio_warn_stale_pagecache(struct file *filp);
>  
>  ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  			     struct block_device *bdev, struct iov_iter *iter,
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matthew Wilcox Nov. 20, 2017, 4:18 p.m. | #2
On Fri, Nov 17, 2017 at 11:39:25AM -0800, Darrick J. Wong wrote:
> If two programs simultaneously try to write to the same part of a file
> via direct IO and buffered IO, there's a chance that the post-diowrite
> pagecache invalidation will fail on the dirty page.  When this happens,
> the dio write succeeded, which means that the page cache is no longer
> coherent with the disk!

This seems like a good opportunity to talk about what I've been working
on for solving this problem.  The XArray is going to introduce a set
of entries which can be stored to locations in the page cache that I'm
calling 'wait entries'.

When starting direct IO, the page cache will insert a wait entry in each
of the indices covered by the I/O (and do the usual invalidation dance).
At the end of direct IO, it will remove the wait entry and wake up any
threads waiting on those entries.

During lookup, any thread which encounters a wait entry (pagefaults,
buffered reads, buffered writes) will sleep on the indicated wait queue.

I don't know what a direct IO thread should do when encountering a wait
entry.  Fail the I/O?  Sleep like any other lookup would?  Go ahead and
issue the I/O anyway?  (uhm, I see a lot of problems with implementing
that third option.)

I'm currently planning on using 256 'wait entries', one for each
PAGE_WAIT_TABLE_SIZE, and sharing those wait queue heads.  Maybe that's
overengineering the solution.

Anyway, I need to get the basics of the XArray finished off before I do
this, but it'd be great if somebody pointed out why this doesn't work
before I implement it.
Dave Chinner Nov. 20, 2017, 8:26 p.m. | #3
On Mon, Nov 20, 2017 at 08:18:29AM -0800, Matthew Wilcox wrote:
> On Fri, Nov 17, 2017 at 11:39:25AM -0800, Darrick J. Wong wrote:
> > If two programs simultaneously try to write to the same part of a file
> > via direct IO and buffered IO, there's a chance that the post-diowrite
> > pagecache invalidation will fail on the dirty page.  When this happens,
> > the dio write succeeded, which means that the page cache is no longer
> > coherent with the disk!
> 
> This seems like a good opportunity to talk about what I've been working
> on for solving this problem.  The XArray is going to introduce a set
> of entries which can be stored to locations in the page cache that I'm
> calling 'wait entries'.

What's this XArray thing you speak of?

Cheers,

Dave.
Matthew Wilcox Nov. 20, 2017, 9:51 p.m. | #4
On Tue, Nov 21, 2017 at 07:26:06AM +1100, Dave Chinner wrote:
> On Mon, Nov 20, 2017 at 08:18:29AM -0800, Matthew Wilcox wrote:
> > On Fri, Nov 17, 2017 at 11:39:25AM -0800, Darrick J. Wong wrote:
> > > If two programs simultaneously try to write to the same part of a file
> > > via direct IO and buffered IO, there's a chance that the post-diowrite
> > > pagecache invalidation will fail on the dirty page.  When this happens,
> > > the dio write succeeded, which means that the page cache is no longer
> > > coherent with the disk!
> > 
> > This seems like a good opportunity to talk about what I've been working
> > on for solving this problem.  The XArray is going to introduce a set
> > of entries which can be stored to locations in the page cache that I'm
> > calling 'wait entries'.
> 
> What's this XArray thing you speak of?

Ah, right, you were on sabbatical at LSFMM this year where I talked
about it.  Briefly, it's a new API for the radix tree.  The data structure
is essentially unchanged (minor enhancements), but I'm rationalising
existing functionality and adding new abilities.  And getting rid of
misfeatures like the preload API and implicit GFP flags.

My current working tree is here:

http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/xarray-2017-11-20

Ignoring the prep patches, the excitement is all to be found with the
commits which start 'xarray:'

If you want an example of it in use, I'm pretty happy with this patch
that switches the brd driver entirely from the radix tree API to the
xarray API:

http://git.infradead.org/users/willy/linux-dax.git/commitdiff/dbf96ae943e43563cbbaa26e21b656b6fe8f4b0f

I've been pretty liberal with the kernel-doc, but I haven't written out
a good .rst file to give an overview of how to use it.
Dave Chinner Nov. 20, 2017, 10:27 p.m. | #5
On Mon, Nov 20, 2017 at 01:51:00PM -0800, Matthew Wilcox wrote:
> On Tue, Nov 21, 2017 at 07:26:06AM +1100, Dave Chinner wrote:
> > On Mon, Nov 20, 2017 at 08:18:29AM -0800, Matthew Wilcox wrote:
> > > On Fri, Nov 17, 2017 at 11:39:25AM -0800, Darrick J. Wong wrote:
> > > > If two programs simultaneously try to write to the same part of a file
> > > > via direct IO and buffered IO, there's a chance that the post-diowrite
> > > > pagecache invalidation will fail on the dirty page.  When this happens,
> > > > the dio write succeeded, which means that the page cache is no longer
> > > > coherent with the disk!
> > > 
> > > This seems like a good opportunity to talk about what I've been working
> > > on for solving this problem.  The XArray is going to introduce a set
> > > of entries which can be stored to locations in the page cache that I'm
> > > calling 'wait entries'.
> > 
> > What's this XArray thing you speak of?
> 
> Ah, right, you were on sabbatical at LSFMM this year where I talked
> about it.  Briefly, it's a new API for the radix tree.  The data structure
> is essentially unchanged (minor enhancements), but I'm rationalising
> existing functionality and adding new abilities.  And getting rid of
> misfeatures like the preload API and implicit GFP flags.
> 
> My current working tree is here:
> 
> http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/xarray-2017-11-20

First thing I noticed was that "xa" as a prefix is already quite
widely used in XFS - it's shorthand for "XFS AIL". Indeed, xa_lock
already exists and is quite widely used, so having a generic
interface using the same prefixes and lock names is going to be
quite confusing in the XFS code. Especially considering there's
fair bit of radix tree use in XFS (e.g. the internal inode and
dquot caches).

FYI, from fs/xfs/xfs_trans_priv.h:

/*
 * Private AIL structures.
 *
 * Eventually we need to drive the locking in here as well.
 */
struct xfs_ail {
        struct xfs_mount        *xa_mount;
        struct task_struct      *xa_task;
        struct list_head        xa_ail;
        xfs_lsn_t               xa_target;
        xfs_lsn_t               xa_target_prev;
        struct list_head        xa_cursors;
        spinlock_t              xa_lock;
        xfs_lsn_t               xa_last_pushed_lsn;
        int                     xa_log_flush;
        struct list_head        xa_buf_list;
        wait_queue_head_t       xa_empty;
};


> Ignoring the prep patches, the excitement is all to be found with the
> commits which start 'xarray:'

FWIW, why is it named "XArray"?  "X" stands for what?  It still
looks like a tree structure to me, but without a design doc I'm a
bit lost to how it differs to the radix tree (apart from the API)
and why it's considered an "array".

> If you want an example of it in use, I'm pretty happy with this patch
> that switches the brd driver entirely from the radix tree API to the
> xarray API:
> 
> http://git.infradead.org/users/willy/linux-dax.git/commitdiff/dbf96ae943e43563cbbaa26e21b656b6fe8f4b0f

Looks pretty neat, but I'll reserve judgement for when I see the
conversion of the XFS radix tree code....

> I've been pretty liberal with the kernel-doc, but I haven't written out
> a good .rst file to give an overview of how to use it.

Let me know when you've written it :)

Cheers,

Dave.
Darrick J. Wong Nov. 21, 2017, 1:37 a.m. | #6
On Tue, Nov 21, 2017 at 09:27:49AM +1100, Dave Chinner wrote:
> On Mon, Nov 20, 2017 at 01:51:00PM -0800, Matthew Wilcox wrote:
> > On Tue, Nov 21, 2017 at 07:26:06AM +1100, Dave Chinner wrote:
> > > On Mon, Nov 20, 2017 at 08:18:29AM -0800, Matthew Wilcox wrote:
> > > > On Fri, Nov 17, 2017 at 11:39:25AM -0800, Darrick J. Wong wrote:
> > > > > If two programs simultaneously try to write to the same part of a file
> > > > > via direct IO and buffered IO, there's a chance that the post-diowrite
> > > > > pagecache invalidation will fail on the dirty page.  When this happens,
> > > > > the dio write succeeded, which means that the page cache is no longer
> > > > > coherent with the disk!
> > > > 
> > > > This seems like a good opportunity to talk about what I've been working
> > > > on for solving this problem.  The XArray is going to introduce a set
> > > > of entries which can be stored to locations in the page cache that I'm
> > > > calling 'wait entries'.
> > > 
> > > What's this XArray thing you speak of?
> > 
> > Ah, right, you were on sabbatical at LSFMM this year where I talked
> > about it.  Briefly, it's a new API for the radix tree.  The data structure
> > is essentially unchanged (minor enhancements), but I'm rationalising
> > existing functionality and adding new abilities.  And getting rid of
> > misfeatures like the preload API and implicit GFP flags.
> > 
> > My current working tree is here:
> > 
> > http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/xarray-2017-11-20
> 
> First thing I noticed was that "xa" as a prefix is already quite
> widely used in XFS - it's shorthand for "XFS AIL". Indeed, xa_lock
> already exists and is quite widely used, so having a generic
> interface using the same prefixes and lock names is going to be
> quite confusing in the XFS code. Especially considering there's
> fair bit of radix tree use in XFS (e.g. the internal inode and
> dquot caches).
> 
> FYI, from fs/xfs/xfs_trans_priv.h:
> 
> /*
>  * Private AIL structures.
>  *
>  * Eventually we need to drive the locking in here as well.
>  */
> struct xfs_ail {
>         struct xfs_mount        *xa_mount;
>         struct task_struct      *xa_task;
>         struct list_head        xa_ail;
>         xfs_lsn_t               xa_target;
>         xfs_lsn_t               xa_target_prev;
>         struct list_head        xa_cursors;
>         spinlock_t              xa_lock;
>         xfs_lsn_t               xa_last_pushed_lsn;
>         int                     xa_log_flush;
>         struct list_head        xa_buf_list;
>         wait_queue_head_t       xa_empty;
> };
> 
> 
> > Ignoring the prep patches, the excitement is all to be found with the
> > commits which start 'xarray:'
> 
> FWIW, why is it named "XArray"?  "X" stands for what?  It still
> looks like a tree structure to me, but without a design doc I'm a
> bit lost to how it differs to the radix tree (apart from the API)
> and why it's considered an "array".

/me nominates 'xarr' for the prefix because pirates. :P

--D

> > If you want an example of it in use, I'm pretty happy with this patch
> > that switches the brd driver entirely from the radix tree API to the
> > xarray API:
> > 
> > http://git.infradead.org/users/willy/linux-dax.git/commitdiff/dbf96ae943e43563cbbaa26e21b656b6fe8f4b0f
> 
> Looks pretty neat, but I'll reserve judgement for when I see the
> conversion of the XFS radix tree code....
> 
> > I've been pretty liberal with the kernel-doc, but I haven't written out
> > a good .rst file to give an overview of how to use it.
> 
> Let me know when you've written it :)
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Matthew Wilcox Nov. 21, 2017, 4:32 a.m. | #7
On Mon, Nov 20, 2017 at 05:37:53PM -0800, Darrick J. Wong wrote:
> On Tue, Nov 21, 2017 at 09:27:49AM +1100, Dave Chinner wrote:
> > First thing I noticed was that "xa" as a prefix is already quite
> > widely used in XFS - it's shorthand for "XFS AIL". Indeed, xa_lock
> > already exists and is quite widely used, so having a generic
> > interface using the same prefixes and lock names is going to be
> > quite confusing in the XFS code. Especially considering there's
> > fair bit of radix tree use in XFS (e.g. the internal inode and
> > dquot caches).
> > 
> > FYI, from fs/xfs/xfs_trans_priv.h:
> > 
> > /*
> >  * Private AIL structures.
> >  *
> >  * Eventually we need to drive the locking in here as well.
> >  */
> > struct xfs_ail {
> >         struct xfs_mount        *xa_mount;
> >         struct task_struct      *xa_task;
> >         struct list_head        xa_ail;
> >         xfs_lsn_t               xa_target;
> >         xfs_lsn_t               xa_target_prev;
> >         struct list_head        xa_cursors;
> >         spinlock_t              xa_lock;
> >         xfs_lsn_t               xa_last_pushed_lsn;
> >         int                     xa_log_flush;
> >         struct list_head        xa_buf_list;
> >         wait_queue_head_t       xa_empty;
> > };
> > 
> > FWIW, why is it named "XArray"?  "X" stands for what?  It still
> > looks like a tree structure to me, but without a design doc I'm a
> > bit lost to how it differs to the radix tree (apart from the API)
> > and why it's considered an "array".
> 
> /me nominates 'xarr' for the prefix because pirates. :P

The X stands for 'eXpandable' or 'eXtending'.  I really don't want to
use more than a two-letter acronym for whatever the XArray ends up being
called.  One of the problems with the radix tree is that everything has
that 11-character 'radix_tree_' prefix; just replacing that with 'xa_'
makes a huge improvement to readability.

I'm open to renaming it, but I want a good name.  I think it needs to
be *something* array, so we're looking for a prefix used nowhere in the
kernel.  Running 'git grep' in turn for '\<[a-z]a_' really only allows
for ja, ya and za.  'ja_' and 'ya_' only have one user, while 'za_'
has none.  So ... anyone got a good retcon for JArray, YArray or ZArray?

It's considered an array because it behaves "as if" it's an infinite
array.  The fact that we chop it up into chunks of 64 entries, and then
arrange those chunks into a tree is not something the average user needs
to concern themselves with.  We have a lot of places in the kernel that
fuss around with "allocate N entries, whoops, we exceeded N, kmalloc some
more, oh no kmalloc failed, vmalloc it instead".  So the idea is to give
these places an interface that acts like an automatically-resizing array.

One of the ways the radix tree fails to act like an array is returning
EEXIST on an insert operation.  Most users don't want that.  Arrays don't
behave like that; you just overwrite whatever's there.  A few users do
want to know whether something was already there (eg the brd example
I showed), but those users generally want to know what was already
there, not just 'something was'.  So this is why we have xa_store()
and xa_cmpxchg() as our higher-level primitives.
Dave Chinner Nov. 21, 2017, 6:48 a.m. | #8
On Mon, Nov 20, 2017 at 08:32:40PM -0800, Matthew Wilcox wrote:
> On Mon, Nov 20, 2017 at 05:37:53PM -0800, Darrick J. Wong wrote:
> > On Tue, Nov 21, 2017 at 09:27:49AM +1100, Dave Chinner wrote:
> > > First thing I noticed was that "xa" as a prefix is already quite
> > > widely used in XFS - it's shorthand for "XFS AIL". Indeed, xa_lock
> > > already exists and is quite widely used, so having a generic
> > > interface using the same prefixes and lock names is going to be
> > > quite confusing in the XFS code. Especially considering there's
> > > fair bit of radix tree use in XFS (e.g. the internal inode and
> > > dquot caches).
> > > 
> > > FYI, from fs/xfs/xfs_trans_priv.h:
> > > 
> > > /*
> > >  * Private AIL structures.
> > >  *
> > >  * Eventually we need to drive the locking in here as well.
> > >  */
> > > struct xfs_ail {
> > >         struct xfs_mount        *xa_mount;
> > >         struct task_struct      *xa_task;
> > >         struct list_head        xa_ail;
> > >         xfs_lsn_t               xa_target;
> > >         xfs_lsn_t               xa_target_prev;
> > >         struct list_head        xa_cursors;
> > >         spinlock_t              xa_lock;
> > >         xfs_lsn_t               xa_last_pushed_lsn;
> > >         int                     xa_log_flush;
> > >         struct list_head        xa_buf_list;
> > >         wait_queue_head_t       xa_empty;
> > > };
> > > 
> > > FWIW, why is it named "XArray"?  "X" stands for what?  It still
> > > looks like a tree structure to me, but without a design doc I'm a
> > > bit lost to how it differs to the radix tree (apart from the API)
> > > and why it's considered an "array".
> > 
> > /me nominates 'xarr' for the prefix because pirates. :P
> 
> The X stands for 'eXpandable' or 'eXtending'.  I really don't want to
> use more than a two-letter acronym for whatever the XArray ends up being
> called.  One of the problems with the radix tree is that everything has
> that 11-character 'radix_tree_' prefix; just replacing that with 'xa_'
> makes a huge improvement to readability.

Yeah, understood. That's why
we have very little clear
prefix namespace left.... :/

[ somedays I write something that looks sorta like a haiku, and from
that point on everything just starts falling out of my brain that
way. I blame Eric for this today! :P ]

> I'm open to renaming it, but I want a good name.  I think it needs to
> be *something* array, so we're looking for a prefix used nowhere in the
> kernel.  Running 'git grep' in turn for '\<[a-z]a_' really only allows
> for ja, ya and za.  'ja_' and 'ya_' only have one user, while 'za_'
> has none.  So ... anyone got a good retcon for JArray, YArray or ZArray?

A Yellow Array.
Colour is irrelevant.
The bikeshed looks nice.

> It's considered an array because it behaves "as if" it's an infinite
> array.

Infinite Array.
Namespace prefix collision
this haiku can't solve.

> The fact that we chop it up into chunks of 64 entries, and then
> arrange those chunks into a tree is not something the average user needs
> to concern themselves with.

Jumbo Array. Many
pieces hidden behind walls.
Will anyone notice?

> We have a lot of places in the kernel that
> fuss around with "allocate N entries, whoops, we exceeded N, kmalloc some
> more, oh no kmalloc failed, vmalloc it instead".  So the idea is to give
> these places an interface that acts like an automatically-resizing array.

Zoetrope Array.
Labyrinth of illusion.
Structure never ends.

Cheers,

Dave.
Matthew Wilcox Nov. 21, 2017, 12:52 p.m. | #9
On Tue, Nov 21, 2017 at 05:48:15PM +1100, Dave Chinner wrote:
> On Mon, Nov 20, 2017 at 08:32:40PM -0800, Matthew Wilcox wrote:
> > On Mon, Nov 20, 2017 at 05:37:53PM -0800, Darrick J. Wong wrote:
> > > On Tue, Nov 21, 2017 at 09:27:49AM +1100, Dave Chinner wrote:
> > > > First thing I noticed was that "xa" as a prefix is already quite
> > > > widely used in XFS - it's shorthand for "XFS AIL". Indeed, xa_lock
> >
> > The X stands for 'eXpandable' or 'eXtending'.  I really don't want to
> > use more than a two-letter acronym for whatever the XArray ends up being
> > called.  One of the problems with the radix tree is that everything has
> > that 11-character 'radix_tree_' prefix; just replacing that with 'xa_'
> > makes a huge improvement to readability.
> 
> Yeah, understood. That's why
> we have very little clear
> prefix namespace left.... :/
> 
> [ somedays I write something that looks sorta like a haiku, and from
> that point on everything just starts falling out of my brain that
> way. I blame Eric for this today! :P ]

When the namespace is
tight we must consider the
competing users.

The earliest us'r
has a claim to a prefix
we are used to it.

Also a more wide-
spread user has a claim to
a shorter prefix.

Would you mind changing
your prefix to one only
one letter longer?

... ok, I give up ;-)

All your current usage of the xa_ prefix looks somewhat like this:

fs/xfs/xfs_trans_ail.c: spin_lock(&ailp->xa_lock);

with honourable mentions to:
fs/xfs/xfs_log.c:	spin_lock(&mp->m_ail->xa_lock);

Would you mind if I bolt a patch on to the front of the series called
something like "free up xa_ namespace" that renamed your xa_* to ail_*?
There are no uses of the 'ail_' prefix in the kernel today.

I don't think that
	spin_lock(&ailp->ail_lock);
loses any readability.

By the way, what does AIL stand for?  It'd be nice if it were spelled out
in at least one of the header files, maybe fs/xfs/xfs_trans_priv.h?

> Zoetrope Array.
> Labyrinth of illusion.
> Structure never ends.

Thank you for making me look up zoetrope ;-)
Matthew Wilcox Nov. 21, 2017, 5:23 p.m. | #10
On Tue, Nov 21, 2017 at 09:27:49AM +1100, Dave Chinner wrote:
> On Mon, Nov 20, 2017 at 01:51:00PM -0800, Matthew Wilcox wrote:
> > If you want an example of it in use, I'm pretty happy with this patch
> > that switches the brd driver entirely from the radix tree API to the
> > xarray API:
> > 
> > http://git.infradead.org/users/willy/linux-dax.git/commitdiff/dbf96ae943e43563cbbaa26e21b656b6fe8f4b0f
> 
> Looks pretty neat, but I'll reserve judgement for when I see the
> conversion of the XFS radix tree code....

Challenge accepted.  This was a good thing for me to do because I found
some opportunities to improve the XArray API.

Changes since yesterday:

 - Added XA_NO_TAG
 - Added 'max' parameters to xa_get_entries() and xa_get_tagged()
 - Changed the order of the arguments of xa_get_entries() and xa_get_tagged()
   to match the radix tree equivalents

You can see the patches on that (rebased) branch:

xfs: Convert mru cache to XArray
xfs: Convert xfs dquot to XArray
xfs: Convert pag_ici_root to XArray
xfs: Convert m_perag_tree to XArray

or did you want me to send them by email?
Darrick J. Wong Nov. 21, 2017, 6:53 p.m. | #11
On Tue, Nov 21, 2017 at 09:23:47AM -0800, Matthew Wilcox wrote:
> On Tue, Nov 21, 2017 at 09:27:49AM +1100, Dave Chinner wrote:
> > On Mon, Nov 20, 2017 at 01:51:00PM -0800, Matthew Wilcox wrote:
> > > If you want an example of it in use, I'm pretty happy with this patch
> > > that switches the brd driver entirely from the radix tree API to the
> > > xarray API:
> > > 
> > > http://git.infradead.org/users/willy/linux-dax.git/commitdiff/dbf96ae943e43563cbbaa26e21b656b6fe8f4b0f
> > 
> > Looks pretty neat, but I'll reserve judgement for when I see the
> > conversion of the XFS radix tree code....
> 
> Challenge accepted.  This was a good thing for me to do because I found
> some opportunities to improve the XArray API.
> 
> Changes since yesterday:
> 
>  - Added XA_NO_TAG
>  - Added 'max' parameters to xa_get_entries() and xa_get_tagged()
>  - Changed the order of the arguments of xa_get_entries() and xa_get_tagged()
>    to match the radix tree equivalents
> 
> You can see the patches on that (rebased) branch:
> 
> xfs: Convert mru cache to XArray
> xfs: Convert xfs dquot to XArray
> xfs: Convert pag_ici_root to XArray
> xfs: Convert m_perag_tree to XArray
> 
> or did you want me to send them by email?

Yes please.

--D

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Nov. 21, 2017, 10:28 p.m. | #12
On Tue, Nov 21, 2017 at 04:52:53AM -0800, Matthew Wilcox wrote:
> On Tue, Nov 21, 2017 at 05:48:15PM +1100, Dave Chinner wrote:
> > On Mon, Nov 20, 2017 at 08:32:40PM -0800, Matthew Wilcox wrote:
> > > On Mon, Nov 20, 2017 at 05:37:53PM -0800, Darrick J. Wong wrote:
> > > > On Tue, Nov 21, 2017 at 09:27:49AM +1100, Dave Chinner wrote:
> > > > > First thing I noticed was that "xa" as a prefix is already quite
> > > > > widely used in XFS - it's shorthand for "XFS AIL". Indeed, xa_lock
> > >
> > > The X stands for 'eXpandable' or 'eXtending'.  I really don't want to
> > > use more than a two-letter acronym for whatever the XArray ends up being
> > > called.  One of the problems with the radix tree is that everything has
> > > that 11-character 'radix_tree_' prefix; just replacing that with 'xa_'
> > > makes a huge improvement to readability.
> > 
> > Yeah, understood. That's why
> > we have very little clear
> > prefix namespace left.... :/
> > 
> > [ somedays I write something that looks sorta like a haiku, and from
> > that point on everything just starts falling out of my brain that
> > way. I blame Eric for this today! :P ]
> 
> When the namespace is
> tight we must consider the
> competing users.
> 
> The earliest us'r
> has a claim to a prefix
> we are used to it.
> 
> Also a more wide-
> spread user has a claim to
> a shorter prefix.
> 
> Would you mind changing
> your prefix to one only
> one letter longer?

We can do something
like that, though Darrick has the
final say in it.

> ... ok, I give up ;-)

Top marks for effort :)

> All your current usage of the xa_ prefix looks somewhat like this:
> 
> fs/xfs/xfs_trans_ail.c: spin_lock(&ailp->xa_lock);
> 
> with honourable mentions to:
> fs/xfs/xfs_log.c:	spin_lock(&mp->m_ail->xa_lock);
> 
> Would you mind if I bolt a patch on to the front of the series called
> something like "free up xa_ namespace" that renamed your xa_* to ail_*?
> There are no uses of the 'ail_' prefix in the kernel today.
> 
> I don't think that
> 	spin_lock(&ailp->ail_lock);
> loses any readability.

Not sure that's going to work - there's an "xa_ail" member for the
AIL list head. That would now read in places:

	if (list_empty(&ailp->ail_ail))

I'd be inclined to just drop the "xa_" prefix from the XFS
structure.  There is no information loss by removing the prefix in
the XFS code because the pointer name tells us what structure it is
pointing at.

> 
> By the way, what does AIL stand for?  It'd be nice if it were spelled out
> in at least one of the header files, maybe fs/xfs/xfs_trans_priv.h?

Active Item List. See the section "Tracking Changes" in
Documentation/filesystems/xfs-delayed-logging-design.txt for the
full rundown, but in short:

"The log item is already used to track the log items that have been
written to the log but not yet written to disk. Such log items are
considered "active" and as such are stored in the Active Item List
(AIL) which is a LSN-ordered double linked list. Items are inserted
into this list during log buffer IO completion, after which they are
unpinned and can be written to disk."

The lack of comments describing the AIL is historic - it's never
been documented in the code, nor has the whole relogging concept it
implements. I wrote the document above when introducing the CIL
(Commited Item List) because almost no-one actively working on XFS
understood how the whole journalling subsystem worked in any detail.

> > Zoetrope Array.
> > Labyrinth of illusion.
> > Structure never ends.
> 
> Thank you for making me look up zoetrope ;-)

My pleasure :)

Cheers,

Dave.
Darrick J. Wong Nov. 22, 2017, 12:05 a.m. | #13
On Wed, Nov 22, 2017 at 09:28:06AM +1100, Dave Chinner wrote:
> On Tue, Nov 21, 2017 at 04:52:53AM -0800, Matthew Wilcox wrote:
> > On Tue, Nov 21, 2017 at 05:48:15PM +1100, Dave Chinner wrote:
> > > On Mon, Nov 20, 2017 at 08:32:40PM -0800, Matthew Wilcox wrote:
> > > > On Mon, Nov 20, 2017 at 05:37:53PM -0800, Darrick J. Wong wrote:
> > > > > On Tue, Nov 21, 2017 at 09:27:49AM +1100, Dave Chinner wrote:
> > > > > > First thing I noticed was that "xa" as a prefix is already quite
> > > > > > widely used in XFS - it's shorthand for "XFS AIL". Indeed, xa_lock
> > > >
> > > > The X stands for 'eXpandable' or 'eXtending'.  I really don't want to
> > > > use more than a two-letter acronym for whatever the XArray ends up being
> > > > called.  One of the problems with the radix tree is that everything has
> > > > that 11-character 'radix_tree_' prefix; just replacing that with 'xa_'
> > > > makes a huge improvement to readability.
> > > 
> > > Yeah, understood. That's why
> > > we have very little clear
> > > prefix namespace left.... :/
> > > 
> > > [ somedays I write something that looks sorta like a haiku, and from
> > > that point on everything just starts falling out of my brain that
> > > way. I blame Eric for this today! :P ]
> > 
> > When the namespace is
> > tight we must consider the
> > competing users.
> > 
> > The earliest us'r
> > has a claim to a prefix
> > we are used to it.
> > 
> > Also a more wide-
> > spread user has a claim to
> > a shorter prefix.
> > 
> > Would you mind changing
> > your prefix to one only
> > one letter longer?
> 
> We can do something
> like that, though Darrick has the
> final say in it.

Keep this up and soon
I'll require patch changelogs
Written in Haiku. :P

(j/k)

Everyone in the US, have a happy Thanksgiving!

--D

> > ... ok, I give up ;-)
> 
> Top marks for effort :)
> 
> > All your current usage of the xa_ prefix looks somewhat like this:
> > 
> > fs/xfs/xfs_trans_ail.c: spin_lock(&ailp->xa_lock);
> > 
> > with honourable mentions to:
> > fs/xfs/xfs_log.c:	spin_lock(&mp->m_ail->xa_lock);
> > 
> > Would you mind if I bolt a patch on to the front of the series called
> > something like "free up xa_ namespace" that renamed your xa_* to ail_*?
> > There are no uses of the 'ail_' prefix in the kernel today.
> > 
> > I don't think that
> > 	spin_lock(&ailp->ail_lock);
> > loses any readability.
> 
> Not sure that's going to work - there's an "xa_ail" member for the
> AIL list head. That would now read in places:
> 
> 	if (list_empty(&ailp->ail_ail))
> 
> I'd be inclined to just drop the "xa_" prefix from the XFS
> structure.  There is no information loss by removing the prefix in
> the XFS code because the pointer name tells us what structure it is
> pointing at.
> 
> > 
> > By the way, what does AIL stand for?  It'd be nice if it were spelled out
> > in at least one of the header files, maybe fs/xfs/xfs_trans_priv.h?
> 
> Active Item List. See the section "Tracking Changes" in
> Documentation/filesystems/xfs-delayed-logging-design.txt for the
> full rundown, but in short:
> 
> "The log item is already used to track the log items that have been
> written to the log but not yet written to disk. Such log items are
> considered "active" and as such are stored in the Active Item List
> (AIL) which is a LSN-ordered double linked list. Items are inserted
> into this list during log buffer IO completion, after which they are
> unpinned and can be written to disk."
> 
> The lack of comments describing the AIL is historic - it's never
> been documented in the code, nor has the whole relogging concept it
> implements. I wrote the document above when introducing the CIL
> (Commited Item List) because almost no-one actively working on XFS
> understood how the whole journalling subsystem worked in any detail.
> 
> > > Zoetrope Array.
> > > Labyrinth of illusion.
> > > Structure never ends.
> > 
> > Thank you for making me look up zoetrope ;-)
> 
> My pleasure :)
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 98fe132..ef5d12a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -219,6 +219,27 @@  static inline struct page *dio_get_page(struct dio *dio,
 	return dio->pages[sdio->head];
 }
 
+/*
+ * Warn about a page cache invalidation failure during a direct io write.
+ */
+void dio_warn_stale_pagecache(struct file *filp)
+{
+	static DEFINE_RATELIMIT_STATE(_rs, 30 * HZ, DEFAULT_RATELIMIT_BURST);
+	char pathname[128];
+	struct inode *inode = file_inode(filp);
+	char *path;
+
+	errseq_set(&inode->i_mapping->wb_err, -EIO);
+	if (__ratelimit(&_rs)) {
+		path = file_path(filp, pathname, sizeof(pathname));
+		if (IS_ERR(path))
+			path = "(unknown)";
+		pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
+		pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
+			current->comm);
+	}
+}
+
 /**
  * dio_complete() - called when all DIO BIO I/O has been completed
  * @offset: the byte offset in the file of the completed operation
@@ -290,7 +311,8 @@  static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
 		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
 					offset >> PAGE_SHIFT,
 					(offset + ret - 1) >> PAGE_SHIFT);
-		WARN_ON_ONCE(err);
+		if (err)
+			dio_warn_stale_pagecache(dio->iocb->ki_filp);
 	}
 
 	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
diff --git a/fs/iomap.c b/fs/iomap.c
index 5011a96..028f329 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -753,7 +753,8 @@  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 		err = invalidate_inode_pages2_range(inode->i_mapping,
 				offset >> PAGE_SHIFT,
 				(offset + dio->size - 1) >> PAGE_SHIFT);
-		WARN_ON_ONCE(err);
+		if (err)
+			dio_warn_stale_pagecache(iocb->ki_filp);
 	}
 
 	inode_dio_end(file_inode(iocb->ki_filp));
@@ -1012,9 +1013,16 @@  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (ret)
 		goto out_free_dio;
 
+	/*
+	 * Try to invalidate cache pages for the range we're direct
+	 * writing.  If this invalidation fails, tough, the write will
+	 * still work, but racing two incompatible write paths is a
+	 * pretty crazy thing to do, so we don't support it 100%.
+	 */
 	ret = invalidate_inode_pages2_range(mapping,
 			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
-	WARN_ON_ONCE(ret);
+	if (ret)
+		dio_warn_stale_pagecache(iocb->ki_filp);
 	ret = 0;
 
 	if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2690864..0e5f060 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2976,6 +2976,7 @@  enum {
 };
 
 void dio_end_io(struct bio *bio);
+void dio_warn_stale_pagecache(struct file *filp);
 
 ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 			     struct block_device *bdev, struct iov_iter *iter,