Patchwork [v3,2/5] fs: Add inode_update_time_writable

login
register
mail settings
Submitter Andrew Lutomirski
Date Aug. 16, 2013, 11:22 p.m.
Message ID <a27accc2d9460b7ef194a203f305a18dafe926e8.1376679411.git.luto@amacapital.net>
Download mbox | patch
Permalink /patch/267937/
State Superseded
Headers show

Comments

Andrew Lutomirski - Aug. 16, 2013, 11:22 p.m.
This is like file_update_time, except that it acts on a struct inode *
instead of a struct file *.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 fs/inode.c         | 72 ++++++++++++++++++++++++++++++++++++++++++------------
 include/linux/fs.h |  1 +
 2 files changed, 58 insertions(+), 15 deletions(-)
Dave Chinner - Aug. 20, 2013, 2:28 a.m.
On Fri, Aug 16, 2013 at 04:22:09PM -0700, Andy Lutomirski wrote:
> This is like file_update_time, except that it acts on a struct inode *
> instead of a struct file *.
> 
> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> ---
>  fs/inode.c         | 72 ++++++++++++++++++++++++++++++++++++++++++------------
>  include/linux/fs.h |  1 +
>  2 files changed, 58 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index d6dfb09..bc90c12 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -1637,6 +1637,34 @@ int file_remove_suid(struct file *file)
>  }
>  EXPORT_SYMBOL(file_remove_suid);
>  
> +/*
> + * This does the work that's common to file_update_time and
> + * inode_update_time.
> + */
> +static int prepare_update_cmtime(struct inode *inode, struct timespec *now)
> +{
> +	int sync_it;
> +
> +	/* First try to exhaust all avenues to not sync */
> +	if (IS_NOCMTIME(inode))
> +		return 0;
> +
> +	*now = current_fs_time(inode->i_sb);
> +	if (!timespec_equal(&inode->i_mtime, now))
> +		sync_it = S_MTIME;
> +
> +	if (!timespec_equal(&inode->i_ctime, now))
> +		sync_it |= S_CTIME;
> +
> +	if (IS_I_VERSION(inode))
> +		sync_it |= S_VERSION;
> +
> +	if (!sync_it)
> +		return 0;
> +
> +	return sync_it;
> +}
> +
>  /**
>   *	file_update_time	-	update mtime and ctime time
>   *	@file: file accessed
> @@ -1654,23 +1682,9 @@ int file_update_time(struct file *file)
>  {
>  	struct inode *inode = file_inode(file);
>  	struct timespec now;
> -	int sync_it = 0;
> +	int sync_it = prepare_update_cmtime(inode, &now);
>  	int ret;
>  
> -	/* First try to exhaust all avenues to not sync */
> -	if (IS_NOCMTIME(inode))
> -		return 0;
> -
> -	now = current_fs_time(inode->i_sb);
> -	if (!timespec_equal(&inode->i_mtime, &now))
> -		sync_it = S_MTIME;
> -
> -	if (!timespec_equal(&inode->i_ctime, &now))
> -		sync_it |= S_CTIME;
> -
> -	if (IS_I_VERSION(inode))
> -		sync_it |= S_VERSION;
> -
>  	if (!sync_it)
>  		return 0;
>  
> @@ -1685,6 +1699,34 @@ int file_update_time(struct file *file)
>  }
>  EXPORT_SYMBOL(file_update_time);
>  
> +/**
> + *	inode_update_time_writable	-	update mtime and ctime time
> + *	@inode: inode accessed
> + *
> + *	This is like file_update_time, but it assumes the mnt is writable
> + *	and takes an inode parameter instead.  (We need to assume the mnt
> + *	was writable because inodes aren't associated with any particular
> + *	mnt.
> + */
> +
> +int inode_update_time_writable(struct inode *inode)
> +{
> +	struct timespec now;
> +	int sync_it = prepare_update_cmtime(inode, &now);
> +	int ret;
> +
> +	if (!sync_it)
> +		return 0;
> +
> +	/* sb_start_pagefault and update_time can both sleep. */
> +	sb_start_pagefault(inode->i_sb);
> +	ret = update_time(inode, &now, sync_it);
> +	sb_end_pagefault(inode->i_sb);

This gets called from the writeback path - you can't use
sb_start_pagefault/sb_end_pagefault in that path.

Cheers,

Dave.
Andrew Lutomirski - Aug. 20, 2013, 3:20 a.m.
On Mon, Aug 19, 2013 at 7:28 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Fri, Aug 16, 2013 at 04:22:09PM -0700, Andy Lutomirski wrote:
>> This is like file_update_time, except that it acts on a struct inode *
>> instead of a struct file *.
>>
>> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
>> ---
>>  fs/inode.c         | 72 ++++++++++++++++++++++++++++++++++++++++++------------
>>  include/linux/fs.h |  1 +
>>  2 files changed, 58 insertions(+), 15 deletions(-)
>>

[...]

>> +
>> +int inode_update_time_writable(struct inode *inode)
>> +{
>> +     struct timespec now;
>> +     int sync_it = prepare_update_cmtime(inode, &now);
>> +     int ret;
>> +
>> +     if (!sync_it)
>> +             return 0;
>> +
>> +     /* sb_start_pagefault and update_time can both sleep. */
>> +     sb_start_pagefault(inode->i_sb);
>> +     ret = update_time(inode, &now, sync_it);
>> +     sb_end_pagefault(inode->i_sb);
>
> This gets called from the writeback path - you can't use
> sb_start_pagefault/sb_end_pagefault in that path.

The race I'm worried about is:

 - mmap
 - write to the mapping
 - remount ro
 - flush_cmtime -> inode_update_time_writable

This may be impossible, in which case I'm okay, but it's nice to have
a sanity check.  I'll see if I can figure out how to do that.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner - Aug. 20, 2013, 3:33 a.m.
On Mon, Aug 19, 2013 at 08:20:12PM -0700, Andy Lutomirski wrote:
> On Mon, Aug 19, 2013 at 7:28 PM, Dave Chinner <david@fromorbit.com> wrote:
> > On Fri, Aug 16, 2013 at 04:22:09PM -0700, Andy Lutomirski wrote:
> >> This is like file_update_time, except that it acts on a struct inode *
> >> instead of a struct file *.
> >>
> >> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> >> ---
> >>  fs/inode.c         | 72 ++++++++++++++++++++++++++++++++++++++++++------------
> >>  include/linux/fs.h |  1 +
> >>  2 files changed, 58 insertions(+), 15 deletions(-)
> >>
> 
> [...]
> 
> >> +
> >> +int inode_update_time_writable(struct inode *inode)
> >> +{
> >> +     struct timespec now;
> >> +     int sync_it = prepare_update_cmtime(inode, &now);
> >> +     int ret;
> >> +
> >> +     if (!sync_it)
> >> +             return 0;
> >> +
> >> +     /* sb_start_pagefault and update_time can both sleep. */
> >> +     sb_start_pagefault(inode->i_sb);
> >> +     ret = update_time(inode, &now, sync_it);
> >> +     sb_end_pagefault(inode->i_sb);
> >
> > This gets called from the writeback path - you can't use
> > sb_start_pagefault/sb_end_pagefault in that path.
> 
> The race I'm worried about is:
> 
>  - mmap
>  - write to the mapping
>  - remount ro
>  - flush_cmtime -> inode_update_time_writable

sb_start_pagefault() is for filesystem freeze protection, not
remount-ro protection. If you freeze the filesystem, then we stop
writes and pagefaults by making sb_start_pagefault/sb_start_write
block, and then run writeback to clean all the pages.  If writeback
then blocks on sb_start_pagefault(), we've got a deadlock.

> This may be impossible, in which case I'm okay, but it's nice to have
> a sanity check.  I'll see if I can figure out how to do that.

The process of remount-ro should flush the dirty pages - the inode
and page has been marked dirty by page_mkwrite(), after all.

Cheers,

Dave.
Andrew Lutomirski - Aug. 20, 2013, 4:07 a.m.
On Mon, Aug 19, 2013 at 8:33 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Mon, Aug 19, 2013 at 08:20:12PM -0700, Andy Lutomirski wrote:
>> On Mon, Aug 19, 2013 at 7:28 PM, Dave Chinner <david@fromorbit.com> wrote:
>> > On Fri, Aug 16, 2013 at 04:22:09PM -0700, Andy Lutomirski wrote:
>> >> This is like file_update_time, except that it acts on a struct inode *
>> >> instead of a struct file *.
>> >>
>> >> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
>> >> ---
>> >>  fs/inode.c         | 72 ++++++++++++++++++++++++++++++++++++++++++------------
>> >>  include/linux/fs.h |  1 +
>> >>  2 files changed, 58 insertions(+), 15 deletions(-)
>> >>
>>
>> [...]
>>
>> >> +
>> >> +int inode_update_time_writable(struct inode *inode)
>> >> +{
>> >> +     struct timespec now;
>> >> +     int sync_it = prepare_update_cmtime(inode, &now);
>> >> +     int ret;
>> >> +
>> >> +     if (!sync_it)
>> >> +             return 0;
>> >> +
>> >> +     /* sb_start_pagefault and update_time can both sleep. */
>> >> +     sb_start_pagefault(inode->i_sb);
>> >> +     ret = update_time(inode, &now, sync_it);
>> >> +     sb_end_pagefault(inode->i_sb);
>> >
>> > This gets called from the writeback path - you can't use
>> > sb_start_pagefault/sb_end_pagefault in that path.
>>
>> The race I'm worried about is:
>>
>>  - mmap
>>  - write to the mapping
>>  - remount ro
>>  - flush_cmtime -> inode_update_time_writable
>
> sb_start_pagefault() is for filesystem freeze protection, not
> remount-ro protection. If you freeze the filesystem, then we stop
> writes and pagefaults by making sb_start_pagefault/sb_start_write
> block, and then run writeback to clean all the pages.  If writeback
> then blocks on sb_start_pagefault(), we've got a deadlock.
>
>> This may be impossible, in which case I'm okay, but it's nice to have
>> a sanity check.  I'll see if I can figure out how to do that.
>
> The process of remount-ro should flush the dirty pages - the inode
> and page has been marked dirty by page_mkwrite(), after all.

Hmm.  We can land in here from writeback, in which case the time
should be updated unconditionally.  We can also land in here from
msync(MS_ASYNC) or munmap.  munmap at least shouldn't block.

The nasty case is if a page is dirtied, then the frozen level is set
to SB_FREEZE_PAGEFAULT, and then userspace calls munmap or msync
*before* writepages gets called.  In this case, blocking until the fs
is unfrozen is probably impolite, and returning without updating the
time is questionable.

Removing the check entirely may add a new race, though: what if
.flush_cmtime has called mapping_test_clear_cmtime but hasn't gotten
to updating the time yet when freezing finishes?  This could be
prevented by changing generic_flush_cmtime to do __sb_start_write(sb,
SB_FREEZE_FS, false) and doing nothing if the fs is already frozen.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - Aug. 20, 2013, 4:10 p.m.
On Mon 19-08-13 21:07:49, Andy Lutomirski wrote:
> On Mon, Aug 19, 2013 at 8:33 PM, Dave Chinner <david@fromorbit.com> wrote:
> > On Mon, Aug 19, 2013 at 08:20:12PM -0700, Andy Lutomirski wrote:
> >> On Mon, Aug 19, 2013 at 7:28 PM, Dave Chinner <david@fromorbit.com> wrote:
> >> > On Fri, Aug 16, 2013 at 04:22:09PM -0700, Andy Lutomirski wrote:
> >> >> This is like file_update_time, except that it acts on a struct inode *
> >> >> instead of a struct file *.
> >> >>
> >> >> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> >> >> ---
> >> >>  fs/inode.c         | 72 ++++++++++++++++++++++++++++++++++++++++++------------
> >> >>  include/linux/fs.h |  1 +
> >> >>  2 files changed, 58 insertions(+), 15 deletions(-)
> >> >>
> >>
> >> [...]
> >>
> >> >> +
> >> >> +int inode_update_time_writable(struct inode *inode)
> >> >> +{
> >> >> +     struct timespec now;
> >> >> +     int sync_it = prepare_update_cmtime(inode, &now);
> >> >> +     int ret;
> >> >> +
> >> >> +     if (!sync_it)
> >> >> +             return 0;
> >> >> +
> >> >> +     /* sb_start_pagefault and update_time can both sleep. */
> >> >> +     sb_start_pagefault(inode->i_sb);
> >> >> +     ret = update_time(inode, &now, sync_it);
> >> >> +     sb_end_pagefault(inode->i_sb);
> >> >
> >> > This gets called from the writeback path - you can't use
> >> > sb_start_pagefault/sb_end_pagefault in that path.
> >>
> >> The race I'm worried about is:
> >>
> >>  - mmap
> >>  - write to the mapping
> >>  - remount ro
> >>  - flush_cmtime -> inode_update_time_writable
> >
> > sb_start_pagefault() is for filesystem freeze protection, not
> > remount-ro protection. If you freeze the filesystem, then we stop
> > writes and pagefaults by making sb_start_pagefault/sb_start_write
> > block, and then run writeback to clean all the pages.  If writeback
> > then blocks on sb_start_pagefault(), we've got a deadlock.
> >
> >> This may be impossible, in which case I'm okay, but it's nice to have
> >> a sanity check.  I'll see if I can figure out how to do that.
> >
> > The process of remount-ro should flush the dirty pages - the inode
> > and page has been marked dirty by page_mkwrite(), after all.
> 
> Hmm.  We can land in here from writeback, in which case the time
> should be updated unconditionally.  We can also land in here from
> msync(MS_ASYNC) or munmap.  munmap at least shouldn't block.
> 
> The nasty case is if a page is dirtied, then the frozen level is set
> to SB_FREEZE_PAGEFAULT, and then userspace calls munmap or msync
> *before* writepages gets called.  In this case, blocking until the fs
> is unfrozen is probably impolite, and returning without updating the
> time is questionable.
> 
> Removing the check entirely may add a new race, though: what if
> .flush_cmtime has called mapping_test_clear_cmtime but hasn't gotten
> to updating the time yet when freezing finishes?  This could be
> prevented by changing generic_flush_cmtime to do __sb_start_write(sb,
> SB_FREEZE_FS, false) and doing nothing if the fs is already frozen.
  I think it should be a responsibility of the caller of .flush_cmtime (as
is the case of update_time()) to handle freeze protection if needed. As
Dave told you, writeback path mustn't actually take it. OTOH things like
munmap() or msync() need to get it because we must avoid changing the
filesystem when it is frozen and these calls can happen when fs is frozen.

								Honza

Patch

diff --git a/fs/inode.c b/fs/inode.c
index d6dfb09..bc90c12 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1637,6 +1637,34 @@  int file_remove_suid(struct file *file)
 }
 EXPORT_SYMBOL(file_remove_suid);
 
+/*
+ * This does the work that's common to file_update_time and
+ * inode_update_time.
+ */
+static int prepare_update_cmtime(struct inode *inode, struct timespec *now)
+{
+	int sync_it;
+
+	/* First try to exhaust all avenues to not sync */
+	if (IS_NOCMTIME(inode))
+		return 0;
+
+	*now = current_fs_time(inode->i_sb);
+	if (!timespec_equal(&inode->i_mtime, now))
+		sync_it = S_MTIME;
+
+	if (!timespec_equal(&inode->i_ctime, now))
+		sync_it |= S_CTIME;
+
+	if (IS_I_VERSION(inode))
+		sync_it |= S_VERSION;
+
+	if (!sync_it)
+		return 0;
+
+	return sync_it;
+}
+
 /**
  *	file_update_time	-	update mtime and ctime time
  *	@file: file accessed
@@ -1654,23 +1682,9 @@  int file_update_time(struct file *file)
 {
 	struct inode *inode = file_inode(file);
 	struct timespec now;
-	int sync_it = 0;
+	int sync_it = prepare_update_cmtime(inode, &now);
 	int ret;
 
-	/* First try to exhaust all avenues to not sync */
-	if (IS_NOCMTIME(inode))
-		return 0;
-
-	now = current_fs_time(inode->i_sb);
-	if (!timespec_equal(&inode->i_mtime, &now))
-		sync_it = S_MTIME;
-
-	if (!timespec_equal(&inode->i_ctime, &now))
-		sync_it |= S_CTIME;
-
-	if (IS_I_VERSION(inode))
-		sync_it |= S_VERSION;
-
 	if (!sync_it)
 		return 0;
 
@@ -1685,6 +1699,34 @@  int file_update_time(struct file *file)
 }
 EXPORT_SYMBOL(file_update_time);
 
+/**
+ *	inode_update_time_writable	-	update mtime and ctime time
+ *	@inode: inode accessed
+ *
+ *	This is like file_update_time, but it assumes the mnt is writable
+ *	and takes an inode parameter instead.  (We need to assume the mnt
+ *	was writable because inodes aren't associated with any particular
+ *	mnt.
+ */
+
+int inode_update_time_writable(struct inode *inode)
+{
+	struct timespec now;
+	int sync_it = prepare_update_cmtime(inode, &now);
+	int ret;
+
+	if (!sync_it)
+		return 0;
+
+	/* sb_start_pagefault and update_time can both sleep. */
+	sb_start_pagefault(inode->i_sb);
+	ret = update_time(inode, &now, sync_it);
+	sb_end_pagefault(inode->i_sb);
+
+	return ret;
+}
+EXPORT_SYMBOL(inode_update_time_writable);
+
 int inode_needs_sync(struct inode *inode)
 {
 	if (IS_SYNC(inode))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9818747..86cf0a4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2590,6 +2590,7 @@  extern int inode_newsize_ok(const struct inode *, loff_t offset);
 extern void setattr_copy(struct inode *inode, const struct iattr *attr);
 
 extern int file_update_time(struct file *file);
+extern int inode_update_time_writable(struct inode *inode);
 
 extern int generic_show_options(struct seq_file *m, struct dentry *root);
 extern void save_mount_options(struct super_block *sb, char *options);