Message ID | a27accc2d9460b7ef194a203f305a18dafe926e8.1376679411.git.luto@amacapital.net |
---|---|
State | Superseded, archived |
Headers | show |
On Fri, Aug 16, 2013 at 04:22:09PM -0700, Andy Lutomirski wrote: > This is like file_update_time, except that it acts on a struct inode * > instead of a struct file *. > > Signed-off-by: Andy Lutomirski <luto@amacapital.net> > --- > fs/inode.c | 72 ++++++++++++++++++++++++++++++++++++++++++------------ > include/linux/fs.h | 1 + > 2 files changed, 58 insertions(+), 15 deletions(-) > > diff --git a/fs/inode.c b/fs/inode.c > index d6dfb09..bc90c12 100644 > --- a/fs/inode.c > +++ b/fs/inode.c > @@ -1637,6 +1637,34 @@ int file_remove_suid(struct file *file) > } > EXPORT_SYMBOL(file_remove_suid); > > +/* > + * This does the work that's common to file_update_time and > + * inode_update_time. > + */ > +static int prepare_update_cmtime(struct inode *inode, struct timespec *now) > +{ > + int sync_it; > + > + /* First try to exhaust all avenues to not sync */ > + if (IS_NOCMTIME(inode)) > + return 0; > + > + *now = current_fs_time(inode->i_sb); > + if (!timespec_equal(&inode->i_mtime, now)) > + sync_it = S_MTIME; > + > + if (!timespec_equal(&inode->i_ctime, now)) > + sync_it |= S_CTIME; > + > + if (IS_I_VERSION(inode)) > + sync_it |= S_VERSION; > + > + if (!sync_it) > + return 0; > + > + return sync_it; > +} > + > /** > * file_update_time - update mtime and ctime time > * @file: file accessed > @@ -1654,23 +1682,9 @@ int file_update_time(struct file *file) > { > struct inode *inode = file_inode(file); > struct timespec now; > - int sync_it = 0; > + int sync_it = prepare_update_cmtime(inode, &now); > int ret; > > - /* First try to exhaust all avenues to not sync */ > - if (IS_NOCMTIME(inode)) > - return 0; > - > - now = current_fs_time(inode->i_sb); > - if (!timespec_equal(&inode->i_mtime, &now)) > - sync_it = S_MTIME; > - > - if (!timespec_equal(&inode->i_ctime, &now)) > - sync_it |= S_CTIME; > - > - if (IS_I_VERSION(inode)) > - sync_it |= S_VERSION; > - > if (!sync_it) > return 0; > > @@ -1685,6 +1699,34 @@ int file_update_time(struct file *file) > } > EXPORT_SYMBOL(file_update_time); > > +/** > + * inode_update_time_writable - update mtime and ctime time > + * @inode: inode accessed > + * > + * This is like file_update_time, but it assumes the mnt is writable > + * and takes an inode parameter instead. (We need to assume the mnt > + * was writable because inodes aren't associated with any particular > + * mnt. > + */ > + > +int inode_update_time_writable(struct inode *inode) > +{ > + struct timespec now; > + int sync_it = prepare_update_cmtime(inode, &now); > + int ret; > + > + if (!sync_it) > + return 0; > + > + /* sb_start_pagefault and update_time can both sleep. */ > + sb_start_pagefault(inode->i_sb); > + ret = update_time(inode, &now, sync_it); > + sb_end_pagefault(inode->i_sb); This gets called from the writeback path - you can't use sb_start_pagefault/sb_end_pagefault in that path. Cheers, Dave.
On Mon, Aug 19, 2013 at 7:28 PM, Dave Chinner <david@fromorbit.com> wrote: > On Fri, Aug 16, 2013 at 04:22:09PM -0700, Andy Lutomirski wrote: >> This is like file_update_time, except that it acts on a struct inode * >> instead of a struct file *. >> >> Signed-off-by: Andy Lutomirski <luto@amacapital.net> >> --- >> fs/inode.c | 72 ++++++++++++++++++++++++++++++++++++++++++------------ >> include/linux/fs.h | 1 + >> 2 files changed, 58 insertions(+), 15 deletions(-) >> [...] >> + >> +int inode_update_time_writable(struct inode *inode) >> +{ >> + struct timespec now; >> + int sync_it = prepare_update_cmtime(inode, &now); >> + int ret; >> + >> + if (!sync_it) >> + return 0; >> + >> + /* sb_start_pagefault and update_time can both sleep. */ >> + sb_start_pagefault(inode->i_sb); >> + ret = update_time(inode, &now, sync_it); >> + sb_end_pagefault(inode->i_sb); > > This gets called from the writeback path - you can't use > sb_start_pagefault/sb_end_pagefault in that path. The race I'm worried about is: - mmap - write to the mapping - remount ro - flush_cmtime -> inode_update_time_writable This may be impossible, in which case I'm okay, but it's nice to have a sanity check. I'll see if I can figure out how to do that. --Andy -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Aug 19, 2013 at 08:20:12PM -0700, Andy Lutomirski wrote: > On Mon, Aug 19, 2013 at 7:28 PM, Dave Chinner <david@fromorbit.com> wrote: > > On Fri, Aug 16, 2013 at 04:22:09PM -0700, Andy Lutomirski wrote: > >> This is like file_update_time, except that it acts on a struct inode * > >> instead of a struct file *. > >> > >> Signed-off-by: Andy Lutomirski <luto@amacapital.net> > >> --- > >> fs/inode.c | 72 ++++++++++++++++++++++++++++++++++++++++++------------ > >> include/linux/fs.h | 1 + > >> 2 files changed, 58 insertions(+), 15 deletions(-) > >> > > [...] > > >> + > >> +int inode_update_time_writable(struct inode *inode) > >> +{ > >> + struct timespec now; > >> + int sync_it = prepare_update_cmtime(inode, &now); > >> + int ret; > >> + > >> + if (!sync_it) > >> + return 0; > >> + > >> + /* sb_start_pagefault and update_time can both sleep. */ > >> + sb_start_pagefault(inode->i_sb); > >> + ret = update_time(inode, &now, sync_it); > >> + sb_end_pagefault(inode->i_sb); > > > > This gets called from the writeback path - you can't use > > sb_start_pagefault/sb_end_pagefault in that path. > > The race I'm worried about is: > > - mmap > - write to the mapping > - remount ro > - flush_cmtime -> inode_update_time_writable sb_start_pagefault() is for filesystem freeze protection, not remount-ro protection. If you freeze the filesystem, then we stop writes and pagefaults by making sb_start_pagefault/sb_start_write block, and then run writeback to clean all the pages. If writeback then blocks on sb_start_pagefault(), we've got a deadlock. > This may be impossible, in which case I'm okay, but it's nice to have > a sanity check. I'll see if I can figure out how to do that. The process of remount-ro should flush the dirty pages - the inode and page has been marked dirty by page_mkwrite(), after all. Cheers, Dave.
On Mon, Aug 19, 2013 at 8:33 PM, Dave Chinner <david@fromorbit.com> wrote: > On Mon, Aug 19, 2013 at 08:20:12PM -0700, Andy Lutomirski wrote: >> On Mon, Aug 19, 2013 at 7:28 PM, Dave Chinner <david@fromorbit.com> wrote: >> > On Fri, Aug 16, 2013 at 04:22:09PM -0700, Andy Lutomirski wrote: >> >> This is like file_update_time, except that it acts on a struct inode * >> >> instead of a struct file *. >> >> >> >> Signed-off-by: Andy Lutomirski <luto@amacapital.net> >> >> --- >> >> fs/inode.c | 72 ++++++++++++++++++++++++++++++++++++++++++------------ >> >> include/linux/fs.h | 1 + >> >> 2 files changed, 58 insertions(+), 15 deletions(-) >> >> >> >> [...] >> >> >> + >> >> +int inode_update_time_writable(struct inode *inode) >> >> +{ >> >> + struct timespec now; >> >> + int sync_it = prepare_update_cmtime(inode, &now); >> >> + int ret; >> >> + >> >> + if (!sync_it) >> >> + return 0; >> >> + >> >> + /* sb_start_pagefault and update_time can both sleep. */ >> >> + sb_start_pagefault(inode->i_sb); >> >> + ret = update_time(inode, &now, sync_it); >> >> + sb_end_pagefault(inode->i_sb); >> > >> > This gets called from the writeback path - you can't use >> > sb_start_pagefault/sb_end_pagefault in that path. >> >> The race I'm worried about is: >> >> - mmap >> - write to the mapping >> - remount ro >> - flush_cmtime -> inode_update_time_writable > > sb_start_pagefault() is for filesystem freeze protection, not > remount-ro protection. If you freeze the filesystem, then we stop > writes and pagefaults by making sb_start_pagefault/sb_start_write > block, and then run writeback to clean all the pages. If writeback > then blocks on sb_start_pagefault(), we've got a deadlock. > >> This may be impossible, in which case I'm okay, but it's nice to have >> a sanity check. I'll see if I can figure out how to do that. > > The process of remount-ro should flush the dirty pages - the inode > and page has been marked dirty by page_mkwrite(), after all. Hmm. We can land in here from writeback, in which case the time should be updated unconditionally. We can also land in here from msync(MS_ASYNC) or munmap. munmap at least shouldn't block. The nasty case is if a page is dirtied, then the frozen level is set to SB_FREEZE_PAGEFAULT, and then userspace calls munmap or msync *before* writepages gets called. In this case, blocking until the fs is unfrozen is probably impolite, and returning without updating the time is questionable. Removing the check entirely may add a new race, though: what if .flush_cmtime has called mapping_test_clear_cmtime but hasn't gotten to updating the time yet when freezing finishes? This could be prevented by changing generic_flush_cmtime to do __sb_start_write(sb, SB_FREEZE_FS, false) and doing nothing if the fs is already frozen. --Andy -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon 19-08-13 21:07:49, Andy Lutomirski wrote: > On Mon, Aug 19, 2013 at 8:33 PM, Dave Chinner <david@fromorbit.com> wrote: > > On Mon, Aug 19, 2013 at 08:20:12PM -0700, Andy Lutomirski wrote: > >> On Mon, Aug 19, 2013 at 7:28 PM, Dave Chinner <david@fromorbit.com> wrote: > >> > On Fri, Aug 16, 2013 at 04:22:09PM -0700, Andy Lutomirski wrote: > >> >> This is like file_update_time, except that it acts on a struct inode * > >> >> instead of a struct file *. > >> >> > >> >> Signed-off-by: Andy Lutomirski <luto@amacapital.net> > >> >> --- > >> >> fs/inode.c | 72 ++++++++++++++++++++++++++++++++++++++++++------------ > >> >> include/linux/fs.h | 1 + > >> >> 2 files changed, 58 insertions(+), 15 deletions(-) > >> >> > >> > >> [...] > >> > >> >> + > >> >> +int inode_update_time_writable(struct inode *inode) > >> >> +{ > >> >> + struct timespec now; > >> >> + int sync_it = prepare_update_cmtime(inode, &now); > >> >> + int ret; > >> >> + > >> >> + if (!sync_it) > >> >> + return 0; > >> >> + > >> >> + /* sb_start_pagefault and update_time can both sleep. */ > >> >> + sb_start_pagefault(inode->i_sb); > >> >> + ret = update_time(inode, &now, sync_it); > >> >> + sb_end_pagefault(inode->i_sb); > >> > > >> > This gets called from the writeback path - you can't use > >> > sb_start_pagefault/sb_end_pagefault in that path. > >> > >> The race I'm worried about is: > >> > >> - mmap > >> - write to the mapping > >> - remount ro > >> - flush_cmtime -> inode_update_time_writable > > > > sb_start_pagefault() is for filesystem freeze protection, not > > remount-ro protection. If you freeze the filesystem, then we stop > > writes and pagefaults by making sb_start_pagefault/sb_start_write > > block, and then run writeback to clean all the pages. If writeback > > then blocks on sb_start_pagefault(), we've got a deadlock. > > > >> This may be impossible, in which case I'm okay, but it's nice to have > >> a sanity check. I'll see if I can figure out how to do that. > > > > The process of remount-ro should flush the dirty pages - the inode > > and page has been marked dirty by page_mkwrite(), after all. > > Hmm. We can land in here from writeback, in which case the time > should be updated unconditionally. We can also land in here from > msync(MS_ASYNC) or munmap. munmap at least shouldn't block. > > The nasty case is if a page is dirtied, then the frozen level is set > to SB_FREEZE_PAGEFAULT, and then userspace calls munmap or msync > *before* writepages gets called. In this case, blocking until the fs > is unfrozen is probably impolite, and returning without updating the > time is questionable. > > Removing the check entirely may add a new race, though: what if > .flush_cmtime has called mapping_test_clear_cmtime but hasn't gotten > to updating the time yet when freezing finishes? This could be > prevented by changing generic_flush_cmtime to do __sb_start_write(sb, > SB_FREEZE_FS, false) and doing nothing if the fs is already frozen. I think it should be a responsibility of the caller of .flush_cmtime (as is the case of update_time()) to handle freeze protection if needed. As Dave told you, writeback path mustn't actually take it. OTOH things like munmap() or msync() need to get it because we must avoid changing the filesystem when it is frozen and these calls can happen when fs is frozen. Honza
diff --git a/fs/inode.c b/fs/inode.c index d6dfb09..bc90c12 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1637,6 +1637,34 @@ int file_remove_suid(struct file *file) } EXPORT_SYMBOL(file_remove_suid); +/* + * This does the work that's common to file_update_time and + * inode_update_time. + */ +static int prepare_update_cmtime(struct inode *inode, struct timespec *now) +{ + int sync_it; + + /* First try to exhaust all avenues to not sync */ + if (IS_NOCMTIME(inode)) + return 0; + + *now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_mtime, now)) + sync_it = S_MTIME; + + if (!timespec_equal(&inode->i_ctime, now)) + sync_it |= S_CTIME; + + if (IS_I_VERSION(inode)) + sync_it |= S_VERSION; + + if (!sync_it) + return 0; + + return sync_it; +} + /** * file_update_time - update mtime and ctime time * @file: file accessed @@ -1654,23 +1682,9 @@ int file_update_time(struct file *file) { struct inode *inode = file_inode(file); struct timespec now; - int sync_it = 0; + int sync_it = prepare_update_cmtime(inode, &now); int ret; - /* First try to exhaust all avenues to not sync */ - if (IS_NOCMTIME(inode)) - return 0; - - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_mtime, &now)) - sync_it = S_MTIME; - - if (!timespec_equal(&inode->i_ctime, &now)) - sync_it |= S_CTIME; - - if (IS_I_VERSION(inode)) - sync_it |= S_VERSION; - if (!sync_it) return 0; @@ -1685,6 +1699,34 @@ int file_update_time(struct file *file) } EXPORT_SYMBOL(file_update_time); +/** + * inode_update_time_writable - update mtime and ctime time + * @inode: inode accessed + * + * This is like file_update_time, but it assumes the mnt is writable + * and takes an inode parameter instead. (We need to assume the mnt + * was writable because inodes aren't associated with any particular + * mnt. + */ + +int inode_update_time_writable(struct inode *inode) +{ + struct timespec now; + int sync_it = prepare_update_cmtime(inode, &now); + int ret; + + if (!sync_it) + return 0; + + /* sb_start_pagefault and update_time can both sleep. */ + sb_start_pagefault(inode->i_sb); + ret = update_time(inode, &now, sync_it); + sb_end_pagefault(inode->i_sb); + + return ret; +} +EXPORT_SYMBOL(inode_update_time_writable); + int inode_needs_sync(struct inode *inode) { if (IS_SYNC(inode)) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9818747..86cf0a4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2590,6 +2590,7 @@ extern int inode_newsize_ok(const struct inode *, loff_t offset); extern void setattr_copy(struct inode *inode, const struct iattr *attr); extern int file_update_time(struct file *file); +extern int inode_update_time_writable(struct inode *inode); extern int generic_show_options(struct seq_file *m, struct dentry *root); extern void save_mount_options(struct super_block *sb, char *options);
This is like file_update_time, except that it acts on a struct inode * instead of a struct file *. Signed-off-by: Andy Lutomirski <luto@amacapital.net> --- fs/inode.c | 72 ++++++++++++++++++++++++++++++++++++++++++------------ include/linux/fs.h | 1 + 2 files changed, 58 insertions(+), 15 deletions(-)