diff mbox

[3/4] vfs: don't let the dirty time inodes get more than a day stale

Message ID 1416599964-21892-4-git-send-email-tytso@mit.edu
State Superseded, archived
Headers show

Commit Message

Theodore Ts'o Nov. 21, 2014, 7:59 p.m. UTC
Guarantee that the on-disk timestamps will be no more than 24 hours
stale.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/fs-writeback.c  | 1 +
 fs/inode.c         | 7 ++++++-
 include/linux/fs.h | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

Comments

Andreas Dilger Nov. 21, 2014, 8:19 p.m. UTC | #1
On Nov 21, 2014, at 1:59 PM, Theodore Ts'o <tytso@mit.edu> wrote:
> Guarantee that the on-disk timestamps will be no more than 24 hours
> stale.
> 
> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> ---
> fs/fs-writeback.c  | 1 +
> fs/inode.c         | 7 ++++++-
> include/linux/fs.h | 1 +
> 3 files changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index ce7de22..eb04277 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -1141,6 +1141,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
> 	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
> 		trace_writeback_dirty_inode_start(inode, flags);
> 
> +		inode->i_ts_dirty_day = 0;
> 		if (sb->s_op->dirty_inode)
> 			sb->s_op->dirty_inode(inode, flags);
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index 6e91aca..f0d6232 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -1511,6 +1511,7 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
>  */
> static int update_time(struct inode *inode, struct timespec *time, int flags)
> {
> +	int days_since_boot = jiffies / (HZ * 86400);
> 	int ret;
> 
> 	if (inode->i_op->update_time) {
> @@ -1527,12 +1528,16 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
> 		if (flags & S_MTIME)
> 			inode->i_mtime = *time;
> 	}
> -	if (inode->i_sb->s_flags & MS_LAZYTIME) {
> +	if ((inode->i_sb->s_flags & MS_LAZYTIME) &&
> +	    (!inode->i_ts_dirty_day ||
> +	     inode->i_ts_dirty_day == days_since_boot)) {
> 		spin_lock(&inode->i_lock);
> 		inode->i_state |= I_DIRTY_TIME;
> 		spin_unlock(&inode->i_lock);
> +		inode->i_ts_dirty_day = days_since_boot;

It isn't clear if this is correct.  It looks like the condition will
only be entered if i_ts_dirty_day == days_since_boot, but that is only
set inside the condition?  Shouldn't this be:

	inode->i_ts_dirty_day = ~0U;

	if ((inode->i_sb->s_flags & MS_LAZYTIME) &&
	    inode->i_ts_dirty_day != days_since_boot)) {
		spin_lock(&inode->i_lock);
		inode->i_state |= I_DIRTY_TIME;
		spin_unlock(&inode->i_lock);
		inode->i_ts_dirty_day = days_since_boot;
	}

and "days_since_boot" should be declared unsigned short so it wraps
in the same way as i_ts_dirty_day, maybe using typeof(i_ts_dirty_day)
so that there isn't any need to update this in the future if the type
changes.  Conceivably this could be an unsigned char, if that packed
into struct inode better, at the risk of losing a timestamp update on
an inode in cache for 256+ days and only modifying it 256 days later.

Cheers, Andreas

> 		return 0;
> 	}
> +	inode->i_ts_dirty_day = 0;
> 	if (inode->i_op->write_time)
> 		return inode->i_op->write_time(inode);
> 	mark_inode_dirty_sync(inode);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 489b2f2..e3574cd 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -575,6 +575,7 @@ struct inode {
> 	struct timespec		i_ctime;
> 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
> 	unsigned short          i_bytes;
> +	unsigned short		i_ts_dirty_day;
> 	unsigned int		i_blkbits;
> 	blkcnt_t		i_blocks;
> 
> -- 
> 2.1.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Cheers, Andreas
Theodore Ts'o Nov. 21, 2014, 9:36 p.m. UTC | #2
On Fri, Nov 21, 2014 at 02:19:07PM -0600, Andreas Dilger wrote:
> > -	if (inode->i_sb->s_flags & MS_LAZYTIME) {
> > +	if ((inode->i_sb->s_flags & MS_LAZYTIME) &&
> > +	    (!inode->i_ts_dirty_day ||
> > +	     inode->i_ts_dirty_day == days_since_boot)) {
> > 		spin_lock(&inode->i_lock);
> > 		inode->i_state |= I_DIRTY_TIME;
> > 		spin_unlock(&inode->i_lock);
> > +		inode->i_ts_dirty_day = days_since_boot;
> 
> It isn't clear if this is correct.  It looks like the condition will
> only be entered if i_ts_dirty_day == days_since_boot, but that is only
> set inside the condition?

If i_ts_dirty_day is zero, the timestamps don't have to written to
disk.  This is either because the inode has been written to disk, or
the system has been up for less than a day, such that when we last a
lazy mtime update (i.e., we skipped the call to mark_inode_dirty)
since jiffies / (HZ * 86400) was zero.

If it is non-zero, then the timestamps were updated but were not sent
to disk N days since the system was booted.  So long as it remains N
days since the system was booted, we can skip calling
mark_inode_dirty().  But once it becomes N+1 days since the system was
booted, then we will call mark_inode_dirty() and set i_ts_dirty_day to
zero.

I'll add a comment so it's a bit more obvious what we're doing here,
but I'm pretty sure we currently have is in fact correct.

> and "days_since_boot" should be declared unsigned short so it wraps
> in the same way as i_ts_dirty_day

Good point, thanks.  This will only be an issue after the system has
been up for almost 90 years, but we might as well get it right,

     	    	      	     	    	  - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger Nov. 21, 2014, 11:09 p.m. UTC | #3
On Nov 21, 2014, at 3:36 PM, Theodore Ts'o <tytso@mit.edu> wrote:
>> and "days_since_boot" should be declared unsigned short so it wraps
>> in the same way as i_ts_dirty_day
> 
> Good point, thanks.  This will only be an issue after the system has
> been up for almost 90 years, but we might as well get it right,

Speaking of stuff in the future, it looks like the patch to fix the
ext4 dates-beyond-epoch handling hasn't landed yet?

The last email I found related to that is in "[PATCH] fs: ext4: Sign-extend tv_sec after ORing in epoch bits", but the patch that I actually thought was ready to land was "[PATCH v8 1/2] ext4: Fix handling of extended tv_sec (bug 23732)" and the matching e2fsprogs patch:

http://lkml.org/lkml/2014/2/13/667
http://lkml.org/lkml/2014/2/13/670

Would be nice to get them landed, since we have less than 20 years
left, and at some point there will be embedded systems or VMs using
ext4 that may live long enough to hit the bug...

Cheers, Andreas
Dave Chinner Nov. 25, 2014, 1:53 a.m. UTC | #4
On Fri, Nov 21, 2014 at 02:59:23PM -0500, Theodore Ts'o wrote:
> Guarantee that the on-disk timestamps will be no more than 24 hours
> stale.
> 
> Signed-off-by: Theodore Ts'o <tytso@mit.edu>

If we put these inodes on the dirty inode list with at writeback
time of 24 hours, this is completely unnecessary.

Cheers,

Dave.
Theodore Ts'o Nov. 25, 2014, 4:45 a.m. UTC | #5
On Tue, Nov 25, 2014 at 12:53:32PM +1100, Dave Chinner wrote:
> On Fri, Nov 21, 2014 at 02:59:23PM -0500, Theodore Ts'o wrote:
> > Guarantee that the on-disk timestamps will be no more than 24 hours
> > stale.
> > 
> > Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> 
> If we put these inodes on the dirty inode list with at writeback
> time of 24 hours, this is completely unnecessary.

What do you mean by "a writeback time of 24 hours"?  Do you mean
creating a new field in the inode which specifies when the writeback
should happen?  I still worry about the dirty inode list getting
somewhat long large in the strictatime && lazytime case, and the inode
bloat nazi's coming after us for adding a new field to struct inode
structure.

Or do you mean trying to abuse the dirtied_when field in some way?

      	       	      	       	   - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara Nov. 25, 2014, 5:31 p.m. UTC | #6
On Fri 21-11-14 14:59:23, Ted Tso wrote:
> Guarantee that the on-disk timestamps will be no more than 24 hours
> stale.
  Hum, how about reusing i_dirtied_when for this. Using that field even
makes a good sence to me...

								Honza

> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> ---
>  fs/fs-writeback.c  | 1 +
>  fs/inode.c         | 7 ++++++-
>  include/linux/fs.h | 1 +
>  3 files changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index ce7de22..eb04277 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -1141,6 +1141,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
>  		trace_writeback_dirty_inode_start(inode, flags);
>  
> +		inode->i_ts_dirty_day = 0;
>  		if (sb->s_op->dirty_inode)
>  			sb->s_op->dirty_inode(inode, flags);
>  
> diff --git a/fs/inode.c b/fs/inode.c
> index 6e91aca..f0d6232 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -1511,6 +1511,7 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
>   */
>  static int update_time(struct inode *inode, struct timespec *time, int flags)
>  {
> +	int days_since_boot = jiffies / (HZ * 86400);
>  	int ret;
>  
>  	if (inode->i_op->update_time) {
> @@ -1527,12 +1528,16 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
>  		if (flags & S_MTIME)
>  			inode->i_mtime = *time;
>  	}
> -	if (inode->i_sb->s_flags & MS_LAZYTIME) {
> +	if ((inode->i_sb->s_flags & MS_LAZYTIME) &&
> +	    (!inode->i_ts_dirty_day ||
> +	     inode->i_ts_dirty_day == days_since_boot)) {
>  		spin_lock(&inode->i_lock);
>  		inode->i_state |= I_DIRTY_TIME;
>  		spin_unlock(&inode->i_lock);
> +		inode->i_ts_dirty_day = days_since_boot;
>  		return 0;
>  	}
> +	inode->i_ts_dirty_day = 0;
>  	if (inode->i_op->write_time)
>  		return inode->i_op->write_time(inode);
>  	mark_inode_dirty_sync(inode);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 489b2f2..e3574cd 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -575,6 +575,7 @@ struct inode {
>  	struct timespec		i_ctime;
>  	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
>  	unsigned short          i_bytes;
> +	unsigned short		i_ts_dirty_day;
>  	unsigned int		i_blkbits;
>  	blkcnt_t		i_blocks;
>  
> -- 
> 2.1.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Nov. 25, 2014, 11:48 p.m. UTC | #7
On Mon, Nov 24, 2014 at 11:45:08PM -0500, Theodore Ts'o wrote:
> On Tue, Nov 25, 2014 at 12:53:32PM +1100, Dave Chinner wrote:
> > On Fri, Nov 21, 2014 at 02:59:23PM -0500, Theodore Ts'o wrote:
> > > Guarantee that the on-disk timestamps will be no more than 24 hours
> > > stale.
> > > 
> > > Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> > 
> > If we put these inodes on the dirty inode list with at writeback
> > time of 24 hours, this is completely unnecessary.
> 
> What do you mean by "a writeback time of 24 hours"?  Do you mean
> creating a new field in the inode which specifies when the writeback
> should happen? 

No.

> I still worry about the dirty inode list getting
> somewhat long large in the strictatime && lazytime case, and the inode
> bloat nazi's coming after us for adding a new field to struct inode
> structure.

Use another pure inode time dirty list, and move the inode to the
existing dirty list when it gets marked I_DIRTY.

> Or do you mean trying to abuse the dirtied_when field in some way?

No abuse necessary at all. Just a different inode_dirtied_after()
check is requires if the inode is on the time dirty list in
move_expired_inodes().

Cheers,

Dave.
Theodore Ts'o Nov. 26, 2014, 10:20 a.m. UTC | #8
On Wed, Nov 26, 2014 at 10:48:51AM +1100, Dave Chinner wrote:
> No abuse necessary at all. Just a different inode_dirtied_after()
> check is requires if the inode is on the time dirty list in
> move_expired_inodes().

I'm still not sure what you have in mind here.  When would this be
checked?  It sounds like you want to set a timeout such that when an
inode which had its timestamps updated lazily 24 hours earlier, the
inode would get written out.  Yes?  But that implies something is
going to have to scan the list of inodes on the dirty time list
periodically.  When are you proposing that this take place?

The various approaches that come to mind all seem more complex than
what I have in this patch 3 of 4, and I'm not sure it's worth the
complexity.

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Nov. 26, 2014, 10:39 p.m. UTC | #9
On Wed, Nov 26, 2014 at 05:20:17AM -0500, Theodore Ts'o wrote:
> On Wed, Nov 26, 2014 at 10:48:51AM +1100, Dave Chinner wrote:
> > No abuse necessary at all. Just a different inode_dirtied_after()
> > check is requires if the inode is on the time dirty list in
> > move_expired_inodes().
> 
> I'm still not sure what you have in mind here.  When would this be
> checked? 

Have you looked at where move_expired_inodes() gets called from?
It's called periodically from background writeback by queue_io(),
and sync uses the same infrastructure to expire all inodes on the
dirty list....

> It sounds like you want to set a timeout such that when an
> inode which had its timestamps updated lazily 24 hours earlier, the
> inode would get written out.  Yes?  But that implies something is
> going to have to scan the list of inodes on the dirty time list
> periodically.  When are you proposing that this take place?

The writeback code already does this for dirty inodes. it does it in
move_expired_inodes() to move the inodes with i_dirtied_when is
older than 30s. It's *trivial* to add a time dirty inode list and
scan that at the same time to pull off inodes that are older than
24hrs.

> The various approaches that come to mind all seem more complex than
> what I have in this patch 3 of 4, and I'm not sure it's worth the
> complexity.

the "once a day" stuff you've added is a horrible, nasty hack. I
wasn't going to say anything about it (i.e. if you can't say
anything nice...). The existing dirty inode writeback expiry code
does *everything* we need already, we just need to plumb in a new
list and add an expiry check of that list to move inodes to the b_io
list when they have been timestamp dirty for more than 24 hours...

Cheers,

Dave.
diff mbox

Patch

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ce7de22..eb04277 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1141,6 +1141,7 @@  void __mark_inode_dirty(struct inode *inode, int flags)
 	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
 		trace_writeback_dirty_inode_start(inode, flags);
 
+		inode->i_ts_dirty_day = 0;
 		if (sb->s_op->dirty_inode)
 			sb->s_op->dirty_inode(inode, flags);
 
diff --git a/fs/inode.c b/fs/inode.c
index 6e91aca..f0d6232 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1511,6 +1511,7 @@  static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
  */
 static int update_time(struct inode *inode, struct timespec *time, int flags)
 {
+	int days_since_boot = jiffies / (HZ * 86400);
 	int ret;
 
 	if (inode->i_op->update_time) {
@@ -1527,12 +1528,16 @@  static int update_time(struct inode *inode, struct timespec *time, int flags)
 		if (flags & S_MTIME)
 			inode->i_mtime = *time;
 	}
-	if (inode->i_sb->s_flags & MS_LAZYTIME) {
+	if ((inode->i_sb->s_flags & MS_LAZYTIME) &&
+	    (!inode->i_ts_dirty_day ||
+	     inode->i_ts_dirty_day == days_since_boot)) {
 		spin_lock(&inode->i_lock);
 		inode->i_state |= I_DIRTY_TIME;
 		spin_unlock(&inode->i_lock);
+		inode->i_ts_dirty_day = days_since_boot;
 		return 0;
 	}
+	inode->i_ts_dirty_day = 0;
 	if (inode->i_op->write_time)
 		return inode->i_op->write_time(inode);
 	mark_inode_dirty_sync(inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 489b2f2..e3574cd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -575,6 +575,7 @@  struct inode {
 	struct timespec		i_ctime;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	unsigned short          i_bytes;
+	unsigned short		i_ts_dirty_day;
 	unsigned int		i_blkbits;
 	blkcnt_t		i_blocks;