Patchwork [-v2] ext4: avoid reusing recently deleted inodes in no journal mode

login
register
mail settings
Submitter Theodore Ts'o
Date July 26, 2013, 7:32 p.m.
Message ID <1374867164-21942-1-git-send-email-tytso@mit.edu>
Download mbox | patch
Permalink /patch/262263/
State Superseded
Headers show

Comments

Theodore Ts'o - July 26, 2013, 7:32 p.m.
In no journal mode, if an inode has recently been deleted, we
shouldn't reuse it right away.  Otherwise it's possible, after an
unclean shutdown, to hit a situation where a recently deleted inode
gets reused for some other purpose before the inode table block has
been written to disk.  However, if the directory entry has been
updated, then the directory entry will be pointing at the old inode
contents.

E2fsck will make sure the file system is consistent after the
unclean shutdown.  However, if the recently deleted inode is a
character mode device, or an inode with the immutable bit set, even
after the file system has been fixed up by e2fsck, it can be
possible for a *.pyc file to be pointing at a character mode
device, and when python tries to open the *.pyc file, Hilarity
Ensues.  We could change all of userspace to be very suspicious
about stat'ing files before opening them, and clearing the
immutable flag if necessary --- or we can just avoid reusing an
inode number if it has been recently deleted.

Google-Bug-Id: 10017573

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
Jan Kara - July 29, 2013, 1:32 p.m.
On Fri 26-07-13 15:32:43, Ted Tso wrote:
> In no journal mode, if an inode has recently been deleted, we
> shouldn't reuse it right away.  Otherwise it's possible, after an
> unclean shutdown, to hit a situation where a recently deleted inode
> gets reused for some other purpose before the inode table block has
> been written to disk.  However, if the directory entry has been
> updated, then the directory entry will be pointing at the old inode
> contents.
> 
> E2fsck will make sure the file system is consistent after the
> unclean shutdown.  However, if the recently deleted inode is a
> character mode device, or an inode with the immutable bit set, even
> after the file system has been fixed up by e2fsck, it can be
> possible for a *.pyc file to be pointing at a character mode
> device, and when python tries to open the *.pyc file, Hilarity
> Ensues.  We could change all of userspace to be very suspicious
> about stat'ing files before opening them, and clearing the
> immutable flag if necessary --- or we can just avoid reusing an
> inode number if it has been recently deleted.
  Hum, I don't like this very much since it's just a heuristic which is
going to work in 99% of cases but not all. But likely it's better than
nothing...

> Google-Bug-Id: 10017573
> 
> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
> ---
>  fs/ext4/ialloc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 51 insertions(+)
> 
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 5b8e22e..7d5ac66 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -625,6 +625,51 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
>  }
>  
>  /*
> + * In no journal mode, if an inode has recently been deleted, we want
> + * to avoid reusing it until we're reasonably sure the inode table
> + * block has been written back to disk.
> + */
> +int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
> +{
> +	struct ext4_group_desc	*gdp;
> +	struct ext4_inode	*raw_inode;
> +	struct buffer_head	*bh;
> +	unsigned long		dtime, now;
> +	int	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
> +	int	offset, ret = 0, recentcy = 30;
  I'd use dirty_expire_interval here so that we are at least tied to
flusher thread timeout...

> +
> +	gdp = ext4_get_group_desc(sb, group, NULL);
> +	if (unlikely(!gdp))
> +		return 0;
> +
> +	bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
> +		       (ino / inodes_per_block));
> +	if (unlikely(!bh) || !buffer_uptodate(bh))
> +		/*
> +		 * If the block is not in the buffer head, then it
                                                     ^^^^ cache

> +		 * must have been written out.
> +		 */
> +		goto out;
> +
> +	offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
> +	raw_inode = (struct ext4_inode *) (bh->b_data + offset);
> +	dtime = le32_to_cpu(raw_inode->i_dtime);
> +	now = get_seconds();
> +	if (!buffer_dirty(bh))
> +		/*
> +		 * Five seconds should be enough time for a block to be
> +		 * committed to the platter once it is sent to the HDD
> +		 */
> +		recentcy = 5;
  This is completely ad-hoc and I cannot say anything about what value
would be appropriate here. Jens told me disk can hold on sectors for
*minutes* in their writeback caches when these blocks are unsuitably placed
and there's enough streaming IO going on. So the question is what value do
we want to base this on?

> +
> +	if (dtime && (dtime < now) && (now < dtime + recentcy))
> +		ret = 1;
> +out:
> +	brelse(bh);
> +	return ret;
> +}
> +
> +/*
>   * There are two policies for allocating an inode.  If the new inode is
>   * a directory, then a forward search is made for a block group with both
>   * free space and a low directory-to-inode ratio; if that fails, then of
> @@ -741,6 +786,11 @@ repeat_in_this_group:
>  				   "inode=%lu", ino + 1);
>  			continue;
>  		}
> +		if ((EXT4_SB(sb)->s_journal == NULL) &&
> +		    recently_deleted(sb, group, ino)) {
> +			ino++;
> +			goto next_inode;
> +		}
>  		if (!handle) {
>  			BUG_ON(nblocks <= 0);
>  			handle = __ext4_journal_start_sb(dir->i_sb, line_no,
> @@ -764,6 +814,7 @@ repeat_in_this_group:
>  		ino++;		/* the inode bitmap is zero-based */
>  		if (!ret2)
>  			goto got; /* we grabbed the inode! */
> +	next_ino:
>  		if (ino < EXT4_INODES_PER_GROUP(sb))
>  			goto repeat_in_this_group;
>  	next_group:

								Honza
Zheng Liu - July 30, 2013, 2:33 a.m.
On Fri, Jul 26, 2013 at 03:32:43PM -0400, Theodore Ts'o wrote:
> In no journal mode, if an inode has recently been deleted, we
> shouldn't reuse it right away.  Otherwise it's possible, after an
> unclean shutdown, to hit a situation where a recently deleted inode
> gets reused for some other purpose before the inode table block has
> been written to disk.  However, if the directory entry has been
> updated, then the directory entry will be pointing at the old inode
> contents.
> 
> E2fsck will make sure the file system is consistent after the
> unclean shutdown.  However, if the recently deleted inode is a
> character mode device, or an inode with the immutable bit set, even
> after the file system has been fixed up by e2fsck, it can be
> possible for a *.pyc file to be pointing at a character mode
> device, and when python tries to open the *.pyc file, Hilarity
> Ensues.  We could change all of userspace to be very suspicious
> about stat'ing files before opening them, and clearing the
> immutable flag if necessary --- or we can just avoid reusing an
> inode number if it has been recently deleted.
> 
> Google-Bug-Id: 10017573
> 
> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
> ---
>  fs/ext4/ialloc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 51 insertions(+)
> 
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 5b8e22e..7d5ac66 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -625,6 +625,51 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
>  }
>  
>  /*
> + * In no journal mode, if an inode has recently been deleted, we want
> + * to avoid reusing it until we're reasonably sure the inode table
> + * block has been written back to disk.
> + */
> +int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
> +{
> +	struct ext4_group_desc	*gdp;
> +	struct ext4_inode	*raw_inode;
> +	struct buffer_head	*bh;
> +	unsigned long		dtime, now;
> +	int	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
> +	int	offset, ret = 0, recentcy = 30;
> +
> +	gdp = ext4_get_group_desc(sb, group, NULL);
> +	if (unlikely(!gdp))
> +		return 0;
> +
> +	bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
> +		       (ino / inodes_per_block));
> +	if (unlikely(!bh) || !buffer_uptodate(bh))
> +		/*
> +		 * If the block is not in the buffer head, then it
> +		 * must have been written out.
> +		 */
> +		goto out;
> +
> +	offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
> +	raw_inode = (struct ext4_inode *) (bh->b_data + offset);
> +	dtime = le32_to_cpu(raw_inode->i_dtime);
> +	now = get_seconds();
> +	if (!buffer_dirty(bh))
> +		/*
> +		 * Five seconds should be enough time for a block to be
> +		 * committed to the platter once it is sent to the HDD
> +		 */
> +		recentcy = 5;
> +
> +	if (dtime && (dtime < now) && (now < dtime + recentcy))
> +		ret = 1;
> +out:
> +	brelse(bh);
> +	return ret;
> +}
> +
> +/*
>   * There are two policies for allocating an inode.  If the new inode is
>   * a directory, then a forward search is made for a block group with both
>   * free space and a low directory-to-inode ratio; if that fails, then of
> @@ -741,6 +786,11 @@ repeat_in_this_group:
>  				   "inode=%lu", ino + 1);
>  			continue;
>  		}
> +		if ((EXT4_SB(sb)->s_journal == NULL) &&
> +		    recently_deleted(sb, group, ino)) {
> +			ino++;
> +			goto next_inode;
                             ^^^^^^^^^^^
                             next_ino;

                                                - Zheng
> +		}
>  		if (!handle) {
>  			BUG_ON(nblocks <= 0);
>  			handle = __ext4_journal_start_sb(dir->i_sb, line_no,
> @@ -764,6 +814,7 @@ repeat_in_this_group:
>  		ino++;		/* the inode bitmap is zero-based */
>  		if (!ret2)
>  			goto got; /* we grabbed the inode! */
> +	next_ino:
>  		if (ino < EXT4_INODES_PER_GROUP(sb))
>  			goto repeat_in_this_group;
>  	next_group:
> -- 
> 1.7.12.rc0.22.gcdd159b
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o - July 31, 2013, 10:12 p.m.
On Mon, Jul 29, 2013 at 03:32:31PM +0200, Jan Kara wrote:
>   I'd use dirty_expire_interval here so that we are at least tied to
> flusher thread timeout...

Makes sense, done.

> > +		/*
> > +		 * Five seconds should be enough time for a block to be
> > +		 * committed to the platter once it is sent to the HDD
> > +		 */
> > +		recentcy = 5;
>   This is completely ad-hoc and I cannot say anything about what value
> would be appropriate here. Jens told me disk can hold on sectors for
> *minutes* in their writeback caches when these blocks are unsuitably placed
> and there's enough streaming IO going on. So the question is what value do
> we want to base this on?

Yes, it is completely ad-hoc.  How long a disk will hold on to sectors
in their writeback caches really depends on its elevator algorithms;
so it is indeed completely a hueristic.  I will say though that the
workloads that allow a sector to be pinned for even seconds (let alone
minutes) are very artifical workloads and it's very unclear whether
it's realistic that they would exist on most normal productoin
servers.  The use cases for no journal mode is for places where
performance is critical, so we really don't want to send a CACHE FLUSH
command, which is really the only way you can be sure.

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Morton - Aug. 8, 2013, 12:03 a.m.
On Fri, 26 Jul 2013 15:32:43 -0400 "Theodore Ts'o" <tytso@mit.edu> wrote:

> In no journal mode, if an inode has recently been deleted, we
> shouldn't reuse it right away.  Otherwise it's possible, after an
> unclean shutdown, to hit a situation where a recently deleted inode
> gets reused for some other purpose before the inode table block has
> been written to disk.  However, if the directory entry has been
> updated, then the directory entry will be pointing at the old inode
> contents.

Sounds a bit hacky :(

> E2fsck will make sure the file system is consistent after the
> unclean shutdown.  However, if the recently deleted inode is a
> character mode device, or an inode with the immutable bit set, even
> after the file system has been fixed up by e2fsck, it can be
> possible for a *.pyc file to be pointing at a character mode
> device, and when python tries to open the *.pyc file, Hilarity
> Ensues.  We could change all of userspace to be very suspicious
> about stat'ing files before opening them, and clearing the
> immutable flag if necessary --- or we can just avoid reusing an
> inode number if it has been recently deleted.
> 
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -625,6 +625,51 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
>  }
>  
>  /*
> + * In no journal mode, if an inode has recently been deleted, we want
> + * to avoid reusing it until we're reasonably sure the inode table
> + * block has been written back to disk.
> + */
> +int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
> +{
> +	struct ext4_group_desc	*gdp;
> +	struct ext4_inode	*raw_inode;
> +	struct buffer_head	*bh;
> +	unsigned long		dtime, now;
> +	int	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
> +	int	offset, ret = 0, recentcy = 30;

The version in linux-next is different from this.  it uses

+       int     offset, ret = 0, recentcy = dirty_expire_interval;

which breaks the build because dirty_expire_interval isn't exported to
modules.  Good luck fixing this without adding a bisection hole :(

Also, it's spelled "recency"!

What's the rationale for using this anyway?  Seems a bit arbitrary. 
Wouldn't using the ext4 commit interval be more appropriate?


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 5b8e22e..7d5ac66 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -625,6 +625,51 @@  static int find_group_other(struct super_block *sb, struct inode *parent,
 }
 
 /*
+ * In no journal mode, if an inode has recently been deleted, we want
+ * to avoid reusing it until we're reasonably sure the inode table
+ * block has been written back to disk.
+ */
+int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
+{
+	struct ext4_group_desc	*gdp;
+	struct ext4_inode	*raw_inode;
+	struct buffer_head	*bh;
+	unsigned long		dtime, now;
+	int	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	int	offset, ret = 0, recentcy = 30;
+
+	gdp = ext4_get_group_desc(sb, group, NULL);
+	if (unlikely(!gdp))
+		return 0;
+
+	bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
+		       (ino / inodes_per_block));
+	if (unlikely(!bh) || !buffer_uptodate(bh))
+		/*
+		 * If the block is not in the buffer head, then it
+		 * must have been written out.
+		 */
+		goto out;
+
+	offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
+	raw_inode = (struct ext4_inode *) (bh->b_data + offset);
+	dtime = le32_to_cpu(raw_inode->i_dtime);
+	now = get_seconds();
+	if (!buffer_dirty(bh))
+		/*
+		 * Five seconds should be enough time for a block to be
+		 * committed to the platter once it is sent to the HDD
+		 */
+		recentcy = 5;
+
+	if (dtime && (dtime < now) && (now < dtime + recentcy))
+		ret = 1;
+out:
+	brelse(bh);
+	return ret;
+}
+
+/*
  * There are two policies for allocating an inode.  If the new inode is
  * a directory, then a forward search is made for a block group with both
  * free space and a low directory-to-inode ratio; if that fails, then of
@@ -741,6 +786,11 @@  repeat_in_this_group:
 				   "inode=%lu", ino + 1);
 			continue;
 		}
+		if ((EXT4_SB(sb)->s_journal == NULL) &&
+		    recently_deleted(sb, group, ino)) {
+			ino++;
+			goto next_inode;
+		}
 		if (!handle) {
 			BUG_ON(nblocks <= 0);
 			handle = __ext4_journal_start_sb(dir->i_sb, line_no,
@@ -764,6 +814,7 @@  repeat_in_this_group:
 		ino++;		/* the inode bitmap is zero-based */
 		if (!ret2)
 			goto got; /* we grabbed the inode! */
+	next_ino:
 		if (ino < EXT4_INODES_PER_GROUP(sb))
 			goto repeat_in_this_group;
 	next_group: