Patchwork [RFC,3/3] ext4: use the O_HOT and O_COLD open flags to influence inode allocation

login
register
mail settings
Submitter Theodore Ts'o
Date April 19, 2012, 7:20 p.m.
Message ID <1334863211-19504-4-git-send-email-tytso@mit.edu>
Download mbox | patch
Permalink /patch/153855/
State Not Applicable
Headers show

Comments

Theodore Ts'o - April 19, 2012, 7:20 p.m.
Wire up the use of the O_HOT and O_COLD open flags so that when an
inode is being created, it can influence which part of the disk gets
used on rotational storage devices.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |    8 +++++++-
 fs/ext4/ialloc.c  |   33 +++++++++++++++++++++++++++------
 fs/ext4/migrate.c |    2 +-
 fs/ext4/namei.c   |   15 +++++++++++----
 4 files changed, 46 insertions(+), 12 deletions(-)
Eric Sandeen - April 19, 2012, 7:45 p.m.
On 4/19/12 2:20 PM, Theodore Ts'o wrote:
> Wire up the use of the O_HOT and O_COLD open flags so that when an
> inode is being created, it can influence which part of the disk gets
> used on rotational storage devices.

I'm curious to know how this will work for example on a linear device
make up of rotational devices (possibly a concat of raids, etc).

At least for dm, it will be still marked as rotational,
but the relative speed of regions of the linear device can't be inferred
from the offset within the device.

Do we really have enough information about the storage under us to
know what parts are "fast" and what parts are "slow?"

-Eric

> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
> ---
>  fs/ext4/ext4.h    |    8 +++++++-
>  fs/ext4/ialloc.c  |   33 +++++++++++++++++++++++++++------
>  fs/ext4/migrate.c |    2 +-
>  fs/ext4/namei.c   |   15 +++++++++++----
>  4 files changed, 46 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 0e01e90..6539c9a 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1553,6 +1553,12 @@ struct ext4_dir_entry_2 {
>  #define EXT4_MAX_REC_LEN		((1<<16)-1)
>  
>  /*
> + * Flags for ext4_new_inode()
> + */
> +#define EXT4_NEWI_HOT	0x0001
> +#define EXT4_NEWI_COLD	0x0002
> +
> +/*
>   * If we ever get support for fs block sizes > page_size, we'll need
>   * to remove the #if statements in the next two functions...
>   */
> @@ -1850,7 +1856,7 @@ extern int ext4fs_dirhash(const char *name, int len, struct
>  /* ialloc.c */
>  extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t,
>  				    const struct qstr *qstr, __u32 goal,
> -				    uid_t *owner);
> +				    uid_t *owner, int flags);
>  extern void ext4_free_inode(handle_t *, struct inode *);
>  extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
>  extern unsigned long ext4_count_free_inodes(struct super_block *);
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 409c2ee..3dcc8c8 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -363,7 +363,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
>  
>  static int find_group_orlov(struct super_block *sb, struct inode *parent,
>  			    ext4_group_t *group, umode_t mode,
> -			    const struct qstr *qstr)
> +			    const struct qstr *qstr, int flags)
>  {
>  	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
> @@ -508,13 +508,20 @@ fallback_retry:
>  }
>  
>  static int find_group_other(struct super_block *sb, struct inode *parent,
> -			    ext4_group_t *group, umode_t mode)
> +			    ext4_group_t *group, umode_t mode, int flags)
>  {
>  	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
>  	ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
>  	struct ext4_group_desc *desc;
>  	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
>  
> +	if ((flags & EXT4_NEWI_HOT) && (ngroups > 3) &&
> +	    (parent_group > ngroups / 3))
> +		parent_group = 0;
> +	if ((flags & EXT4_NEWI_COLD) && (ngroups > 3) &&
> +	    (parent_group < (2 * (ngroups / 3))))
> +		parent_group = 2 * (ngroups / 3);
> +
>  	/*
>  	 * Try to place the inode is the same flex group as its
>  	 * parent.  If we can't find space, use the Orlov algorithm to
> @@ -550,7 +557,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
>  		*group = parent_group + flex_size;
>  		if (*group > ngroups)
>  			*group = 0;
> -		return find_group_orlov(sb, parent, group, mode, NULL);
> +		return find_group_orlov(sb, parent, group, mode, NULL, flags);
>  	}
>  
>  	/*
> @@ -614,7 +621,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
>   * group to find a free inode.
>   */
>  struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
> -			     const struct qstr *qstr, __u32 goal, uid_t *owner)
> +			     const struct qstr *qstr, __u32 goal, uid_t *owner,
> +			     int flags)
>  {
>  	struct super_block *sb;
>  	struct buffer_head *inode_bitmap_bh = NULL;
> @@ -643,6 +651,19 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
>  	ei = EXT4_I(inode);
>  	sbi = EXT4_SB(sb);
>  
> +	if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev)))
> +		flags &= ~(EXT4_NEWI_HOT | EXT4_NEWI_COLD);
> +
> +	/* 
> +	 * We will only allow the HOT flag if the user passes the
> +	 * reserved uid/gid check, or if she has CAP_SYS_RESOURCE
> +	 */
> +	if ((flags & EXT4_NEWI_HOT) && 
> +	    !(sbi->s_resuid == current_fsuid() ||
> +	      ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
> +	      capable(CAP_SYS_RESOURCE)))
> +		flags &= ~EXT4_NEWI_HOT;
> +
>  	if (!goal)
>  		goal = sbi->s_inode_goal;
>  
> @@ -654,9 +675,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
>  	}
>  
>  	if (S_ISDIR(mode))
> -		ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
> +		ret2 = find_group_orlov(sb, dir, &group, mode, qstr, flags);
>  	else
> -		ret2 = find_group_other(sb, dir, &group, mode);
> +		ret2 = find_group_other(sb, dir, &group, mode, flags);
>  
>  got_group:
>  	EXT4_I(dir)->i_last_alloc_group = group;
> diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
> index f39f80f..2b3d65c 100644
> --- a/fs/ext4/migrate.c
> +++ b/fs/ext4/migrate.c
> @@ -469,7 +469,7 @@ int ext4_ext_migrate(struct inode *inode)
>  	owner[0] = inode->i_uid;
>  	owner[1] = inode->i_gid;
>  	tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
> -				   S_IFREG, NULL, goal, owner);
> +				   S_IFREG, NULL, goal, owner, 0);
>  	if (IS_ERR(tmp_inode)) {
>  		retval = PTR_ERR(tmp_inode);
>  		ext4_journal_stop(handle);
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 6f48ff8..222a419 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -1742,6 +1742,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
>  	handle_t *handle;
>  	struct inode *inode;
>  	int err, retries = 0;
> +	int flags = 0;
>  
>  	dquot_initialize(dir);
>  
> @@ -1755,7 +1756,13 @@ retry:
>  	if (IS_DIRSYNC(dir))
>  		ext4_handle_sync(handle);
>  
> -	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
> +	if (op && op->open_flag & O_HOT)
> +		flags |= EXT4_NEWI_HOT;
> +	if (op && op->open_flag & O_COLD)
> +		flags |= EXT4_NEWI_COLD;
> +
> +	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0,
> +			       NULL, flags);
>  	err = PTR_ERR(inode);
>  	if (!IS_ERR(inode)) {
>  		inode->i_op = &ext4_file_inode_operations;
> @@ -1791,7 +1798,7 @@ retry:
>  	if (IS_DIRSYNC(dir))
>  		ext4_handle_sync(handle);
>  
> -	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
> +	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL, 0);
>  	err = PTR_ERR(inode);
>  	if (!IS_ERR(inode)) {
>  		init_special_inode(inode, inode->i_mode, rdev);
> @@ -1831,7 +1838,7 @@ retry:
>  		ext4_handle_sync(handle);
>  
>  	inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
> -			       &dentry->d_name, 0, NULL);
> +			       &dentry->d_name, 0, NULL, 0);
>  	err = PTR_ERR(inode);
>  	if (IS_ERR(inode))
>  		goto out_stop;
> @@ -2278,7 +2285,7 @@ retry:
>  		ext4_handle_sync(handle);
>  
>  	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
> -			       &dentry->d_name, 0, NULL);
> +			       &dentry->d_name, 0, NULL, 0);
>  	err = PTR_ERR(inode);
>  	if (IS_ERR(inode))
>  		goto out_stop;

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o - April 19, 2012, 7:59 p.m.
On Thu, Apr 19, 2012 at 02:45:28PM -0500, Eric Sandeen wrote:
> 
> I'm curious to know how this will work for example on a linear device
> make up of rotational devices (possibly a concat of raids, etc).
> 
> At least for dm, it will be still marked as rotational,
> but the relative speed of regions of the linear device can't be inferred
> from the offset within the device.

Hmm, good point.  We need a way to determine whether this is some kind
of glued-together dm thing versus a plain-old HDD.

> Do we really have enough information about the storage under us to
> know what parts are "fast" and what parts are "slow?"

Well, plain and simple HDD's are still quite common; not everyone
drops in an intermediate dm layer.  I view dm as being similar to
enterprise storage arrays where we will need to pass down an explicit
hint with block ranges down to the storage device.  However, it's
going to be a long time before we get that part of the interface
plumbed in.

In the meantime, it would be nice if we had something that worked in
the common case of plain old stupid HDD's --- we just need a way of
determining that's what we are dealing with.

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger - April 19, 2012, 10:55 p.m.
On 2012-04-19, at 1:59 PM, Ted Ts'o wrote:
> On Thu, Apr 19, 2012 at 02:45:28PM -0500, Eric Sandeen wrote:
>> 
>> I'm curious to know how this will work for example on a linear device
>> make up of rotational devices (possibly a concat of raids, etc).
>> 
>> At least for dm, it will be still marked as rotational,
>> but the relative speed of regions of the linear device can't be inferred from the offset within the device.
> 
> Hmm, good point.  We need a way to determine whether this is some kind
> of glued-together dm thing versus a plain-old HDD.

I would posit that in a majority of cases that low-address blocks
are much more likely to be "fast" than high-address blocks.  This
is true for RAID-0,1,5,6, most LVs built atop those devices (since
they are allocated from low-to-high offset order).

It is true that some less common configurations (the above dm-concat)
may not follow this rule, but in that case the filesystem is not
worse off compared to not having this information at all.

>> Do we really have enough information about the storage under us to
>> know what parts are "fast" and what parts are "slow?"
> 
> Well, plain and simple HDD's are still quite common; not everyone
> drops in an intermediate dm layer.  I view dm as being similar to
> enterprise storage arrays where we will need to pass down an explicit
> hint with block ranges down to the storage device.  However, it's
> going to be a long time before we get that part of the interface
> plumbed in.
> 
> In the meantime, it would be nice if we had something that worked in
> the common case of plain old stupid HDD's --- we just need a way of
> determining that's what we are dealing with.

Also, if the admin knows (or can control) what these hints mean, then
they can configure the storage explicitly to match the usage.  I've
long been a proponent of configuring LVs with hybrid SSD+HDD storage,
so that ext4 can allocate inodes + directories on the SSD part of each
flex_bg, and files on the RAID-6 part of the flex_bg.  This kind of
API would allow files to be hinted similarly.

While having flexible kernel APIs that allowed the upper layers to
understand the underlying layout would be great, I also don't imagine
that this will arrive any time soon.  It will also take userspace and
application support to be able to leverage that, and we have to start
somewhere.

Cheers, Andreas
--
Andreas Dilger                       Whamcloud, Inc.
Principal Lustre Engineer            http://www.whamcloud.com/




--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner - April 19, 2012, 11:27 p.m.
On Thu, Apr 19, 2012 at 03:20:11PM -0400, Theodore Ts'o wrote:
> Wire up the use of the O_HOT and O_COLD open flags so that when an
> inode is being created, it can influence which part of the disk gets
> used on rotational storage devices.
.....
> @@ -508,13 +508,20 @@ fallback_retry:
>  }
>  
>  static int find_group_other(struct super_block *sb, struct inode *parent,
> -			    ext4_group_t *group, umode_t mode)
> +			    ext4_group_t *group, umode_t mode, int flags)
>  {
>  	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
>  	ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
>  	struct ext4_group_desc *desc;
>  	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
>  
> +	if ((flags & EXT4_NEWI_HOT) && (ngroups > 3) &&
> +	    (parent_group > ngroups / 3))
> +		parent_group = 0;
> +	if ((flags & EXT4_NEWI_COLD) && (ngroups > 3) &&
> +	    (parent_group < (2 * (ngroups / 3))))
> +		parent_group = 2 * (ngroups / 3);
> +

So you're assuming that locating the inodes somewhere "hot" is going
to improve performance. So say an application has a "hot" file (say
an index file) but still has a lot of other files it creates and
reads, and they are all in the same directory.

If the index file is created "hot", then it is going to be placed a
long way away from all the other files that applciation is using,
and every time you access the hot file you now seek away to a
different location on disk. The net result: the application goes
slower because average seek times have increased.

Essentially, an application is going to have to claim all files it
is working on at any point in time are either hot, normal or cold,
otherwise it is going to seek between hot, normal and cold regions
all the time. That's going to increase average seek times compared
to having all the files in the same general location, hot, cold or
otherwise.

Note: I'm not saying that O_HOT/O_COLD is a bad idea, just that it's
going to be had to implement in a way that behaves consistently in a
way that users would expect - i.e. improves performance.  IMO,
unless you have tiered storage and knowledge of the underlying block
device characteristics, then HOT/COLD are going to be very difficult
to implement sanely....

Cheers,

Dave.
Theodore Ts'o - April 20, 2012, 2:26 a.m.
On Fri, Apr 20, 2012 at 09:27:57AM +1000, Dave Chinner wrote:
> So you're assuming that locating the inodes somewhere "hot" is going
> to improve performance. So say an application has a "hot" file (say
> an index file) but still has a lot of other files it creates and
> reads, and they are all in the same directory.
> 
> If the index file is created "hot", then it is going to be placed a
> long way away from all the other files that applciation is using,
> and every time you access the hot file you now seek away to a
> different location on disk. The net result: the application goes
> slower because average seek times have increased.

Well, let's assume the application is using all or most of the disk,
so the objects it is fetching from the 2T disk are randomly
distributed throughout the disk.  Short seeks are faster, yes, but the
seek time as a function of the seek distance is decidedly non-linear,
with a sharp "knee" in the curve at around 10-15% of a full-stroke
seek.  (Ref:
http://static.usenix.org/event/fast05/tech/schlosser/schlosser.pdf)

So most of the time, as you seek back and forth fetching data objects,
most of the time you will be incurring 75-85% of the cost of a
worst-case seek anyway.  So seeking *is* going to be a fact of life
that we can't run away from that.

Given that, the question then is whether we are better off (a) putting
the index files in the exact middle of the disk, trying to minimize
seeks, (b) scattering the index files all over the disk randomly, or
(c) concentrating the index files near the beginning of the disk?
Given the non-linear seek times, it seems to suggest that (c) would
probably be the best case for this use case.

Note that when we short-stroke, it's not just a matter of minimizing
seek distances; if it were, then it wouldn't matter if we used the
first third of the disk closest to the outer edge, or the last third
of the disk closer to the inner part of the disk.

Granted this may be a relatively small effect compared to the huge
wins of placing your data according to its usage frequency on tiered
storage.  But the effect should still be there.

Cheers,

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner - April 21, 2012, 12:57 a.m.
On Thu, Apr 19, 2012 at 10:26:06PM -0400, Ted Ts'o wrote:
> On Fri, Apr 20, 2012 at 09:27:57AM +1000, Dave Chinner wrote:
> > So you're assuming that locating the inodes somewhere "hot" is going
> > to improve performance. So say an application has a "hot" file (say
> > an index file) but still has a lot of other files it creates and
> > reads, and they are all in the same directory.
> > 
> > If the index file is created "hot", then it is going to be placed a
> > long way away from all the other files that applciation is using,
> > and every time you access the hot file you now seek away to a
> > different location on disk. The net result: the application goes
> > slower because average seek times have increased.
> 
> Well, let's assume the application is using all or most of the disk,
> so the objects it is fetching from the 2T disk are randomly
> distributed throughout the disk.

Which is so far from most people's reality that it is not worth
considering. 

> Short seeks are faster, yes, but the
> seek time as a function of the seek distance is decidedly non-linear,
> with a sharp "knee" in the curve at around 10-15% of a full-stroke
> seek.  (Ref:
> http://static.usenix.org/event/fast05/tech/schlosser/schlosser.pdf)
> 
> So most of the time, as you seek back and forth fetching data objects,
> most of the time you will be incurring 75-85% of the cost of a
> worst-case seek anyway.  So seeking *is* going to be a fact of life
> that we can't run away from that.
> 
> Given that, the question then is whether we are better off (a) putting
> the index files in the exact middle of the disk, trying to minimize
> seeks, (b) scattering the index files all over the disk randomly, or
> (c) concentrating the index files near the beginning of the disk?
> Given the non-linear seek times, it seems to suggest that (c) would
> probably be the best case for this use case.

I disagree - based on that paper, you're better off putting all the
related application data in the same place, and hoping it all fits
in that 10-15% minimal seek time region....

Besides, you missed my point - that it is trivial to come up with
examples of what application writers think are their hot/cold/normal
data whose optimal layout bears no resemblence to your proposed
hot/cold/normal inode layout.  That's the fundamental problem here,
there is no obvious definition of HOT/COLD, and that the best
implementation depends on how the application uses those flags
combined with the characteristics of the underlying storage. IOws,
however you optimise it for a single spindle, a large percentage of
the time it is going to be detrimental to performance, not improve
it....

Cheers,

Dave.

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0e01e90..6539c9a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1553,6 +1553,12 @@  struct ext4_dir_entry_2 {
 #define EXT4_MAX_REC_LEN		((1<<16)-1)
 
 /*
+ * Flags for ext4_new_inode()
+ */
+#define EXT4_NEWI_HOT	0x0001
+#define EXT4_NEWI_COLD	0x0002
+
+/*
  * If we ever get support for fs block sizes > page_size, we'll need
  * to remove the #if statements in the next two functions...
  */
@@ -1850,7 +1856,7 @@  extern int ext4fs_dirhash(const char *name, int len, struct
 /* ialloc.c */
 extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t,
 				    const struct qstr *qstr, __u32 goal,
-				    uid_t *owner);
+				    uid_t *owner, int flags);
 extern void ext4_free_inode(handle_t *, struct inode *);
 extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 409c2ee..3dcc8c8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -363,7 +363,7 @@  static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
 
 static int find_group_orlov(struct super_block *sb, struct inode *parent,
 			    ext4_group_t *group, umode_t mode,
-			    const struct qstr *qstr)
+			    const struct qstr *qstr, int flags)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -508,13 +508,20 @@  fallback_retry:
 }
 
 static int find_group_other(struct super_block *sb, struct inode *parent,
-			    ext4_group_t *group, umode_t mode)
+			    ext4_group_t *group, umode_t mode, int flags)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
 	struct ext4_group_desc *desc;
 	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
 
+	if ((flags & EXT4_NEWI_HOT) && (ngroups > 3) &&
+	    (parent_group > ngroups / 3))
+		parent_group = 0;
+	if ((flags & EXT4_NEWI_COLD) && (ngroups > 3) &&
+	    (parent_group < (2 * (ngroups / 3))))
+		parent_group = 2 * (ngroups / 3);
+
 	/*
 	 * Try to place the inode is the same flex group as its
 	 * parent.  If we can't find space, use the Orlov algorithm to
@@ -550,7 +557,7 @@  static int find_group_other(struct super_block *sb, struct inode *parent,
 		*group = parent_group + flex_size;
 		if (*group > ngroups)
 			*group = 0;
-		return find_group_orlov(sb, parent, group, mode, NULL);
+		return find_group_orlov(sb, parent, group, mode, NULL, flags);
 	}
 
 	/*
@@ -614,7 +621,8 @@  static int find_group_other(struct super_block *sb, struct inode *parent,
  * group to find a free inode.
  */
 struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
-			     const struct qstr *qstr, __u32 goal, uid_t *owner)
+			     const struct qstr *qstr, __u32 goal, uid_t *owner,
+			     int flags)
 {
 	struct super_block *sb;
 	struct buffer_head *inode_bitmap_bh = NULL;
@@ -643,6 +651,19 @@  struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
 	ei = EXT4_I(inode);
 	sbi = EXT4_SB(sb);
 
+	if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev)))
+		flags &= ~(EXT4_NEWI_HOT | EXT4_NEWI_COLD);
+
+	/* 
+	 * We will only allow the HOT flag if the user passes the
+	 * reserved uid/gid check, or if she has CAP_SYS_RESOURCE
+	 */
+	if ((flags & EXT4_NEWI_HOT) && 
+	    !(sbi->s_resuid == current_fsuid() ||
+	      ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
+	      capable(CAP_SYS_RESOURCE)))
+		flags &= ~EXT4_NEWI_HOT;
+
 	if (!goal)
 		goal = sbi->s_inode_goal;
 
@@ -654,9 +675,9 @@  struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
 	}
 
 	if (S_ISDIR(mode))
-		ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
+		ret2 = find_group_orlov(sb, dir, &group, mode, qstr, flags);
 	else
-		ret2 = find_group_other(sb, dir, &group, mode);
+		ret2 = find_group_other(sb, dir, &group, mode, flags);
 
 got_group:
 	EXT4_I(dir)->i_last_alloc_group = group;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f39f80f..2b3d65c 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -469,7 +469,7 @@  int ext4_ext_migrate(struct inode *inode)
 	owner[0] = inode->i_uid;
 	owner[1] = inode->i_gid;
 	tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
-				   S_IFREG, NULL, goal, owner);
+				   S_IFREG, NULL, goal, owner, 0);
 	if (IS_ERR(tmp_inode)) {
 		retval = PTR_ERR(tmp_inode);
 		ext4_journal_stop(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6f48ff8..222a419 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1742,6 +1742,7 @@  static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	handle_t *handle;
 	struct inode *inode;
 	int err, retries = 0;
+	int flags = 0;
 
 	dquot_initialize(dir);
 
@@ -1755,7 +1756,13 @@  retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
+	if (op && op->open_flag & O_HOT)
+		flags |= EXT4_NEWI_HOT;
+	if (op && op->open_flag & O_COLD)
+		flags |= EXT4_NEWI_COLD;
+
+	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0,
+			       NULL, flags);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ext4_file_inode_operations;
@@ -1791,7 +1798,7 @@  retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
+	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL, 0);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
@@ -1831,7 +1838,7 @@  retry:
 		ext4_handle_sync(handle);
 
 	inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
-			       &dentry->d_name, 0, NULL);
+			       &dentry->d_name, 0, NULL, 0);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
@@ -2278,7 +2285,7 @@  retry:
 		ext4_handle_sync(handle);
 
 	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
-			       &dentry->d_name, 0, NULL);
+			       &dentry->d_name, 0, NULL, 0);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;