Message ID | 1334863211-19504-4-git-send-email-tytso@mit.edu |
---|---|
State | Not Applicable, archived |
Headers | show |
On 4/19/12 2:20 PM, Theodore Ts'o wrote: > Wire up the use of the O_HOT and O_COLD open flags so that when an > inode is being created, it can influence which part of the disk gets > used on rotational storage devices. I'm curious to know how this will work for example on a linear device make up of rotational devices (possibly a concat of raids, etc). At least for dm, it will be still marked as rotational, but the relative speed of regions of the linear device can't be inferred from the offset within the device. Do we really have enough information about the storage under us to know what parts are "fast" and what parts are "slow?" -Eric > Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> > --- > fs/ext4/ext4.h | 8 +++++++- > fs/ext4/ialloc.c | 33 +++++++++++++++++++++++++++------ > fs/ext4/migrate.c | 2 +- > fs/ext4/namei.c | 15 +++++++++++---- > 4 files changed, 46 insertions(+), 12 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 0e01e90..6539c9a 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -1553,6 +1553,12 @@ struct ext4_dir_entry_2 { > #define EXT4_MAX_REC_LEN ((1<<16)-1) > > /* > + * Flags for ext4_new_inode() > + */ > +#define EXT4_NEWI_HOT 0x0001 > +#define EXT4_NEWI_COLD 0x0002 > + > +/* > * If we ever get support for fs block sizes > page_size, we'll need > * to remove the #if statements in the next two functions... > */ > @@ -1850,7 +1856,7 @@ extern int ext4fs_dirhash(const char *name, int len, struct > /* ialloc.c */ > extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t, > const struct qstr *qstr, __u32 goal, > - uid_t *owner); > + uid_t *owner, int flags); > extern void ext4_free_inode(handle_t *, struct inode *); > extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); > extern unsigned long ext4_count_free_inodes(struct super_block *); > diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c > index 409c2ee..3dcc8c8 100644 > --- a/fs/ext4/ialloc.c > +++ b/fs/ext4/ialloc.c > @@ -363,7 +363,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, > > static int find_group_orlov(struct super_block *sb, struct inode *parent, > ext4_group_t *group, umode_t mode, > - const struct qstr *qstr) > + const struct qstr *qstr, int flags) > { > ext4_group_t parent_group = EXT4_I(parent)->i_block_group; > struct ext4_sb_info *sbi = EXT4_SB(sb); > @@ -508,13 +508,20 @@ fallback_retry: > } > > static int find_group_other(struct super_block *sb, struct inode *parent, > - ext4_group_t *group, umode_t mode) > + ext4_group_t *group, umode_t mode, int flags) > { > ext4_group_t parent_group = EXT4_I(parent)->i_block_group; > ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); > struct ext4_group_desc *desc; > int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); > > + if ((flags & EXT4_NEWI_HOT) && (ngroups > 3) && > + (parent_group > ngroups / 3)) > + parent_group = 0; > + if ((flags & EXT4_NEWI_COLD) && (ngroups > 3) && > + (parent_group < (2 * (ngroups / 3)))) > + parent_group = 2 * (ngroups / 3); > + > /* > * Try to place the inode is the same flex group as its > * parent. If we can't find space, use the Orlov algorithm to > @@ -550,7 +557,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, > *group = parent_group + flex_size; > if (*group > ngroups) > *group = 0; > - return find_group_orlov(sb, parent, group, mode, NULL); > + return find_group_orlov(sb, parent, group, mode, NULL, flags); > } > > /* > @@ -614,7 +621,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, > * group to find a free inode. > */ > struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, > - const struct qstr *qstr, __u32 goal, uid_t *owner) > + const struct qstr *qstr, __u32 goal, uid_t *owner, > + int flags) > { > struct super_block *sb; > struct buffer_head *inode_bitmap_bh = NULL; > @@ -643,6 +651,19 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, > ei = EXT4_I(inode); > sbi = EXT4_SB(sb); > > + if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev))) > + flags &= ~(EXT4_NEWI_HOT | EXT4_NEWI_COLD); > + > + /* > + * We will only allow the HOT flag if the user passes the > + * reserved uid/gid check, or if she has CAP_SYS_RESOURCE > + */ > + if ((flags & EXT4_NEWI_HOT) && > + !(sbi->s_resuid == current_fsuid() || > + ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || > + capable(CAP_SYS_RESOURCE))) > + flags &= ~EXT4_NEWI_HOT; > + > if (!goal) > goal = sbi->s_inode_goal; > > @@ -654,9 +675,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, > } > > if (S_ISDIR(mode)) > - ret2 = find_group_orlov(sb, dir, &group, mode, qstr); > + ret2 = find_group_orlov(sb, dir, &group, mode, qstr, flags); > else > - ret2 = find_group_other(sb, dir, &group, mode); > + ret2 = find_group_other(sb, dir, &group, mode, flags); > > got_group: > EXT4_I(dir)->i_last_alloc_group = group; > diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c > index f39f80f..2b3d65c 100644 > --- a/fs/ext4/migrate.c > +++ b/fs/ext4/migrate.c > @@ -469,7 +469,7 @@ int ext4_ext_migrate(struct inode *inode) > owner[0] = inode->i_uid; > owner[1] = inode->i_gid; > tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, > - S_IFREG, NULL, goal, owner); > + S_IFREG, NULL, goal, owner, 0); > if (IS_ERR(tmp_inode)) { > retval = PTR_ERR(tmp_inode); > ext4_journal_stop(handle); > diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c > index 6f48ff8..222a419 100644 > --- a/fs/ext4/namei.c > +++ b/fs/ext4/namei.c > @@ -1742,6 +1742,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, > handle_t *handle; > struct inode *inode; > int err, retries = 0; > + int flags = 0; > > dquot_initialize(dir); > > @@ -1755,7 +1756,13 @@ retry: > if (IS_DIRSYNC(dir)) > ext4_handle_sync(handle); > > - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); > + if (op && op->open_flag & O_HOT) > + flags |= EXT4_NEWI_HOT; > + if (op && op->open_flag & O_COLD) > + flags |= EXT4_NEWI_COLD; > + > + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, > + NULL, flags); > err = PTR_ERR(inode); > if (!IS_ERR(inode)) { > inode->i_op = &ext4_file_inode_operations; > @@ -1791,7 +1798,7 @@ retry: > if (IS_DIRSYNC(dir)) > ext4_handle_sync(handle); > > - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); > + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL, 0); > err = PTR_ERR(inode); > if (!IS_ERR(inode)) { > init_special_inode(inode, inode->i_mode, rdev); > @@ -1831,7 +1838,7 @@ retry: > ext4_handle_sync(handle); > > inode = ext4_new_inode(handle, dir, S_IFDIR | mode, > - &dentry->d_name, 0, NULL); > + &dentry->d_name, 0, NULL, 0); > err = PTR_ERR(inode); > if (IS_ERR(inode)) > goto out_stop; > @@ -2278,7 +2285,7 @@ retry: > ext4_handle_sync(handle); > > inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, > - &dentry->d_name, 0, NULL); > + &dentry->d_name, 0, NULL, 0); > err = PTR_ERR(inode); > if (IS_ERR(inode)) > goto out_stop; -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Apr 19, 2012 at 02:45:28PM -0500, Eric Sandeen wrote: > > I'm curious to know how this will work for example on a linear device > make up of rotational devices (possibly a concat of raids, etc). > > At least for dm, it will be still marked as rotational, > but the relative speed of regions of the linear device can't be inferred > from the offset within the device. Hmm, good point. We need a way to determine whether this is some kind of glued-together dm thing versus a plain-old HDD. > Do we really have enough information about the storage under us to > know what parts are "fast" and what parts are "slow?" Well, plain and simple HDD's are still quite common; not everyone drops in an intermediate dm layer. I view dm as being similar to enterprise storage arrays where we will need to pass down an explicit hint with block ranges down to the storage device. However, it's going to be a long time before we get that part of the interface plumbed in. In the meantime, it would be nice if we had something that worked in the common case of plain old stupid HDD's --- we just need a way of determining that's what we are dealing with. - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2012-04-19, at 1:59 PM, Ted Ts'o wrote: > On Thu, Apr 19, 2012 at 02:45:28PM -0500, Eric Sandeen wrote: >> >> I'm curious to know how this will work for example on a linear device >> make up of rotational devices (possibly a concat of raids, etc). >> >> At least for dm, it will be still marked as rotational, >> but the relative speed of regions of the linear device can't be inferred from the offset within the device. > > Hmm, good point. We need a way to determine whether this is some kind > of glued-together dm thing versus a plain-old HDD. I would posit that in a majority of cases that low-address blocks are much more likely to be "fast" than high-address blocks. This is true for RAID-0,1,5,6, most LVs built atop those devices (since they are allocated from low-to-high offset order). It is true that some less common configurations (the above dm-concat) may not follow this rule, but in that case the filesystem is not worse off compared to not having this information at all. >> Do we really have enough information about the storage under us to >> know what parts are "fast" and what parts are "slow?" > > Well, plain and simple HDD's are still quite common; not everyone > drops in an intermediate dm layer. I view dm as being similar to > enterprise storage arrays where we will need to pass down an explicit > hint with block ranges down to the storage device. However, it's > going to be a long time before we get that part of the interface > plumbed in. > > In the meantime, it would be nice if we had something that worked in > the common case of plain old stupid HDD's --- we just need a way of > determining that's what we are dealing with. Also, if the admin knows (or can control) what these hints mean, then they can configure the storage explicitly to match the usage. I've long been a proponent of configuring LVs with hybrid SSD+HDD storage, so that ext4 can allocate inodes + directories on the SSD part of each flex_bg, and files on the RAID-6 part of the flex_bg. This kind of API would allow files to be hinted similarly. While having flexible kernel APIs that allowed the upper layers to understand the underlying layout would be great, I also don't imagine that this will arrive any time soon. It will also take userspace and application support to be able to leverage that, and we have to start somewhere. Cheers, Andreas -- Andreas Dilger Whamcloud, Inc. Principal Lustre Engineer http://www.whamcloud.com/ -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Apr 19, 2012 at 03:20:11PM -0400, Theodore Ts'o wrote: > Wire up the use of the O_HOT and O_COLD open flags so that when an > inode is being created, it can influence which part of the disk gets > used on rotational storage devices. ..... > @@ -508,13 +508,20 @@ fallback_retry: > } > > static int find_group_other(struct super_block *sb, struct inode *parent, > - ext4_group_t *group, umode_t mode) > + ext4_group_t *group, umode_t mode, int flags) > { > ext4_group_t parent_group = EXT4_I(parent)->i_block_group; > ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); > struct ext4_group_desc *desc; > int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); > > + if ((flags & EXT4_NEWI_HOT) && (ngroups > 3) && > + (parent_group > ngroups / 3)) > + parent_group = 0; > + if ((flags & EXT4_NEWI_COLD) && (ngroups > 3) && > + (parent_group < (2 * (ngroups / 3)))) > + parent_group = 2 * (ngroups / 3); > + So you're assuming that locating the inodes somewhere "hot" is going to improve performance. So say an application has a "hot" file (say an index file) but still has a lot of other files it creates and reads, and they are all in the same directory. If the index file is created "hot", then it is going to be placed a long way away from all the other files that applciation is using, and every time you access the hot file you now seek away to a different location on disk. The net result: the application goes slower because average seek times have increased. Essentially, an application is going to have to claim all files it is working on at any point in time are either hot, normal or cold, otherwise it is going to seek between hot, normal and cold regions all the time. That's going to increase average seek times compared to having all the files in the same general location, hot, cold or otherwise. Note: I'm not saying that O_HOT/O_COLD is a bad idea, just that it's going to be had to implement in a way that behaves consistently in a way that users would expect - i.e. improves performance. IMO, unless you have tiered storage and knowledge of the underlying block device characteristics, then HOT/COLD are going to be very difficult to implement sanely.... Cheers, Dave.
On Fri, Apr 20, 2012 at 09:27:57AM +1000, Dave Chinner wrote: > So you're assuming that locating the inodes somewhere "hot" is going > to improve performance. So say an application has a "hot" file (say > an index file) but still has a lot of other files it creates and > reads, and they are all in the same directory. > > If the index file is created "hot", then it is going to be placed a > long way away from all the other files that applciation is using, > and every time you access the hot file you now seek away to a > different location on disk. The net result: the application goes > slower because average seek times have increased. Well, let's assume the application is using all or most of the disk, so the objects it is fetching from the 2T disk are randomly distributed throughout the disk. Short seeks are faster, yes, but the seek time as a function of the seek distance is decidedly non-linear, with a sharp "knee" in the curve at around 10-15% of a full-stroke seek. (Ref: http://static.usenix.org/event/fast05/tech/schlosser/schlosser.pdf) So most of the time, as you seek back and forth fetching data objects, most of the time you will be incurring 75-85% of the cost of a worst-case seek anyway. So seeking *is* going to be a fact of life that we can't run away from that. Given that, the question then is whether we are better off (a) putting the index files in the exact middle of the disk, trying to minimize seeks, (b) scattering the index files all over the disk randomly, or (c) concentrating the index files near the beginning of the disk? Given the non-linear seek times, it seems to suggest that (c) would probably be the best case for this use case. Note that when we short-stroke, it's not just a matter of minimizing seek distances; if it were, then it wouldn't matter if we used the first third of the disk closest to the outer edge, or the last third of the disk closer to the inner part of the disk. Granted this may be a relatively small effect compared to the huge wins of placing your data according to its usage frequency on tiered storage. But the effect should still be there. Cheers, - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Apr 19, 2012 at 10:26:06PM -0400, Ted Ts'o wrote: > On Fri, Apr 20, 2012 at 09:27:57AM +1000, Dave Chinner wrote: > > So you're assuming that locating the inodes somewhere "hot" is going > > to improve performance. So say an application has a "hot" file (say > > an index file) but still has a lot of other files it creates and > > reads, and they are all in the same directory. > > > > If the index file is created "hot", then it is going to be placed a > > long way away from all the other files that applciation is using, > > and every time you access the hot file you now seek away to a > > different location on disk. The net result: the application goes > > slower because average seek times have increased. > > Well, let's assume the application is using all or most of the disk, > so the objects it is fetching from the 2T disk are randomly > distributed throughout the disk. Which is so far from most people's reality that it is not worth considering. > Short seeks are faster, yes, but the > seek time as a function of the seek distance is decidedly non-linear, > with a sharp "knee" in the curve at around 10-15% of a full-stroke > seek. (Ref: > http://static.usenix.org/event/fast05/tech/schlosser/schlosser.pdf) > > So most of the time, as you seek back and forth fetching data objects, > most of the time you will be incurring 75-85% of the cost of a > worst-case seek anyway. So seeking *is* going to be a fact of life > that we can't run away from that. > > Given that, the question then is whether we are better off (a) putting > the index files in the exact middle of the disk, trying to minimize > seeks, (b) scattering the index files all over the disk randomly, or > (c) concentrating the index files near the beginning of the disk? > Given the non-linear seek times, it seems to suggest that (c) would > probably be the best case for this use case. I disagree - based on that paper, you're better off putting all the related application data in the same place, and hoping it all fits in that 10-15% minimal seek time region.... Besides, you missed my point - that it is trivial to come up with examples of what application writers think are their hot/cold/normal data whose optimal layout bears no resemblence to your proposed hot/cold/normal inode layout. That's the fundamental problem here, there is no obvious definition of HOT/COLD, and that the best implementation depends on how the application uses those flags combined with the characteristics of the underlying storage. IOws, however you optimise it for a single spindle, a large percentage of the time it is going to be detrimental to performance, not improve it.... Cheers, Dave.
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0e01e90..6539c9a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1553,6 +1553,12 @@ struct ext4_dir_entry_2 { #define EXT4_MAX_REC_LEN ((1<<16)-1) /* + * Flags for ext4_new_inode() + */ +#define EXT4_NEWI_HOT 0x0001 +#define EXT4_NEWI_COLD 0x0002 + +/* * If we ever get support for fs block sizes > page_size, we'll need * to remove the #if statements in the next two functions... */ @@ -1850,7 +1856,7 @@ extern int ext4fs_dirhash(const char *name, int len, struct /* ialloc.c */ extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t, const struct qstr *qstr, __u32 goal, - uid_t *owner); + uid_t *owner, int flags); extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 409c2ee..3dcc8c8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -363,7 +363,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4_group_t *group, umode_t mode, - const struct qstr *qstr) + const struct qstr *qstr, int flags) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -508,13 +508,20 @@ fallback_retry: } static int find_group_other(struct super_block *sb, struct inode *parent, - ext4_group_t *group, umode_t mode) + ext4_group_t *group, umode_t mode, int flags) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); + if ((flags & EXT4_NEWI_HOT) && (ngroups > 3) && + (parent_group > ngroups / 3)) + parent_group = 0; + if ((flags & EXT4_NEWI_COLD) && (ngroups > 3) && + (parent_group < (2 * (ngroups / 3)))) + parent_group = 2 * (ngroups / 3); + /* * Try to place the inode is the same flex group as its * parent. If we can't find space, use the Orlov algorithm to @@ -550,7 +557,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, *group = parent_group + flex_size; if (*group > ngroups) *group = 0; - return find_group_orlov(sb, parent, group, mode, NULL); + return find_group_orlov(sb, parent, group, mode, NULL, flags); } /* @@ -614,7 +621,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, * group to find a free inode. */ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, - const struct qstr *qstr, __u32 goal, uid_t *owner) + const struct qstr *qstr, __u32 goal, uid_t *owner, + int flags) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -643,6 +651,19 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, ei = EXT4_I(inode); sbi = EXT4_SB(sb); + if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev))) + flags &= ~(EXT4_NEWI_HOT | EXT4_NEWI_COLD); + + /* + * We will only allow the HOT flag if the user passes the + * reserved uid/gid check, or if she has CAP_SYS_RESOURCE + */ + if ((flags & EXT4_NEWI_HOT) && + !(sbi->s_resuid == current_fsuid() || + ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || + capable(CAP_SYS_RESOURCE))) + flags &= ~EXT4_NEWI_HOT; + if (!goal) goal = sbi->s_inode_goal; @@ -654,9 +675,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, } if (S_ISDIR(mode)) - ret2 = find_group_orlov(sb, dir, &group, mode, qstr); + ret2 = find_group_orlov(sb, dir, &group, mode, qstr, flags); else - ret2 = find_group_other(sb, dir, &group, mode); + ret2 = find_group_other(sb, dir, &group, mode, flags); got_group: EXT4_I(dir)->i_last_alloc_group = group; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index f39f80f..2b3d65c 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -469,7 +469,7 @@ int ext4_ext_migrate(struct inode *inode) owner[0] = inode->i_uid; owner[1] = inode->i_gid; tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, - S_IFREG, NULL, goal, owner); + S_IFREG, NULL, goal, owner, 0); if (IS_ERR(tmp_inode)) { retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6f48ff8..222a419 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1742,6 +1742,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, handle_t *handle; struct inode *inode; int err, retries = 0; + int flags = 0; dquot_initialize(dir); @@ -1755,7 +1756,13 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); + if (op && op->open_flag & O_HOT) + flags |= EXT4_NEWI_HOT; + if (op && op->open_flag & O_COLD) + flags |= EXT4_NEWI_COLD; + + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, + NULL, flags); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; @@ -1791,7 +1798,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL, 0); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); @@ -1831,7 +1838,7 @@ retry: ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, S_IFDIR | mode, - &dentry->d_name, 0, NULL); + &dentry->d_name, 0, NULL, 0); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2278,7 +2285,7 @@ retry: ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, - &dentry->d_name, 0, NULL); + &dentry->d_name, 0, NULL, 0); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop;
Wire up the use of the O_HOT and O_COLD open flags so that when an inode is being created, it can influence which part of the disk gets used on rotational storage devices. Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/ext4.h | 8 +++++++- fs/ext4/ialloc.c | 33 +++++++++++++++++++++++++++------ fs/ext4/migrate.c | 2 +- fs/ext4/namei.c | 15 +++++++++++---- 4 files changed, 46 insertions(+), 12 deletions(-)