@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o migrate.o mballoc.o
+ ext4_jbd2.o migrate.o mballoc.o defrag.o
ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
new file mode 100644
@@ -0,0 +1,499 @@
+/*
+ * Copyright (c) 2008, NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ * Akira Fujita <a-fujita@rs.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* Online defragmentation for EXT4 */
+
+#include <linux/quotaops.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "group.h"
+
+/**
+ * ext4_defrag_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode: inode which is searched
+ * @path: this will obtain data for the next extent
+ * @extent: pointer to the next extent we have just gotten
+ *
+ * This function returns 0 or 1(last entry) if succeed, otherwise
+ * returns -EIO.
+ */
+static int
+ext4_defrag_next_extent(struct inode *inode, struct ext4_ext_path *path,
+ struct ext4_extent **extent)
+{
+ int ppos, leaf_ppos = path->p_depth;
+
+ ppos = leaf_ppos;
+ if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
+ /* leaf block */
+ *extent = ++path[ppos].p_ext;
+ return 0;
+ }
+
+ while (--ppos >= 0) {
+ if (EXT_LAST_INDEX(path[ppos].p_hdr) >
+ path[ppos].p_idx) {
+ int cur_ppos = ppos;
+
+ /* index block */
+ path[ppos].p_idx++;
+ path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+ if (path[ppos+1].p_bh)
+ brelse(path[ppos+1].p_bh);
+ path[ppos+1].p_bh =
+ sb_bread(inode->i_sb, path[ppos].p_block);
+ if (!path[ppos+1].p_bh)
+ goto err;
+ path[ppos+1].p_hdr =
+ ext_block_hdr(path[ppos+1].p_bh);
+
+ /* Halfway index block */
+ while (++cur_ppos < leaf_ppos) {
+ path[cur_ppos].p_idx =
+ EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
+ path[cur_ppos].p_block =
+ idx_pblock(path[cur_ppos].p_idx);
+ if (path[cur_ppos+1].p_bh)
+ brelse(path[cur_ppos+1].p_bh);
+ path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
+ path[cur_ppos].p_block);
+ if (!path[cur_ppos+1].p_bh)
+ goto err;
+ path[cur_ppos+1].p_hdr =
+ ext_block_hdr(path[cur_ppos+1].p_bh);
+ }
+
+ /* leaf block */
+ path[leaf_ppos].p_ext = *extent =
+ EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+ return 0;
+ }
+ }
+ /* We found the last extent */
+ return 1;
+err:
+ if (path)
+ ext4_ext_drop_refs(path);
+ return -EIO;
+}
+
+/**
+ * ext4_defrag_partial - Defrag a file per page
+ *
+ * @tmp_inode: temporary inode
+ * @filp: pointer to file
+ * @org_page_offset: page index on original file
+ * @dest_blk_offset: block index on temporary file
+ * @data_offset_in_page: block index where data swapping starts
+ * @block_len_in_page: the number of blocks to be swapped
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_partial(struct inode *tmp_inode, struct file *filp,
+ pgoff_t org_page_offset, ext4_lblk_t dest_blk_offset,
+ int data_offset_in_page, int block_len_in_page)
+{
+ return 0;
+}
+
+/**
+ * ext4_defrag_new_extent_tree - Get contiguous blocks and build an extent tree
+ *
+ * @org_inode: original inode
+ * @tmp_inode: temporary inode
+ * @org_path: indicating the original inode's extent
+ * @req_start: starting offset to allocate in blocks
+ * @req_blocks: the number of blocks to allocate
+ * @iblock: file related offset
+ *
+ * This function returns the value as below:
+ * 0 (succeed)
+ * 1 (not improved)
+ * negative value (error case)
+ */
+static int
+ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode,
+ struct ext4_ext_path *org_path, ext4_lblk_t req_start,
+ ext4_lblk_t req_blocks, ext4_lblk_t iblock)
+{
+ return 0;
+}
+
+/**
+ * ext4_defrag_check - Check the environment whether a defrag can be done
+ *
+ * @org_inode: original inode
+ * @defrag_size: size of defrag in blocks
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_check(struct inode *org_inode, ext4_lblk_t defrag_size)
+{
+
+ /* Ext4 online defrag supports only extent based file */
+ if (!(EXT4_I(org_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ printk(KERN_ERR "ext4 defrag: ino[%lu] is not extents "
+ "based file\n", org_inode->i_ino);
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Will go away.
+ * Now ext4 online defrag does not support flex_bg feature.
+ */
+ if (EXT4_HAS_INCOMPAT_FEATURE(org_inode->i_sb,
+ EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ printk(KERN_ERR "ext4 defrag: online defrag does not support "
+ "flex_bg feature.\n");
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+/**
+ * ext4_defrag_init_tmp_inode - Create a temporary inode
+ *
+ * @org_inode: original inode
+ *
+ * This function returns pointer to the struct inode if succeed,
+ * otherwise returns error value.
+ */
+static struct inode *
+ext4_defrag_init_tmp_inode(struct inode *org_inode)
+{
+ handle_t *handle;
+ struct inode *tmp_inode;
+
+ handle = ext4_journal_start(org_inode,
+ EXT4_DATA_TRANS_BLOCKS(org_inode->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 4 +
+ 2 * EXT4_QUOTA_INIT_BLOCKS(org_inode->i_sb));
+ if (IS_ERR(handle))
+ /* Return error code */
+ return (struct inode *)handle;
+
+ tmp_inode = ext4_new_inode(handle,
+ org_inode->i_sb->s_root->d_inode, S_IFREG);
+ if (IS_ERR(tmp_inode))
+ goto out;
+
+ i_size_write(tmp_inode, i_size_read(org_inode));
+ tmp_inode->i_nlink = 0;
+ ext4_ext_tree_init(handle, tmp_inode);
+ ext4_orphan_add(handle, tmp_inode);
+
+out:
+ ext4_journal_stop(handle);
+
+ return tmp_inode;
+}
+
+/**
+ * ext4_defrag - Defrag the specified range of a file
+ *
+ * If no-option is specified, ext4_defrag() proceeds the following order.
+ * 1.ext4_defrag() calculates the block number where defrag terminates
+ * by the start block number(defrag_start) and the size of defraged data
+ * (defrag_size) specified as arguments.
+ * If the defrag_start points a hole, the extent's start offset pointed by
+ * ext_cur(current extent), holecheck_path, org_path are set after
+ * hole behind.
+ * 2.Continue step 3 to step 5, until the holecheck_path points to last_extent
+ * or the ext_cur exceeds the block_end which is last logical block number.
+ * 3.To get a length of continues area, call ext4_defrag_next_extent()
+ * specified with the ext_cur(initial value is holecheck_path) re-cursive,
+ * until find un-continuous extent, the start logical block number exceeds
+ * the block_end or the extent points to the last extent.
+ * 4.After determining the length of continuous block,
+ * allocates continuous blocks to a temporary inode
+ * by ext4_defrag_new_extent_tree().
+ * 5.Exchange the original inode data with temporary inode data
+ * from org_page_offset to seq_end_page.
+ * The start indexes of data are specified as arguments.
+ * That of the original inode is org_page_offset,
+ * and that of the temporary inode is dest_block_offset
+ * (To easily handle blocksize != pagesize case, the offset for the
+ * temporary inode is block unit).
+ * 6.Update holecheck_path and org_path to points a next proceeding extent,
+ * and release the temporary inode holding the original fragmented data.
+ * Then, returns to step 2.
+ * 7.Release holecheck_path, org_path and temporary inode,
+ * and returns the defrag_size which is the size of defraged data.
+ * The defrag_size is used for the command to calculate the file offset
+ * where a next defrag processing start.
+ * (Since the defrag command calls defrag_ioctl() by 64MB unit,
+ * a file bigger than 64MB calls defrag_ioctl many times.)
+ *
+ * @filp: pointer to file
+ * @block_start: starting offset to defrag in blocks
+ * @defrag_size: size of defrag in blocks
+ *
+ * This function returns the number of blocks if succeed, otherwise
+ * returns error value.
+ */
+int
+ext4_defrag(struct file *filp, ext4_lblk_t block_start,
+ ext4_lblk_t defrag_size)
+{
+ struct inode *org_inode = filp->f_dentry->d_inode, *tmp_inode = NULL;
+ struct ext4_ext_path *org_path = NULL, *holecheck_path = NULL;
+ struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+ ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+ ext4_lblk_t dest_block_offset;
+ ext4_lblk_t rest_blocks;
+ pgoff_t org_page_offset, seq_end_page;
+ int ret, depth, seq_extents, last_extent = 0;
+ int blocks_per_page;
+ int data_offset_in_page;
+ int block_len_in_page;
+
+ /* Check the filesystem environment whether defrag can be done */
+ ret = ext4_defrag_check(org_inode, defrag_size);
+ if (ret < 0)
+ return ret;
+
+ blocks_per_page = PAGE_CACHE_SIZE >> org_inode->i_blkbits;
+ file_end = (org_inode->i_size - 1) >> org_inode->i_blkbits;
+ block_end = block_start + defrag_size - 1;
+ if (file_end < block_end)
+ defrag_size -= block_end - file_end;
+
+ mutex_lock(&org_inode->i_mutex);
+ down_write(&EXT4_I(org_inode)->i_data_sem);
+
+ org_path = ext4_ext_find_extent(org_inode, block_start, NULL);
+ if (IS_ERR(org_path)) {
+ ret = PTR_ERR(org_path);
+ org_path = NULL;
+ goto out;
+ }
+
+ /* Get path structure to check the hole */
+ holecheck_path = ext4_ext_find_extent(org_inode, block_start, NULL);
+ if (IS_ERR(holecheck_path)) {
+ ret = PTR_ERR(holecheck_path);
+ holecheck_path = NULL;
+ goto out;
+ }
+
+ depth = ext_depth(org_inode);
+ ext_cur = holecheck_path[depth].p_ext;
+ if (ext_cur == NULL)
+ goto out;
+
+ /*
+ * Get proper extent whose ee_block is beyond block_start
+ * if block_start was within the hole.
+ */
+ if (le32_to_cpu(ext_cur->ee_block) +
+ le16_to_cpu(ext_cur->ee_len) - 1 < block_start) {
+ last_extent = ext4_defrag_next_extent(org_inode,
+ holecheck_path, &ext_cur);
+ if (last_extent < 0) {
+ ret = last_extent;
+ goto out;
+ }
+ last_extent = ext4_defrag_next_extent(org_inode, org_path,
+ &ext_dummy);
+ if (last_extent < 0) {
+ ret = last_extent;
+ goto out;
+ }
+ }
+ seq_extents = 1;
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+
+ /* No blocks within the specified range. */
+ if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+ printk(KERN_INFO "ext4 defrag: The specified range of file"
+ " may be the hole\n");
+ goto out;
+ }
+
+ /* Adjust start blocks */
+ add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+ le16_to_cpu(ext_cur->ee_len), block_end + 1) -
+ max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+ while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+ seq_blocks += add_blocks;
+
+ /* Create a temporary inode to be exchanged data block */
+ tmp_inode = ext4_defrag_init_tmp_inode(org_inode);
+ if (IS_ERR(tmp_inode)) {
+ ret = PTR_ERR(tmp_inode);
+ tmp_inode = NULL;
+ goto out;
+ }
+
+ /* Adjust tail blocks */
+ if (seq_start + seq_blocks - 1 > block_end)
+ seq_blocks = block_end - seq_start + 1;
+
+ ext_prev = ext_cur;
+ last_extent = ext4_defrag_next_extent(org_inode,
+ holecheck_path, &ext_cur);
+ if (last_extent < 0) {
+ ret = last_extent;
+ break;
+ }
+ if (!last_extent)
+ seq_extents++;
+ add_blocks = le16_to_cpu(ext_cur->ee_len);
+
+ /*
+ * Extend the length of contiguous block (seq_blocks)
+ * if extents are contiguous.
+ */
+ if (le32_to_cpu(ext_prev->ee_block) +
+ le16_to_cpu(ext_prev->ee_len) ==
+ le32_to_cpu(ext_cur->ee_block) &&
+ block_end >= le32_to_cpu(ext_cur->ee_block) &&
+ !last_extent) {
+ if (tmp_inode) {
+ iput(tmp_inode);
+ tmp_inode = NULL;
+ }
+ continue;
+ }
+
+ /* Found an isolated block */
+ if (seq_extents == 1) {
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ goto CLEANUP;
+ }
+
+ ret = ext4_defrag_new_extent_tree(org_inode, tmp_inode,
+ org_path, seq_start, seq_blocks,
+ block_start);
+
+ if (ret < 0) {
+ break;
+ } else if (ret == 1) {
+ ret = 0;
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ goto CLEANUP;
+ }
+
+ data_offset_in_page = seq_start % blocks_per_page;
+
+ /*
+ * Calculate data blocks count that should be swapped
+ * at the first page.
+ */
+ if (data_offset_in_page + seq_blocks > blocks_per_page) {
+ /* Swapped blocks are across pages */
+ block_len_in_page =
+ blocks_per_page - data_offset_in_page;
+ } else {
+ /* Swapped blocks are in a page */
+ block_len_in_page = seq_blocks;
+ }
+
+ org_page_offset = seq_start >>
+ (PAGE_CACHE_SHIFT - org_inode->i_blkbits);
+ seq_end_page = (seq_start + seq_blocks - 1) >>
+ (PAGE_CACHE_SHIFT - org_inode->i_blkbits);
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ rest_blocks = seq_blocks;
+ /* Offset for the tmp_inode */
+ dest_block_offset = 0;
+
+ /*
+ * Discard all preallocations.
+ * This is provisional solution.
+ * When true ext4_mb_return_to_preallocation() is
+ * implemented, this will be removed.
+ */
+ ext4_discard_preallocations(org_inode);
+
+ while (org_page_offset <= seq_end_page) {
+
+ /* Swap original branches with new branches */
+ ret = ext4_defrag_partial(tmp_inode, filp,
+ org_page_offset,
+ dest_block_offset,
+ data_offset_in_page,
+ block_len_in_page);
+ if (ret < 0)
+ goto out;
+
+ org_page_offset++;
+ dest_block_offset += block_len_in_page;
+
+ data_offset_in_page = 0;
+ rest_blocks -= block_len_in_page;
+ if (rest_blocks > blocks_per_page)
+ block_len_in_page = blocks_per_page;
+ else
+ block_len_in_page = rest_blocks;
+ }
+
+ /* Decrease buffer counter */
+ if (holecheck_path)
+ ext4_ext_drop_refs(holecheck_path);
+ holecheck_path = ext4_ext_find_extent(org_inode,
+ seq_start, holecheck_path);
+ if (IS_ERR(holecheck_path)) {
+ ret = PTR_ERR(holecheck_path);
+ holecheck_path = NULL;
+ break;
+ }
+ depth = holecheck_path->p_depth;
+
+CLEANUP:
+ /* Decrease buffer counter */
+ if (org_path)
+ ext4_ext_drop_refs(org_path);
+ org_path = ext4_ext_find_extent(org_inode, seq_start, org_path);
+ if (IS_ERR(org_path)) {
+ ret = PTR_ERR(org_path);
+ org_path = NULL;
+ break;
+ }
+
+ ext_cur = holecheck_path[depth].p_ext;
+ add_blocks = le16_to_cpu(ext_cur->ee_len);
+ seq_blocks = 0;
+ seq_extents = 1;
+
+ if (tmp_inode) {
+ iput(tmp_inode);
+ tmp_inode = NULL;
+ }
+ }
+
+out:
+ if (org_path) {
+ ext4_ext_drop_refs(org_path);
+ kfree(org_path);
+ }
+ if (holecheck_path) {
+ ext4_ext_drop_refs(holecheck_path);
+ kfree(holecheck_path);
+ }
+
+ up_write(&EXT4_I(org_inode)->i_data_sem);
+ mutex_unlock(&org_inode->i_mutex);
+
+ if (tmp_inode)
+ iput(tmp_inode);
+
+ return ret ? ret : defrag_size;
+}
@@ -302,6 +302,7 @@ struct ext4_new_group_data {
#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
#define EXT4_IOC_MIGRATE _IO('f', 9)
/* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_DEFRAG _IOW('f', 15, struct ext4_ext_defrag_data)
/*
* ioctl commands in 32 bit emulation
@@ -319,6 +320,12 @@ struct ext4_new_group_data {
#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
+struct ext4_ext_defrag_data {
+ ext4_lblk_t start_offset; /* start offset to defrag in blocks */
+ ext4_lblk_t defrag_size; /* size of defrag in blocks */
+ ext4_fsblk_t goal; /* block offset for allocation */
+};
+
/*
* Mount options
@@ -1144,6 +1151,9 @@ extern void ext4_inode_bitmap_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_table_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
+/* defrag.c */
+extern int ext4_defrag(struct file *filp, ext4_lblk_t block_start,
+ ext4_lblk_t defrag_size);
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
@@ -246,5 +246,7 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
ext4_lblk_t *, ext4_fsblk_t *);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
+
#endif /* _EXT4_EXTENTS */
@@ -49,7 +49,7 @@
* ext_pblock:
* combine low and high parts of physical block number into ext4_fsblk_t
*/
-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
{
ext4_fsblk_t block;
@@ -214,6 +214,28 @@ setversion_out:
return err;
}
+
+ case EXT4_IOC_DEFRAG: {
+ struct ext4_ext_defrag_data defrag;
+ int err;
+
+ if (!capable(CAP_DAC_OVERRIDE)) {
+ if ((inode->i_mode & S_IRUSR) != S_IRUSR)
+ return -EACCES;
+ if (current->fsuid != inode->i_uid)
+ return -EACCES;
+ }
+
+ if (copy_from_user(&defrag,
+ (struct ext4_ext_defrag_data __user *)arg,
+ sizeof(defrag)))
+ return -EFAULT;
+
+ err = ext4_defrag(filp, defrag.start_offset,
+ defrag.defrag_size);
+ return err;
+ }
+
case EXT4_IOC_GROUP_ADD: {
struct ext4_new_group_data input;
struct super_block *sb = inode->i_sb;