@@ -3114,7 +3114,8 @@ extern int ext4_generic_delete_entry(struct inode *dir,
struct buffer_head *bh,
void *entry_buf,
int buf_size,
- int csum_size);
+ int csum_size,
+ bool *empty);
extern bool ext4_empty_dir(struct inode *inode);
/* resize.c */
@@ -83,6 +83,13 @@
*/
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U
+/*
+ * Number of credits needed to perform directory shrinking. For each htree
+ * level, we need 6 blocks (block in question, its parent, last block, last
+ * block's parent, bitmap block, bg summary).
+ */
+#define EXT4_DIR_SHRINK_TRANS_BLOCKS 6U
+
#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
* allocated so we need to update only data block */
@@ -1673,7 +1673,7 @@ int ext4_delete_inline_entry(handle_t *handle,
goto out;
err = ext4_generic_delete_entry(dir, de_del, bh,
- inline_start, inline_size, 0);
+ inline_start, inline_size, 0, NULL);
if (err)
goto out;
@@ -296,6 +296,11 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_dir_entry_2 **res_dir, ext4_lblk_t *lblk);
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
struct inode *dir, struct inode *inode);
+static void dx_release(struct dx_frame *frames);
+static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+ struct dx_frame *frame,
+ struct dx_frame *frames,
+ __u32 *start_hash);
/* checksumming functions */
void ext4_initialize_dirent_tail(struct buffer_head *bh,
@@ -768,15 +773,19 @@ static inline void htree_rep_invariant_check(struct dx_entry *at,
/*
* Probe for a directory leaf block to search.
*
- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+ * __dx_probe can return ERR_BAD_DX_DIR, which means there was a format
* error in the directory index, and the caller should fall back to
- * searching the directory normally. The callers of dx_probe **MUST**
+ * searching the directory normally. The callers of __dx_probe **MUST**
* check for this error code, and make sure it never gets reflected
- * back to userspace.
+ * back to userspace. If lblk_end is set and if __dx_probe hits lblk_end
+ * logical block, it reads it and stops. If strict is true, __dx_probe
+ * performs stronger checks for "count" during the lookup. See other
+ * variants below.
*/
static struct dx_frame *
-dx_probe(struct ext4_filename *fname, struct inode *dir,
- struct dx_hash_info *hinfo, struct dx_frame *frame_in)
+__dx_probe(struct ext4_filename *fname, struct inode *dir,
+ struct dx_hash_info *hinfo, struct dx_frame *frame_in,
+ ext4_lblk_t lblk_end, bool strict)
{
unsigned count, indirect, level, i;
struct dx_entry *at, *entries, *p, *q, *m;
@@ -867,7 +876,9 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
blocks[0] = 0;
while (1) {
count = dx_get_count(entries);
- if (!count || count > dx_get_limit(entries)) {
+ /* If we are called from the shrink path */
+ if (count > dx_get_limit(entries) ||
+ (strict && !count)) {
ext4_warning_inode(dir,
"dx entry: count %u beyond limit %u",
count, dx_get_limit(entries));
@@ -903,7 +914,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
goto fail;
}
}
- if (++level > indirect)
+ if (++level > indirect || dx_get_block(at) == lblk_end)
return frame;
blocks[level] = block;
frame++;
@@ -935,6 +946,139 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
return ret_err;
}
+static inline struct dx_frame *
+dx_probe(struct ext4_filename *fname, struct inode *dir,
+ struct dx_hash_info *hinfo, struct dx_frame *frame_in)
+{
+ return __dx_probe(fname, dir, hinfo, frame_in, 0, true);
+}
+
+/*
+ * dx_probe with relaxed checks. This function is used in the directory
+ * shrinking code since we can run into intermediate states where we have
+ * internal dx nodes with count = 0.
+ */
+static inline struct dx_frame *
+dx_probe_relaxed(struct ext4_filename *fname, struct inode *dir,
+ struct dx_hash_info *hinfo, struct dx_frame *frame_in)
+{
+ return __dx_probe(fname, dir, hinfo, frame_in, 0, false);
+}
+
+/* dx_probe() variant that allows us to lookup frames for a dirent block. */
+static struct dx_frame *dx_probe_dirent_blk(struct inode *dir,
+ struct dx_frame *frames,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
+{
+ struct dx_hash_info hinfo;
+ struct dx_frame *frame_ptr;
+ struct ext4_dir_entry_2 *dead_de;
+
+ hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version +=
+ EXT4_SB(dir->i_sb)->s_hash_unsigned;
+ hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+
+ /* Let's assume that the lblk is a leaf dirent block */
+ dead_de = (struct ext4_dir_entry_2 *)bh->b_data;
+ ext4fs_dirhash(dir, dead_de->name, dead_de->name_len, &hinfo);
+
+ frame_ptr = dx_probe_relaxed(NULL, dir, &hinfo, frames);
+ if (!IS_ERR(frame_ptr)) {
+ /*
+ * See if we need to move our frame_ptr.
+ */
+ while (dx_get_block(frame_ptr->at) != lblk) {
+ if (!ext4_htree_next_block(dir, hinfo.hash, frame_ptr,
+ frames, NULL))
+ break;
+ }
+ if (dx_get_block(frame_ptr->at) == lblk)
+ return frame_ptr;
+ dx_release(frames);
+ frame_ptr = ERR_PTR(-EINVAL);
+ }
+ return frame_ptr;
+}
+
+/* dx_probe() variant that allows us to lookup frames for a dx node. */
+static struct dx_frame *dx_probe_dx_node(struct inode *dir,
+ struct dx_frame *frames,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
+{
+ struct dx_hash_info hinfo;
+ struct dx_frame *frame_ptr;
+ struct dx_entry *entries;
+
+ hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version +=
+ EXT4_SB(dir->i_sb)->s_hash_unsigned;
+ hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ entries = ((struct dx_node *)(bh->b_data))->entries;
+
+ /* Use the first hash found in the dx node */
+ hinfo.hash = le32_to_cpu(entries[1].hash);
+ frame_ptr = __dx_probe(NULL, dir, &hinfo, frames, lblk, false);
+ if (IS_ERR_OR_NULL(frame_ptr))
+ return frame_ptr;
+ if (dx_get_block(frame_ptr->at) != lblk) {
+ dx_release(frames);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return frame_ptr;
+}
+
+/*
+ * This function tries to remove the entry of a dirent block (which was just
+ * emptied by the caller) from the dx frame. It does so by reducing the count by
+ * 1 and left shifting all the entries after the deleted entry.
+ * ext4_remove_dx_entry() does NOT mark dx_frame dirty, that's left to the
+ * caller. The reason for doing this is that this function removes entry from
+ * a dx_node and can potentially leave dx_node with no entries. Since older
+ * kernels don't allow dx_parent nodes to have 0 count, the caller should
+ * only mark buffer dirty when we are sure that we won't leave dx_node
+ * with 0 count.
+ */
+static int
+ext4_remove_dx_entry(handle_t *handle, struct inode *dir,
+ struct dx_frame *dx_frame)
+{
+ struct dx_entry *entries = dx_frame->entries;
+ int err, i = 0;
+ unsigned int count = dx_get_count(entries);
+
+ for (i = 0; i < count; i++)
+ if (entries[i].block == dx_frame->at->block)
+ break;
+ if (i >= count)
+ return -EINVAL;
+
+ err = ext4_journal_get_write_access(handle, dir->i_sb, dx_frame->bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_err;
+
+ if (i == 0) {
+ entries[0].block = entries[1].block;
+ i++;
+ }
+ if (i < count - 1)
+ memmove(&entries[i], &entries[i + 1], (count - i - 1) *
+ sizeof(struct dx_entry));
+ dx_set_count(entries, count - 1);
+
+ return 0;
+out_err:
+ ext4_std_error(dir->i_sb, err);
+ return -EINVAL;
+
+}
+
static void dx_release(struct dx_frame *frames)
{
struct dx_root_info *info;
@@ -2649,6 +2793,24 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
return err;
}
+/*
+ * Checks if dirent block is empty. If de is passed, we
+ * skip directory entries before de in the block.
+ */
+static inline bool is_empty_dirent_block(struct inode *dir,
+ struct buffer_head *bh,
+ struct ext4_dir_entry_2 *de)
+{
+ if (!de)
+ de = (struct ext4_dir_entry_2 *)bh->b_data;
+ while ((u8 *)de < (u8 *)bh->b_data + dir->i_sb->s_blocksize) {
+ if (de->inode)
+ return false;
+ de = ext4_next_entry(de, dir->i_sb->s_blocksize);
+ }
+ return true;
+}
+
/*
* ext4_generic_delete_entry deletes a directory entry by merging it
* with the previous entry
@@ -2658,7 +2820,8 @@ int ext4_generic_delete_entry(struct inode *dir,
struct buffer_head *bh,
void *entry_buf,
int buf_size,
- int csum_size)
+ int csum_size,
+ bool *empty)
{
struct ext4_dir_entry_2 *de, *pde;
unsigned int blocksize = dir->i_sb->s_blocksize;
@@ -2667,6 +2830,8 @@ int ext4_generic_delete_entry(struct inode *dir,
i = 0;
pde = NULL;
de = entry_buf;
+ if (empty)
+ *empty = false;
while (i < buf_size - csum_size) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
entry_buf, buf_size, i))
@@ -2692,7 +2857,9 @@ int ext4_generic_delete_entry(struct inode *dir,
offsetof(struct ext4_dir_entry_2,
name_len));
}
-
+ if (empty)
+ *empty = is_empty_dirent_block(dir, bh,
+ pde ? pde : de);
inode_inc_iversion(dir);
return 0;
}
@@ -2703,12 +2870,176 @@ int ext4_generic_delete_entry(struct inode *dir,
return -ENOENT;
}
+static void make_unindexed(handle_t *handle, struct inode *dir,
+ struct buffer_head *bh)
+{
+ int parent_ino = le32_to_cpu(
+ ((struct dx_root *)bh->b_data)->dotdot.inode);
+
+ memset(bh->b_data, 0, dir->i_sb->s_blocksize);
+ ext4_init_dirblock(handle, dir, bh, parent_ino, NULL, 0);
+ ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
+ ext4_mark_inode_dirty(handle, dir);
+}
+
+/*
+ * Copy contents from lblk_src to lblk_dst and remap internal dx nodes
+ * such that parent of lblk_src points to lblk_dst.
+ */
+static int ext4_dx_remap_block(handle_t *handle, struct inode *dir,
+ struct buffer_head *bh_dst,
+ ext4_lblk_t lblk_dst, ext4_lblk_t lblk_src)
+{
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame_ptr;
+ struct buffer_head *bh_src;
+ int ret;
+
+ bh_src = ext4_bread(handle, dir, lblk_src, 0);
+ if (IS_ERR_OR_NULL(bh_src))
+ return -EIO;
+
+ memset(frames, 0, sizeof(frames));
+
+ /*
+ * blk_src can either be a dirent block or dx node. Try to search
+ * using both and use whichever one that returns success.
+ */
+ frame_ptr = dx_probe_dirent_blk(dir, frames, bh_src, lblk_src);
+ if (IS_ERR_OR_NULL(frame_ptr)) {
+ frame_ptr = dx_probe_dx_node(dir, frames, bh_src, lblk_src);
+ if (IS_ERR_OR_NULL(frame_ptr)) {
+ brelse(bh_src);
+ return PTR_ERR(frame_ptr);
+ }
+ }
+ WARN_ON(dx_get_block(frame_ptr->at) != lblk_src);
+ memcpy(bh_dst->b_data, bh_src->b_data, bh_dst->b_size);
+ dx_set_block(frame_ptr->at, lblk_dst);
+
+ /* Set pointer in bh_last_parent to bh */
+ ret = ext4_journal_get_write_access(handle, dir->i_sb, bh_src,
+ EXT4_JTR_NONE);
+ if (ret)
+ goto out;
+
+ ret = ext4_journal_get_write_access(handle, dir->i_sb, frame_ptr->bh,
+ EXT4_JTR_NONE);
+ if (ret)
+ goto out;
+
+ ret = ext4_handle_dirty_metadata(handle, dir, bh_src);
+ if (ret) {
+ ext4_std_error(dir->i_sb, ret);
+ goto out;
+ }
+
+ ret = ext4_handle_dirty_dx_node(handle, dir, frame_ptr->bh);
+ if (ret) {
+ ext4_std_error(dir->i_sb, ret);
+ goto out;
+ }
+out:
+ brelse(bh_src);
+ dx_release(frames);
+ return ret;
+}
+
+/*
+ * Try to shrink directory as much as possible. This function
+ * truncates directory to the new size.
+ *
+ * The high level algorithm is as follows:
+ *
+ * - If after dentry removal the dirent block (let's call it B) becomes
+ * empty, then remove its references in its dx parent.
+ * - Swap its contents with that of the last block (L) in directory.
+ * - Update L's parents to point to B instead.
+ * - Remove L
+ * - Repeat this for all the ancestors of B.
+ */
+static int ext4_try_dir_shrink(handle_t *handle, struct inode *dir,
+ ext4_lblk_t lblk, struct buffer_head *bh)
+{
+ struct buffer_head *bh_last = NULL, *parent_bh = NULL;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame_ptr;
+ ext4_lblk_t shrink = 0, last_lblk, parent_lblk;
+ int ret = 0;
+ bool check_empty = true;
+
+ memset(frames, 0, sizeof(frames));
+ frame_ptr = dx_probe_dirent_blk(dir, frames, bh, lblk);
+ if (IS_ERR(frame_ptr))
+ return -EINVAL;
+
+ last_lblk = (dir->i_size >> dir->i_sb->s_blocksize_bits) - 1;
+
+ while (check_empty && frame_ptr >= frames) {
+ parent_lblk = frame_ptr > frames ?
+ dx_get_block((frame_ptr - 1)->at) : 0;
+ lblk = dx_get_block(frame_ptr->at);
+ if (ext4_journal_extend(
+ handle, EXT4_DIR_SHRINK_TRANS_BLOCKS, 0) < 0)
+ break;
+ if (ext4_remove_dx_entry(handle, dir, frame_ptr) < 0) {
+ dxtrace(pr_warn("remove dx entry failed\n"));
+ break;
+ }
+ check_empty = dx_get_count(frame_ptr->entries) == 0;
+ if (lblk != last_lblk) {
+ ret = ext4_dx_remap_block(handle, dir, bh, lblk,
+ last_lblk);
+ if (ret) {
+ dxtrace(pr_warn("remap failed\n"));
+ break;
+ }
+ }
+ ret = ext4_handle_dirty_dx_node(handle, dir, frame_ptr->bh);
+ if (ret)
+ break;
+ if (parent_lblk == last_lblk)
+ parent_lblk = lblk;
+ last_lblk--;
+ shrink++;
+
+ if (parent_lblk == 0)
+ break;
+ if (!IS_ERR_OR_NULL(parent_bh))
+ brelse(parent_bh);
+ parent_bh = ext4_bread(handle, dir, parent_lblk, 0);
+ if (IS_ERR_OR_NULL(parent_bh))
+ break;
+ dx_release(frames);
+ frame_ptr = dx_probe_dx_node(dir, frames, parent_bh,
+ parent_lblk);
+ if (IS_ERR(frame_ptr))
+ break;
+ bh = parent_bh;
+ }
+
+ /* Fallback to linear directories if root node becomes empty */
+ if (dx_get_count(frames[0].entries) == 0)
+ make_unindexed(handle, dir, frames[0].bh);
+ if (!IS_ERR(frame_ptr))
+ dx_release(frames);
+ if (!IS_ERR_OR_NULL(bh_last))
+ brelse(bh_last);
+ if (!IS_ERR_OR_NULL(parent_bh))
+ brelse(parent_bh);
+
+ if (ret || !shrink)
+ return ret;
+
+ dir->i_size -= shrink * dir->i_sb->s_blocksize;
+ return ext4_truncate(dir);
+}
+
static int ext4_delete_entry(handle_t *handle,
struct inode *dir,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh, ext4_lblk_t lblk)
{
int err, csum_size = 0;
+ bool block_empty = false;
if (ext4_has_inline_data(dir)) {
int has_inline_data = 1;
@@ -2728,7 +3059,8 @@ static int ext4_delete_entry(handle_t *handle,
goto out;
err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data,
- dir->i_sb->s_blocksize, csum_size);
+ dir->i_sb->s_blocksize, csum_size,
+ &block_empty);
if (err)
goto out;
@@ -2737,6 +3069,9 @@ static int ext4_delete_entry(handle_t *handle,
if (unlikely(err))
goto out;
+ if (block_empty && is_dx(dir))
+ ext4_try_dir_shrink(handle, dir, lblk, bh);
+
return 0;
out:
if (err != -ENOENT)