===================================================================
@@ -400,6 +400,127 @@
}
}
+#ifdef EXT3_DELETE_THREAD
+/*
+ * Delete inodes in a loop until there are no more to be deleted.
+ * Normally, we run in the background doing the deletes and sleeping again,
+ * and clients just add new inodes to be deleted onto the end of the list.
+ * If someone is concerned about free space (e.g. block allocation or similar)
+ * then they can sleep on s_delete_waiter_queue and be woken up when space
+ * has been freed.
+ */
+int ext3_delete_thread(void *data)
+{
+ struct super_block *sb = data;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ struct task_struct *tsk = current;
+
+ /* Almost like daemonize, but not quite */
+ exit_mm(current);
+ tsk->session = 1;
+ tsk->pgrp = 1;
+ tsk->tty = NULL;
+ exit_files(current);
+ reparent_to_init();
+
+ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
+ sigfillset(&tsk->blocked);
+
+ /*tsk->flags |= PF_KERNTHREAD;*/
+
+ INIT_LIST_HEAD(&sbi->s_delete_list);
+ wake_up(&sbi->s_delete_waiter_queue);
+ ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
+
+ /* main loop */
+ for (;;) {
+ wait_event_interruptible(sbi->s_delete_thread_queue,
+ !list_empty(&sbi->s_delete_list) ||
+ !test_opt(sb, ASYNCDEL));
+ ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
+ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
+
+ spin_lock(&sbi->s_delete_lock);
+ if (list_empty(&sbi->s_delete_list)) {
+ clear_opt(sbi->s_mount_opt, ASYNCDEL);
+ memset(&sbi->s_delete_list, 0,
+ sizeof(sbi->s_delete_list));
+ spin_unlock(&sbi->s_delete_lock);
+ ext3_debug("delete thread on %s exiting\n",
+ kdevname(sb->s_dev));
+ wake_up(&sbi->s_delete_waiter_queue);
+ break;
+ }
+
+ while (!list_empty(&sbi->s_delete_list)) {
+ struct inode *inode=list_entry(sbi->s_delete_list.next,
+ struct inode, i_devices);
+ unsigned long blocks = inode->i_blocks >>
+ (inode->i_blkbits - 9);
+
+ list_del_init(&inode->i_devices);
+ spin_unlock(&sbi->s_delete_lock);
+ ext3_debug("%s delete ino %lu blk %lu\n",
+ tsk->comm, inode->i_ino, blocks);
+
+ J_ASSERT(EXT3_I(inode)->i_state & EXT3_STATE_DELETE);
+ J_ASSERT(inode->i_nlink == 1);
+ inode->i_nlink = 0;
+ iput(inode);
+
+ spin_lock(&sbi->s_delete_lock);
+ sbi->s_delete_blocks -= blocks;
+ sbi->s_delete_inodes--;
+ }
+ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
+ ext3_warning(sb, __FUNCTION__,
+ "%lu blocks, %lu inodes on list?\n",
+ sbi->s_delete_blocks,sbi->s_delete_inodes);
+ sbi->s_delete_blocks = 0;
+ sbi->s_delete_inodes = 0;
+ }
+ spin_unlock(&sbi->s_delete_lock);
+ wake_up(&sbi->s_delete_waiter_queue);
+ }
+
+ return 0;
+}
+
+static void ext3_start_delete_thread(struct super_block *sb)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ int rc;
+
+ spin_lock_init(&sbi->s_delete_lock);
+ init_waitqueue_head(&sbi->s_delete_thread_queue);
+ init_waitqueue_head(&sbi->s_delete_waiter_queue);
+
+ if (!test_opt(sb, ASYNCDEL))
+ return;
+
+ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
+ if (rc < 0)
+ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
+ rc);
+ else
+ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
+}
+
+static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
+{
+ if (sbi->s_delete_list.next == 0) /* thread never started */
+ return;
+
+ clear_opt(sbi->s_mount_opt, ASYNCDEL);
+ wake_up(&sbi->s_delete_thread_queue);
+ wait_event(sbi->s_delete_waiter_queue,
+ sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0);
+}
+#else
+#define ext3_start_delete_thread(sbi) do {} while(0)
+#define ext3_stop_delete_thread(sbi) do {} while(0)
+#endif /* EXT3_DELETE_THREAD */
+
void ext3_put_super (struct super_block * sb)
{
struct ext3_sb_info *sbi = EXT3_SB(sb);
@@ -407,6 +528,9 @@
kdev_t j_dev = sbi->s_journal->j_dev;
int i;
+#ifdef EXT3_DELETE_THREAD
+ J_ASSERT(sbi->s_delete_inodes == 0);
+#endif
ext3_xattr_put_super(sb);
journal_destroy(sbi->s_journal);
if (!(sb->s_flags & MS_RDONLY)) {
@@ -526,6 +650,13 @@
clear_opt (*mount_options, XATTR_USER);
else
#endif
+#ifdef EXT3_DELETE_THREAD
+ if (!strcmp(this_char, "asyncdel"))
+ set_opt(*mount_options, ASYNCDEL);
+ else if (!strcmp(this_char, "noasyncdel"))
+ clear_opt(*mount_options, ASYNCDEL);
+ else
+#endif
if (!strcmp (this_char, "bsddf"))
clear_opt (*mount_options, MINIX_DF);
else if (!strcmp (this_char, "nouid32")) {
@@ -1244,6 +1375,7 @@
}
ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+ ext3_start_delete_thread(sb);
EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
ext3_orphan_cleanup(sb, es);
EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
@@ -1626,7 +1758,12 @@
static int ext3_sync_fs(struct super_block *sb)
{
tid_t target;
-
+
+ if (atomic_read(&sb->s_active) == 0) {
+ /* fs is being umounted: time to stop delete thread */
+ ext3_stop_delete_thread(EXT3_SB(sb));
+ }
+
sb->s_dirt = 0;
target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
log_wait_commit(EXT3_SB(sb)->s_journal, target);
@@ -1690,6 +1827,9 @@
if (!parse_options(data, &tmp, sbi, &tmp, 1))
return -EINVAL;
+ if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
+ ext3_stop_delete_thread(sbi);
+
if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
ext3_abort(sb, __FUNCTION__, "Abort forced by user");
===================================================================
@@ -2562,6 +2562,118 @@
return err;
}
+#ifdef EXT3_DELETE_THREAD
+/* Move blocks from to-be-truncated inode over to a new inode, and delete
+ * that one from the delete thread instead. This avoids a lot of latency
+ * when truncating large files.
+ *
+ * If we have any problem deferring the truncate, just truncate it right away.
+ * If we defer it, we also mark how many blocks it would free, so that we
+ * can keep the statfs data correct, and we know if we should sleep on the
+ * delete thread when we run out of space.
+ */
+void ext3_truncate_thread(struct inode *old_inode)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
+ struct inode *new_inode;
+ handle_t *handle;
+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
+
+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
+ goto out_truncate;
+
+ /* XXX This is a temporary limitation for code simplicity.
+ * We could truncate to arbitrary sizes at some later time.
+ */
+ if (old_inode->i_size != 0)
+ goto out_truncate;
+
+ /* We may want to truncate the inode immediately and not defer it */
+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
+ old_inode->i_size > oei->i_disksize)
+ goto out_truncate;
+
+ /* We can't use the delete thread as-is during real orphan recovery,
+ * as we add to the orphan list here, causing ext3_orphan_cleanup()
+ * to loop endlessly. It would be nice to do so, but needs work.
+ */
+ if (oei->i_state & EXT3_STATE_DELETE ||
+ sbi->s_mount_state & EXT3_ORPHAN_FS) {
+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
+ old_inode->i_ino, blocks);
+ goto out_truncate;
+ }
+
+ ext3_discard_prealloc(old_inode);
+
+ /* old_inode = 1
+ * new_inode = sb + GDT + ibitmap
+ * orphan list = 1 inode/superblock for add, 2 inodes for del
+ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
+ */
+ handle = ext3_journal_start(old_inode, 7);
+ if (IS_ERR(handle))
+ goto out_truncate;
+
+ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
+ if (IS_ERR(new_inode)) {
+ ext3_debug("truncate inode %lu directly (no new inodes)\n",
+ old_inode->i_ino);
+ goto out_journal;
+ }
+
+ nei = EXT3_I(new_inode);
+
+ down_write(&oei->truncate_sem);
+ new_inode->i_size = old_inode->i_size;
+ new_inode->i_blocks = old_inode->i_blocks;
+ new_inode->i_uid = old_inode->i_uid;
+ new_inode->i_gid = old_inode->i_gid;
+ new_inode->i_nlink = 1;
+
+ /* FIXME when we do arbitrary truncates */
+ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
+ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
+
+ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
+ memset(oei->i_data, 0, sizeof(oei->i_data));
+
+ nei->i_disksize = oei->i_disksize;
+ nei->i_state |= EXT3_STATE_DELETE;
+ up_write(&oei->truncate_sem);
+
+ if (ext3_orphan_add(handle, new_inode) < 0)
+ goto out_journal;
+
+ if (ext3_orphan_del(handle, old_inode) < 0) {
+ ext3_orphan_del(handle, new_inode);
+ iput(new_inode);
+ goto out_journal;
+ }
+
+ ext3_journal_stop(handle, old_inode);
+
+ spin_lock(&sbi->s_delete_lock);
+ J_ASSERT(list_empty(&new_inode->i_devices));
+ list_add_tail(&new_inode->i_devices, &sbi->s_delete_list);
+ sbi->s_delete_blocks += blocks;
+ sbi->s_delete_inodes++;
+ spin_unlock(&sbi->s_delete_lock);
+
+ ext3_debug("delete inode %lu (%lu blocks) by thread\n",
+ new_inode->i_ino, blocks);
+
+ wake_up(&sbi->s_delete_thread_queue);
+ return;
+
+out_journal:
+ ext3_journal_stop(handle, old_inode);
+out_truncate:
+ ext3_truncate(old_inode);
+}
+#endif /* EXT3_DELETE_THREAD */
+
/*
* On success, We end up with an outstanding reference count against
* iloc->bh. This _must_ be cleaned up later.
===================================================================
@@ -123,7 +123,11 @@
};
struct inode_operations ext3_file_inode_operations = {
+#ifdef EXT3_DELETE_THREAD
+ truncate: ext3_truncate_thread, /* BKL held */
+#else
truncate: ext3_truncate, /* BKL held */
+#endif
setattr: ext3_setattr, /* BKL held */
setxattr: ext3_setxattr, /* BKL held */
getxattr: ext3_getxattr, /* BKL held */
===================================================================
@@ -838,6 +838,40 @@
return retval;
}
+#ifdef EXT3_DELETE_THREAD
+static int ext3_try_to_delay_deletion(struct inode *inode)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb);
+ struct ext3_inode_info *ei = EXT3_I(inode);
+ unsigned long blocks;
+
+ if (!test_opt(inode->i_sb, ASYNCDEL))
+ return 0;
+
+ /* We may want to delete the inode immediately and not defer it */
+ blocks = inode->i_blocks >> (inode->i_blkbits - 9);
+ if (IS_SYNC(inode) || blocks <= EXT3_NDIR_BLOCKS)
+ return 0;
+
+ inode->i_nlink = 1;
+ atomic_inc(&inode->i_count);
+ ei->i_state |= EXT3_STATE_DELETE;
+
+ spin_lock(&sbi->s_delete_lock);
+ J_ASSERT(list_empty(&inode->i_devices));
+ list_add_tail(&inode->i_devices, &sbi->s_delete_list);
+ sbi->s_delete_blocks += blocks;
+ sbi->s_delete_inodes++;
+ spin_unlock(&sbi->s_delete_lock);
+
+ wake_up(&sbi->s_delete_thread_queue);
+
+ return 0;
+}
+#else
+#define ext3_try_to_delay_deletion(inode) do {} while (0)
+#endif
+
static int ext3_unlink(struct inode * dir, struct dentry *dentry)
{
int retval;
@@ -878,8 +912,10 @@
dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
ext3_mark_inode_dirty(handle, dir);
inode->i_nlink--;
- if (!inode->i_nlink)
+ if (!inode->i_nlink) {
+ ext3_try_to_delay_deletion(inode);
ext3_orphan_add(handle, inode);
+ }
inode->i_ctime = dir->i_ctime;
ext3_mark_inode_dirty(handle, inode);
retval = 0;
===================================================================
@@ -188,6 +188,7 @@
*/
#define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */
#define EXT3_STATE_NEW 0x00000002 /* inode is newly created */
+#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */
/*
* ioctl commands
@@ -315,6 +316,7 @@
#define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */
#define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */
#define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
+#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
@@ -639,6 +641,9 @@
extern void ext3_dirty_inode(struct inode *);
extern int ext3_change_inode_journal_flag(struct inode *, int);
extern void ext3_truncate (struct inode *);
+#ifdef EXT3_DELETE_THREAD
+extern void ext3_truncate_thread(struct inode *inode);
+#endif
extern void ext3_set_inode_flags(struct inode *);
/* ioctl.c */
===================================================================
@@ -29,6 +29,8 @@
#define EXT3_MAX_GROUP_LOADED 8
+#define EXT3_DELETE_THREAD
+
/*
* third extended-fs super-block data in memory
*/
@@ -74,6 +76,14 @@
struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
#endif
+#ifdef EXT3_DELETE_THREAD
+ spinlock_t s_delete_lock;
+ struct list_head s_delete_list;
+ unsigned long s_delete_blocks;
+ unsigned long s_delete_inodes;
+ wait_queue_head_t s_delete_thread_queue;
+ wait_queue_head_t s_delete_waiter_queue;
+#endif
};
#endif /* _LINUX_EXT3_FS_SB */