diff mbox

ext4: expand commit callback

Message ID 1319626156-2444-1-git-send-email-bobijam@whamcloud.com
State Superseded, archived
Headers show

Commit Message

Bobi Jam Oct. 26, 2011, 10:49 a.m. UTC
The per-commit callback is now used by mballoc code to manage free
space bitmaps after deleted blocks have been released. This patch
expand it to contain multiple different callbacks.

Signed-off-by: Bobi Jam <bobijam@whamcloud.com>
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
---
 fs/ext4/ext4_jbd2.h |   72 ++++++++++++++++++++++++
 fs/ext4/mballoc.c   |  150 ++++++++++++++++++++++++---------------------------
 fs/ext4/mballoc.h   |   18 ++++---
 fs/ext4/super.c     |   18 ++++++
 4 files changed, 170 insertions(+), 88 deletions(-)

Comments

Bobi Jam Nov. 4, 2011, 12:17 p.m. UTC | #1
ping. Any comment on this patch proposal? Thanks.

On 2011-10-26 18:49, Bobi Jam wrote:
> The per-commit callback is now used by mballoc code to manage free
> space bitmaps after deleted blocks have been released. This patch
> expand it to contain multiple different callbacks.
> 
> Signed-off-by: Bobi Jam <bobijam@whamcloud.com>
> Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
> ---
>  fs/ext4/ext4_jbd2.h |   72 ++++++++++++++++++++++++
>  fs/ext4/mballoc.c   |  150 ++++++++++++++++++++++++---------------------------
>  fs/ext4/mballoc.h   |   18 ++++---
>  fs/ext4/super.c     |   18 ++++++
>  4 files changed, 170 insertions(+), 88 deletions(-)
> 
> diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
> index 5802fa1..c65a774 100644
> --- a/fs/ext4/ext4_jbd2.h
> +++ b/fs/ext4/ext4_jbd2.h
> @@ -104,6 +104,78 @@
>  #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
>  #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
>  
> +/**
> + *   struct ext4_journal_cb_entry - Base structure for callback information.
> + *
> + *   This struct is a 'seed' structure for a using with your own callback
> + *   structs. If you are using callbacks you must allocate one of these
> + *   or another struct of your own definition which has this struct
> + *   as it's first element and pass it to ext4_journal_callback_add().
> + */
> +struct ext4_journal_cb_entry {
> +	/* list information for other callbacks attached to the same handle */
> +	struct list_head jce_list;
> +
> +	/*  Function to call with this callback structure */
> +	void (*jce_func)(struct super_block *sb,
> +			 struct ext4_journal_cb_entry *jce, int error);
> +
> +	/* user data goes here */
> +};
> +
> +/**
> + * ext4_journal_callback_add: add a function to call after transaction commit
> + * @handle: active journal transaction handle to register callback on
> + * @func: callback function to call after the transaction has committed:
> + *        @sb: superblock of current filesystem for transaction
> + *        @jce: returned journal callback data
> + *        @rc: journal state at commit (0 = transaction committed properly)
> + * @jce: journal callback data (internal and function private data struct)
> + *
> + * The registered function will be called in the context of the journal thread
> + * after the transaction for which the handle was created has completed.
> + *
> + * No locks are held when the callback function is called, so it is safe to
> + * call blocking functions from within the callback, but the callback should
> + * not block or run for too long, or the filesystem will be blocked waiting for
> + * the next transaction to commit. No journaling functions can be used, or
> + * there is a risk of deadlock.
> + *
> + * There is no guaranteed calling order of multiple registered callbacks on
> + * the same transaction.
> + */
> +static inline void ext4_journal_callback_add(handle_t *handle,
> +			void (*func)(struct super_block *sb,
> +				     struct ext4_journal_cb_entry *jce,
> +				     int rc),
> +			struct ext4_journal_cb_entry *jce)
> +{
> +	struct ext4_sb_info *sbi =
> +			EXT4_SB(handle->h_transaction->t_journal->j_private);
> +
> +	/* Add the jce to transaction's private list */
> +	jce->jce_func = func;
> +	spin_lock(&sbi->s_md_lock);
> +	list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
> +	spin_unlock(&sbi->s_md_lock);
> +}
> +
> +/**
> + * ext4_journal_callback_del: delete a registered callback
> + * @handle: active journal transaction handle on which callback was registered
> + * @jce: registered journal callback entry to unregister
> + */
> +static inline void ext4_journal_callback_del(handle_t *handle,
> +					     struct ext4_journal_cb_entry *jce)
> +{
> +	struct ext4_sb_info *sbi =
> +			EXT4_SB(handle->h_transaction->t_journal->j_private);
> +
> +	spin_lock(&sbi->s_md_lock);
> +	list_del_init(&jce->jce_list);
> +	spin_unlock(&sbi->s_md_lock);
> +}
> +
>  int
>  ext4_mark_iloc_dirty(handle_t *handle,
>  		     struct inode *inode,
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 17a5a57..b90da57 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -21,6 +21,7 @@
>   * mballoc.c contains the multiblocks allocation routines
>   */
>  
> +#include "ext4_jbd2.h"
>  #include "mballoc.h"
>  #include <linux/debugfs.h>
>  #include <linux/slab.h>
> @@ -338,7 +339,7 @@
>   */
>  static struct kmem_cache *ext4_pspace_cachep;
>  static struct kmem_cache *ext4_ac_cachep;
> -static struct kmem_cache *ext4_free_ext_cachep;
> +static struct kmem_cache *ext4_free_data_cachep;
>  
>  /* We create slab caches for groupinfo data structures based on the
>   * superblock block size.  There will be one per mounted filesystem for
> @@ -356,7 +357,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
>  					ext4_group_t group);
>  static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
>  						ext4_group_t group);
> -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
> +static void ext4_free_data_callback(struct super_block *sb,
> +					struct ext4_journal_cb_entry *jce, int error);
>  
>  static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
>  {
> @@ -2511,8 +2513,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
>  		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
>  				 &ext4_mb_seq_groups_fops, sb);
>  
> -	if (sbi->s_journal)
> -		sbi->s_journal->j_commit_callback = release_blocks_on_commit;
>  out:
>  	if (ret) {
>  		kfree(sbi->s_mb_offsets);
> @@ -2616,58 +2616,54 @@ static inline int ext4_issue_discard(struct super_block *sb,
>   * This function is called by the jbd2 layer once the commit has finished,
>   * so we know we can free the blocks that were released with that commit.
>   */
> -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
> +static void ext4_free_data_callback(struct super_block *sb,
> +				    struct ext4_journal_cb_entry *jce,
> +				    int rc)
>  {
> -	struct super_block *sb = journal->j_private;
> +	struct ext4_free_data *entry = (struct ext4_free_data *)jce;
>  	struct ext4_buddy e4b;
>  	struct ext4_group_info *db;
>  	int err, count = 0, count2 = 0;
> -	struct ext4_free_data *entry;
> -	struct list_head *l, *ltmp;
>  
> -	list_for_each_safe(l, ltmp, &txn->t_private_list) {
> -		entry = list_entry(l, struct ext4_free_data, list);
> +	mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
> +		 entry->efd_count, entry->efd_group, entry);
>  
> -		mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
> -			 entry->count, entry->group, entry);
> +	if (test_opt(sb, DISCARD))
> +		ext4_issue_discard(sb, entry->efd_group,
> +				   entry->efd_start_blk, entry->efd_count);
>  
> -		if (test_opt(sb, DISCARD))
> -			ext4_issue_discard(sb, entry->group,
> -					   entry->start_blk, entry->count);
> +	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
> +	/* we expect to find existing buddy because it's pinned */
> +	BUG_ON(err != 0);
>  
> -		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
> -		/* we expect to find existing buddy because it's pinned */
> -		BUG_ON(err != 0);
> +	db = e4b.bd_info;
> +	/* there are blocks to put in buddy to make them really free */
> +	count += entry->efd_count;
> +	count2++;
> +	ext4_lock_group(sb, entry->efd_group);
> +	/* Take it out of per group rb tree */
> +	rb_erase(&entry->efd_node, &(db->bb_free_root));
> +	mb_free_blocks(NULL, &e4b, entry->efd_start_blk, entry->efd_count);
>  
> -		db = e4b.bd_info;
> -		/* there are blocks to put in buddy to make them really free */
> -		count += entry->count;
> -		count2++;
> -		ext4_lock_group(sb, entry->group);
> -		/* Take it out of per group rb tree */
> -		rb_erase(&entry->node, &(db->bb_free_root));
> -		mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
> +	/*
> +	 * Clear the trimmed flag for the group so that the next
> +	 * ext4_trim_fs can trim it.
> +	 * If the volume is mounted with -o discard, online discard
> +	 * is supported and the free blocks will be trimmed online.
> +	 */
> +	if (!test_opt(sb, DISCARD))
> +		EXT4_MB_GRP_CLEAR_TRIMMED(db);
>  
> -		/*
> -		 * Clear the trimmed flag for the group so that the next
> -		 * ext4_trim_fs can trim it.
> -		 * If the volume is mounted with -o discard, online discard
> -		 * is supported and the free blocks will be trimmed online.
> +	if (!db->bb_free_root.rb_node) {
> +		/* No more items in the per group rb tree
> +		 * balance refcounts from ext4_mb_free_metadata()
>  		 */
> -		if (!test_opt(sb, DISCARD))
> -			EXT4_MB_GRP_CLEAR_TRIMMED(db);
> -
> -		if (!db->bb_free_root.rb_node) {
> -			/* No more items in the per group rb tree
> -			 * balance refcounts from ext4_mb_free_metadata()
> -			 */
> -			page_cache_release(e4b.bd_buddy_page);
> -			page_cache_release(e4b.bd_bitmap_page);
> -		}
> -		ext4_unlock_group(sb, entry->group);
> -		kmem_cache_free(ext4_free_ext_cachep, entry);
> -		ext4_mb_unload_buddy(&e4b);
> +		page_cache_release(e4b.bd_buddy_page);
> +		page_cache_release(e4b.bd_bitmap_page);
>  	}
> +	ext4_unlock_group(sb, entry->efd_group);
> +	kmem_cache_free(ext4_free_data_cachep, entry);
> +	ext4_mb_unload_buddy(&e4b);
>  
>  	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
>  }
> @@ -2720,9 +2716,9 @@ int __init ext4_init_mballoc(void)
>  		return -ENOMEM;
>  	}
>  
> -	ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
> -					  SLAB_RECLAIM_ACCOUNT);
> -	if (ext4_free_ext_cachep == NULL) {
> +	ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
> +					   SLAB_RECLAIM_ACCOUNT);
> +	if (ext4_free_data_cachep == NULL) {
>  		kmem_cache_destroy(ext4_pspace_cachep);
>  		kmem_cache_destroy(ext4_ac_cachep);
>  		return -ENOMEM;
> @@ -2740,7 +2736,7 @@ void ext4_exit_mballoc(void)
>  	rcu_barrier();
>  	kmem_cache_destroy(ext4_pspace_cachep);
>  	kmem_cache_destroy(ext4_ac_cachep);
> -	kmem_cache_destroy(ext4_free_ext_cachep);
> +	kmem_cache_destroy(ext4_free_data_cachep);
>  	ext4_groupinfo_destroy_slabs();
>  	ext4_remove_debugfs_entry();
>  }
> @@ -3290,8 +3286,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
>  	n = rb_first(&(grp->bb_free_root));
>  
>  	while (n) {
> -		entry = rb_entry(n, struct ext4_free_data, node);
> -		ext4_set_bits(bitmap, entry->start_blk, entry->count);
> +		entry = rb_entry(n, struct ext4_free_data, efd_node);
> +		ext4_set_bits(bitmap, entry->efd_start_blk, entry->efd_count);
>  		n = rb_next(n);
>  	}
>  	return;
> @@ -4386,9 +4382,9 @@ out:
>  static int can_merge(struct ext4_free_data *entry1,
>  			struct ext4_free_data *entry2)
>  {
> -	if ((entry1->t_tid == entry2->t_tid) &&
> -	    (entry1->group == entry2->group) &&
> -	    ((entry1->start_blk + entry1->count) == entry2->start_blk))
> +	if ((entry1->efd_tid == entry2->efd_tid) &&
> +	    (entry1->efd_group == entry2->efd_group) &&
> +	    ((entry1->efd_start_blk + entry1->efd_count) == entry2->efd_start_blk))
>  		return 1;
>  	return 0;
>  }
> @@ -4402,7 +4398,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
>  	struct ext4_free_data *entry;
>  	struct ext4_group_info *db = e4b->bd_info;
>  	struct super_block *sb = e4b->bd_sb;
> -	struct ext4_sb_info *sbi = EXT4_SB(sb);
>  	struct rb_node **n = &db->bb_free_root.rb_node, *node;
>  	struct rb_node *parent = NULL, *new_node;
>  
> @@ -4410,8 +4405,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
>  	BUG_ON(e4b->bd_bitmap_page == NULL);
>  	BUG_ON(e4b->bd_buddy_page == NULL);
>  
> -	new_node = &new_entry->node;
> -	block = new_entry->start_blk;
> +	new_node = &new_entry->efd_node;
> +	block = new_entry->efd_start_blk;
>  
>  	if (!*n) {
>  		/* first free block exent. We need to
> @@ -4424,10 +4419,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
>  	}
>  	while (*n) {
>  		parent = *n;
> -		entry = rb_entry(parent, struct ext4_free_data, node);
> -		if (block < entry->start_blk)
> +		entry = rb_entry(parent, struct ext4_free_data, efd_node);
> +		if (block < entry->efd_start_blk)
>  			n = &(*n)->rb_left;
> -		else if (block >= (entry->start_blk + entry->count))
> +		else if (block >= (entry->efd_start_blk + entry->efd_count))
>  			n = &(*n)->rb_right;
>  		else {
>  			ext4_grp_locked_error(sb, group, 0,
> @@ -4443,34 +4438,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
>  	/* Now try to see the extent can be merged to left and right */
>  	node = rb_prev(new_node);
>  	if (node) {
> -		entry = rb_entry(node, struct ext4_free_data, node);
> +		entry = rb_entry(node, struct ext4_free_data, efd_node);
>  		if (can_merge(entry, new_entry)) {
> -			new_entry->start_blk = entry->start_blk;
> -			new_entry->count += entry->count;
> +			new_entry->efd_start_blk = entry->efd_start_blk;
> +			new_entry->efd_count += entry->efd_count;
>  			rb_erase(node, &(db->bb_free_root));
> -			spin_lock(&sbi->s_md_lock);
> -			list_del(&entry->list);
> -			spin_unlock(&sbi->s_md_lock);
> -			kmem_cache_free(ext4_free_ext_cachep, entry);
> +			ext4_journal_callback_del(handle, &entry->efd_jce);
> +			kmem_cache_free(ext4_free_data_cachep, entry);
>  		}
>  	}
>  
>  	node = rb_next(new_node);
>  	if (node) {
> -		entry = rb_entry(node, struct ext4_free_data, node);
> +		entry = rb_entry(node, struct ext4_free_data, efd_node);
>  		if (can_merge(new_entry, entry)) {
> -			new_entry->count += entry->count;
> +			new_entry->efd_count += entry->efd_count;
>  			rb_erase(node, &(db->bb_free_root));
> -			spin_lock(&sbi->s_md_lock);
> -			list_del(&entry->list);
> -			spin_unlock(&sbi->s_md_lock);
> -			kmem_cache_free(ext4_free_ext_cachep, entry);
> +			ext4_journal_callback_del(handle, &entry->efd_jce);
> +			kmem_cache_free(ext4_free_data_cachep, entry);
>  		}
>  	}
>  	/* Add the extent to transaction's private list */
> -	spin_lock(&sbi->s_md_lock);
> -	list_add(&new_entry->list, &handle->h_transaction->t_private_list);
> -	spin_unlock(&sbi->s_md_lock);
> +	ext4_journal_callback_add(handle, ext4_free_data_callback,
> +				  &new_entry->efd_jce);
>  	return 0;
>  }
>  
> @@ -4613,15 +4603,15 @@ do_more:
>  		 * blocks being freed are metadata. these blocks shouldn't
>  		 * be used until this transaction is committed
>  		 */
> -		new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
> +		new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
>  		if (!new_entry) {
>  			err = -ENOMEM;
>  			goto error_return;
>  		}
> -		new_entry->start_blk = bit;
> -		new_entry->group  = block_group;
> -		new_entry->count = count;
> -		new_entry->t_tid = handle->h_transaction->t_tid;
> +		new_entry->efd_start_blk = bit;
> +		new_entry->efd_group  = block_group;
> +		new_entry->efd_count = count;
> +		new_entry->efd_tid = handle->h_transaction->t_tid;
>  
>  		ext4_lock_group(sb, block_group);
>  		mb_clear_bits(bitmap_bh->b_data, bit, count);
> diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
> index 9d4a636..1e47c2e 100644
> --- a/fs/ext4/mballoc.h
> +++ b/fs/ext4/mballoc.h
> @@ -96,21 +96,23 @@ extern u8 mb_enable_debug;
>  
>  
>  struct ext4_free_data {
> -	/* this links the free block information from group_info */
> -	struct rb_node node;
> +	/* MUST be the first member */
> +	struct ext4_journal_cb_entry	efd_jce;
> +
> +	/* ext4_free_data private data starts from here */
>  
> -	/* this links the free block information from ext4_sb_info */
> -	struct list_head list;
> +	/* this links the free block information from group_info */
> +	struct rb_node			efd_node;
>  
>  	/* group which free block extent belongs */
> -	ext4_group_t group;
> +	ext4_group_t			efd_group;
>  
>  	/* free block extent */
> -	ext4_grpblk_t start_blk;
> -	ext4_grpblk_t count;
> +	ext4_grpblk_t			efd_start_blk;
> +	ext4_grpblk_t			efd_count;
>  
>  	/* transaction which freed this extent */
> -	tid_t	t_tid;
> +	tid_t				efd_tid;
>  };
>  
>  struct ext4_prealloc_space {
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 44d0c8d..5e2205e 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -414,6 +414,22 @@ static void save_error_info(struct super_block *sb, const char *func,
>  	ext4_commit_super(sb, 1);
>  }
>  
> +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
> +{
> +	struct super_block		*sb = journal->j_private;
> +	struct ext4_sb_info		*sbi = EXT4_SB(sb);
> +	int				error = is_journal_aborted(journal);
> +	struct ext4_journal_cb_entry	*jce, *tmp;
> +
> +	spin_lock(&sbi->s_md_lock);
> +	list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
> +		list_del_init(&jce->jce_list);
> +		spin_unlock(&sbi->s_md_lock);
> +		jce->jce_func(sb, jce, error);
> +		spin_lock(&sbi->s_md_lock);
> +	}
> +	spin_unlock(&sbi->s_md_lock);
> +}
>  
>  /* Deal with the reporting of failure conditions on a filesystem such as
>   * inconsistencies detected or read IO failures.
> @@ -3605,6 +3621,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	}
>  	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
>  
> +	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
> +
>  	/*
>  	 * The journal may have updated the bg summary counts, so we
>  	 * need to update the global counters.
Theodore Ts'o Feb. 13, 2012, 9:02 p.m. UTC | #2
On Wed, Oct 26, 2011 at 06:49:16PM +0800, Bobi Jam wrote:
> The per-commit callback is now used by mballoc code to manage free
> space bitmaps after deleted blocks have been released. This patch
> expand it to contain multiple different callbacks.
> 
> Signed-off-by: Bobi Jam <bobijam@whamcloud.com>
> Signed-off-by: Andreas Dilger <adilger@whamcloud.com>

Thanks, applied.

Note: since this callback is called *after* the commit is completed,
we can't really use this to do things like compute metadata checksums
or update atimes more efficiently, since those sorts of things have to
happen *before* we actually finish up the commit.

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger Feb. 13, 2012, 11:50 p.m. UTC | #3
On 2012-02-13, at 2:02 PM, Ted Ts'o wrote:
> On Wed, Oct 26, 2011 at 06:49:16PM +0800, Bobi Jam wrote:
>> The per-commit callback is now used by mballoc code to manage free
>> space bitmaps after deleted blocks have been released. This patch
>> expand it to contain multiple different callbacks.
>> 
>> Signed-off-by: Bobi Jam <bobijam@whamcloud.com>
>> Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
> 
> Thanks, applied.
> 
> Note: since this callback is called *after* the commit is completed,
> we can't really use this to do things like compute metadata checksums
> or update atimes more efficiently, since those sorts of things have to
> happen *before* we actually finish up the commit.

Right, though a similar commit callback (using the same infrastructure)
could be added before the commit is started.

There is already the buffer_head callback that OCFS is using for their
data checksums, which is called before the transaction commit.  I haven't
looked at that closely enough to know whether the ext4 metadata checksums
could use it (e.g. if it can handle different metadata block types with
different checksum chunks and locations).

Cheers, Andreas
--
Andreas Dilger                       Whamcloud, Inc.
Principal Lustre Engineer            http://www.whamcloud.com/




--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5802fa1..c65a774 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -104,6 +104,78 @@ 
 #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
 #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
 
+/**
+ *   struct ext4_journal_cb_entry - Base structure for callback information.
+ *
+ *   This struct is a 'seed' structure for a using with your own callback
+ *   structs. If you are using callbacks you must allocate one of these
+ *   or another struct of your own definition which has this struct
+ *   as it's first element and pass it to ext4_journal_callback_add().
+ */
+struct ext4_journal_cb_entry {
+	/* list information for other callbacks attached to the same handle */
+	struct list_head jce_list;
+
+	/*  Function to call with this callback structure */
+	void (*jce_func)(struct super_block *sb,
+			 struct ext4_journal_cb_entry *jce, int error);
+
+	/* user data goes here */
+};
+
+/**
+ * ext4_journal_callback_add: add a function to call after transaction commit
+ * @handle: active journal transaction handle to register callback on
+ * @func: callback function to call after the transaction has committed:
+ *        @sb: superblock of current filesystem for transaction
+ *        @jce: returned journal callback data
+ *        @rc: journal state at commit (0 = transaction committed properly)
+ * @jce: journal callback data (internal and function private data struct)
+ *
+ * The registered function will be called in the context of the journal thread
+ * after the transaction for which the handle was created has completed.
+ *
+ * No locks are held when the callback function is called, so it is safe to
+ * call blocking functions from within the callback, but the callback should
+ * not block or run for too long, or the filesystem will be blocked waiting for
+ * the next transaction to commit. No journaling functions can be used, or
+ * there is a risk of deadlock.
+ *
+ * There is no guaranteed calling order of multiple registered callbacks on
+ * the same transaction.
+ */
+static inline void ext4_journal_callback_add(handle_t *handle,
+			void (*func)(struct super_block *sb,
+				     struct ext4_journal_cb_entry *jce,
+				     int rc),
+			struct ext4_journal_cb_entry *jce)
+{
+	struct ext4_sb_info *sbi =
+			EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+	/* Add the jce to transaction's private list */
+	jce->jce_func = func;
+	spin_lock(&sbi->s_md_lock);
+	list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
+	spin_unlock(&sbi->s_md_lock);
+}
+
+/**
+ * ext4_journal_callback_del: delete a registered callback
+ * @handle: active journal transaction handle on which callback was registered
+ * @jce: registered journal callback entry to unregister
+ */
+static inline void ext4_journal_callback_del(handle_t *handle,
+					     struct ext4_journal_cb_entry *jce)
+{
+	struct ext4_sb_info *sbi =
+			EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+	spin_lock(&sbi->s_md_lock);
+	list_del_init(&jce->jce_list);
+	spin_unlock(&sbi->s_md_lock);
+}
+
 int
 ext4_mark_iloc_dirty(handle_t *handle,
 		     struct inode *inode,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 17a5a57..b90da57 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -21,6 +21,7 @@ 
  * mballoc.c contains the multiblocks allocation routines
  */
 
+#include "ext4_jbd2.h"
 #include "mballoc.h"
 #include <linux/debugfs.h>
 #include <linux/slab.h>
@@ -338,7 +339,7 @@ 
  */
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
+static struct kmem_cache *ext4_free_data_cachep;
 
 /* We create slab caches for groupinfo data structures based on the
  * superblock block size.  There will be one per mounted filesystem for
@@ -356,7 +357,8 @@  static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 						ext4_group_t group);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+static void ext4_free_data_callback(struct super_block *sb,
+					struct ext4_journal_cb_entry *jce, int error);
 
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
@@ -2511,8 +2513,6 @@  int ext4_mb_init(struct super_block *sb, int needs_recovery)
 		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
 				 &ext4_mb_seq_groups_fops, sb);
 
-	if (sbi->s_journal)
-		sbi->s_journal->j_commit_callback = release_blocks_on_commit;
 out:
 	if (ret) {
 		kfree(sbi->s_mb_offsets);
@@ -2616,58 +2616,54 @@  static inline int ext4_issue_discard(struct super_block *sb,
  * This function is called by the jbd2 layer once the commit has finished,
  * so we know we can free the blocks that were released with that commit.
  */
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
+static void ext4_free_data_callback(struct super_block *sb,
+				    struct ext4_journal_cb_entry *jce,
+				    int rc)
 {
-	struct super_block *sb = journal->j_private;
+	struct ext4_free_data *entry = (struct ext4_free_data *)jce;
 	struct ext4_buddy e4b;
 	struct ext4_group_info *db;
 	int err, count = 0, count2 = 0;
-	struct ext4_free_data *entry;
-	struct list_head *l, *ltmp;
 
-	list_for_each_safe(l, ltmp, &txn->t_private_list) {
-		entry = list_entry(l, struct ext4_free_data, list);
+	mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+		 entry->efd_count, entry->efd_group, entry);
 
-		mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
-			 entry->count, entry->group, entry);
+	if (test_opt(sb, DISCARD))
+		ext4_issue_discard(sb, entry->efd_group,
+				   entry->efd_start_blk, entry->efd_count);
 
-		if (test_opt(sb, DISCARD))
-			ext4_issue_discard(sb, entry->group,
-					   entry->start_blk, entry->count);
+	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
+	/* we expect to find existing buddy because it's pinned */
+	BUG_ON(err != 0);
 
-		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
-		/* we expect to find existing buddy because it's pinned */
-		BUG_ON(err != 0);
+	db = e4b.bd_info;
+	/* there are blocks to put in buddy to make them really free */
+	count += entry->efd_count;
+	count2++;
+	ext4_lock_group(sb, entry->efd_group);
+	/* Take it out of per group rb tree */
+	rb_erase(&entry->efd_node, &(db->bb_free_root));
+	mb_free_blocks(NULL, &e4b, entry->efd_start_blk, entry->efd_count);
 
-		db = e4b.bd_info;
-		/* there are blocks to put in buddy to make them really free */
-		count += entry->count;
-		count2++;
-		ext4_lock_group(sb, entry->group);
-		/* Take it out of per group rb tree */
-		rb_erase(&entry->node, &(db->bb_free_root));
-		mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+	/*
+	 * Clear the trimmed flag for the group so that the next
+	 * ext4_trim_fs can trim it.
+	 * If the volume is mounted with -o discard, online discard
+	 * is supported and the free blocks will be trimmed online.
+	 */
+	if (!test_opt(sb, DISCARD))
+		EXT4_MB_GRP_CLEAR_TRIMMED(db);
 
-		/*
-		 * Clear the trimmed flag for the group so that the next
-		 * ext4_trim_fs can trim it.
-		 * If the volume is mounted with -o discard, online discard
-		 * is supported and the free blocks will be trimmed online.
+	if (!db->bb_free_root.rb_node) {
+		/* No more items in the per group rb tree
+		 * balance refcounts from ext4_mb_free_metadata()
 		 */
-		if (!test_opt(sb, DISCARD))
-			EXT4_MB_GRP_CLEAR_TRIMMED(db);
-
-		if (!db->bb_free_root.rb_node) {
-			/* No more items in the per group rb tree
-			 * balance refcounts from ext4_mb_free_metadata()
-			 */
-			page_cache_release(e4b.bd_buddy_page);
-			page_cache_release(e4b.bd_bitmap_page);
-		}
-		ext4_unlock_group(sb, entry->group);
-		kmem_cache_free(ext4_free_ext_cachep, entry);
-		ext4_mb_unload_buddy(&e4b);
+		page_cache_release(e4b.bd_buddy_page);
+		page_cache_release(e4b.bd_bitmap_page);
 	}
+	ext4_unlock_group(sb, entry->efd_group);
+	kmem_cache_free(ext4_free_data_cachep, entry);
+	ext4_mb_unload_buddy(&e4b);
 
 	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
 }
@@ -2720,9 +2716,9 @@  int __init ext4_init_mballoc(void)
 		return -ENOMEM;
 	}
 
-	ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
-					  SLAB_RECLAIM_ACCOUNT);
-	if (ext4_free_ext_cachep == NULL) {
+	ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
+					   SLAB_RECLAIM_ACCOUNT);
+	if (ext4_free_data_cachep == NULL) {
 		kmem_cache_destroy(ext4_pspace_cachep);
 		kmem_cache_destroy(ext4_ac_cachep);
 		return -ENOMEM;
@@ -2740,7 +2736,7 @@  void ext4_exit_mballoc(void)
 	rcu_barrier();
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
-	kmem_cache_destroy(ext4_free_ext_cachep);
+	kmem_cache_destroy(ext4_free_data_cachep);
 	ext4_groupinfo_destroy_slabs();
 	ext4_remove_debugfs_entry();
 }
@@ -3290,8 +3286,8 @@  static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 	n = rb_first(&(grp->bb_free_root));
 
 	while (n) {
-		entry = rb_entry(n, struct ext4_free_data, node);
-		ext4_set_bits(bitmap, entry->start_blk, entry->count);
+		entry = rb_entry(n, struct ext4_free_data, efd_node);
+		ext4_set_bits(bitmap, entry->efd_start_blk, entry->efd_count);
 		n = rb_next(n);
 	}
 	return;
@@ -4386,9 +4382,9 @@  out:
 static int can_merge(struct ext4_free_data *entry1,
 			struct ext4_free_data *entry2)
 {
-	if ((entry1->t_tid == entry2->t_tid) &&
-	    (entry1->group == entry2->group) &&
-	    ((entry1->start_blk + entry1->count) == entry2->start_blk))
+	if ((entry1->efd_tid == entry2->efd_tid) &&
+	    (entry1->efd_group == entry2->efd_group) &&
+	    ((entry1->efd_start_blk + entry1->efd_count) == entry2->efd_start_blk))
 		return 1;
 	return 0;
 }
@@ -4402,7 +4398,6 @@  ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	struct ext4_free_data *entry;
 	struct ext4_group_info *db = e4b->bd_info;
 	struct super_block *sb = e4b->bd_sb;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct rb_node **n = &db->bb_free_root.rb_node, *node;
 	struct rb_node *parent = NULL, *new_node;
 
@@ -4410,8 +4405,8 @@  ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
-	new_node = &new_entry->node;
-	block = new_entry->start_blk;
+	new_node = &new_entry->efd_node;
+	block = new_entry->efd_start_blk;
 
 	if (!*n) {
 		/* first free block exent. We need to
@@ -4424,10 +4419,10 @@  ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	}
 	while (*n) {
 		parent = *n;
-		entry = rb_entry(parent, struct ext4_free_data, node);
-		if (block < entry->start_blk)
+		entry = rb_entry(parent, struct ext4_free_data, efd_node);
+		if (block < entry->efd_start_blk)
 			n = &(*n)->rb_left;
-		else if (block >= (entry->start_blk + entry->count))
+		else if (block >= (entry->efd_start_blk + entry->efd_count))
 			n = &(*n)->rb_right;
 		else {
 			ext4_grp_locked_error(sb, group, 0,
@@ -4443,34 +4438,29 @@  ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	/* Now try to see the extent can be merged to left and right */
 	node = rb_prev(new_node);
 	if (node) {
-		entry = rb_entry(node, struct ext4_free_data, node);
+		entry = rb_entry(node, struct ext4_free_data, efd_node);
 		if (can_merge(entry, new_entry)) {
-			new_entry->start_blk = entry->start_blk;
-			new_entry->count += entry->count;
+			new_entry->efd_start_blk = entry->efd_start_blk;
+			new_entry->efd_count += entry->efd_count;
 			rb_erase(node, &(db->bb_free_root));
-			spin_lock(&sbi->s_md_lock);
-			list_del(&entry->list);
-			spin_unlock(&sbi->s_md_lock);
-			kmem_cache_free(ext4_free_ext_cachep, entry);
+			ext4_journal_callback_del(handle, &entry->efd_jce);
+			kmem_cache_free(ext4_free_data_cachep, entry);
 		}
 	}
 
 	node = rb_next(new_node);
 	if (node) {
-		entry = rb_entry(node, struct ext4_free_data, node);
+		entry = rb_entry(node, struct ext4_free_data, efd_node);
 		if (can_merge(new_entry, entry)) {
-			new_entry->count += entry->count;
+			new_entry->efd_count += entry->efd_count;
 			rb_erase(node, &(db->bb_free_root));
-			spin_lock(&sbi->s_md_lock);
-			list_del(&entry->list);
-			spin_unlock(&sbi->s_md_lock);
-			kmem_cache_free(ext4_free_ext_cachep, entry);
+			ext4_journal_callback_del(handle, &entry->efd_jce);
+			kmem_cache_free(ext4_free_data_cachep, entry);
 		}
 	}
 	/* Add the extent to transaction's private list */
-	spin_lock(&sbi->s_md_lock);
-	list_add(&new_entry->list, &handle->h_transaction->t_private_list);
-	spin_unlock(&sbi->s_md_lock);
+	ext4_journal_callback_add(handle, ext4_free_data_callback,
+				  &new_entry->efd_jce);
 	return 0;
 }
 
@@ -4613,15 +4603,15 @@  do_more:
 		 * blocks being freed are metadata. these blocks shouldn't
 		 * be used until this transaction is committed
 		 */
-		new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+		new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
 		if (!new_entry) {
 			err = -ENOMEM;
 			goto error_return;
 		}
-		new_entry->start_blk = bit;
-		new_entry->group  = block_group;
-		new_entry->count = count;
-		new_entry->t_tid = handle->h_transaction->t_tid;
+		new_entry->efd_start_blk = bit;
+		new_entry->efd_group  = block_group;
+		new_entry->efd_count = count;
+		new_entry->efd_tid = handle->h_transaction->t_tid;
 
 		ext4_lock_group(sb, block_group);
 		mb_clear_bits(bitmap_bh->b_data, bit, count);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 9d4a636..1e47c2e 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -96,21 +96,23 @@  extern u8 mb_enable_debug;
 
 
 struct ext4_free_data {
-	/* this links the free block information from group_info */
-	struct rb_node node;
+	/* MUST be the first member */
+	struct ext4_journal_cb_entry	efd_jce;
+
+	/* ext4_free_data private data starts from here */
 
-	/* this links the free block information from ext4_sb_info */
-	struct list_head list;
+	/* this links the free block information from group_info */
+	struct rb_node			efd_node;
 
 	/* group which free block extent belongs */
-	ext4_group_t group;
+	ext4_group_t			efd_group;
 
 	/* free block extent */
-	ext4_grpblk_t start_blk;
-	ext4_grpblk_t count;
+	ext4_grpblk_t			efd_start_blk;
+	ext4_grpblk_t			efd_count;
 
 	/* transaction which freed this extent */
-	tid_t	t_tid;
+	tid_t				efd_tid;
 };
 
 struct ext4_prealloc_space {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 44d0c8d..5e2205e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -414,6 +414,22 @@  static void save_error_info(struct super_block *sb, const char *func,
 	ext4_commit_super(sb, 1);
 }
 
+static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
+{
+	struct super_block		*sb = journal->j_private;
+	struct ext4_sb_info		*sbi = EXT4_SB(sb);
+	int				error = is_journal_aborted(journal);
+	struct ext4_journal_cb_entry	*jce, *tmp;
+
+	spin_lock(&sbi->s_md_lock);
+	list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
+		list_del_init(&jce->jce_list);
+		spin_unlock(&sbi->s_md_lock);
+		jce->jce_func(sb, jce, error);
+		spin_lock(&sbi->s_md_lock);
+	}
+	spin_unlock(&sbi->s_md_lock);
+}
 
 /* Deal with the reporting of failure conditions on a filesystem such as
  * inconsistencies detected or read IO failures.
@@ -3605,6 +3621,8 @@  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 
+	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
+
 	/*
 	 * The journal may have updated the bg summary counts, so we
 	 * need to update the global counters.