Patchwork [07/29] jbd2: Refine waiting for shadow buffers

login
register
mail settings
Submitter Jan Kara
Date April 8, 2013, 9:32 p.m.
Message ID <1365456754-29373-8-git-send-email-jack@suse.cz>
Download mbox | patch
Permalink /patch/234917/
State Superseded
Headers show

Comments

Jan Kara - April 8, 2013, 9:32 p.m.
Currently when we add a buffer to a transaction, we wait until the
buffer is removed from BJ_Shadow list (so that we prevent any changes to
the buffer that is just written to the journal). This can take
unnecessarily long as a lot happens between the time the buffer is
submitted to the journal and the time when we remove the buffer from
BJ_Shadow list (e.g.  we wait for all data buffers in the transaction,
we issue a cache flush etc.). Also this creates a dependency of
do_get_write_access() on transaction commit (namely waiting for data IO
to complete) which we want to avoid when implementing transaction
reservation.

So we modify commit code to set new BH_Shadow flag when temporary
shadowing buffer is created and we clear that flag once IO on that
buffer is complete. This allows do_get_write_access() to wait only for
BH_Shadow bit and thus removes the dependency on data IO completion.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd2/commit.c           |   20 ++++++++++----------
 fs/jbd2/journal.c          |    2 ++
 fs/jbd2/transaction.c      |   44 +++++++++++++++++++-------------------------
 include/linux/jbd.h        |   25 +++++++++++++++++++++++++
 include/linux/jbd2.h       |   28 ++++++++++++++++++++++++++++
 include/linux/jbd_common.h |   26 --------------------------
 6 files changed, 84 insertions(+), 61 deletions(-)
Zheng Liu - May 3, 2013, 2:16 p.m.
On Mon, Apr 08, 2013 at 11:32:12PM +0200, Jan Kara wrote:
> Currently when we add a buffer to a transaction, we wait until the
> buffer is removed from BJ_Shadow list (so that we prevent any changes to
> the buffer that is just written to the journal). This can take
> unnecessarily long as a lot happens between the time the buffer is
> submitted to the journal and the time when we remove the buffer from
> BJ_Shadow list (e.g.  we wait for all data buffers in the transaction,
> we issue a cache flush etc.). Also this creates a dependency of
> do_get_write_access() on transaction commit (namely waiting for data IO
> to complete) which we want to avoid when implementing transaction
> reservation.
> 
> So we modify commit code to set new BH_Shadow flag when temporary
> shadowing buffer is created and we clear that flag once IO on that
> buffer is complete. This allows do_get_write_access() to wait only for
> BH_Shadow bit and thus removes the dependency on data IO completion.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>

A minor nit below.  Otherwise the patch looks good to me.
Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>

> ---
>  fs/jbd2/commit.c           |   20 ++++++++++----------
>  fs/jbd2/journal.c          |    2 ++
>  fs/jbd2/transaction.c      |   44 +++++++++++++++++++-------------------------
>  include/linux/jbd.h        |   25 +++++++++++++++++++++++++
>  include/linux/jbd2.h       |   28 ++++++++++++++++++++++++++++
>  include/linux/jbd_common.h |   26 --------------------------
>  6 files changed, 84 insertions(+), 61 deletions(-)
> 
> diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
> index 1a03762..4863f5b 100644
> --- a/fs/jbd2/commit.c
> +++ b/fs/jbd2/commit.c
> @@ -30,15 +30,22 @@
>  #include <trace/events/jbd2.h>
>  
>  /*
> - * Default IO end handler for temporary BJ_IO buffer_heads.
> + * IO end handler for temporary buffer_heads handling writes to the journal.
>   */
>  static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
>  {
> +	struct buffer_head *orig_bh = bh->b_private;
> +
>  	BUFFER_TRACE(bh, "");
>  	if (uptodate)
>  		set_buffer_uptodate(bh);
>  	else
>  		clear_buffer_uptodate(bh);
> +	if (orig_bh) {
> +		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
> +		smp_mb__after_clear_bit();
> +		wake_up_bit(&orig_bh->b_state, BH_Shadow);
> +	}
>  	unlock_buffer(bh);
>  }
>  
> @@ -818,7 +825,7 @@ start_journal_io:
>  		jbd2_unfile_log_bh(bh);
>  
>  		/*
> -		 * The list contains temporary buffer heads created by
> +		 * The list contains temporary buffer heas created by
                                                      ^^^^
                                                typo: head

Regards,
                                                - Zheng

>  		 * jbd2_journal_write_metadata_buffer().
>  		 */
>  		BUFFER_TRACE(bh, "dumping temporary bh");
> @@ -831,6 +838,7 @@ start_journal_io:
>  		bh = jh2bh(jh);
>  		clear_buffer_jwrite(bh);
>  		J_ASSERT_BH(bh, buffer_jbddirty(bh));
> +		J_ASSERT_BH(bh, !buffer_shadow(bh));
>  
>  		/* The metadata is now released for reuse, but we need
>                     to remember it against this transaction so that when
> @@ -838,14 +846,6 @@ start_journal_io:
>                     required. */
>  		JBUFFER_TRACE(jh, "file as BJ_Forget");
>  		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
> -		/*
> -		 * Wake up any transactions which were waiting for this IO to
> -		 * complete. The barrier must be here so that changes by
> -		 * jbd2_journal_file_buffer() take effect before wake_up_bit()
> -		 * does the waitqueue check.
> -		 */
> -		smp_mb();
> -		wake_up_bit(&bh->b_state, BH_Unshadow);
>  		JBUFFER_TRACE(jh, "brelse shadowed buffer");
>  		__brelse(bh);
>  	}
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index e03aae0..e9a9cdb 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -453,6 +453,7 @@ repeat:
>  	new_bh->b_size = bh_in->b_size;
>  	new_bh->b_bdev = journal->j_dev;
>  	new_bh->b_blocknr = blocknr;
> +	new_bh->b_private = bh_in;
>  	set_buffer_mapped(new_bh);
>  	set_buffer_dirty(new_bh);
>  
> @@ -467,6 +468,7 @@ repeat:
>  	spin_lock(&journal->j_list_lock);
>  	__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
>  	spin_unlock(&journal->j_list_lock);
> +	set_buffer_shadow(bh_in);
>  	jbd_unlock_bh_state(bh_in);
>  
>  	return do_escape | (done_copy_out << 1);
> diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
> index bc35899..81df09c 100644
> --- a/fs/jbd2/transaction.c
> +++ b/fs/jbd2/transaction.c
> @@ -620,6 +620,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
>  	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
>  }
>  
> +static int sleep_on_shadow_bh(void *word)
> +{
> +	io_schedule();
> +	return 0;
> +}
> +
>  /*
>   * If the buffer is already part of the current transaction, then there
>   * is nothing we need to do.  If it is already part of a prior
> @@ -747,41 +753,29 @@ repeat:
>  		 * journaled.  If the primary copy is already going to
>  		 * disk then we cannot do copy-out here. */
>  
> -		if (jh->b_jlist == BJ_Shadow) {
> -			DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
> -			wait_queue_head_t *wqh;
> -
> -			wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
> -
> +		if (buffer_shadow(bh)) {
>  			JBUFFER_TRACE(jh, "on shadow: sleep");
>  			jbd_unlock_bh_state(bh);
> -			/* commit wakes up all shadow buffers after IO */
> -			for ( ; ; ) {
> -				prepare_to_wait(wqh, &wait.wait,
> -						TASK_UNINTERRUPTIBLE);
> -				if (jh->b_jlist != BJ_Shadow)
> -					break;
> -				schedule();
> -			}
> -			finish_wait(wqh, &wait.wait);
> +			wait_on_bit(&bh->b_state, BH_Shadow,
> +				    sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
>  			goto repeat;
>  		}
>  
> -		/* Only do the copy if the currently-owning transaction
> -		 * still needs it.  If it is on the Forget list, the
> -		 * committing transaction is past that stage.  The
> -		 * buffer had better remain locked during the kmalloc,
> -		 * but that should be true --- we hold the journal lock
> -		 * still and the buffer is already on the BUF_JOURNAL
> -		 * list so won't be flushed.
> +		/*
> +		 * Only do the copy if the currently-owning transaction still
> +		 * needs it. If buffer isn't on BJ_Metadata list, the
> +		 * committing transaction is past that stage (here we use the
> +		 * fact that BH_Shadow is set under bh_state lock together with
> +		 * refiling to BJ_Shadow list and at this point we know the
> +		 * buffer doesn't have BH_Shadow set).
>  		 *
>  		 * Subtle point, though: if this is a get_undo_access,
>  		 * then we will be relying on the frozen_data to contain
>  		 * the new value of the committed_data record after the
>  		 * transaction, so we HAVE to force the frozen_data copy
> -		 * in that case. */
> -
> -		if (jh->b_jlist != BJ_Forget || force_copy) {
> +		 * in that case.
> +		 */
> +		if (jh->b_jlist == BJ_Metadata || force_copy) {
>  			JBUFFER_TRACE(jh, "generate frozen data");
>  			if (!frozen_buffer) {
>  				JBUFFER_TRACE(jh, "allocate memory for buffer");
> diff --git a/include/linux/jbd.h b/include/linux/jbd.h
> index c8f3297..1c36b0c 100644
> --- a/include/linux/jbd.h
> +++ b/include/linux/jbd.h
> @@ -244,6 +244,31 @@ typedef struct journal_superblock_s
>  
>  #include <linux/fs.h>
>  #include <linux/sched.h>
> +
> +enum jbd_state_bits {
> +	BH_JBD			/* Has an attached ext3 journal_head */
> +	  = BH_PrivateStart,
> +	BH_JWrite,		/* Being written to log (@@@ DEBUGGING) */
> +	BH_Freed,		/* Has been freed (truncated) */
> +	BH_Revoked,		/* Has been revoked from the log */
> +	BH_RevokeValid,		/* Revoked flag is valid */
> +	BH_JBDDirty,		/* Is dirty but journaled */
> +	BH_State,		/* Pins most journal_head state */
> +	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
> +	BH_Unshadow,		/* Dummy bit, for BJ_Shadow wakeup filtering */
> +	BH_JBDPrivateStart,	/* First bit available for private use by FS */
> +};
> +
> +BUFFER_FNS(JBD, jbd)
> +BUFFER_FNS(JWrite, jwrite)
> +BUFFER_FNS(JBDDirty, jbddirty)
> +TAS_BUFFER_FNS(JBDDirty, jbddirty)
> +BUFFER_FNS(Revoked, revoked)
> +TAS_BUFFER_FNS(Revoked, revoked)
> +BUFFER_FNS(RevokeValid, revokevalid)
> +TAS_BUFFER_FNS(RevokeValid, revokevalid)
> +BUFFER_FNS(Freed, freed)
> +
>  #include <linux/jbd_common.h>
>  
>  #define J_ASSERT(assert)	BUG_ON(!(assert))
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index 4584518..be5115f 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -302,6 +302,34 @@ typedef struct journal_superblock_s
>  
>  #include <linux/fs.h>
>  #include <linux/sched.h>
> +
> +enum jbd_state_bits {
> +	BH_JBD			/* Has an attached ext3 journal_head */
> +	  = BH_PrivateStart,
> +	BH_JWrite,		/* Being written to log (@@@ DEBUGGING) */
> +	BH_Freed,		/* Has been freed (truncated) */
> +	BH_Revoked,		/* Has been revoked from the log */
> +	BH_RevokeValid,		/* Revoked flag is valid */
> +	BH_JBDDirty,		/* Is dirty but journaled */
> +	BH_State,		/* Pins most journal_head state */
> +	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
> +	BH_Shadow,		/* IO on shadow buffer is running */
> +	BH_Verified,		/* Metadata block has been verified ok */
> +	BH_JBDPrivateStart,	/* First bit available for private use by FS */
> +};
> +
> +BUFFER_FNS(JBD, jbd)
> +BUFFER_FNS(JWrite, jwrite)
> +BUFFER_FNS(JBDDirty, jbddirty)
> +TAS_BUFFER_FNS(JBDDirty, jbddirty)
> +BUFFER_FNS(Revoked, revoked)
> +TAS_BUFFER_FNS(Revoked, revoked)
> +BUFFER_FNS(RevokeValid, revokevalid)
> +TAS_BUFFER_FNS(RevokeValid, revokevalid)
> +BUFFER_FNS(Freed, freed)
> +BUFFER_FNS(Shadow, shadow)
> +BUFFER_FNS(Verified, verified)
> +
>  #include <linux/jbd_common.h>
>  
>  #define J_ASSERT(assert)	BUG_ON(!(assert))
> diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
> index 6133679..b1f7089 100644
> --- a/include/linux/jbd_common.h
> +++ b/include/linux/jbd_common.h
> @@ -1,32 +1,6 @@
>  #ifndef _LINUX_JBD_STATE_H
>  #define _LINUX_JBD_STATE_H
>  
> -enum jbd_state_bits {
> -	BH_JBD			/* Has an attached ext3 journal_head */
> -	  = BH_PrivateStart,
> -	BH_JWrite,		/* Being written to log (@@@ DEBUGGING) */
> -	BH_Freed,		/* Has been freed (truncated) */
> -	BH_Revoked,		/* Has been revoked from the log */
> -	BH_RevokeValid,		/* Revoked flag is valid */
> -	BH_JBDDirty,		/* Is dirty but journaled */
> -	BH_State,		/* Pins most journal_head state */
> -	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
> -	BH_Unshadow,		/* Dummy bit, for BJ_Shadow wakeup filtering */
> -	BH_Verified,		/* Metadata block has been verified ok */
> -	BH_JBDPrivateStart,	/* First bit available for private use by FS */
> -};
> -
> -BUFFER_FNS(JBD, jbd)
> -BUFFER_FNS(JWrite, jwrite)
> -BUFFER_FNS(JBDDirty, jbddirty)
> -TAS_BUFFER_FNS(JBDDirty, jbddirty)
> -BUFFER_FNS(Revoked, revoked)
> -TAS_BUFFER_FNS(Revoked, revoked)
> -BUFFER_FNS(RevokeValid, revokevalid)
> -TAS_BUFFER_FNS(RevokeValid, revokevalid)
> -BUFFER_FNS(Freed, freed)
> -BUFFER_FNS(Verified, verified)
> -
>  static inline struct buffer_head *jh2bh(struct journal_head *jh)
>  {
>  	return jh->b_bh;
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - May 3, 2013, 8:44 p.m.
On Fri 03-05-13 22:16:13, Zheng Liu wrote:
> On Mon, Apr 08, 2013 at 11:32:12PM +0200, Jan Kara wrote:
> > Currently when we add a buffer to a transaction, we wait until the
> > buffer is removed from BJ_Shadow list (so that we prevent any changes to
> > the buffer that is just written to the journal). This can take
> > unnecessarily long as a lot happens between the time the buffer is
> > submitted to the journal and the time when we remove the buffer from
> > BJ_Shadow list (e.g.  we wait for all data buffers in the transaction,
> > we issue a cache flush etc.). Also this creates a dependency of
> > do_get_write_access() on transaction commit (namely waiting for data IO
> > to complete) which we want to avoid when implementing transaction
> > reservation.
> > 
> > So we modify commit code to set new BH_Shadow flag when temporary
> > shadowing buffer is created and we clear that flag once IO on that
> > buffer is complete. This allows do_get_write_access() to wait only for
> > BH_Shadow bit and thus removes the dependency on data IO completion.
> > 
> > Signed-off-by: Jan Kara <jack@suse.cz>
> 
> A minor nit below.  Otherwise the patch looks good to me.
> Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
  Thanks for review! I'll fix that typo.

							Honza
> 
> > ---
> >  fs/jbd2/commit.c           |   20 ++++++++++----------
> >  fs/jbd2/journal.c          |    2 ++
> >  fs/jbd2/transaction.c      |   44 +++++++++++++++++++-------------------------
> >  include/linux/jbd.h        |   25 +++++++++++++++++++++++++
> >  include/linux/jbd2.h       |   28 ++++++++++++++++++++++++++++
> >  include/linux/jbd_common.h |   26 --------------------------
> >  6 files changed, 84 insertions(+), 61 deletions(-)
> > 
> > diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
> > index 1a03762..4863f5b 100644
> > --- a/fs/jbd2/commit.c
> > +++ b/fs/jbd2/commit.c
> > @@ -30,15 +30,22 @@
> >  #include <trace/events/jbd2.h>
> >  
> >  /*
> > - * Default IO end handler for temporary BJ_IO buffer_heads.
> > + * IO end handler for temporary buffer_heads handling writes to the journal.
> >   */
> >  static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
> >  {
> > +	struct buffer_head *orig_bh = bh->b_private;
> > +
> >  	BUFFER_TRACE(bh, "");
> >  	if (uptodate)
> >  		set_buffer_uptodate(bh);
> >  	else
> >  		clear_buffer_uptodate(bh);
> > +	if (orig_bh) {
> > +		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
> > +		smp_mb__after_clear_bit();
> > +		wake_up_bit(&orig_bh->b_state, BH_Shadow);
> > +	}
> >  	unlock_buffer(bh);
> >  }
> >  
> > @@ -818,7 +825,7 @@ start_journal_io:
> >  		jbd2_unfile_log_bh(bh);
> >  
> >  		/*
> > -		 * The list contains temporary buffer heads created by
> > +		 * The list contains temporary buffer heas created by
>                                                       ^^^^
>                                                 typo: head
> 
> Regards,
>                                                 - Zheng
> 
> >  		 * jbd2_journal_write_metadata_buffer().
> >  		 */
> >  		BUFFER_TRACE(bh, "dumping temporary bh");
> > @@ -831,6 +838,7 @@ start_journal_io:
> >  		bh = jh2bh(jh);
> >  		clear_buffer_jwrite(bh);
> >  		J_ASSERT_BH(bh, buffer_jbddirty(bh));
> > +		J_ASSERT_BH(bh, !buffer_shadow(bh));
> >  
> >  		/* The metadata is now released for reuse, but we need
> >                     to remember it against this transaction so that when
> > @@ -838,14 +846,6 @@ start_journal_io:
> >                     required. */
> >  		JBUFFER_TRACE(jh, "file as BJ_Forget");
> >  		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
> > -		/*
> > -		 * Wake up any transactions which were waiting for this IO to
> > -		 * complete. The barrier must be here so that changes by
> > -		 * jbd2_journal_file_buffer() take effect before wake_up_bit()
> > -		 * does the waitqueue check.
> > -		 */
> > -		smp_mb();
> > -		wake_up_bit(&bh->b_state, BH_Unshadow);
> >  		JBUFFER_TRACE(jh, "brelse shadowed buffer");
> >  		__brelse(bh);
> >  	}
> > diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> > index e03aae0..e9a9cdb 100644
> > --- a/fs/jbd2/journal.c
> > +++ b/fs/jbd2/journal.c
> > @@ -453,6 +453,7 @@ repeat:
> >  	new_bh->b_size = bh_in->b_size;
> >  	new_bh->b_bdev = journal->j_dev;
> >  	new_bh->b_blocknr = blocknr;
> > +	new_bh->b_private = bh_in;
> >  	set_buffer_mapped(new_bh);
> >  	set_buffer_dirty(new_bh);
> >  
> > @@ -467,6 +468,7 @@ repeat:
> >  	spin_lock(&journal->j_list_lock);
> >  	__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
> >  	spin_unlock(&journal->j_list_lock);
> > +	set_buffer_shadow(bh_in);
> >  	jbd_unlock_bh_state(bh_in);
> >  
> >  	return do_escape | (done_copy_out << 1);
> > diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
> > index bc35899..81df09c 100644
> > --- a/fs/jbd2/transaction.c
> > +++ b/fs/jbd2/transaction.c
> > @@ -620,6 +620,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
> >  	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
> >  }
> >  
> > +static int sleep_on_shadow_bh(void *word)
> > +{
> > +	io_schedule();
> > +	return 0;
> > +}
> > +
> >  /*
> >   * If the buffer is already part of the current transaction, then there
> >   * is nothing we need to do.  If it is already part of a prior
> > @@ -747,41 +753,29 @@ repeat:
> >  		 * journaled.  If the primary copy is already going to
> >  		 * disk then we cannot do copy-out here. */
> >  
> > -		if (jh->b_jlist == BJ_Shadow) {
> > -			DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
> > -			wait_queue_head_t *wqh;
> > -
> > -			wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
> > -
> > +		if (buffer_shadow(bh)) {
> >  			JBUFFER_TRACE(jh, "on shadow: sleep");
> >  			jbd_unlock_bh_state(bh);
> > -			/* commit wakes up all shadow buffers after IO */
> > -			for ( ; ; ) {
> > -				prepare_to_wait(wqh, &wait.wait,
> > -						TASK_UNINTERRUPTIBLE);
> > -				if (jh->b_jlist != BJ_Shadow)
> > -					break;
> > -				schedule();
> > -			}
> > -			finish_wait(wqh, &wait.wait);
> > +			wait_on_bit(&bh->b_state, BH_Shadow,
> > +				    sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
> >  			goto repeat;
> >  		}
> >  
> > -		/* Only do the copy if the currently-owning transaction
> > -		 * still needs it.  If it is on the Forget list, the
> > -		 * committing transaction is past that stage.  The
> > -		 * buffer had better remain locked during the kmalloc,
> > -		 * but that should be true --- we hold the journal lock
> > -		 * still and the buffer is already on the BUF_JOURNAL
> > -		 * list so won't be flushed.
> > +		/*
> > +		 * Only do the copy if the currently-owning transaction still
> > +		 * needs it. If buffer isn't on BJ_Metadata list, the
> > +		 * committing transaction is past that stage (here we use the
> > +		 * fact that BH_Shadow is set under bh_state lock together with
> > +		 * refiling to BJ_Shadow list and at this point we know the
> > +		 * buffer doesn't have BH_Shadow set).
> >  		 *
> >  		 * Subtle point, though: if this is a get_undo_access,
> >  		 * then we will be relying on the frozen_data to contain
> >  		 * the new value of the committed_data record after the
> >  		 * transaction, so we HAVE to force the frozen_data copy
> > -		 * in that case. */
> > -
> > -		if (jh->b_jlist != BJ_Forget || force_copy) {
> > +		 * in that case.
> > +		 */
> > +		if (jh->b_jlist == BJ_Metadata || force_copy) {
> >  			JBUFFER_TRACE(jh, "generate frozen data");
> >  			if (!frozen_buffer) {
> >  				JBUFFER_TRACE(jh, "allocate memory for buffer");
> > diff --git a/include/linux/jbd.h b/include/linux/jbd.h
> > index c8f3297..1c36b0c 100644
> > --- a/include/linux/jbd.h
> > +++ b/include/linux/jbd.h
> > @@ -244,6 +244,31 @@ typedef struct journal_superblock_s
> >  
> >  #include <linux/fs.h>
> >  #include <linux/sched.h>
> > +
> > +enum jbd_state_bits {
> > +	BH_JBD			/* Has an attached ext3 journal_head */
> > +	  = BH_PrivateStart,
> > +	BH_JWrite,		/* Being written to log (@@@ DEBUGGING) */
> > +	BH_Freed,		/* Has been freed (truncated) */
> > +	BH_Revoked,		/* Has been revoked from the log */
> > +	BH_RevokeValid,		/* Revoked flag is valid */
> > +	BH_JBDDirty,		/* Is dirty but journaled */
> > +	BH_State,		/* Pins most journal_head state */
> > +	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
> > +	BH_Unshadow,		/* Dummy bit, for BJ_Shadow wakeup filtering */
> > +	BH_JBDPrivateStart,	/* First bit available for private use by FS */
> > +};
> > +
> > +BUFFER_FNS(JBD, jbd)
> > +BUFFER_FNS(JWrite, jwrite)
> > +BUFFER_FNS(JBDDirty, jbddirty)
> > +TAS_BUFFER_FNS(JBDDirty, jbddirty)
> > +BUFFER_FNS(Revoked, revoked)
> > +TAS_BUFFER_FNS(Revoked, revoked)
> > +BUFFER_FNS(RevokeValid, revokevalid)
> > +TAS_BUFFER_FNS(RevokeValid, revokevalid)
> > +BUFFER_FNS(Freed, freed)
> > +
> >  #include <linux/jbd_common.h>
> >  
> >  #define J_ASSERT(assert)	BUG_ON(!(assert))
> > diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> > index 4584518..be5115f 100644
> > --- a/include/linux/jbd2.h
> > +++ b/include/linux/jbd2.h
> > @@ -302,6 +302,34 @@ typedef struct journal_superblock_s
> >  
> >  #include <linux/fs.h>
> >  #include <linux/sched.h>
> > +
> > +enum jbd_state_bits {
> > +	BH_JBD			/* Has an attached ext3 journal_head */
> > +	  = BH_PrivateStart,
> > +	BH_JWrite,		/* Being written to log (@@@ DEBUGGING) */
> > +	BH_Freed,		/* Has been freed (truncated) */
> > +	BH_Revoked,		/* Has been revoked from the log */
> > +	BH_RevokeValid,		/* Revoked flag is valid */
> > +	BH_JBDDirty,		/* Is dirty but journaled */
> > +	BH_State,		/* Pins most journal_head state */
> > +	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
> > +	BH_Shadow,		/* IO on shadow buffer is running */
> > +	BH_Verified,		/* Metadata block has been verified ok */
> > +	BH_JBDPrivateStart,	/* First bit available for private use by FS */
> > +};
> > +
> > +BUFFER_FNS(JBD, jbd)
> > +BUFFER_FNS(JWrite, jwrite)
> > +BUFFER_FNS(JBDDirty, jbddirty)
> > +TAS_BUFFER_FNS(JBDDirty, jbddirty)
> > +BUFFER_FNS(Revoked, revoked)
> > +TAS_BUFFER_FNS(Revoked, revoked)
> > +BUFFER_FNS(RevokeValid, revokevalid)
> > +TAS_BUFFER_FNS(RevokeValid, revokevalid)
> > +BUFFER_FNS(Freed, freed)
> > +BUFFER_FNS(Shadow, shadow)
> > +BUFFER_FNS(Verified, verified)
> > +
> >  #include <linux/jbd_common.h>
> >  
> >  #define J_ASSERT(assert)	BUG_ON(!(assert))
> > diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
> > index 6133679..b1f7089 100644
> > --- a/include/linux/jbd_common.h
> > +++ b/include/linux/jbd_common.h
> > @@ -1,32 +1,6 @@
> >  #ifndef _LINUX_JBD_STATE_H
> >  #define _LINUX_JBD_STATE_H
> >  
> > -enum jbd_state_bits {
> > -	BH_JBD			/* Has an attached ext3 journal_head */
> > -	  = BH_PrivateStart,
> > -	BH_JWrite,		/* Being written to log (@@@ DEBUGGING) */
> > -	BH_Freed,		/* Has been freed (truncated) */
> > -	BH_Revoked,		/* Has been revoked from the log */
> > -	BH_RevokeValid,		/* Revoked flag is valid */
> > -	BH_JBDDirty,		/* Is dirty but journaled */
> > -	BH_State,		/* Pins most journal_head state */
> > -	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
> > -	BH_Unshadow,		/* Dummy bit, for BJ_Shadow wakeup filtering */
> > -	BH_Verified,		/* Metadata block has been verified ok */
> > -	BH_JBDPrivateStart,	/* First bit available for private use by FS */
> > -};
> > -
> > -BUFFER_FNS(JBD, jbd)
> > -BUFFER_FNS(JWrite, jwrite)
> > -BUFFER_FNS(JBDDirty, jbddirty)
> > -TAS_BUFFER_FNS(JBDDirty, jbddirty)
> > -BUFFER_FNS(Revoked, revoked)
> > -TAS_BUFFER_FNS(Revoked, revoked)
> > -BUFFER_FNS(RevokeValid, revokevalid)
> > -TAS_BUFFER_FNS(RevokeValid, revokevalid)
> > -BUFFER_FNS(Freed, freed)
> > -BUFFER_FNS(Verified, verified)
> > -
> >  static inline struct buffer_head *jh2bh(struct journal_head *jh)
> >  {
> >  	return jh->b_bh;
> > -- 
> > 1.7.1
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 1a03762..4863f5b 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -30,15 +30,22 @@ 
 #include <trace/events/jbd2.h>
 
 /*
- * Default IO end handler for temporary BJ_IO buffer_heads.
+ * IO end handler for temporary buffer_heads handling writes to the journal.
  */
 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 {
+	struct buffer_head *orig_bh = bh->b_private;
+
 	BUFFER_TRACE(bh, "");
 	if (uptodate)
 		set_buffer_uptodate(bh);
 	else
 		clear_buffer_uptodate(bh);
+	if (orig_bh) {
+		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
+		smp_mb__after_clear_bit();
+		wake_up_bit(&orig_bh->b_state, BH_Shadow);
+	}
 	unlock_buffer(bh);
 }
 
@@ -818,7 +825,7 @@  start_journal_io:
 		jbd2_unfile_log_bh(bh);
 
 		/*
-		 * The list contains temporary buffer heads created by
+		 * The list contains temporary buffer heas created by
 		 * jbd2_journal_write_metadata_buffer().
 		 */
 		BUFFER_TRACE(bh, "dumping temporary bh");
@@ -831,6 +838,7 @@  start_journal_io:
 		bh = jh2bh(jh);
 		clear_buffer_jwrite(bh);
 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
+		J_ASSERT_BH(bh, !buffer_shadow(bh));
 
 		/* The metadata is now released for reuse, but we need
                    to remember it against this transaction so that when
@@ -838,14 +846,6 @@  start_journal_io:
                    required. */
 		JBUFFER_TRACE(jh, "file as BJ_Forget");
 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
-		/*
-		 * Wake up any transactions which were waiting for this IO to
-		 * complete. The barrier must be here so that changes by
-		 * jbd2_journal_file_buffer() take effect before wake_up_bit()
-		 * does the waitqueue check.
-		 */
-		smp_mb();
-		wake_up_bit(&bh->b_state, BH_Unshadow);
 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
 		__brelse(bh);
 	}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e03aae0..e9a9cdb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -453,6 +453,7 @@  repeat:
 	new_bh->b_size = bh_in->b_size;
 	new_bh->b_bdev = journal->j_dev;
 	new_bh->b_blocknr = blocknr;
+	new_bh->b_private = bh_in;
 	set_buffer_mapped(new_bh);
 	set_buffer_dirty(new_bh);
 
@@ -467,6 +468,7 @@  repeat:
 	spin_lock(&journal->j_list_lock);
 	__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
 	spin_unlock(&journal->j_list_lock);
+	set_buffer_shadow(bh_in);
 	jbd_unlock_bh_state(bh_in);
 
 	return do_escape | (done_copy_out << 1);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bc35899..81df09c 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -620,6 +620,12 @@  static void warn_dirty_buffer(struct buffer_head *bh)
 	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
 
+static int sleep_on_shadow_bh(void *word)
+{
+	io_schedule();
+	return 0;
+}
+
 /*
  * If the buffer is already part of the current transaction, then there
  * is nothing we need to do.  If it is already part of a prior
@@ -747,41 +753,29 @@  repeat:
 		 * journaled.  If the primary copy is already going to
 		 * disk then we cannot do copy-out here. */
 
-		if (jh->b_jlist == BJ_Shadow) {
-			DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
-			wait_queue_head_t *wqh;
-
-			wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
-
+		if (buffer_shadow(bh)) {
 			JBUFFER_TRACE(jh, "on shadow: sleep");
 			jbd_unlock_bh_state(bh);
-			/* commit wakes up all shadow buffers after IO */
-			for ( ; ; ) {
-				prepare_to_wait(wqh, &wait.wait,
-						TASK_UNINTERRUPTIBLE);
-				if (jh->b_jlist != BJ_Shadow)
-					break;
-				schedule();
-			}
-			finish_wait(wqh, &wait.wait);
+			wait_on_bit(&bh->b_state, BH_Shadow,
+				    sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
 			goto repeat;
 		}
 
-		/* Only do the copy if the currently-owning transaction
-		 * still needs it.  If it is on the Forget list, the
-		 * committing transaction is past that stage.  The
-		 * buffer had better remain locked during the kmalloc,
-		 * but that should be true --- we hold the journal lock
-		 * still and the buffer is already on the BUF_JOURNAL
-		 * list so won't be flushed.
+		/*
+		 * Only do the copy if the currently-owning transaction still
+		 * needs it. If buffer isn't on BJ_Metadata list, the
+		 * committing transaction is past that stage (here we use the
+		 * fact that BH_Shadow is set under bh_state lock together with
+		 * refiling to BJ_Shadow list and at this point we know the
+		 * buffer doesn't have BH_Shadow set).
 		 *
 		 * Subtle point, though: if this is a get_undo_access,
 		 * then we will be relying on the frozen_data to contain
 		 * the new value of the committed_data record after the
 		 * transaction, so we HAVE to force the frozen_data copy
-		 * in that case. */
-
-		if (jh->b_jlist != BJ_Forget || force_copy) {
+		 * in that case.
+		 */
+		if (jh->b_jlist == BJ_Metadata || force_copy) {
 			JBUFFER_TRACE(jh, "generate frozen data");
 			if (!frozen_buffer) {
 				JBUFFER_TRACE(jh, "allocate memory for buffer");
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index c8f3297..1c36b0c 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -244,6 +244,31 @@  typedef struct journal_superblock_s
 
 #include <linux/fs.h>
 #include <linux/sched.h>
+
+enum jbd_state_bits {
+	BH_JBD			/* Has an attached ext3 journal_head */
+	  = BH_PrivateStart,
+	BH_JWrite,		/* Being written to log (@@@ DEBUGGING) */
+	BH_Freed,		/* Has been freed (truncated) */
+	BH_Revoked,		/* Has been revoked from the log */
+	BH_RevokeValid,		/* Revoked flag is valid */
+	BH_JBDDirty,		/* Is dirty but journaled */
+	BH_State,		/* Pins most journal_head state */
+	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
+	BH_Unshadow,		/* Dummy bit, for BJ_Shadow wakeup filtering */
+	BH_JBDPrivateStart,	/* First bit available for private use by FS */
+};
+
+BUFFER_FNS(JBD, jbd)
+BUFFER_FNS(JWrite, jwrite)
+BUFFER_FNS(JBDDirty, jbddirty)
+TAS_BUFFER_FNS(JBDDirty, jbddirty)
+BUFFER_FNS(Revoked, revoked)
+TAS_BUFFER_FNS(Revoked, revoked)
+BUFFER_FNS(RevokeValid, revokevalid)
+TAS_BUFFER_FNS(RevokeValid, revokevalid)
+BUFFER_FNS(Freed, freed)
+
 #include <linux/jbd_common.h>
 
 #define J_ASSERT(assert)	BUG_ON(!(assert))
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 4584518..be5115f 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -302,6 +302,34 @@  typedef struct journal_superblock_s
 
 #include <linux/fs.h>
 #include <linux/sched.h>
+
+enum jbd_state_bits {
+	BH_JBD			/* Has an attached ext3 journal_head */
+	  = BH_PrivateStart,
+	BH_JWrite,		/* Being written to log (@@@ DEBUGGING) */
+	BH_Freed,		/* Has been freed (truncated) */
+	BH_Revoked,		/* Has been revoked from the log */
+	BH_RevokeValid,		/* Revoked flag is valid */
+	BH_JBDDirty,		/* Is dirty but journaled */
+	BH_State,		/* Pins most journal_head state */
+	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
+	BH_Shadow,		/* IO on shadow buffer is running */
+	BH_Verified,		/* Metadata block has been verified ok */
+	BH_JBDPrivateStart,	/* First bit available for private use by FS */
+};
+
+BUFFER_FNS(JBD, jbd)
+BUFFER_FNS(JWrite, jwrite)
+BUFFER_FNS(JBDDirty, jbddirty)
+TAS_BUFFER_FNS(JBDDirty, jbddirty)
+BUFFER_FNS(Revoked, revoked)
+TAS_BUFFER_FNS(Revoked, revoked)
+BUFFER_FNS(RevokeValid, revokevalid)
+TAS_BUFFER_FNS(RevokeValid, revokevalid)
+BUFFER_FNS(Freed, freed)
+BUFFER_FNS(Shadow, shadow)
+BUFFER_FNS(Verified, verified)
+
 #include <linux/jbd_common.h>
 
 #define J_ASSERT(assert)	BUG_ON(!(assert))
diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
index 6133679..b1f7089 100644
--- a/include/linux/jbd_common.h
+++ b/include/linux/jbd_common.h
@@ -1,32 +1,6 @@ 
 #ifndef _LINUX_JBD_STATE_H
 #define _LINUX_JBD_STATE_H
 
-enum jbd_state_bits {
-	BH_JBD			/* Has an attached ext3 journal_head */
-	  = BH_PrivateStart,
-	BH_JWrite,		/* Being written to log (@@@ DEBUGGING) */
-	BH_Freed,		/* Has been freed (truncated) */
-	BH_Revoked,		/* Has been revoked from the log */
-	BH_RevokeValid,		/* Revoked flag is valid */
-	BH_JBDDirty,		/* Is dirty but journaled */
-	BH_State,		/* Pins most journal_head state */
-	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
-	BH_Unshadow,		/* Dummy bit, for BJ_Shadow wakeup filtering */
-	BH_Verified,		/* Metadata block has been verified ok */
-	BH_JBDPrivateStart,	/* First bit available for private use by FS */
-};
-
-BUFFER_FNS(JBD, jbd)
-BUFFER_FNS(JWrite, jwrite)
-BUFFER_FNS(JBDDirty, jbddirty)
-TAS_BUFFER_FNS(JBDDirty, jbddirty)
-BUFFER_FNS(Revoked, revoked)
-TAS_BUFFER_FNS(Revoked, revoked)
-BUFFER_FNS(RevokeValid, revokevalid)
-TAS_BUFFER_FNS(RevokeValid, revokevalid)
-BUFFER_FNS(Freed, freed)
-BUFFER_FNS(Verified, verified)
-
 static inline struct buffer_head *jh2bh(struct journal_head *jh)
 {
 	return jh->b_bh;