diff mbox series

[v2,05/12] jbd2: fast-commit commit path new APIs

Message ID 20190809034552.148629-6-harshadshirwadkar@gmail.com
State Superseded
Headers show
Series ext4: add support fast commit | expand

Commit Message

harshad shirwadkar Aug. 9, 2019, 3:45 a.m. UTC
This patch adds new helper APIs that ext4 needs for fast
commits. These new fast commit APIs are used by subsequent fast commit
patches to implement fast commits. Following new APIs are added:

/*
 * Returns when either a full commit or a fast commit
 * completes
 */
int jbd2_fc_complete_commit(journal_tc *journal, tid_t tid,
			    tid_t tid, tid_t subtid)

/* Send all the data buffers related to an inode */
int journal_submit_inode_data(journal_t *journal,
			      struct jbd2_inode *jinode)

/* Map one fast commit buffer for use by the file system */
int jbd2_map_fc_buf(journal_t *journal, struct buffer_head **bh_out)

/* Wait on fast commit buffers to complete IO */
jbd2_wait_on_fc_bufs(journal_t *journal, int num_bufs)

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>

---

Changelog:

V2: 1) Fixed error reported by kbuild test robot. Removed duplicate
       EXPORT_SYMBOL() call. Also, added EXPORT_SYMBOL() for the new
       APIs introduced.
    2) Changed jbd2_submit_fc_bufs() to jbd2_wait_on_fc_bufs(). This
       gives client file system to submit JBD2 buffers according to
       its own convenience.
---
 fs/jbd2/commit.c     | 32 +++++++++++++++
 fs/jbd2/journal.c    | 98 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/jbd2.h |  6 +++
 3 files changed, 136 insertions(+)

Comments

Andreas Dilger Aug. 9, 2019, 8:38 p.m. UTC | #1
On Aug 8, 2019, at 9:45 PM, Harshad Shirwadkar <harshadshirwadkar@gmail.com> wrote:
> 
> This patch adds new helper APIs that ext4 needs for fast
> commits. These new fast commit APIs are used by subsequent fast commit
> patches to implement fast commits. Following new APIs are added:
> 
> /*
> * Returns when either a full commit or a fast commit
> * completes
> */
> int jbd2_fc_complete_commit(journal_tc *journal, tid_t tid,
> 			    tid_t tid, tid_t subtid)
> 
> /* Send all the data buffers related to an inode */
> int journal_submit_inode_data(journal_t *journal,
> 			      struct jbd2_inode *jinode)
> 
> /* Map one fast commit buffer for use by the file system */
> int jbd2_map_fc_buf(journal_t *journal, struct buffer_head **bh_out)
> 
> /* Wait on fast commit buffers to complete IO */
> jbd2_wait_on_fc_bufs(journal_t *journal, int num_bufs)
> 
> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>

Reviewed-by: Andreas Dilger <adilger@dilger.ca>

> ---
> 
> Changelog:
> 
> V2: 1) Fixed error reported by kbuild test robot. Removed duplicate
>       EXPORT_SYMBOL() call. Also, added EXPORT_SYMBOL() for the new
>       APIs introduced.
>    2) Changed jbd2_submit_fc_bufs() to jbd2_wait_on_fc_bufs(). This
>       gives client file system to submit JBD2 buffers according to
>       its own convenience.
> ---
> fs/jbd2/commit.c     | 32 +++++++++++++++
> fs/jbd2/journal.c    | 98 ++++++++++++++++++++++++++++++++++++++++++++
> include/linux/jbd2.h |  6 +++
> 3 files changed, 136 insertions(+)
> 
> diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
> index 9281814606e7..db62a53436e3 100644
> --- a/fs/jbd2/commit.c
> +++ b/fs/jbd2/commit.c
> @@ -202,6 +202,38 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping,
> 	return ret;
> }
> 
> +int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
> +{
> +	struct address_space *mapping;
> +	loff_t dirty_start = jinode->i_dirty_start;
> +	loff_t dirty_end = jinode->i_dirty_end;
> +	int ret;
> +
> +	if (!jinode)
> +		return 0;
> +
> +	if (!(jinode->i_flags & JI_WRITE_DATA))
> +		return 0;
> +
> +	dirty_start = jinode->i_dirty_start;
> +	dirty_end = jinode->i_dirty_end;
> +
> +	mapping = jinode->i_vfs_inode->i_mapping;
> +	jinode->i_flags |= JI_COMMIT_RUNNING;
> +
> +	trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
> +	ret = journal_submit_inode_data_buffers(mapping, dirty_start,
> +						dirty_end);
> +
> +	jinode->i_flags &= ~JI_COMMIT_RUNNING;
> +	/* Protect JI_COMMIT_RUNNING flag */
> +	smp_mb();
> +	wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(jbd2_submit_inode_data);
> +
> /*
>  * Submit all the data buffers of inode associated with the transaction to
>  * disk.
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index ab05e47ed2d4..1e15804b2c3c 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -811,6 +811,33 @@ int jbd2_complete_transaction(journal_t *journal, tid_t tid)
> }
> EXPORT_SYMBOL(jbd2_complete_transaction);
> 
> +int jbd2_fc_complete_commit(journal_t *journal, tid_t tid, tid_t subtid)
> +{
> +	int	need_to_wait = 1;
> +
> +	read_lock(&journal->j_state_lock);
> +	if (journal->j_running_transaction &&
> +	    journal->j_running_transaction->t_tid == tid) {
> +		/* Check if fast commit was already done */
> +		if (journal->j_subtid > subtid)
> +			need_to_wait = 0;
> +		if (journal->j_commit_request != tid) {
> +			/* transaction not yet started, so request it */
> +			read_unlock(&journal->j_state_lock);
> +			jbd2_log_start_commit(journal, tid, false);
> +			goto wait_commit;
> +		}
> +	} else if (!(journal->j_committing_transaction &&
> +		     journal->j_committing_transaction->t_tid == tid))
> +		need_to_wait = 0;
> +	read_unlock(&journal->j_state_lock);
> +	if (!need_to_wait)
> +		return 0;
> +wait_commit:
> +	return __jbd2_log_wait_commit(journal, tid, subtid);
> +}
> +EXPORT_SYMBOL(jbd2_fc_complete_commit);
> +
> /*
>  * Log buffer allocation routines:
>  */
> @@ -831,6 +858,77 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
> 	return jbd2_journal_bmap(journal, blocknr, retp);
> }
> 
> +int jbd2_map_fc_buf(journal_t *journal, struct buffer_head **bh_out)
> +{
> +	unsigned long long pblock;
> +	unsigned long blocknr;
> +	int ret = 0;
> +	struct buffer_head *bh;
> +	int fc_off;
> +	journal_header_t *jhdr;
> +
> +	write_lock(&journal->j_state_lock);
> +
> +	if (journal->j_fc_off + journal->j_first_fc < journal->j_last_fc) {
> +		fc_off = journal->j_fc_off;
> +		blocknr = journal->j_first_fc + fc_off;
> +		journal->j_fc_off++;
> +	} else {
> +		ret = -EINVAL;
> +	}
> +	write_unlock(&journal->j_state_lock);
> +
> +	if (ret)
> +		return ret;
> +
> +	ret = jbd2_journal_bmap(journal, blocknr, &pblock);
> +	if (ret)
> +		return ret;
> +
> +	bh = __getblk(journal->j_dev, pblock, journal->j_blocksize);
> +	if (!bh)
> +		return -ENOMEM;
> +
> +	lock_buffer(bh);
> +	jhdr = (journal_header_t *)bh->b_data;
> +	jhdr->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
> +	jhdr->h_blocktype = cpu_to_be32(JBD2_FC_BLOCK);
> +	jhdr->h_sequence = cpu_to_be32(journal->j_running_transaction->t_tid);
> +
> +	set_buffer_uptodate(bh);
> +	unlock_buffer(bh);
> +	journal->j_fc_wbuf[fc_off] = bh;
> +
> +	*bh_out = bh;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(jbd2_map_fc_buf);
> +
> +int jbd2_wait_on_fc_bufs(journal_t *journal, int num_blks)
> +{
> +	struct buffer_head *bh;
> +	int i, j_fc_off;
> +
> +	read_lock(&journal->j_state_lock);
> +	j_fc_off = journal->j_fc_off;
> +	read_unlock(&journal->j_state_lock);
> +
> +	/*
> +	 * Wait in reverse order to minimize chances of us being woken up before
> +	 * all IOs have completed
> +	 */
> +	for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) {
> +		bh = journal->j_fc_wbuf[i];
> +		wait_on_buffer(bh);
> +		if (unlikely(!buffer_uptodate(bh)))
> +			return -EIO;
> +	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(jbd2_wait_on_fc_bufs);
> +
> /*
>  * Conversion of logical to physical block numbers for the journal
>  *
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index 535f88dff653..5362777d06f8 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -124,6 +124,7 @@ typedef struct journal_s	journal_t;	/* Journal control structure */
> #define JBD2_SUPERBLOCK_V1	3
> #define JBD2_SUPERBLOCK_V2	4
> #define JBD2_REVOKE_BLOCK	5
> +#define JBD2_FC_BLOCK		6
> 
> /*
>  * Standard header for all descriptor blocks:
> @@ -1582,6 +1583,7 @@ int jbd2_transaction_committed(journal_t *journal, tid_t tid);
> int jbd2_complete_transaction(journal_t *journal, tid_t tid);
> int jbd2_log_do_checkpoint(journal_t *journal);
> int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
> +int jbd2_fc_complete_commit(journal_t *journal, tid_t tid, tid_t subtid);
> 
> void __jbd2_log_wait_for_space(journal_t *journal);
> extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
> @@ -1732,6 +1734,10 @@ static inline tid_t  jbd2_get_latest_transaction(journal_t *journal)
> 	return tid;
> }
> 
> +int jbd2_map_fc_buf(journal_t *journal, struct buffer_head **bh_out);
> +int jbd2_wait_on_fc_bufs(journal_t *journal, int num_blks);
> +int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode);
> +
> #ifdef __KERNEL__
> 
> #define buffer_trace_init(bh)	do {} while (0)
> --
> 2.23.0.rc1.153.gdeed80330f-goog
> 


Cheers, Andreas
Andreas Dilger Aug. 9, 2019, 9:11 p.m. UTC | #2
> On Aug 9, 2019, at 2:38 PM, Andreas Dilger <adilger@dilger.ca> wrote:
> 
> On Aug 8, 2019, at 9:45 PM, Harshad Shirwadkar <harshadshirwadkar@gmail.com> wrote:
>> 
>> This patch adds new helper APIs that ext4 needs for fast
>> commits. These new fast commit APIs are used by subsequent fast commit
>> patches to implement fast commits. Following new APIs are added:
>> 
>> /*
>> * Returns when either a full commit or a fast commit
>> * completes
>> */
>> int jbd2_fc_complete_commit(journal_tc *journal, tid_t tid,
>> 			    tid_t tid, tid_t subtid)
>> 
>> /* Send all the data buffers related to an inode */
>> int journal_submit_inode_data(journal_t *journal,
>> 			      struct jbd2_inode *jinode)
>> 
>> /* Map one fast commit buffer for use by the file system */
>> int jbd2_map_fc_buf(journal_t *journal, struct buffer_head **bh_out)
>> 
>> /* Wait on fast commit buffers to complete IO */
>> jbd2_wait_on_fc_bufs(journal_t *journal, int num_bufs)
>> 
>> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
>> 
>> +int jbd2_map_fc_buf(journal_t *journal, struct buffer_head **bh_out)
>> +{
>> +	unsigned long long pblock;
>> +	unsigned long blocknr;
>> +	int ret = 0;
>> +	struct buffer_head *bh;
>> +	int fc_off;
>> +	journal_header_t *jhdr;
>> +
>> +	write_lock(&journal->j_state_lock);
>> +
>> +	if (journal->j_fc_off + journal->j_first_fc < journal->j_last_fc) {
>> +		fc_off = journal->j_fc_off;
>> +		blocknr = journal->j_first_fc + fc_off;
>> +		journal->j_fc_off++;
>> +	} else {
>> +		ret = -EINVAL;
>> +	}
>> +	write_unlock(&journal->j_state_lock);
>> +
>> +	if (ret)
>> +		return ret;
>> +
>> +	ret = jbd2_journal_bmap(journal, blocknr, &pblock);
>> +	if (ret)
>> +		return ret;
>> +
>> +	bh = __getblk(journal->j_dev, pblock, journal->j_blocksize);
>> +	if (!bh)
>> +		return -ENOMEM;
>> +
>> +	lock_buffer(bh);
>> +	jhdr = (journal_header_t *)bh->b_data;
>> +	jhdr->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
>> +	jhdr->h_blocktype = cpu_to_be32(JBD2_FC_BLOCK);
>> +	jhdr->h_sequence = cpu_to_be32(journal->j_running_transaction->t_tid);
>> +
>> +	set_buffer_uptodate(bh);
>> +	unlock_buffer(bh);
>> +	journal->j_fc_wbuf[fc_off] = bh;
>> +
>> +	*bh_out = bh;
>> +
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL(jbd2_map_fc_buf);

One question about this function.  It seems that it is called for every
commit by ext4_journal_fc_commit_cb().  Why does it need to map the fast
journal commit blocks on every call?  It would make more sense to map the
blocks once at initialization time and then just re-use them on each call.

Cheers, Andreas
harshad shirwadkar Aug. 9, 2019, 9:20 p.m. UTC | #3
On Fri, Aug 9, 2019 at 2:11 PM Andreas Dilger <adilger@dilger.ca> wrote:
>
>
> > On Aug 9, 2019, at 2:38 PM, Andreas Dilger <adilger@dilger.ca> wrote:
> >
> > On Aug 8, 2019, at 9:45 PM, Harshad Shirwadkar <harshadshirwadkar@gmail.com> wrote:
> >>
> >> This patch adds new helper APIs that ext4 needs for fast
> >> commits. These new fast commit APIs are used by subsequent fast commit
> >> patches to implement fast commits. Following new APIs are added:
> >>
> >> /*
> >> * Returns when either a full commit or a fast commit
> >> * completes
> >> */
> >> int jbd2_fc_complete_commit(journal_tc *journal, tid_t tid,
> >>                          tid_t tid, tid_t subtid)
> >>
> >> /* Send all the data buffers related to an inode */
> >> int journal_submit_inode_data(journal_t *journal,
> >>                            struct jbd2_inode *jinode)
> >>
> >> /* Map one fast commit buffer for use by the file system */
> >> int jbd2_map_fc_buf(journal_t *journal, struct buffer_head **bh_out)
> >>
> >> /* Wait on fast commit buffers to complete IO */
> >> jbd2_wait_on_fc_bufs(journal_t *journal, int num_bufs)
> >>
> >> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> >>
> >> +int jbd2_map_fc_buf(journal_t *journal, struct buffer_head **bh_out)
> >> +{
> >> +    unsigned long long pblock;
> >> +    unsigned long blocknr;
> >> +    int ret = 0;
> >> +    struct buffer_head *bh;
> >> +    int fc_off;
> >> +    journal_header_t *jhdr;
> >> +
> >> +    write_lock(&journal->j_state_lock);
> >> +
> >> +    if (journal->j_fc_off + journal->j_first_fc < journal->j_last_fc) {
> >> +            fc_off = journal->j_fc_off;
> >> +            blocknr = journal->j_first_fc + fc_off;
> >> +            journal->j_fc_off++;
> >> +    } else {
> >> +            ret = -EINVAL;
> >> +    }
> >> +    write_unlock(&journal->j_state_lock);
> >> +
> >> +    if (ret)
> >> +            return ret;
> >> +
> >> +    ret = jbd2_journal_bmap(journal, blocknr, &pblock);
> >> +    if (ret)
> >> +            return ret;
> >> +
> >> +    bh = __getblk(journal->j_dev, pblock, journal->j_blocksize);
> >> +    if (!bh)
> >> +            return -ENOMEM;
> >> +
> >> +    lock_buffer(bh);
> >> +    jhdr = (journal_header_t *)bh->b_data;
> >> +    jhdr->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
> >> +    jhdr->h_blocktype = cpu_to_be32(JBD2_FC_BLOCK);
> >> +    jhdr->h_sequence = cpu_to_be32(journal->j_running_transaction->t_tid);
> >> +
> >> +    set_buffer_uptodate(bh);
> >> +    unlock_buffer(bh);
> >> +    journal->j_fc_wbuf[fc_off] = bh;
> >> +
> >> +    *bh_out = bh;
> >> +
> >> +    return 0;
> >> +}
> >> +EXPORT_SYMBOL(jbd2_map_fc_buf);
>
> One question about this function.  It seems that it is called for every
> commit by ext4_journal_fc_commit_cb().  Why does it need to map the fast
> journal commit blocks on every call?  It would make more sense to map the
> blocks once at initialization time and then just re-use them on each call.
>

The only reason why I did it this way is that this way JBD2 gets an
opportunity to set-up journal header at the beginning of the block
which contains TID information. But I guess we could have a separate
call for setting the journal header and ext4 could call that routine
instead of mapping buffers on every commit call. Thanks for pointing
this out. I'll fix this in V3.

> Cheers, Andreas
>
>
>
>
>
Theodore Ts'o Aug. 12, 2019, 4:04 p.m. UTC | #4
On Thu, Aug 08, 2019 at 08:45:45PM -0700, Harshad Shirwadkar wrote:
> This patch adds new helper APIs that ext4 needs for fast
> commits. These new fast commit APIs are used by subsequent fast commit
> patches to implement fast commits. Following new APIs are added:
> 
> /*
>  * Returns when either a full commit or a fast commit
>  * completes
>  */
> int jbd2_fc_complete_commit(journal_tc *journal, tid_t tid,
> 			    tid_t tid, tid_t subtid)

I think there is an opportunity to do something more efficient.

Right now, the ext4_fsync() calls this function, and the file system
can only do a "fast commit" if all of the modifications made to the
file system to date are "fast commit eligible".  Otherwise, we have to
fall back to a normal, slow commit.

We can make this decision on a much more granular level.  Suppose that
so far during the life of the current transaction, inodes A, B, and C
have been modified.  The modification to inode A is not fast commit
eligible (maybe the inode is deleted, or it is involved in a directory
rename, etc.).  The modification to inode B is fast commit eligible,
but an fsync was not requested for it.  And the modification to inode
C *is* fast commit eligble, *and* fsync() has been requested for it.

We only need to write the information for inode C to the fast commit
area.  The fact that inode A is not fast commit eligible isn't a
problem.  It will get committed when the normal transaction closes,
perhaps when the 5 second commit transaction timer expires.  And inode
B, even though its changes might be fast commit eligible, might
require writing a large number of data blocks if it were included in
the fast commit.  So excluding inodes A and B from the fast commit,
and only writing the logical changes corresponding to the those made
to inode C, will allow a fast commit to take place.

In order to do that, though, the ext4's fast commit machinery needs to
know which inode we actually need to do the fast commit for.  And so
for that reason, it's actually probably better not to run the changes
through the commit thread.  That makes it harder to plumb the file
system specific information through, and it also requires waking up
the commit thread and waiting for it to get scheduled.

Instead, ext4_fsync() could just call the fast commit machinery, and
the only thing we need to expose is a way for the fast commit
machinery to attempt to grab a mutex preventing the normal commit
thread from starting a normal commit.  If it loses the race, and the
normal commit takes place before we manage to do the fast commit; then
we don't need to do any thing more.  Otherwise the fast commit
machinery can do its thing, writing inode changes to the journal, and
once it is done, it can release the mutex and ext4 fsync can return.

Does that make sense?

					- Ted
harshad shirwadkar Aug. 12, 2019, 5:41 p.m. UTC | #5
Thanks Andreas and Ted for the review.

Yeah, this makes sense.

On Mon, Aug 12, 2019 at 9:04 AM Theodore Y. Ts'o <tytso@mit.edu> wrote:
>
> On Thu, Aug 08, 2019 at 08:45:45PM -0700, Harshad Shirwadkar wrote:
> > This patch adds new helper APIs that ext4 needs for fast
> > commits. These new fast commit APIs are used by subsequent fast commit
> > patches to implement fast commits. Following new APIs are added:
> >
> > /*
> >  * Returns when either a full commit or a fast commit
> >  * completes
> >  */
> > int jbd2_fc_complete_commit(journal_tc *journal, tid_t tid,
> >                           tid_t tid, tid_t subtid)
>
> I think there is an opportunity to do something more efficient.
>
> Right now, the ext4_fsync() calls this function, and the file system
> can only do a "fast commit" if all of the modifications made to the
> file system to date are "fast commit eligible".  Otherwise, we have to
> fall back to a normal, slow commit.
>
> We can make this decision on a much more granular level.  Suppose that
> so far during the life of the current transaction, inodes A, B, and C
> have been modified.  The modification to inode A is not fast commit
> eligible (maybe the inode is deleted, or it is involved in a directory
> rename, etc.).  The modification to inode B is fast commit eligible,
> but an fsync was not requested for it.  And the modification to inode
> C *is* fast commit eligble, *and* fsync() has been requested for it.
>
> We only need to write the information for inode C to the fast commit
> area.  The fact that inode A is not fast commit eligible isn't a
> problem.  It will get committed when the normal transaction closes,
> perhaps when the 5 second commit transaction timer expires.  And inode
> B, even though its changes might be fast commit eligible, might
> require writing a large number of data blocks if it were included in
> the fast commit.  So excluding inodes A and B from the fast commit,
> and only writing the logical changes corresponding to the those made
> to inode C, will allow a fast commit to take place.
>
> In order to do that, though, the ext4's fast commit machinery needs to
> know which inode we actually need to do the fast commit for.  And so
> for that reason, it's actually probably better not to run the changes
> through the commit thread.  That makes it harder to plumb the file
> system specific information through, and it also requires waking up
> the commit thread and waiting for it to get scheduled.
I see, so you mean each fsync() call will result in exactly one inode
to be committed (the inode on which fsync was called), right? I agree
this doesn't need to go through JBD2 but we need a mechanism to inform
JBD2 about this fast commit since JBD2 maintains sub-transaction ID.
JBD2 will in turn need to make sure that a subtid was allocated for
such a fast commit and it was incremented once the fast commit was
successful as well.
>
> Instead, ext4_fsync() could just call the fast commit machinery, and
> the only thing we need to expose is a way for the fast commit
> machinery to attempt to grab a mutex preventing the normal commit
> thread from starting a normal commit.  If it loses the race, and the
> normal commit takes place before we manage to do the fast commit; then
> we don't need to do any thing more.  Otherwise the fast commit
> machinery can do its thing, writing inode changes to the journal, and
> once it is done, it can release the mutex and ext4 fsync can return.
>
> Does that make sense?
Thanks for the suggestion, I will implement this in V3.
>
>                                         - Ted
Theodore Ts'o Aug. 12, 2019, 6:01 p.m. UTC | #6
On Mon, Aug 12, 2019 at 10:41:48AM -0700, harshad shirwadkar wrote:
> I see, so you mean each fsync() call will result in exactly one inode
> to be committed (the inode on which fsync was called), right? I agree
> this doesn't need to go through JBD2 but we need a mechanism to inform
> JBD2 about this fast commit since JBD2 maintains sub-transaction ID.
> JBD2 will in turn need to make sure that a subtid was allocated for
> such a fast commit and it was incremented once the fast commit was
> successful as well.

Why does JBD2 need to maintain the sub-transaction ID?  We can only
have a single fast commit happening at a time, and while a fast commit
is happening we can't allow a full commit from happening (or vice
versa).  So we need a mutex which enforces this, the transaction id
can just be a field in the transaction structure.

Cheers,

					- Ted
diff mbox series

Patch

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 9281814606e7..db62a53436e3 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -202,6 +202,38 @@  static int journal_submit_inode_data_buffers(struct address_space *mapping,
 	return ret;
 }
 
+int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
+{
+	struct address_space *mapping;
+	loff_t dirty_start = jinode->i_dirty_start;
+	loff_t dirty_end = jinode->i_dirty_end;
+	int ret;
+
+	if (!jinode)
+		return 0;
+
+	if (!(jinode->i_flags & JI_WRITE_DATA))
+		return 0;
+
+	dirty_start = jinode->i_dirty_start;
+	dirty_end = jinode->i_dirty_end;
+
+	mapping = jinode->i_vfs_inode->i_mapping;
+	jinode->i_flags |= JI_COMMIT_RUNNING;
+
+	trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
+	ret = journal_submit_inode_data_buffers(mapping, dirty_start,
+						dirty_end);
+
+	jinode->i_flags &= ~JI_COMMIT_RUNNING;
+	/* Protect JI_COMMIT_RUNNING flag */
+	smp_mb();
+	wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_submit_inode_data);
+
 /*
  * Submit all the data buffers of inode associated with the transaction to
  * disk.
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ab05e47ed2d4..1e15804b2c3c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -811,6 +811,33 @@  int jbd2_complete_transaction(journal_t *journal, tid_t tid)
 }
 EXPORT_SYMBOL(jbd2_complete_transaction);
 
+int jbd2_fc_complete_commit(journal_t *journal, tid_t tid, tid_t subtid)
+{
+	int	need_to_wait = 1;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == tid) {
+		/* Check if fast commit was already done */
+		if (journal->j_subtid > subtid)
+			need_to_wait = 0;
+		if (journal->j_commit_request != tid) {
+			/* transaction not yet started, so request it */
+			read_unlock(&journal->j_state_lock);
+			jbd2_log_start_commit(journal, tid, false);
+			goto wait_commit;
+		}
+	} else if (!(journal->j_committing_transaction &&
+		     journal->j_committing_transaction->t_tid == tid))
+		need_to_wait = 0;
+	read_unlock(&journal->j_state_lock);
+	if (!need_to_wait)
+		return 0;
+wait_commit:
+	return __jbd2_log_wait_commit(journal, tid, subtid);
+}
+EXPORT_SYMBOL(jbd2_fc_complete_commit);
+
 /*
  * Log buffer allocation routines:
  */
@@ -831,6 +858,77 @@  int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
 	return jbd2_journal_bmap(journal, blocknr, retp);
 }
 
+int jbd2_map_fc_buf(journal_t *journal, struct buffer_head **bh_out)
+{
+	unsigned long long pblock;
+	unsigned long blocknr;
+	int ret = 0;
+	struct buffer_head *bh;
+	int fc_off;
+	journal_header_t *jhdr;
+
+	write_lock(&journal->j_state_lock);
+
+	if (journal->j_fc_off + journal->j_first_fc < journal->j_last_fc) {
+		fc_off = journal->j_fc_off;
+		blocknr = journal->j_first_fc + fc_off;
+		journal->j_fc_off++;
+	} else {
+		ret = -EINVAL;
+	}
+	write_unlock(&journal->j_state_lock);
+
+	if (ret)
+		return ret;
+
+	ret = jbd2_journal_bmap(journal, blocknr, &pblock);
+	if (ret)
+		return ret;
+
+	bh = __getblk(journal->j_dev, pblock, journal->j_blocksize);
+	if (!bh)
+		return -ENOMEM;
+
+	lock_buffer(bh);
+	jhdr = (journal_header_t *)bh->b_data;
+	jhdr->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+	jhdr->h_blocktype = cpu_to_be32(JBD2_FC_BLOCK);
+	jhdr->h_sequence = cpu_to_be32(journal->j_running_transaction->t_tid);
+
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	journal->j_fc_wbuf[fc_off] = bh;
+
+	*bh_out = bh;
+
+	return 0;
+}
+EXPORT_SYMBOL(jbd2_map_fc_buf);
+
+int jbd2_wait_on_fc_bufs(journal_t *journal, int num_blks)
+{
+	struct buffer_head *bh;
+	int i, j_fc_off;
+
+	read_lock(&journal->j_state_lock);
+	j_fc_off = journal->j_fc_off;
+	read_unlock(&journal->j_state_lock);
+
+	/*
+	 * Wait in reverse order to minimize chances of us being woken up before
+	 * all IOs have completed
+	 */
+	for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) {
+		bh = journal->j_fc_wbuf[i];
+		wait_on_buffer(bh);
+		if (unlikely(!buffer_uptodate(bh)))
+			return -EIO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(jbd2_wait_on_fc_bufs);
+
 /*
  * Conversion of logical to physical block numbers for the journal
  *
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 535f88dff653..5362777d06f8 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -124,6 +124,7 @@  typedef struct journal_s	journal_t;	/* Journal control structure */
 #define JBD2_SUPERBLOCK_V1	3
 #define JBD2_SUPERBLOCK_V2	4
 #define JBD2_REVOKE_BLOCK	5
+#define JBD2_FC_BLOCK		6
 
 /*
  * Standard header for all descriptor blocks:
@@ -1582,6 +1583,7 @@  int jbd2_transaction_committed(journal_t *journal, tid_t tid);
 int jbd2_complete_transaction(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
 int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
+int jbd2_fc_complete_commit(journal_t *journal, tid_t tid, tid_t subtid);
 
 void __jbd2_log_wait_for_space(journal_t *journal);
 extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
@@ -1732,6 +1734,10 @@  static inline tid_t  jbd2_get_latest_transaction(journal_t *journal)
 	return tid;
 }
 
+int jbd2_map_fc_buf(journal_t *journal, struct buffer_head **bh_out);
+int jbd2_wait_on_fc_bufs(journal_t *journal, int num_blks);
+int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode);
+
 #ifdef __KERNEL__
 
 #define buffer_trace_init(bh)	do {} while (0)