diff mbox series

[RFC,v2,3/5] ext4: data=journal: write-protect pages on submit inode data buffers callback

Message ID 20200810010210.3305322-4-mfo@canonical.com
State Superseded
Headers show
Series ext4/jbd2: data=journal: write-protect pages on transaction commit | expand

Commit Message

Mauricio Faria de Oliveira Aug. 10, 2020, 1:02 a.m. UTC
This implements the journal's j_submit_inode_data_buffers() callback
to write-protect the inode's pages with write_cache_pages(), and use
a writepage callback to redirty pages with buffers that are not part
of the committing transaction or the next transaction.

And set a no-op function as j_finish_inode_data_buffers() callback
(nothing needed other than the write-protect above.)

Currently, the inode is added to the transaction's inode list in the
__ext4_journalled_writepage() function.
---
 fs/ext4/inode.c |  4 +++
 fs/ext4/super.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

Comments

Jan Kara Aug. 19, 2020, 8:44 a.m. UTC | #1
On Sun 09-08-20 22:02:06, Mauricio Faria de Oliveira wrote:
> This implements the journal's j_submit_inode_data_buffers() callback
> to write-protect the inode's pages with write_cache_pages(), and use
> a writepage callback to redirty pages with buffers that are not part
> of the committing transaction or the next transaction.
> 
> And set a no-op function as j_finish_inode_data_buffers() callback
> (nothing needed other than the write-protect above.)
> 
> Currently, the inode is added to the transaction's inode list in the
> __ext4_journalled_writepage() function.
> ---
>  fs/ext4/inode.c |  4 +++
>  fs/ext4/super.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 69 insertions(+)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 10dd470876b3..978ccde8454f 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1911,6 +1911,10 @@ static int __ext4_journalled_writepage(struct page *page,
>  		err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
>  					     write_end_fn);
>  	}
> +	if (ret == 0)
> +		ret = err;
> +	// XXX: is this correct for inline data inodes?

Inodes with inline data should never get here. The thing is that when a
file with inline data is faulted for writing, ext4_page_mkwrite() converts
the file to normal data format. And ordinary write(2) will directly update
the inline data and clear the page dirty bit so writepage isn't called for
it.

> +	err = ext4_jbd2_inode_add_write(handle, inode, 0, len);
>  	if (ret == 0)
>  		ret = err;
>  	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 330957ed1f05..38aaac6572ea 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -472,6 +472,66 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
>  	spin_unlock(&sbi->s_md_lock);
>  }
>  
> +/*
> + * This writepage callback for write_cache_pages()
> + * takes care of a few cases after page cleaning.
> + *
> + * write_cache_pages() already checks for dirty pages
> + * and calls clear_page_dirty_for_io(), which we want,
> + * to write protect the pages.
> + *
> + * However, we have to redirty a page in two cases:
> + * 1) some buffer is not part of the committing transaction
> + * 2) some buffer already has b_next_transaction set
> + */
> +
> +static int ext4_journalled_writepage_callback(struct page *page,
> +					      struct writeback_control *wbc,
> +					      void *data)
> +{
> +	transaction_t *transaction = (transaction_t *) data;
> +	struct buffer_head *bh, *head;
> +	struct journal_head *jh;
> +
> +	// XXX: any chance of !bh here?

No. In ext4, whenever a page is dirty, it should have buffers attached.

> +	bh = head = page_buffers(page);
> +	do {
> +		jh = bh2jh(bh);
> +		if (!jh || jh->b_transaction != transaction ||
> +		    jh->b_next_transaction) {
> +			redirty_page_for_writepage(wbc, page);
> +			goto out;
> +		}
> +	} while ((bh = bh->b_this_page) != head);
> +
> +out:
> +	return AOP_WRITEPAGE_ACTIVATE;
> +}
> +
> +static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
> +{
> +	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
> +	transaction_t *transaction = jinode->i_transaction;
> +	loff_t dirty_start = jinode->i_dirty_start;
> +	loff_t dirty_end = jinode->i_dirty_end;
> +
> +	struct writeback_control wbc = {
> +		.sync_mode =  WB_SYNC_ALL,
> +		.nr_to_write = mapping->nrpages * 2,

For WB_SYNC_ALL writeback, .nr_to_write is mostly ignored so just set it to
~0ULL.

> +		.range_start = dirty_start,
> +		.range_end = dirty_end,
> +        };
> +
> +	return write_cache_pages(mapping, &wbc,
> +				 ext4_journalled_writepage_callback,
> +				 transaction);

I was thinking about this and we may need to do this somewhat differently.
I've realized that there's the slight trouble that we now use page dirty
bit for two purposes in data=journal mode - to track pages that need write
protection during commit and also to track pages which have buffers that
need checkpointing. And this mixing is making things complex. So I was
thinking that we could simply leave PageDirty bit for checkpointing
purposes and always make sure buffers are appropriately attached to a
transaction as dirty in ext4_page_mkwrite(). This will make mmap writes in
data=journal mode somewhat less efficient (all the pages written through
mmap while transaction T was running will be written to the journal during
transaction T commit while currently, we write only pages that also went
through __ext4_journalled_writepage() while T was running which usually
happens less frequently). But the code should be simpler and we don't care
about mmap write performance for data=journal mode much. Furthermore I
don't think that the tricks with PageChecked logic we play in data=journal
mode are really needed as well which should bring further simplifications.
I'll try to code this cleanup.

Then in ext4_journalled_submit_inode_data_buffers() we would need to walk
all the pages in the range describe by jinode and call page_mkclean() for
each page which has buffer attached to the committing transaction.

> +}
> +
> +static int ext4_journalled_finish_inode_data_buffers(struct jbd2_inode *jinode)
> +{
> +	return 0;
> +}
> +
>  static bool system_going_down(void)
>  {
>  	return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
> @@ -4599,6 +4659,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  		ext4_msg(sb, KERN_ERR, "can't mount with "
>  			"journal_async_commit in data=ordered mode");
>  		goto failed_mount_wq;
> +	} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
> +		sbi->s_journal->j_submit_inode_data_buffers =
> +			ext4_journalled_submit_inode_data_buffers;
> +		sbi->s_journal->j_finish_inode_data_buffers =
> +			ext4_journalled_finish_inode_data_buffers;

We can journal data only for individual inodes (when inode has journal_data
flag set). So you have to set the callback unconditionally here and only in
the callback decide what to do with the particular inode based on what
ext4_should_journal_data() tells about the inode.

								Honza
Jan Kara Aug. 19, 2020, 10:41 a.m. UTC | #2
On Wed 19-08-20 10:44:21, Jan Kara wrote:
> I was thinking about this and we may need to do this somewhat differently.
> I've realized that there's the slight trouble that we now use page dirty
> bit for two purposes in data=journal mode - to track pages that need write
> protection during commit and also to track pages which have buffers that
> need checkpointing. And this mixing is making things complex. So I was
> thinking that we could simply leave PageDirty bit for checkpointing
> purposes and always make sure buffers are appropriately attached to a
> transaction as dirty in ext4_page_mkwrite(). This will make mmap writes in
> data=journal mode somewhat less efficient (all the pages written through
> mmap while transaction T was running will be written to the journal during
> transaction T commit while currently, we write only pages that also went
> through __ext4_journalled_writepage() while T was running which usually
> happens less frequently). But the code should be simpler and we don't care
> about mmap write performance for data=journal mode much. Furthermore I
> don't think that the tricks with PageChecked logic we play in data=journal
> mode are really needed as well which should bring further simplifications.
> I'll try to code this cleanup.

I was looking more into this but it isn't as simple as I thought because
get_user_pages() users can still modify data and call set_page_dirty() when
the page is no longer writeably mapped. And by the time set_page_dirty() is
called page buffers are not necessarily part of any transaction so we need
to do effectively what's in ext4_journalled_writepage(). To handle this
corner case I didn't find anything considerably simpler than the current
code.

So let's stay with what we have in
ext4_journalled_submit_inode_data_buffers(), we just have to also redirty
the page if we find any dirty buffers.

								Honza
Mauricio Faria de Oliveira Aug. 20, 2020, 10:55 p.m. UTC | #3
Hi Jan,

Thanks a bunch for the detailed comments, including in the cover letter.
Your attention and patience for explanations are really appreciated.

I _think_ I got most of it for the next iteration -- a few follow up questions:

On Wed, Aug 19, 2020 at 7:41 AM Jan Kara <jack@suse.cz> wrote:
>
> On Wed 19-08-20 10:44:21, Jan Kara wrote:
> > I was thinking about this and we may need to do this somewhat differently.
> > I've realized that there's the slight trouble that we now use page dirty
> > bit for two purposes in data=journal mode - to track pages that need write
> > protection during commit and also to track pages which have buffers that
> > need checkpointing. And this mixing is making things complex. So I was
> > thinking that we could simply leave PageDirty bit for checkpointing
> > purposes and always make sure buffers are appropriately attached to a
> > transaction as dirty in ext4_page_mkwrite(). [snip]
> > [snip] Furthermore I
> > don't think that the tricks with PageChecked logic we play in data=journal
> > mode are really needed as well which should bring further simplifications.
> > I'll try to code this cleanup.
>
> I was looking more into this but it isn't as simple as I thought because
> get_user_pages() users can still modify data and call set_page_dirty() when
> the page is no longer writeably mapped. And by the time set_page_dirty() is
> called page buffers are not necessarily part of any transaction so we need
> to do effectively what's in ext4_journalled_writepage(). To handle this
> corner case I didn't find anything considerably simpler than the current
> code.
>
> So let's stay with what we have in
> ext4_journalled_submit_inode_data_buffers(), we just have to also redirty
> the page if we find any dirty buffers.
>

Could you please clarify/comment whether the dirty buffers "flags" are
different between
the suggestions for ext4_page_mkwrite() and
ext4_journalled_submit_inode_data_buffers() ?

I'm asking because..

In ext4_page_mkwrite() the suggestion is to attach buffers as dirty to
a transaction, which I guess can be done with
ext4_walk_page_buffers(..., write_end_fn) after
ext4_walk_page_buffers(..., do_journal_get_write_access) -- just as
done in ext4_journalled_writepage() -- and that sets the buffer as
*jbd* dirty (BH_JBDDirty.)

In ext4_journalled_submit_inode_data_buffers() the suggestion is to
check for dirty buffers to redirty the page
(for the case of buffers that need checkpointing) and I think this is
the non-jbd/just dirty (BH_Dirty.)

If I actually understood your explanation/suggest, the dirty buffer
flags should be different,
as otherwise we'd be unconditionally setting buffers dirty on
ext4_page_mkwrite() to later
check for (known to be) dirty buffers in
ext4_journalled_submit_inode_data_buffers().

...

And as you mentioned no cleanup / keeping ext4_journalled_writepage()
and the PageChecked bit,
I would like to revisit two questions from the cover letter that would
have no impact with the cleanup,
so to confirm my understanding for the next steps.

    > 3) When checking to redirty the page in the writepage callback,
    >    does a buffer without a journal head means we should redirty
    >    the page? (for the reason it's not part of the committing txn)

Per your explanation about the page dirty bit for buffers that need
checkpointing, I see we cannot redirty
the page just because a buffer isn't part of the transaction -- the
buffer has to be dirty -- so I think it falls
down to your suggestion of 'also redirty if we find any dirty buffers'
(regardless of a buffer w/out txns.) right?

    > 4) Should we clear the PageChecked bit?
    ...
    > Should we try to prevent that [ext4_journalled_writepage()
running later] by, say, clearing the pagechecked bit
    > in case we don't have to redirty the page (in the writepage callback) ?

And I think the answer is no, per your explanation about page dirty
being set elsewhere outside of our control,
and thus ext4_journalled_page() still needs to run, and thus the  page
checked bit still needs to remain set; correct?

Thanks again,




>                                                                 Honza
>
> --
> Jan Kara <jack@suse.com>
> SUSE Labs, CR



--
Mauricio Faria de Oliveira
Jan Kara Aug. 21, 2020, 10:26 a.m. UTC | #4
On Thu 20-08-20 19:55:05, Mauricio Faria de Oliveira wrote:
> On Wed, Aug 19, 2020 at 7:41 AM Jan Kara <jack@suse.cz> wrote:
> >
> > On Wed 19-08-20 10:44:21, Jan Kara wrote:
> > > I was thinking about this and we may need to do this somewhat differently.
> > > I've realized that there's the slight trouble that we now use page dirty
> > > bit for two purposes in data=journal mode - to track pages that need write
> > > protection during commit and also to track pages which have buffers that
> > > need checkpointing. And this mixing is making things complex. So I was
> > > thinking that we could simply leave PageDirty bit for checkpointing
> > > purposes and always make sure buffers are appropriately attached to a
> > > transaction as dirty in ext4_page_mkwrite(). [snip]
> > > [snip] Furthermore I
> > > don't think that the tricks with PageChecked logic we play in data=journal
> > > mode are really needed as well which should bring further simplifications.
> > > I'll try to code this cleanup.
> >
> > I was looking more into this but it isn't as simple as I thought because
> > get_user_pages() users can still modify data and call set_page_dirty() when
> > the page is no longer writeably mapped. And by the time set_page_dirty() is
> > called page buffers are not necessarily part of any transaction so we need
> > to do effectively what's in ext4_journalled_writepage(). To handle this
> > corner case I didn't find anything considerably simpler than the current
> > code.
> >
> > So let's stay with what we have in
> > ext4_journalled_submit_inode_data_buffers(), we just have to also redirty
> > the page if we find any dirty buffers.
> >
> 
> Could you please clarify/comment whether the dirty buffers "flags" are
> different between the suggestions for ext4_page_mkwrite() and
> ext4_journalled_submit_inode_data_buffers() ?
> 
> I'm asking because..
> 
> In ext4_page_mkwrite() the suggestion is to attach buffers as dirty to
> a transaction, which I guess can be done with
> ext4_walk_page_buffers(..., write_end_fn) after
> ext4_walk_page_buffers(..., do_journal_get_write_access) -- just as
> done in ext4_journalled_writepage() -- and that sets the buffer as
> *jbd* dirty (BH_JBDDirty.)

Correct.

> In ext4_journalled_submit_inode_data_buffers() the suggestion is to
> check for dirty buffers to redirty the page
> (for the case of buffers that need checkpointing) and I think this is
> the non-jbd/just dirty (BH_Dirty.)

Again correct :).

> If I actually understood your explanation/suggest, the dirty buffer
> flags should be different,
> as otherwise we'd be unconditionally setting buffers dirty on
> ext4_page_mkwrite() to later
> check for (known to be) dirty buffers in
> ext4_journalled_submit_inode_data_buffers().
> 
> ...
> 
> And as you mentioned no cleanup / keeping ext4_journalled_writepage()
> and the PageChecked bit,
> I would like to revisit two questions from the cover letter that would
> have no impact with the cleanup,
> so to confirm my understanding for the next steps.
> 
>     > 3) When checking to redirty the page in the writepage callback,
>     >    does a buffer without a journal head means we should redirty
>     >    the page? (for the reason it's not part of the committing txn)
> 
> Per your explanation about the page dirty bit for buffers that need
> checkpointing, I see we cannot redirty
> the page just because a buffer isn't part of the transaction -- the
> buffer has to be dirty -- so I think it falls
> down to your suggestion of 'also redirty if we find any dirty buffers'
> (regardless of a buffer w/out txns.) right?

Correct. It should be:

		if (buffer_dirty(bh) || (jh && ...))
			redirty
 
>     > 4) Should we clear the PageChecked bit?
>     ...
>     > Should we try to prevent that [ext4_journalled_writepage()
> running later] by, say, clearing the pagechecked bit
>     > in case we don't have to redirty the page (in the writepage callback) ?
> 
> And I think the answer is no, per your explanation about page dirty
> being set elsewhere outside of our control,
> and thus ext4_journalled_page() still needs to run, and thus the  page
> checked bit still needs to remain set; correct?

Correct.

								Honza
diff mbox series

Patch

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 10dd470876b3..978ccde8454f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1911,6 +1911,10 @@  static int __ext4_journalled_writepage(struct page *page,
 		err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
 					     write_end_fn);
 	}
+	if (ret == 0)
+		ret = err;
+	// XXX: is this correct for inline data inodes?
+	err = ext4_jbd2_inode_add_write(handle, inode, 0, len);
 	if (ret == 0)
 		ret = err;
 	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 330957ed1f05..38aaac6572ea 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -472,6 +472,66 @@  static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 	spin_unlock(&sbi->s_md_lock);
 }
 
+/*
+ * This writepage callback for write_cache_pages()
+ * takes care of a few cases after page cleaning.
+ *
+ * write_cache_pages() already checks for dirty pages
+ * and calls clear_page_dirty_for_io(), which we want,
+ * to write protect the pages.
+ *
+ * However, we have to redirty a page in two cases:
+ * 1) some buffer is not part of the committing transaction
+ * 2) some buffer already has b_next_transaction set
+ */
+
+static int ext4_journalled_writepage_callback(struct page *page,
+					      struct writeback_control *wbc,
+					      void *data)
+{
+	transaction_t *transaction = (transaction_t *) data;
+	struct buffer_head *bh, *head;
+	struct journal_head *jh;
+
+	// XXX: any chance of !bh here?
+	bh = head = page_buffers(page);
+	do {
+		jh = bh2jh(bh);
+		if (!jh || jh->b_transaction != transaction ||
+		    jh->b_next_transaction) {
+			redirty_page_for_writepage(wbc, page);
+			goto out;
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+out:
+	return AOP_WRITEPAGE_ACTIVATE;
+}
+
+static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
+	transaction_t *transaction = jinode->i_transaction;
+	loff_t dirty_start = jinode->i_dirty_start;
+	loff_t dirty_end = jinode->i_dirty_end;
+
+	struct writeback_control wbc = {
+		.sync_mode =  WB_SYNC_ALL,
+		.nr_to_write = mapping->nrpages * 2,
+		.range_start = dirty_start,
+		.range_end = dirty_end,
+        };
+
+	return write_cache_pages(mapping, &wbc,
+				 ext4_journalled_writepage_callback,
+				 transaction);
+}
+
+static int ext4_journalled_finish_inode_data_buffers(struct jbd2_inode *jinode)
+{
+	return 0;
+}
+
 static bool system_going_down(void)
 {
 	return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
@@ -4599,6 +4659,11 @@  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		ext4_msg(sb, KERN_ERR, "can't mount with "
 			"journal_async_commit in data=ordered mode");
 		goto failed_mount_wq;
+	} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+		sbi->s_journal->j_submit_inode_data_buffers =
+			ext4_journalled_submit_inode_data_buffers;
+		sbi->s_journal->j_finish_inode_data_buffers =
+			ext4_journalled_finish_inode_data_buffers;
 	}
 
 	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);