diff mbox

ext4: revert i_data_sum locking cleanups for dioread_nolock

Message ID 20160212182506.GB1592@localhost.localdomain
State Superseded, archived
Headers show

Commit Message

Eric Whitney Feb. 12, 2016, 6:25 p.m. UTC
Commit 2bcba4781fa3 ("ext4: get rid of EXT4_GET_BLOCKS_NO_LOCK flag")
can cause a kernel panic when xfstest generic/300 is run on a file
system mounted with dioread_nolock.  The panic is typically triggered
from check_irqs_on() (fs/buffer.c: 1272), and happens because
ext4_end_io_dio() is being called in an interrupt context rather than
from a workqueue for AIO.  This suggests that buffer_defer_completion
may not be set properly when creating an unwritten extent for async
direct I/O.

Revert the locking changes until this problem can be resolved.  Patch
applied to 4.5-rc3 and tested with a full xfstest-bld auto group run.

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
---
 fs/ext4/ext4.h              |  6 ++++--
 fs/ext4/inode.c             | 43 +++++++++++++++++++++++--------------------
 include/trace/events/ext4.h |  1 +
 3 files changed, 28 insertions(+), 22 deletions(-)

Comments

Theodore Ts'o Feb. 16, 2016, 5:08 a.m. UTC | #1
On Fri, Feb 12, 2016 at 01:25:06PM -0500, Eric Whitney wrote:
> Commit 2bcba4781fa3 ("ext4: get rid of EXT4_GET_BLOCKS_NO_LOCK flag")
> can cause a kernel panic when xfstest generic/300 is run on a file
> system mounted with dioread_nolock.  The panic is typically triggered
> from check_irqs_on() (fs/buffer.c: 1272), and happens because
> ext4_end_io_dio() is being called in an interrupt context rather than
> from a workqueue for AIO.  This suggests that buffer_defer_completion
> may not be set properly when creating an unwritten extent for async
> direct I/O.
> 
> Revert the locking changes until this problem can be resolved.  Patch
> applied to 4.5-rc3 and tested with a full xfstest-bld auto group run.
> 
> Signed-off-by: Eric Whitney <enwlinux@gmail.com>

Applied, thanks.

I was able to reliably reproduce the problem without this revert patch
using a 32-bit x86 kvm-xfstests test appliance:

   ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/kvm-xfstests/testing/root_fs.img.i386

Using the command: "kvm-xfstests -c dioread_nolock generic/300"

With this patch, the problem goes away, so reverting the patch is
clearly the right thing for now.  There is something screwy going on,
since the original commit shouldn't have made a difference, but let's
revert it first, and figure it out when we have time to investigate
more deeply.

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0662b28..36fcf2a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -563,10 +563,12 @@  enum {
 #define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
 	/* Request will not result in inode size update (user for fallocate) */
 #define EXT4_GET_BLOCKS_KEEP_SIZE		0x0080
+	/* Do not take i_data_sem locking in ext4_map_blocks */
+#define EXT4_GET_BLOCKS_NO_LOCK			0x0100
 	/* Convert written extents to unwritten */
-#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0100
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0200
 	/* Write zeros to newly created written extents */
-#define EXT4_GET_BLOCKS_ZERO			0x0200
+#define EXT4_GET_BLOCKS_ZERO			0x0300
 #define EXT4_GET_BLOCKS_CREATE_ZERO		(EXT4_GET_BLOCKS_CREATE |\
 					EXT4_GET_BLOCKS_ZERO)
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 83bc8bf..f72dc04 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -418,7 +418,8 @@  static void ext4_map_blocks_es_recheck(handle_t *handle,
 	 * out taking i_data_sem.  So at the time the unwritten extent
 	 * could be converted.
 	 */
-	down_read(&EXT4_I(inode)->i_data_sem);
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		down_read(&EXT4_I(inode)->i_data_sem);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -426,7 +427,8 @@  static void ext4_map_blocks_es_recheck(handle_t *handle,
 		retval = ext4_ind_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	}
-	up_read((&EXT4_I(inode)->i_data_sem));
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		up_read((&EXT4_I(inode)->i_data_sem));
 
 	/*
 	 * We don't check m_len because extent will be collpased in status
@@ -522,7 +524,8 @@  int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
 	 */
-	down_read(&EXT4_I(inode)->i_data_sem);
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		down_read(&EXT4_I(inode)->i_data_sem);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -553,7 +556,8 @@  int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		if (ret < 0)
 			retval = ret;
 	}
-	up_read((&EXT4_I(inode)->i_data_sem));
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		up_read((&EXT4_I(inode)->i_data_sem));
 
 found:
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -703,7 +707,7 @@  static int _ext4_get_block(struct inode *inode, sector_t iblock,
 	map.m_lblk = iblock;
 	map.m_len = bh->b_size >> inode->i_blkbits;
 
-	if (flags && !handle) {
+	if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
 		/* Direct IO write... */
 		if (map.m_len > DIO_MAX_BLOCKS)
 			map.m_len = DIO_MAX_BLOCKS;
@@ -898,6 +902,9 @@  int do_journal_get_write_access(handle_t *handle,
 	return ret;
 }
 
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create);
+
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 				  get_block_t *get_block)
@@ -3070,21 +3077,13 @@  int ext4_get_block_write(struct inode *inode, sector_t iblock,
 			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 
-static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create)
 {
-	int ret;
-
-	ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",
+	ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
-	ret = _ext4_get_block(inode, iblock, bh_result, 0);
-	/*
-	 * Blocks should have been preallocated! ext4_file_write_iter() checks
-	 * that.
-	 */
-	WARN_ON_ONCE(!buffer_mapped(bh_result));
-
-	return ret;
+	return _ext4_get_block(inode, iblock, bh_result,
+			       EXT4_GET_BLOCKS_NO_LOCK);
 }
 
 #ifdef CONFIG_FS_DAX
@@ -3230,8 +3229,10 @@  static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	/* If we do a overwrite dio, i_mutex locking can be released */
 	overwrite = *((int *)iocb->private);
 
-	if (overwrite)
+	if (overwrite) {
+		down_read(&EXT4_I(inode)->i_data_sem);
 		inode_unlock(inode);
+	}
 
 	/*
 	 * We could direct write to holes and fallocate.
@@ -3274,7 +3275,7 @@  static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	}
 
 	if (overwrite) {
-		get_block_func = ext4_get_block_overwrite;
+		get_block_func = ext4_get_block_write_nolock;
 	} else {
 		get_block_func = ext4_get_block_write;
 		dio_flags = DIO_LOCKING;
@@ -3330,8 +3331,10 @@  retake_lock:
 	if (iov_iter_rw(iter) == WRITE)
 		inode_dio_end(inode);
 	/* take i_mutex locking again if we do a ovewrite dio */
-	if (overwrite)
+	if (overwrite) {
+		up_read(&EXT4_I(inode)->i_data_sem);
 		inode_lock(inode);
+	}
 
 	return ret;
 }
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 4e4b2fa..6599e75 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -43,6 +43,7 @@  struct extent_status;
 	{ EXT4_GET_BLOCKS_METADATA_NOFAIL,	"METADATA_NOFAIL" },	\
 	{ EXT4_GET_BLOCKS_NO_NORMALIZE,		"NO_NORMALIZE" },	\
 	{ EXT4_GET_BLOCKS_KEEP_SIZE,		"KEEP_SIZE" },		\
+	{ EXT4_GET_BLOCKS_NO_LOCK,		"NO_LOCK" },		\
 	{ EXT4_GET_BLOCKS_ZERO,			"ZERO" })
 
 #define show_mflags(flags) __print_flags(flags, "",	\