Patchwork [1/2] ext4: introduce new i_write_mutex to protect fallocate

login
register
mail settings
Submitter Namjae Jeon
Date May 13, 2014, 12:19 a.m.
Message ID <001701cf6e40$fab98be0$f02ca3a0$@samsung.com>
Download mbox | patch
Permalink /patch/348179/
State Awaiting Upstream
Headers show

Comments

Namjae Jeon - May 13, 2014, 12:19 a.m.
Introduce new i_write_mutex to protect new writes from coming while doing
fallocate operations. Also, get rid of aio_mutex as it is covered by
i_write_mutex.

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
---
 fs/ext4/ext4.h    |  6 +++---
 fs/ext4/extents.c | 18 +++++++++++++++---
 fs/ext4/file.c    | 18 +++++++++++-------
 fs/ext4/inode.c   |  7 ++++++-
 fs/ext4/super.c   |  3 +--
 5 files changed, 36 insertions(+), 16 deletions(-)
Theodore Ts'o - May 26, 2014, 4:29 p.m.
On Tue, May 13, 2014 at 09:19:17AM +0900, Namjae Jeon wrote:
> Introduce new i_write_mutex to protect new writes from coming while doing
> fallocate operations. Also, get rid of aio_mutex as it is covered by
> i_write_mutex.
> 
> Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
> Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>

Thanks, applied.

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukas Czerner - May 29, 2014, 12:42 p.m.
On Tue, 13 May 2014, Namjae Jeon wrote:

> Date: Tue, 13 May 2014 09:19:17 +0900
> From: Namjae Jeon <namjae.jeon@samsung.com>
> To: Theodore Ts'o <tytso@mit.edu>
> Cc: linux-ext4 <linux-ext4@vger.kernel.org>,
>     Lukáš Czerner <lczerner@redhat.com>,
>     Ashish Sangwan <a.sangwan@samsung.com>
> Subject: [PATCH 1/2] ext4: introduce new i_write_mutex to protect fallocate
> 
> Introduce new i_write_mutex to protect new writes from coming while doing
> fallocate operations. Also, get rid of aio_mutex as it is covered by
> i_write_mutex.

I wonder what is the performance impact of this change ? Especially
since we're not longer taking the lock only in unaligned aio/dio
case but in all cases ?

Also, against what tree is this patch ?

The description is quite sparse and I would like see a reasoning for
this change, because it's completely missing!

Thanks!
-Lukas

> 
> Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
> Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
> ---
>  fs/ext4/ext4.h    |  6 +++---
>  fs/ext4/extents.c | 18 +++++++++++++++---
>  fs/ext4/file.c    | 18 +++++++++++-------
>  fs/ext4/inode.c   |  7 ++++++-
>  fs/ext4/super.c   |  3 +--
>  5 files changed, 36 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 6b45afa..77e5705 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -943,6 +943,9 @@ struct ext4_inode_info {
>  
>  	/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
>  	__u32 i_csum_seed;
> +
> +	/* protects fallocate operations racing with new writes */
> +	struct mutex i_write_mutex;
>  };
>  
>  /*
> @@ -2827,10 +2830,7 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
>  #define EXT4_WQ_HASH_SZ		37
>  #define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
>  					    EXT4_WQ_HASH_SZ])
> -#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
> -					     EXT4_WQ_HASH_SZ])
>  extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
> -extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
>  
>  #define EXT4_RESIZING	0
>  extern int ext4_resize_begin(struct super_block *sb);
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 086baa9..5262750 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4741,6 +4741,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>  	if (!S_ISREG(inode->i_mode))
>  		return -EINVAL;
>  
> +	mutex_lock(&EXT4_I(inode)->i_write_mutex);
> +
>  	/*
>  	 * Write out all dirty pages to avoid race conditions
>  	 * Then release them.
> @@ -4748,8 +4750,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>  	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
>  		ret = filemap_write_and_wait_range(mapping, offset,
>  						   offset + len - 1);
> -		if (ret)
> +		if (ret) {
> +			mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  			return ret;
> +		}
>  	}
>  
>  	/*
> @@ -4761,8 +4765,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>  	start = round_up(offset, 1 << blkbits);
>  	end = round_down((offset + len), 1 << blkbits);
>  
> -	if (start < offset || end > offset + len)
> +	if (start < offset || end > offset + len) {
> +		mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  		return -EINVAL;
> +	}
>  	partial = (offset + len) & ((1 << blkbits) - 1);
>  
>  	lblk = start >> blkbits;
> @@ -4859,6 +4865,7 @@ out_dio:
>  	ext4_inode_resume_unlocked_dio(inode);
>  out_mutex:
>  	mutex_unlock(&inode->i_mutex);
> +	mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  	return ret;
>  }
>  
> @@ -5428,11 +5435,15 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
>  	punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
>  	punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
>  
> +	mutex_lock(&EXT4_I(inode)->i_write_mutex);
> +
>  	/* Call ext4_force_commit to flush all data in case of data=journal. */
>  	if (ext4_should_journal_data(inode)) {
>  		ret = ext4_force_commit(inode->i_sb);
> -		if (ret)
> +		if (ret) {
> +			mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  			return ret;
> +		}
>  	}
>  
>  	/*
> @@ -5518,5 +5529,6 @@ out_dio:
>  	ext4_inode_resume_unlocked_dio(inode);
>  out_mutex:
>  	mutex_unlock(&inode->i_mutex);
> +	mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  	return ret;
>  }
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 708aad7..557b4ac 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -93,7 +93,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = file_inode(iocb->ki_filp);
> -	struct mutex *aio_mutex = NULL;
> +	bool unaligned_direct_aio = false;
>  	struct blk_plug plug;
>  	int o_direct = file->f_flags & O_DIRECT;
>  	int overwrite = 0;
> @@ -101,6 +101,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	ssize_t ret;
>  	loff_t pos = iocb->ki_pos;
>  
> +	mutex_lock(&EXT4_I(inode)->i_write_mutex);
> +
>  	/*
>  	 * Unaligned direct AIO must be serialized; see comment above
>  	 * In the case of O_APPEND, assume that we must always serialize
> @@ -110,8 +112,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	    !is_sync_kiocb(iocb) &&
>  	    (file->f_flags & O_APPEND ||
>  	     ext4_unaligned_aio(inode, from, pos))) {
> -		aio_mutex = ext4_aio_mutex(inode);
> -		mutex_lock(aio_mutex);
> +		unaligned_direct_aio = true;
>  		ext4_unwritten_wait(inode);
>  	}
>  
> @@ -143,8 +144,9 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		iocb->private = &overwrite;
>  
>  		/* check whether we do a DIO overwrite or not */
> -		if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
> -		    !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
> +		if (ext4_should_dioread_nolock(inode) &&
> +		    !unaligned_direct_aio && !file->f_mapping->nrpages &&
> +		    pos + length <= i_size_read(inode)) {
>  			struct ext4_map_blocks map;
>  			unsigned int blkbits = inode->i_blkbits;
>  			int err, len;
> @@ -174,6 +176,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  
>  	ret = __generic_file_write_iter(iocb, from);
>  	mutex_unlock(&inode->i_mutex);
> +	if (!unaligned_direct_aio)
> +		mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  
>  	if (ret > 0) {
>  		ssize_t err;
> @@ -186,8 +190,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		blk_finish_plug(&plug);
>  
>  errout:
> -	if (aio_mutex)
> -		mutex_unlock(aio_mutex);
> +	if (unaligned_direct_aio)
> +		mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  	return ret;
>  }
>  
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index b1dc334..d804120 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3528,6 +3528,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
>  
>  	trace_ext4_punch_hole(inode, offset, length, 0);
>  
> +	mutex_lock(&EXT4_I(inode)->i_write_mutex);
> +
>  	/*
>  	 * Write out all dirty pages to avoid race conditions
>  	 * Then release them.
> @@ -3535,8 +3537,10 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
>  	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
>  		ret = filemap_write_and_wait_range(mapping, offset,
>  						   offset + length - 1);
> -		if (ret)
> +		if (ret) {
> +			mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  			return ret;
> +		}
>  	}
>  
>  	mutex_lock(&inode->i_mutex);
> @@ -3637,6 +3641,7 @@ out_dio:
>  	ext4_inode_resume_unlocked_dio(inode);
>  out_mutex:
>  	mutex_unlock(&inode->i_mutex);
> +	mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  	return ret;
>  }
>  
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 1f8cb18..e236c85 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -904,6 +904,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
>  	atomic_set(&ei->i_ioend_count, 0);
>  	atomic_set(&ei->i_unwritten, 0);
>  	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
> +	mutex_init(&ei->i_write_mutex);
>  
>  	return &ei->vfs_inode;
>  }
> @@ -5505,7 +5506,6 @@ static void ext4_exit_feat_adverts(void)
>  
>  /* Shared across all ext4 file systems */
>  wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
> -struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
>  
>  static int __init ext4_init_fs(void)
>  {
> @@ -5518,7 +5518,6 @@ static int __init ext4_init_fs(void)
>  	ext4_check_flag_values();
>  
>  	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
> -		mutex_init(&ext4__aio_mutex[i]);
>  		init_waitqueue_head(&ext4__ioend_wq[i]);
>  	}
>  
>
Theodore Ts'o - May 29, 2014, 4:28 p.m.
On Thu, May 29, 2014 at 02:42:04PM +0200, Lukáš Czerner wrote:
> 
> I wonder what is the performance impact of this change ? Especially
> since we're not longer taking the lock only in unaligned aio/dio
> case but in all cases ?

Thinking about this some more, this is also going to break parallel
writes, which would be unfortunate.  We might want to change this to
using a rw mutex, where writes take a shared lock, and require
fallocate to take an exclusive lock....

				- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Namjae Jeon - May 31, 2014, 6:45 a.m.
> 
> On Thu, May 29, 2014 at 02:42:04PM +0200, Lukáš Czerner wrote:
> >
> > I wonder what is the performance impact of this change ? Especially
> > since we're not longer taking the lock only in unaligned aio/dio
> > case but in all cases ?
> 
> Thinking about this some more, this is also going to break parallel
> writes, which would be unfortunate.  We might want to change this to
> using a rw mutex, where writes take a shared lock, and require
> fallocate to take an exclusive lock....
ext4 file write is already serialized with inode mutex.
So I think the impact of adding another lock will be very very less..
When I run parallel write test of fio to prove it, I can not see the difference on w/wo i_write_mutex.

[job1]
ioengine=sync
buffered=1
rw=write
numjobs=100
filename=file1
rw_sequencer=sequential
size=10485760000
filesize=104857600
nrfiles=1
openfiles=100

Without i_write_mutex =>
Run status group 0 (all jobs):
  WRITE: io=10000MB, aggrb=276869KB/s, minb=2768KB/s, maxb=3530KB/s, mint=29007msec, maxt=36985msec
Run status group 0 (all jobs):
  WRITE: io=10000MB, aggrb=273862KB/s, minb=2738KB/s, maxb=3584KB/s, mint=28566msec, maxt=37391msec
Run status group 0 (all jobs):
  WRITE: io=10000MB, aggrb=271373KB/s, minb=2713KB/s, maxb=3650KB/s, mint=28048msec, maxt=37734msec
Run status group 0 (all jobs):
  WRITE: io=10000MB, aggrb=274906KB/s, minb=2749KB/s, maxb=3554KB/s, mint=28808msec, maxt=37249msec

With i_write_mutex patch applied =>
Run status group 0 (all jobs):
  WRITE: io=10000MB, aggrb=273672KB/s, minb=2736KB/s, maxb=3498KB/s, mint=29269msec, maxt=37417msec
Run status group 0 (all jobs):
  WRITE: io=10000MB, aggrb=271877KB/s, minb=2718KB/s, maxb=3401KB/s, mint=30101msec, maxt=37664msec
Run status group 0 (all jobs):
  WRITE: io=10000MB, aggrb=272753KB/s, minb=2727KB/s, maxb=3412KB/s, mint=30008msec, maxt=37543msec
Run status group 0 (all jobs):
  WRITE: io=10000MB, aggrb=274508KB/s, minb=2745KB/s, maxb=3267KB/s, mint=31335msec, maxt=37303msec

Yes, Right. We can use shared lock to remove a little bit lock contention in ext4 file write.
I will share rwsem lock patch.. Could you please revert i_write_mutex patch ?

Thanks!

> 
> 				- Ted

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o - June 2, 2014, 2:38 p.m.
On Sat, May 31, 2014 at 03:45:36PM +0900, Namjae Jeon wrote:
> ext4 file write is already serialized with inode mutex.

Right, I had forgotten about that.  The case where we really care
about parallel writes is in the direct I/O case, and eventually I'd
like for us to be able to support non-overwriting/non-isize-extending
writes in parallel but we're not there yet.

> So I think the impact of adding another lock will be very very less..
> When I run parallel write test of fio to prove it, I can not see the difference on w/wo i_write_mutex.

If there is an impact, it won't show up there.  Where it will show up
will be in high scalability workloads.  For people who don't have the
half-million dollars (and up) expensive RAID arrays, a fairly good
facsimile is to use a > 16 core system, preferably a system at least 4
sockets, and say 32 or 64 gigs of memory, of which you can dedicate
half to a ramdisk.  Then run the fio scalability benchmark in that
scenario.  That way, things like cache line bounces and lock
contentions will be much more visible when the system is no longer
bottleneck by the HDD.

> Yes, Right. We can use shared lock to remove a little bit lock contention in ext4 file write.
> I will share rwsem lock patch.. Could you please revert i_write_mutex patch ?

So the shared lock will help somewhat (since writes will be far more
common than fallocate calls) but I suspect, not all that much.  And if
I revert the i_write_mutex call, now, we won't have time to replace it
with a different patch since the merge window is already open.

And since this patch is needed to fix a xfstests failure (although
it's for collapse range in data journalling mode, so not a common
case), if we can't really see a performance loss in the much more
common server configurations, I'm inclined to leave it in for now, and
we can try to improve performance in the next kernel revision.

What do other people think?

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Namjae Jeon - June 3, 2014, 6:04 a.m.
> 
> On Sat, May 31, 2014 at 03:45:36PM +0900, Namjae Jeon wrote:
> > ext4 file write is already serialized with inode mutex.
> 
> Right, I had forgotten about that.  The case where we really care
> about parallel writes is in the direct I/O case, and eventually I'd
> like for us to be able to support non-overwriting/non-isize-extending
> writes in parallel but we're not there yet.
Okay.
> 
> > So I think the impact of adding another lock will be very very less..
> > When I run parallel write test of fio to prove it, I can not see the difference on w/wo
> i_write_mutex.
> 
> If there is an impact, it won't show up there.  Where it will show up
> will be in high scalability workloads.  For people who don't have the
> half-million dollars (and up) expensive RAID arrays, a fairly good
> facsimile is to use a > 16 core system, preferably a system at least 4
> sockets, and say 32 or 64 gigs of memory, of which you can dedicate
> half to a ramdisk.  Then run the fio scalability benchmark in that
> scenario.  That way, things like cache line bounces and lock
> contentions will be much more visible when the system is no longer
> bottleneck by the HDD.
Yes, Right. I agree that result should be measured on high-end server
as you pointed again. Unfortunately I don't have such equipment yet..
> 
> > Yes, Right. We can use shared lock to remove a little bit lock contention in ext4 file write.
> > I will share rwsem lock patch.. Could you please revert i_write_mutex patch ?
> 
> So the shared lock will help somewhat (since writes will be far more
> common than fallocate calls) but I suspect, not all that much.  And if
> I revert the i_write_mutex call, now, we won't have time to replace it
> with a different patch since the merge window is already open.
> 
> And since this patch is needed to fix a xfstests failure (although
> it's for collapse range in data journalling mode, so not a common
> case), if we can't really see a performance loss in the much more
> common server configurations, I'm inclined to leave it in for now, and
> we can try to improve performance in the next kernel revision.
IMHO, If our goal is to solve the problem of xfstests, we can use only
"ext4: fix ZERO_RANGE test failure in data journalling" patch without
i_write_mutex patch. And we can add lock for fallocate on next kernel
after checking with sufficient time.

Thanks!

> 
> What do other people think?
> 
> 						- Ted

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukas Czerner - June 3, 2014, 10:49 a.m.
On Tue, 3 Jun 2014, Namjae Jeon wrote:

> Date: Tue, 03 Jun 2014 15:04:32 +0900
> From: Namjae Jeon <namjae.jeon@samsung.com>
> To: 'Theodore Ts'o' <tytso@mit.edu>
> Cc: 'Lukáš Czerner' <lczerner@redhat.com>,
>     'linux-ext4' <linux-ext4@vger.kernel.org>,
>     'Ashish Sangwan' <a.sangwan@samsung.com>
> Subject: RE: [PATCH 1/2] ext4: introduce new i_write_mutex to protect
>     fallocate
> 
> > 
> > On Sat, May 31, 2014 at 03:45:36PM +0900, Namjae Jeon wrote:
> > > ext4 file write is already serialized with inode mutex.
> > 
> > Right, I had forgotten about that.  The case where we really care
> > about parallel writes is in the direct I/O case, and eventually I'd
> > like for us to be able to support non-overwriting/non-isize-extending
> > writes in parallel but we're not there yet.
> Okay.
> > 
> > > So I think the impact of adding another lock will be very very less..
> > > When I run parallel write test of fio to prove it, I can not see the difference on w/wo
> > i_write_mutex.
> > 
> > If there is an impact, it won't show up there.  Where it will show up
> > will be in high scalability workloads.  For people who don't have the
> > half-million dollars (and up) expensive RAID arrays, a fairly good
> > facsimile is to use a > 16 core system, preferably a system at least 4
> > sockets, and say 32 or 64 gigs of memory, of which you can dedicate
> > half to a ramdisk.  Then run the fio scalability benchmark in that
> > scenario.  That way, things like cache line bounces and lock
> > contentions will be much more visible when the system is no longer
> > bottleneck by the HDD.
> Yes, Right. I agree that result should be measured on high-end server
> as you pointed again. Unfortunately I don't have such equipment yet..
> > 
> > > Yes, Right. We can use shared lock to remove a little bit lock contention in ext4 file write.
> > > I will share rwsem lock patch.. Could you please revert i_write_mutex patch ?
> > 
> > So the shared lock will help somewhat (since writes will be far more
> > common than fallocate calls) but I suspect, not all that much.  And if
> > I revert the i_write_mutex call, now, we won't have time to replace it
> > with a different patch since the merge window is already open.
> > 
> > And since this patch is needed to fix a xfstests failure (although
> > it's for collapse range in data journalling mode, so not a common
> > case), if we can't really see a performance loss in the much more
> > common server configurations, I'm inclined to leave it in for now, and
> > we can try to improve performance in the next kernel revision.
> IMHO, If our goal is to solve the problem of xfstests, we can use only
> "ext4: fix ZERO_RANGE test failure in data journalling" patch without
> i_write_mutex patch. And we can add lock for fallocate on next kernel
> after checking with sufficient time.

I would rather go with this solution. The race is not terribly
critical and this way we would have more time to come up with a
proper locking also with proper locking for AIO/DIO because from my
measurement I can see only about 50% of performance that xfs can
achieve. I believe that the reason is that we're actually using the
stock VFS locking, but we should be able to do something smarter
than that.

Thanks!
-Lukas

> 
> Thanks!
> 
> > 
> > What do other people think?
> > 
> > 						- Ted
> 
>
Theodore Ts'o - June 3, 2014, 3:19 p.m.
On Tue, Jun 03, 2014 at 03:04:32PM +0900, Namjae Jeon wrote:
> IMHO, If our goal is to solve the problem of xfstests, we can use only
> "ext4: fix ZERO_RANGE test failure in data journalling" patch without
> i_write_mutex patch. And we can add lock for fallocate on next kernel
> after checking with sufficient time.

I thought this patch required i_write_mutex to avoid a race where
another thread modifies an inode while filemap_write_and_wait_range()
is running?

I agree that we could drop the i_write_mutex and add a call to
ext4_force_commit() which should make the xfstest failure rarer, but
the race would still be there, yes?

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Namjae Jeon - June 4, 2014, 5:58 a.m.
> 
> On Tue, Jun 03, 2014 at 03:04:32PM +0900, Namjae Jeon wrote:
> > IMHO, If our goal is to solve the problem of xfstests, we can use only
> > "ext4: fix ZERO_RANGE test failure in data journalling" patch without
> > i_write_mutex patch. And we can add lock for fallocate on next kernel
> > after checking with sufficient time.
> 
> I thought this patch required i_write_mutex to avoid a race where
> another thread modifies an inode while filemap_write_and_wait_range()
> is running?
Yes, Right.
> 
> I agree that we could drop the i_write_mutex and add a call to
> ext4_force_commit() which should make the xfstest failure rarer, but
> the race would still be there, yes?
Yes, It is there but as Lukas said it is not critical than a possible
locking overhead. So, IMHO this is not something which needs urgent
attention and can be tackled properly after checking unclear
performance measurement on high-end server.

Thanks.
> 
> 						- Ted

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o - June 8, 2014, 2:48 a.m.
On Wed, Jun 04, 2014 at 02:58:06PM +0900, Namjae Jeon wrote:
> Yes, It is there but as Lukas said it is not critical than a possible
> locking overhead. So, IMHO this is not something which needs urgent
> attention and can be tackled properly after checking unclear
> performance measurement on high-end server.

Fair enough, I've done my own testing and with the following modified
version of that patch it does seem to mostly avoid the xfstest failure
(even if there still is the possibility of a race).  So I'll drop the
i_write_mutex patch for now and make the necessary changes to the "fix
ZERO_RANGE test failure in data journalling" to avoid taking the
now-removed i_write_mutex.

Cheers,

						- Ted


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6b45afa..77e5705 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -943,6 +943,9 @@  struct ext4_inode_info {
 
 	/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
 	__u32 i_csum_seed;
+
+	/* protects fallocate operations racing with new writes */
+	struct mutex i_write_mutex;
 };
 
 /*
@@ -2827,10 +2830,7 @@  static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
 #define EXT4_WQ_HASH_SZ		37
 #define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
 					    EXT4_WQ_HASH_SZ])
-#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
-					     EXT4_WQ_HASH_SZ])
 extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
 
 #define EXT4_RESIZING	0
 extern int ext4_resize_begin(struct super_block *sb);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 086baa9..5262750 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4741,6 +4741,8 @@  static long ext4_zero_range(struct file *file, loff_t offset,
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
 
+	mutex_lock(&EXT4_I(inode)->i_write_mutex);
+
 	/*
 	 * Write out all dirty pages to avoid race conditions
 	 * Then release them.
@@ -4748,8 +4750,10 @@  static long ext4_zero_range(struct file *file, loff_t offset,
 	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 		ret = filemap_write_and_wait_range(mapping, offset,
 						   offset + len - 1);
-		if (ret)
+		if (ret) {
+			mutex_unlock(&EXT4_I(inode)->i_write_mutex);
 			return ret;
+		}
 	}
 
 	/*
@@ -4761,8 +4765,10 @@  static long ext4_zero_range(struct file *file, loff_t offset,
 	start = round_up(offset, 1 << blkbits);
 	end = round_down((offset + len), 1 << blkbits);
 
-	if (start < offset || end > offset + len)
+	if (start < offset || end > offset + len) {
+		mutex_unlock(&EXT4_I(inode)->i_write_mutex);
 		return -EINVAL;
+	}
 	partial = (offset + len) & ((1 << blkbits) - 1);
 
 	lblk = start >> blkbits;
@@ -4859,6 +4865,7 @@  out_dio:
 	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
 	mutex_unlock(&inode->i_mutex);
+	mutex_unlock(&EXT4_I(inode)->i_write_mutex);
 	return ret;
 }
 
@@ -5428,11 +5435,15 @@  int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
 	punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
 
+	mutex_lock(&EXT4_I(inode)->i_write_mutex);
+
 	/* Call ext4_force_commit to flush all data in case of data=journal. */
 	if (ext4_should_journal_data(inode)) {
 		ret = ext4_force_commit(inode->i_sb);
-		if (ret)
+		if (ret) {
+			mutex_unlock(&EXT4_I(inode)->i_write_mutex);
 			return ret;
+		}
 	}
 
 	/*
@@ -5518,5 +5529,6 @@  out_dio:
 	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
 	mutex_unlock(&inode->i_mutex);
+	mutex_unlock(&EXT4_I(inode)->i_write_mutex);
 	return ret;
 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 708aad7..557b4ac 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -93,7 +93,7 @@  ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(iocb->ki_filp);
-	struct mutex *aio_mutex = NULL;
+	bool unaligned_direct_aio = false;
 	struct blk_plug plug;
 	int o_direct = file->f_flags & O_DIRECT;
 	int overwrite = 0;
@@ -101,6 +101,8 @@  ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ssize_t ret;
 	loff_t pos = iocb->ki_pos;
 
+	mutex_lock(&EXT4_I(inode)->i_write_mutex);
+
 	/*
 	 * Unaligned direct AIO must be serialized; see comment above
 	 * In the case of O_APPEND, assume that we must always serialize
@@ -110,8 +112,7 @@  ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	    !is_sync_kiocb(iocb) &&
 	    (file->f_flags & O_APPEND ||
 	     ext4_unaligned_aio(inode, from, pos))) {
-		aio_mutex = ext4_aio_mutex(inode);
-		mutex_lock(aio_mutex);
+		unaligned_direct_aio = true;
 		ext4_unwritten_wait(inode);
 	}
 
@@ -143,8 +144,9 @@  ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		iocb->private = &overwrite;
 
 		/* check whether we do a DIO overwrite or not */
-		if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
-		    !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
+		if (ext4_should_dioread_nolock(inode) &&
+		    !unaligned_direct_aio && !file->f_mapping->nrpages &&
+		    pos + length <= i_size_read(inode)) {
 			struct ext4_map_blocks map;
 			unsigned int blkbits = inode->i_blkbits;
 			int err, len;
@@ -174,6 +176,8 @@  ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	ret = __generic_file_write_iter(iocb, from);
 	mutex_unlock(&inode->i_mutex);
+	if (!unaligned_direct_aio)
+		mutex_unlock(&EXT4_I(inode)->i_write_mutex);
 
 	if (ret > 0) {
 		ssize_t err;
@@ -186,8 +190,8 @@  ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		blk_finish_plug(&plug);
 
 errout:
-	if (aio_mutex)
-		mutex_unlock(aio_mutex);
+	if (unaligned_direct_aio)
+		mutex_unlock(&EXT4_I(inode)->i_write_mutex);
 	return ret;
 }
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b1dc334..d804120 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3528,6 +3528,8 @@  int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 
 	trace_ext4_punch_hole(inode, offset, length, 0);
 
+	mutex_lock(&EXT4_I(inode)->i_write_mutex);
+
 	/*
 	 * Write out all dirty pages to avoid race conditions
 	 * Then release them.
@@ -3535,8 +3537,10 @@  int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 		ret = filemap_write_and_wait_range(mapping, offset,
 						   offset + length - 1);
-		if (ret)
+		if (ret) {
+			mutex_unlock(&EXT4_I(inode)->i_write_mutex);
 			return ret;
+		}
 	}
 
 	mutex_lock(&inode->i_mutex);
@@ -3637,6 +3641,7 @@  out_dio:
 	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
 	mutex_unlock(&inode->i_mutex);
+	mutex_unlock(&EXT4_I(inode)->i_write_mutex);
 	return ret;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1f8cb18..e236c85 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -904,6 +904,7 @@  static struct inode *ext4_alloc_inode(struct super_block *sb)
 	atomic_set(&ei->i_ioend_count, 0);
 	atomic_set(&ei->i_unwritten, 0);
 	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+	mutex_init(&ei->i_write_mutex);
 
 	return &ei->vfs_inode;
 }
@@ -5505,7 +5506,6 @@  static void ext4_exit_feat_adverts(void)
 
 /* Shared across all ext4 file systems */
 wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
 
 static int __init ext4_init_fs(void)
 {
@@ -5518,7 +5518,6 @@  static int __init ext4_init_fs(void)
 	ext4_check_flag_values();
 
 	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
-		mutex_init(&ext4__aio_mutex[i]);
 		init_waitqueue_head(&ext4__ioend_wq[i]);
 	}