diff mbox series

ext4: Fix i_disksize exceeding i_size problem in paritally written case

Message ID 20230317013553.1009553-1-chengzhihao1@huawei.com
State New
Headers show
Series ext4: Fix i_disksize exceeding i_size problem in paritally written case | expand

Commit Message

Zhihao Cheng March 17, 2023, 1:35 a.m. UTC
Following process makes i_disksize exceed i_size:

generic_perform_write
 copied = iov_iter_copy_from_user_atomic(len) // copied < len
 ext4_da_write_end
 | ext4_update_i_disksize
 |  new_i_size = pos + copied;
 |  WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize) // update i_disksize
 | generic_write_end
 |  copied = block_write_end(copied, len) // copied = 0
 |   if (unlikely(copied < len))
 |    if (!PageUptodate(page))
 |     copied = 0;
 |  if (pos + copied > inode->i_size) // return false
 if (unlikely(copied == 0))
  goto again;
 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
  status = -EFAULT;
  break;
 }

We get i_disksize greater than i_size here, which could trigger WARNING
check 'i_size_read(inode) < EXT4_I(inode)->i_disksize' while doing dio:

ext4_dio_write_iter
 iomap_dio_rw
  __iomap_dio_rw // return err, length is not aligned to 512
 ext4_handle_inode_extension
  WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize) // Oops

 WARNING: CPU: 2 PID: 2609 at fs/ext4/file.c:319
 CPU: 2 PID: 2609 Comm: aa Not tainted 6.3.0-rc2
 RIP: 0010:ext4_file_write_iter+0xbc7
 Call Trace:
  vfs_write+0x3b1
  ksys_write+0x77
  do_syscall_64+0x39

Fix it by putting block_write_end() before i_disksize updating just
like ext4_write_end() does.

Fetch a reproducer in [Link].

Link: https://bugzilla.kernel.org/show_bug.cgi?id=217209
Fixes: 64769240bd07f ("ext4: Add delayed allocation support in data=writeback mode")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
---
 fs/ext4/inode.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

Comments

Jan Kara March 17, 2023, 12:45 p.m. UTC | #1
On Fri 17-03-23 09:35:53, Zhihao Cheng wrote:
> Following process makes i_disksize exceed i_size:
> 
> generic_perform_write
>  copied = iov_iter_copy_from_user_atomic(len) // copied < len
>  ext4_da_write_end
>  | ext4_update_i_disksize
>  |  new_i_size = pos + copied;
>  |  WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize) // update i_disksize
>  | generic_write_end
>  |  copied = block_write_end(copied, len) // copied = 0
>  |   if (unlikely(copied < len))
>  |    if (!PageUptodate(page))
>  |     copied = 0;
>  |  if (pos + copied > inode->i_size) // return false
>  if (unlikely(copied == 0))
>   goto again;
>  if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
>   status = -EFAULT;
>   break;
>  }
> 
> We get i_disksize greater than i_size here, which could trigger WARNING
> check 'i_size_read(inode) < EXT4_I(inode)->i_disksize' while doing dio:
> 
> ext4_dio_write_iter
>  iomap_dio_rw
>   __iomap_dio_rw // return err, length is not aligned to 512
>  ext4_handle_inode_extension
>   WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize) // Oops
> 
>  WARNING: CPU: 2 PID: 2609 at fs/ext4/file.c:319
>  CPU: 2 PID: 2609 Comm: aa Not tainted 6.3.0-rc2
>  RIP: 0010:ext4_file_write_iter+0xbc7
>  Call Trace:
>   vfs_write+0x3b1
>   ksys_write+0x77
>   do_syscall_64+0x39
> 
> Fix it by putting block_write_end() before i_disksize updating just
> like ext4_write_end() does.
> 
> Fetch a reproducer in [Link].
> 
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=217209
> Fixes: 64769240bd07f ("ext4: Add delayed allocation support in data=writeback mode")
> Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>

Good catch (although practically this will hardly have any negative
effect). But rather than opencoding generic_write_end() I'd do:

        if (unlikely(copied < len) && !PageUptodate(page))
                copied = 0;

at the beginning of ext4_da_write_end() and that should solve these
problems as well?

								Honza

> ---
>  fs/ext4/inode.c | 32 +++++++++++++++++++++++++-------
>  1 file changed, 25 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index bf0b7dea4900..577dc23f3b78 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3136,6 +3136,8 @@ static int ext4_da_write_end(struct file *file,
>  	loff_t new_i_size;
>  	unsigned long start, end;
>  	int write_mode = (int)(unsigned long)fsdata;
> +	bool i_size_changed = false;
> +	loff_t old_size = inode->i_size;
>  
>  	if (write_mode == FALL_BACK_TO_NONDELALLOC)
>  		return ext4_write_end(file, mapping, pos,
> @@ -3148,6 +3150,8 @@ static int ext4_da_write_end(struct file *file,
>  	    ext4_has_inline_data(inode))
>  		return ext4_write_inline_data_end(inode, pos, len, copied, page);
>  
> +	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
> +
>  	start = pos & (PAGE_SIZE - 1);
>  	end = start + copied - 1;
>  
> @@ -3162,16 +3166,30 @@ static int ext4_da_write_end(struct file *file,
>  	 * check), we need to update i_disksize here as neither
>  	 * ext4_writepage() nor certain ext4_writepages() paths not
>  	 * allocating blocks update i_disksize.
> -	 *
> -	 * Note that we defer inode dirtying to generic_write_end() /
> -	 * ext4_da_write_inline_data_end().
>  	 */
>  	new_i_size = pos + copied;
> -	if (copied && new_i_size > inode->i_size &&
> -	    ext4_da_should_update_i_disksize(page, end))
> -		ext4_update_i_disksize(inode, new_i_size);
> +	if (new_i_size > inode->i_size) {
> +		i_size_write(inode, new_i_size);
> +		i_size_changed = true;
> +		if (copied && ext4_da_should_update_i_disksize(page, end))
> +			ext4_update_i_disksize(inode, new_i_size);
> +	}
> +
> +	unlock_page(page);
> +	put_page(page);
> +
> +	if (old_size < pos)
> +		pagecache_isize_extended(inode, old_size, pos);
> +	/*
> +	 * Don't mark the inode dirty under page lock. First, it unnecessarily
> +	 * makes the holding time of page lock longer. Second, it forces lock
> +	 * ordering of page lock and transaction start for journaling
> +	 * filesystems.
> +	 */
> +	if (i_size_changed)
> +		mark_inode_dirty(inode);
>  
> -	return generic_write_end(file, mapping, pos, len, copied, page, fsdata);
> +	return copied;
>  }
>  
>  /*
> -- 
> 2.31.1
>
Zhihao Cheng March 18, 2023, 7:03 a.m. UTC | #2
Hi, Jan
> On Fri 17-03-23 09:35:53, Zhihao Cheng wrote:
>> Following process makes i_disksize exceed i_size:
>>
>> generic_perform_write
>>   copied = iov_iter_copy_from_user_atomic(len) // copied < len
>>   ext4_da_write_end
>>   | ext4_update_i_disksize
>>   |  new_i_size = pos + copied;
>>   |  WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize) // update i_disksize
>>   | generic_write_end
>>   |  copied = block_write_end(copied, len) // copied = 0
>>   |   if (unlikely(copied < len))
>>   |    if (!PageUptodate(page))
>>   |     copied = 0;
>>   |  if (pos + copied > inode->i_size) // return false
>>   if (unlikely(copied == 0))
>>    goto again;
>>   if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
>>    status = -EFAULT;
>>    break;
>>   }
>>
>> We get i_disksize greater than i_size here, which could trigger WARNING
>> check 'i_size_read(inode) < EXT4_I(inode)->i_disksize' while doing dio:
>>
>> ext4_dio_write_iter
>>   iomap_dio_rw
>>    __iomap_dio_rw // return err, length is not aligned to 512
>>   ext4_handle_inode_extension
>>    WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize) // Oops
>>
>>   WARNING: CPU: 2 PID: 2609 at fs/ext4/file.c:319
>>   CPU: 2 PID: 2609 Comm: aa Not tainted 6.3.0-rc2
>>   RIP: 0010:ext4_file_write_iter+0xbc7
>>   Call Trace:
>>    vfs_write+0x3b1
>>    ksys_write+0x77
>>    do_syscall_64+0x39
>>
>> Fix it by putting block_write_end() before i_disksize updating just
>> like ext4_write_end() does.
>>
>> Fetch a reproducer in [Link].
>>
>> Link: https://bugzilla.kernel.org/show_bug.cgi?id=217209
>> Fixes: 64769240bd07f ("ext4: Add delayed allocation support in data=writeback mode")
>> Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
> 
> Good catch (although practically this will hardly have any negative
> effect). But rather than opencoding generic_write_end() I'd do:
> 
>          if (unlikely(copied < len) && !PageUptodate(page))
>                  copied = 0;
> 
> at the beginning of ext4_da_write_end() and that should solve these
> problems as well?
> 

Yes, your suggestion looks good, and I think we can put the checking 
just after ext4_write_inline_data_end(Line 3150)? On the one hand, we 
can pass original 'copied' value in trace_ext4_da_write_end(), one the 
other hand, ext4_write_inline_data_end() already has this checking.
Zhihao Cheng March 18, 2023, 8:51 a.m. UTC | #3
> Hi, Jan
>> On Fri 17-03-23 09:35:53, Zhihao Cheng wrote:
>>> Following process makes i_disksize exceed i_size:
>>>
>>> generic_perform_write
>>>   copied = iov_iter_copy_from_user_atomic(len) // copied < len
>>>   ext4_da_write_end
>>>   | ext4_update_i_disksize
>>>   |  new_i_size = pos + copied;
>>>   |  WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize) // update i_disksize
>>>   | generic_write_end
>>>   |  copied = block_write_end(copied, len) // copied = 0
>>>   |   if (unlikely(copied < len))
>>>   |    if (!PageUptodate(page))
>>>   |     copied = 0;
>>>   |  if (pos + copied > inode->i_size) // return false
>>>   if (unlikely(copied == 0))
>>>    goto again;
>>>   if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
>>>    status = -EFAULT;
>>>    break;
>>>   }
>>>
>>> We get i_disksize greater than i_size here, which could trigger WARNING
>>> check 'i_size_read(inode) < EXT4_I(inode)->i_disksize' while doing dio:
>>>
>>> ext4_dio_write_iter
>>>   iomap_dio_rw
>>>    __iomap_dio_rw // return err, length is not aligned to 512
>>>   ext4_handle_inode_extension
>>>    WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize) // Oops
>>>
>>>   WARNING: CPU: 2 PID: 2609 at fs/ext4/file.c:319
>>>   CPU: 2 PID: 2609 Comm: aa Not tainted 6.3.0-rc2
>>>   RIP: 0010:ext4_file_write_iter+0xbc7
>>>   Call Trace:
>>>    vfs_write+0x3b1
>>>    ksys_write+0x77
>>>    do_syscall_64+0x39
>>>
>>> Fix it by putting block_write_end() before i_disksize updating just
>>> like ext4_write_end() does.
>>>
>>> Fetch a reproducer in [Link].
>>>
>>> Link: https://bugzilla.kernel.org/show_bug.cgi?id=217209
>>> Fixes: 64769240bd07f ("ext4: Add delayed allocation support in 
>>> data=writeback mode")
>>> Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
>>
>> Good catch (although practically this will hardly have any negative
>> effect). But rather than opencoding generic_write_end() I'd do:
>>
>>          if (unlikely(copied < len) && !PageUptodate(page))
>>                  copied = 0;
>>
>> at the beginning of ext4_da_write_end() and that should solve these
>> problems as well?
>>
> 
> Yes, your suggestion looks good, and I think we can put the checking 
> just after ext4_write_inline_data_end(Line 3150)? On the one hand, we 
> can pass original 'copied' value in trace_ext4_da_write_end(), one the 
> other hand, ext4_write_inline_data_end() already has this checking.
> .

BTW, I want send another patch as follows:
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bf0b7dea4900..570a687ae847 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3149,7 +3149,7 @@ static int ext4_da_write_end(struct file *file,
                 return ext4_write_inline_data_end(inode, pos, len, 
copied, page);

         start = pos & (PAGE_SIZE - 1);
-       end = start + copied - 1;
+       end = start + (copied ? copied - 1 : copied);

         /*
          * Since we are holding inode lock, we are sure i_disksize <=
@@ -3167,7 +3167,7 @@ static int ext4_da_write_end(struct file *file,
          * ext4_da_write_inline_data_end().
          */
         new_i_size = pos + copied;
-       if (copied && new_i_size > inode->i_size &&
+       if (new_i_size > inode->i_size &&
             ext4_da_should_update_i_disksize(page, end))
                 ext4_update_i_disksize(inode, new_i_size);

This modification handle unconsistent i_size and i_disksize imported by 
ea51d132dbf9 ("ext4: avoid hangs in ext4_da_should_update_i_disksize()").

Paritially written may display a fake inode size for user, for example: 

 

i_disksize=1 

generic_perform_write 

  copied = iov_iter_copy_from_user_atomic(len) // copied = 0 

  ext4_da_write_end // skip updating i_disksize 

  generic_write_end 

   if (pos + copied > inode->i_size) {  // 10 + 0 > 1, true 

    i_size_write(inode, pos + copied);  // i_size = 10 

   } 

 

    0 1      10                4096 

    |_|_______|_________..._____| 

      |       | 

     i_size  pos 

 

Now, user see the i_size is 10 (i_disksize is still 1). After inode 

destroyed, user will get the i_size is 1 read from disk.
Jan Kara March 20, 2023, 11:20 a.m. UTC | #4
On Sat 18-03-23 16:51:00, Zhihao Cheng wrote:
> > Hi, Jan
> > > On Fri 17-03-23 09:35:53, Zhihao Cheng wrote:
> > > > Following process makes i_disksize exceed i_size:
> > > > 
> > > > generic_perform_write
> > > >   copied = iov_iter_copy_from_user_atomic(len) // copied < len
> > > >   ext4_da_write_end
> > > >   | ext4_update_i_disksize
> > > >   |  new_i_size = pos + copied;
> > > >   |  WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize) // update i_disksize
> > > >   | generic_write_end
> > > >   |  copied = block_write_end(copied, len) // copied = 0
> > > >   |   if (unlikely(copied < len))
> > > >   |    if (!PageUptodate(page))
> > > >   |     copied = 0;
> > > >   |  if (pos + copied > inode->i_size) // return false
> > > >   if (unlikely(copied == 0))
> > > >    goto again;
> > > >   if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
> > > >    status = -EFAULT;
> > > >    break;
> > > >   }
> > > > 
> > > > We get i_disksize greater than i_size here, which could trigger WARNING
> > > > check 'i_size_read(inode) < EXT4_I(inode)->i_disksize' while doing dio:
> > > > 
> > > > ext4_dio_write_iter
> > > >   iomap_dio_rw
> > > >    __iomap_dio_rw // return err, length is not aligned to 512
> > > >   ext4_handle_inode_extension
> > > >    WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize) // Oops
> > > > 
> > > >   WARNING: CPU: 2 PID: 2609 at fs/ext4/file.c:319
> > > >   CPU: 2 PID: 2609 Comm: aa Not tainted 6.3.0-rc2
> > > >   RIP: 0010:ext4_file_write_iter+0xbc7
> > > >   Call Trace:
> > > >    vfs_write+0x3b1
> > > >    ksys_write+0x77
> > > >    do_syscall_64+0x39
> > > > 
> > > > Fix it by putting block_write_end() before i_disksize updating just
> > > > like ext4_write_end() does.
> > > > 
> > > > Fetch a reproducer in [Link].
> > > > 
> > > > Link: https://bugzilla.kernel.org/show_bug.cgi?id=217209
> > > > Fixes: 64769240bd07f ("ext4: Add delayed allocation support in
> > > > data=writeback mode")
> > > > Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
> > > 
> > > Good catch (although practically this will hardly have any negative
> > > effect). But rather than opencoding generic_write_end() I'd do:
> > > 
> > >          if (unlikely(copied < len) && !PageUptodate(page))
> > >                  copied = 0;
> > > 
> > > at the beginning of ext4_da_write_end() and that should solve these
> > > problems as well?
> > > 
> > 
> > Yes, your suggestion looks good, and I think we can put the checking
> > just after ext4_write_inline_data_end(Line 3150)? On the one hand, we
> > can pass original 'copied' value in trace_ext4_da_write_end(), one the
> > other hand, ext4_write_inline_data_end() already has this checking.
> > .
> 
> BTW, I want send another patch as follows:
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index bf0b7dea4900..570a687ae847 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3149,7 +3149,7 @@ static int ext4_da_write_end(struct file *file,
>                 return ext4_write_inline_data_end(inode, pos, len, copied,
> page);
> 
>         start = pos & (PAGE_SIZE - 1);
> -       end = start + copied - 1;
> +       end = start + (copied ? copied - 1 : copied);
> 
>         /*
>          * Since we are holding inode lock, we are sure i_disksize <=
> @@ -3167,7 +3167,7 @@ static int ext4_da_write_end(struct file *file,
>          * ext4_da_write_inline_data_end().
>          */
>         new_i_size = pos + copied;
> -       if (copied && new_i_size > inode->i_size &&
> +       if (new_i_size > inode->i_size &&
>             ext4_da_should_update_i_disksize(page, end))
>                 ext4_update_i_disksize(inode, new_i_size);
> 
> This modification handle unconsistent i_size and i_disksize imported by
> ea51d132dbf9 ("ext4: avoid hangs in ext4_da_should_update_i_disksize()").
> 
> Paritially written may display a fake inode size for user, for example:
> 
> 
> 
> i_disksize=1
> 
> generic_perform_write
> 
>  copied = iov_iter_copy_from_user_atomic(len) // copied = 0
> 
>  ext4_da_write_end // skip updating i_disksize
> 
>  generic_write_end
> 
>   if (pos + copied > inode->i_size) {  // 10 + 0 > 1, true
> 
>    i_size_write(inode, pos + copied);  // i_size = 10
> 
>   }
> 
> 
> 
>    0 1      10                4096
> 
>    |_|_______|_________..._____|
> 
>      |       |
> 
>     i_size  pos
> 
> 
> 
> Now, user see the i_size is 10 (i_disksize is still 1). After inode
> 
> destroyed, user will get the i_size is 1 read from disk.

OK, but shouldn't we rather change generic_write_end() to not increase
i_size if no write happened? Because that is what seems somewhat
problematic...

								Honza
Zhihao Cheng March 20, 2023, 12:49 p.m. UTC | #5
[...]

>> BTW, I want send another patch as follows:
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index bf0b7dea4900..570a687ae847 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -3149,7 +3149,7 @@ static int ext4_da_write_end(struct file *file,
>>                  return ext4_write_inline_data_end(inode, pos, len, copied,
>> page);
>>
>>          start = pos & (PAGE_SIZE - 1);
>> -       end = start + copied - 1;
>> +       end = start + (copied ? copied - 1 : copied);
>>
>>          /*
>>           * Since we are holding inode lock, we are sure i_disksize <=
>> @@ -3167,7 +3167,7 @@ static int ext4_da_write_end(struct file *file,
>>           * ext4_da_write_inline_data_end().
>>           */
>>          new_i_size = pos + copied;
>> -       if (copied && new_i_size > inode->i_size &&
>> +       if (new_i_size > inode->i_size &&
>>              ext4_da_should_update_i_disksize(page, end))
>>                  ext4_update_i_disksize(inode, new_i_size);
>>
>> This modification handle unconsistent i_size and i_disksize imported by
>> ea51d132dbf9 ("ext4: avoid hangs in ext4_da_should_update_i_disksize()").
>>
>> Paritially written may display a fake inode size for user, for example:
>>
>>
>>
>> i_disksize=1
>>
>> generic_perform_write
>>
>>   copied = iov_iter_copy_from_user_atomic(len) // copied = 0
>>
>>   ext4_da_write_end // skip updating i_disksize
>>
>>   generic_write_end
>>
>>    if (pos + copied > inode->i_size) {  // 10 + 0 > 1, true
>>
>>     i_size_write(inode, pos + copied);  // i_size = 10
>>
>>    }
>>
>>
>>
>>     0 1      10                4096
>>
>>     |_|_______|_________..._____|
>>
>>       |       |
>>
>>      i_size  pos
>>
>>
>>
>> Now, user see the i_size is 10 (i_disksize is still 1). After inode
>>
>> destroyed, user will get the i_size is 1 read from disk.
> 
> OK, but shouldn't we rather change generic_write_end() to not increase
> i_size if no write happened? Because that is what seems somewhat
> problematic...
> 
> 								Honza
> 

After looking through some code, I find some other places have similar 
problem:
1. In ext4_write_end(), i_size is updated by ext4 not generic_write_end().
2. The iommap framework, i_size is updated even copied is zero.
3. ubifs_write_end, i_size is updated even copied is zero.

It seems that fixing all places is not an easy work.
Jan Kara March 20, 2023, 4:24 p.m. UTC | #6
On Mon 20-03-23 20:49:07, Zhihao Cheng wrote:
> [...]
> 
> > > BTW, I want send another patch as follows:
> > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > index bf0b7dea4900..570a687ae847 100644
> > > --- a/fs/ext4/inode.c
> > > +++ b/fs/ext4/inode.c
> > > @@ -3149,7 +3149,7 @@ static int ext4_da_write_end(struct file *file,
> > >                  return ext4_write_inline_data_end(inode, pos, len, copied,
> > > page);
> > > 
> > >          start = pos & (PAGE_SIZE - 1);
> > > -       end = start + copied - 1;
> > > +       end = start + (copied ? copied - 1 : copied);
> > > 
> > >          /*
> > >           * Since we are holding inode lock, we are sure i_disksize <=
> > > @@ -3167,7 +3167,7 @@ static int ext4_da_write_end(struct file *file,
> > >           * ext4_da_write_inline_data_end().
> > >           */
> > >          new_i_size = pos + copied;
> > > -       if (copied && new_i_size > inode->i_size &&
> > > +       if (new_i_size > inode->i_size &&
> > >              ext4_da_should_update_i_disksize(page, end))
> > >                  ext4_update_i_disksize(inode, new_i_size);
> > > 
> > > This modification handle unconsistent i_size and i_disksize imported by
> > > ea51d132dbf9 ("ext4: avoid hangs in ext4_da_should_update_i_disksize()").
> > > 
> > > Paritially written may display a fake inode size for user, for example:
> > > 
> > > 
> > > 
> > > i_disksize=1
> > > 
> > > generic_perform_write
> > > 
> > >   copied = iov_iter_copy_from_user_atomic(len) // copied = 0
> > > 
> > >   ext4_da_write_end // skip updating i_disksize
> > > 
> > >   generic_write_end
> > > 
> > >    if (pos + copied > inode->i_size) {  // 10 + 0 > 1, true
> > > 
> > >     i_size_write(inode, pos + copied);  // i_size = 10
> > > 
> > >    }
> > > 
> > > 
> > > 
> > >     0 1      10                4096
> > > 
> > >     |_|_______|_________..._____|
> > > 
> > >       |       |
> > > 
> > >      i_size  pos
> > > 
> > > 
> > > 
> > > Now, user see the i_size is 10 (i_disksize is still 1). After inode
> > > 
> > > destroyed, user will get the i_size is 1 read from disk.
> > 
> > OK, but shouldn't we rather change generic_write_end() to not increase
> > i_size if no write happened? Because that is what seems somewhat
> > problematic...
> > 
> > 								Honza
> > 
> 
> After looking through some code, I find some other places have similar
> problem:
> 1. In ext4_write_end(), i_size is updated by ext4 not generic_write_end().
> 2. The iommap framework, i_size is updated even copied is zero.
> 3. ubifs_write_end, i_size is updated even copied is zero.
> 
> It seems that fixing all places is not an easy work.

Well, yeah, probably not trivial but still desirable ;). Will you look into
that?

								Honza
Zhihao Cheng March 21, 2023, 1:29 a.m. UTC | #7
> On Mon 20-03-23 20:49:07, Zhihao Cheng wrote:
>> [...]
>>
>>>> BTW, I want send another patch as follows:
>>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>>> index bf0b7dea4900..570a687ae847 100644
>>>> --- a/fs/ext4/inode.c
>>>> +++ b/fs/ext4/inode.c
>>>> @@ -3149,7 +3149,7 @@ static int ext4_da_write_end(struct file *file,
>>>>                   return ext4_write_inline_data_end(inode, pos, len, copied,
>>>> page);
>>>>
>>>>           start = pos & (PAGE_SIZE - 1);
>>>> -       end = start + copied - 1;
>>>> +       end = start + (copied ? copied - 1 : copied);
>>>>
>>>>           /*
>>>>            * Since we are holding inode lock, we are sure i_disksize <=
>>>> @@ -3167,7 +3167,7 @@ static int ext4_da_write_end(struct file *file,
>>>>            * ext4_da_write_inline_data_end().
>>>>            */
>>>>           new_i_size = pos + copied;
>>>> -       if (copied && new_i_size > inode->i_size &&
>>>> +       if (new_i_size > inode->i_size &&
>>>>               ext4_da_should_update_i_disksize(page, end))
>>>>                   ext4_update_i_disksize(inode, new_i_size);
>>>>
>>>> This modification handle unconsistent i_size and i_disksize imported by
>>>> ea51d132dbf9 ("ext4: avoid hangs in ext4_da_should_update_i_disksize()").
>>>>
>>>> Paritially written may display a fake inode size for user, for example:
>>>>
>>>>
>>>>
>>>> i_disksize=1
>>>>
>>>> generic_perform_write
>>>>
>>>>    copied = iov_iter_copy_from_user_atomic(len) // copied = 0
>>>>
>>>>    ext4_da_write_end // skip updating i_disksize
>>>>
>>>>    generic_write_end
>>>>
>>>>     if (pos + copied > inode->i_size) {  // 10 + 0 > 1, true
>>>>
>>>>      i_size_write(inode, pos + copied);  // i_size = 10
>>>>
>>>>     }
>>>>
>>>>
>>>>
>>>>      0 1      10                4096
>>>>
>>>>      |_|_______|_________..._____|
>>>>
>>>>        |       |
>>>>
>>>>       i_size  pos
>>>>
>>>>
>>>>
>>>> Now, user see the i_size is 10 (i_disksize is still 1). After inode
>>>>
>>>> destroyed, user will get the i_size is 1 read from disk.
>>>
>>> OK, but shouldn't we rather change generic_write_end() to not increase
>>> i_size if no write happened? Because that is what seems somewhat
>>> problematic...
>>>
>>> 								Honza
>>>
>>
>> After looking through some code, I find some other places have similar
>> problem:
>> 1. In ext4_write_end(), i_size is updated by ext4 not generic_write_end().
>> 2. The iommap framework, i_size is updated even copied is zero.
>> 3. ubifs_write_end, i_size is updated even copied is zero.
>>
>> It seems that fixing all places is not an easy work.
> 
> Well, yeah, probably not trivial but still desirable ;). Will you look into
> that?
> 
> 								Honza
> 


I'am happy to investigate it, maybe it will take some time, and I'm also 
glad to help review code if somebody come up a solution firstly.
diff mbox series

Patch

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bf0b7dea4900..577dc23f3b78 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3136,6 +3136,8 @@  static int ext4_da_write_end(struct file *file,
 	loff_t new_i_size;
 	unsigned long start, end;
 	int write_mode = (int)(unsigned long)fsdata;
+	bool i_size_changed = false;
+	loff_t old_size = inode->i_size;
 
 	if (write_mode == FALL_BACK_TO_NONDELALLOC)
 		return ext4_write_end(file, mapping, pos,
@@ -3148,6 +3150,8 @@  static int ext4_da_write_end(struct file *file,
 	    ext4_has_inline_data(inode))
 		return ext4_write_inline_data_end(inode, pos, len, copied, page);
 
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
 	start = pos & (PAGE_SIZE - 1);
 	end = start + copied - 1;
 
@@ -3162,16 +3166,30 @@  static int ext4_da_write_end(struct file *file,
 	 * check), we need to update i_disksize here as neither
 	 * ext4_writepage() nor certain ext4_writepages() paths not
 	 * allocating blocks update i_disksize.
-	 *
-	 * Note that we defer inode dirtying to generic_write_end() /
-	 * ext4_da_write_inline_data_end().
 	 */
 	new_i_size = pos + copied;
-	if (copied && new_i_size > inode->i_size &&
-	    ext4_da_should_update_i_disksize(page, end))
-		ext4_update_i_disksize(inode, new_i_size);
+	if (new_i_size > inode->i_size) {
+		i_size_write(inode, new_i_size);
+		i_size_changed = true;
+		if (copied && ext4_da_should_update_i_disksize(page, end))
+			ext4_update_i_disksize(inode, new_i_size);
+	}
+
+	unlock_page(page);
+	put_page(page);
+
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		mark_inode_dirty(inode);
 
-	return generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	return copied;
 }
 
 /*