Patchwork [RFC] vfs: always protect diretory file->fpos with inode mutex

login
register
mail settings
Submitter Zefan Li
Date Feb. 19, 2013, 1:22 a.m.
Message ID <5122D3E0.6070800@huawei.com>
Download mbox | patch
Permalink /patch/221597/
State New
Headers show

Comments

Zefan Li - Feb. 19, 2013, 1:22 a.m.
There's a long long-standing bug...As long as I don't know when it dates
from.

I've written and attached a simple program to reproduce this bug, and it can
immediately trigger the bug in my box. It uses two threads, one keeps calling
read(), and the other calling readdir(), both on the same directory fd.

When I ran it on ext3 (can be replaced with ext2/ext4) which has _dir_index_
feature disabled, I got this:

EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=993, inode=0, rec_len=0, name_len=0
EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=1009, inode=0, rec_len=0, name_len=0
EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=993, inode=0, rec_len=0, name_len=0
EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=1009, inode=0, rec_len=0, name_len=0
...

If we configured errors=remount-ro, the filesystem will become read-only.

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
	...
		loff_t pos = file_pos_read(file);
		ret = vfs_read(file, buf, count, &pos);
		file_pos_write(file, pos);
		fput_light(file, fput_needed);
	...
}

While readdir() is protected with i_mutex, f_pos can be changed without any locking
in various read()/write() syscalls, which leads to this bug.

What makes things worse is Andi removed i_mutex from generic_file_llseek, so you
can trigger the same bug by replacing read() with lseek() in the test program.

commit ef3d0fd27e90f67e35da516dafc1482c82939a60
Author: Andi Kleen <ak@linux.intel.com>
Date:   Thu Sep 15 16:06:48 2011 -0700

    vfs: do (nearly) lockless generic_file_llseek

I've tested ext3 with dir_index enabled and btrfs, nothing bad happened, but there
should be some other vulnerabilities. For example, running the test program on /sys
for a few minutes triggered this warning:

[  917.994600] ------------[ cut here ]------------
[  917.994614] WARNING: at fs/sysfs/sysfs.h:195 sysfs_readdir+0x24c/0x260()
[  917.994621] Hardware name: Tecal RH2285
...
[  917.994725] Pid: 8754, comm: a.out Not tainted 3.8.0-rc2-tj-0.7-default+ #69
[  917.994731] Call Trace:
[  917.994736]  [<ffffffff81205c6c>] ? sysfs_readdir+0x24c/0x260
[  917.994743]  [<ffffffff81205c6c>] ? sysfs_readdir+0x24c/0x260
[  917.994752]  [<ffffffff81041fff>] warn_slowpath_common+0x7f/0xc0
[  917.994759]  [<ffffffff8104205a>] warn_slowpath_null+0x1a/0x20
[  917.994766]  [<ffffffff81205c6c>] sysfs_readdir+0x24c/0x260
[  917.994774]  [<ffffffff8119cbd0>] ? sys_ioctl+0x90/0x90
[  917.994780]  [<ffffffff8119cbd0>] ? sys_ioctl+0x90/0x90
[  917.994787]  [<ffffffff8119cfc1>] vfs_readdir+0xb1/0xd0
[  917.994794]  [<ffffffff8119d07b>] sys_getdents64+0x9b/0x110
[  917.994803]  [<ffffffff814a45d9>] system_call_fastpath+0x16/0x1b
[  917.994809] ---[ end trace 6efe15a65b89022a ]---
[  917.994816] ida_remove called for id=13073 which is not allocated.


We can fix this bug in each filesystem, but can't we just make sure i_mutex is
acquired in lseek(), read(), write() and readdir() for directory file operations?

(the patch is for demonstration only)
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dirent.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>

int main(int argc, char *argv[])
{
	int fd;
	int ret;
	DIR *dir;
	struct dirent *ptr;

	if (argc != 2)
		errx(1, "Please specify a directory");

	dir = opendir(argv[1]);
	if (!dir)
		err(1, "Failed to open directory %s", argv[1]);

	fd = dirfd(dir);
	if (fd < 0)
		err(1, "Failed to get dirfd");

	ret = fork();
	if (ret == 0) {
		char buf[100];

		while (1)
			read(fd, buf, 100);
	} else {
		int ret2;

		while (1) {
			ret2 = lseek(fd, 0, SEEK_SET);
			if (ret2 < -1)
				err(1, "seek failed");

			while (ptr = readdir(dir))
				;
		}
	}	

	closedir(dir);
	return 0;
}
Miao Xie - Feb. 19, 2013, 4:06 a.m.
On tue, 19 Feb 2013 09:22:40 +0800, Li Zefan wrote:
> There's a long long-standing bug...As long as I don't know when it dates
> from.
> 
> I've written and attached a simple program to reproduce this bug, and it can
> immediately trigger the bug in my box. It uses two threads, one keeps calling
> read(), and the other calling readdir(), both on the same directory fd.
> 
> When I ran it on ext3 (can be replaced with ext2/ext4) which has _dir_index_
> feature disabled, I got this:
> 
> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=993, inode=0, rec_len=0, name_len=0
> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=1009, inode=0, rec_len=0, name_len=0
> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=993, inode=0, rec_len=0, name_len=0
> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=1009, inode=0, rec_len=0, name_len=0
> ...
> 
> If we configured errors=remount-ro, the filesystem will become read-only.
> 
> SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
> {
> 	...
> 		loff_t pos = file_pos_read(file);
> 		ret = vfs_read(file, buf, count, &pos);
> 		file_pos_write(file, pos);
> 		fput_light(file, fput_needed);
> 	...
> }
> 
> While readdir() is protected with i_mutex, f_pos can be changed without any locking
> in various read()/write() syscalls, which leads to this bug.
> 
> What makes things worse is Andi removed i_mutex from generic_file_llseek, so you
> can trigger the same bug by replacing read() with lseek() in the test program.
> 
> commit ef3d0fd27e90f67e35da516dafc1482c82939a60
> Author: Andi Kleen <ak@linux.intel.com>
> Date:   Thu Sep 15 16:06:48 2011 -0700
> 
>     vfs: do (nearly) lockless generic_file_llseek
> 
> I've tested ext3 with dir_index enabled and btrfs, nothing bad happened, but there
> should be some other vulnerabilities. For example, running the test program on /sys
> for a few minutes triggered this warning:
> 
> [  917.994600] ------------[ cut here ]------------
> [  917.994614] WARNING: at fs/sysfs/sysfs.h:195 sysfs_readdir+0x24c/0x260()
> [  917.994621] Hardware name: Tecal RH2285
> ...
> [  917.994725] Pid: 8754, comm: a.out Not tainted 3.8.0-rc2-tj-0.7-default+ #69
> [  917.994731] Call Trace:
> [  917.994736]  [<ffffffff81205c6c>] ? sysfs_readdir+0x24c/0x260
> [  917.994743]  [<ffffffff81205c6c>] ? sysfs_readdir+0x24c/0x260
> [  917.994752]  [<ffffffff81041fff>] warn_slowpath_common+0x7f/0xc0
> [  917.994759]  [<ffffffff8104205a>] warn_slowpath_null+0x1a/0x20
> [  917.994766]  [<ffffffff81205c6c>] sysfs_readdir+0x24c/0x260
> [  917.994774]  [<ffffffff8119cbd0>] ? sys_ioctl+0x90/0x90
> [  917.994780]  [<ffffffff8119cbd0>] ? sys_ioctl+0x90/0x90
> [  917.994787]  [<ffffffff8119cfc1>] vfs_readdir+0xb1/0xd0
> [  917.994794]  [<ffffffff8119d07b>] sys_getdents64+0x9b/0x110
> [  917.994803]  [<ffffffff814a45d9>] system_call_fastpath+0x16/0x1b
> [  917.994809] ---[ end trace 6efe15a65b89022a ]---
> [  917.994816] ida_remove called for id=13073 which is not allocated.
> 
> 
> We can fix this bug in each filesystem, but can't we just make sure i_mutex is
> acquired in lseek(), read(), write() and readdir() for directory file operations?

I think it is unnecessary to acquire i_mutex in lseek(), read() and write(), because
we can be aware of the change of f_pos, and then get and tune the value in readdir(),
just like ext3 with dir_index enabled.

Thanks
Miao
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - Feb. 19, 2013, 9:19 a.m.
On Tue 19-02-13 09:22:40, Li Zefan wrote:
> There's a long long-standing bug...As long as I don't know when it dates
> from.
> 
> I've written and attached a simple program to reproduce this bug, and it can
> immediately trigger the bug in my box. It uses two threads, one keeps calling
> read(), and the other calling readdir(), both on the same directory fd.
  So the fact that read() or even write() to fd opened O_RDONLY has *any*
effect on f_pos looks really unexpected to me. I think we really should
have there:
	if (ret >= 0)
		file_pos_write(...);
  That would solve problems with read() and write() on directories for
pretty much every filesystem since the first usually returns -EISDIR and
the second -EBADF.

> When I ran it on ext3 (can be replaced with ext2/ext4) which has _dir_index_
> feature disabled, I got this:
> 
> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=993, inode=0, rec_len=0, name_len=0
> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=1009, inode=0, rec_len=0, name_len=0
> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=993, inode=0, rec_len=0, name_len=0
> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=1009, inode=0, rec_len=0, name_len=0
> ...
> 
> If we configured errors=remount-ro, the filesystem will become read-only.
> 
> SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
> {
> 	...
> 		loff_t pos = file_pos_read(file);
> 		ret = vfs_read(file, buf, count, &pos);
> 		file_pos_write(file, pos);
> 		fput_light(file, fput_needed);
> 	...
> }
> 
> While readdir() is protected with i_mutex, f_pos can be changed without
> any locking in various read()/write() syscalls, which leads to this bug.
> 
> What makes things worse is Andi removed i_mutex from generic_file_llseek,
> so you can trigger the same bug by replacing read() with lseek() in the
> test program.
  Yes, and here I'd say it's a filesystem issue. If filesystem needs f_pos
changed only under i_mutex, it should use default_llseek() or get the mutex
itself. That's what the callback is for. We shouldn't unnecessarily impose
the i_mutex restriction on llseek on a directory for every filesystem.

								Honza
Zefan Li - Feb. 19, 2013, 11:47 a.m.
On 2013/2/19 17:19, Jan Kara wrote:
> On Tue 19-02-13 09:22:40, Li Zefan wrote:
>> There's a long long-standing bug...As long as I don't know when it dates
>> from.
>>
>> I've written and attached a simple program to reproduce this bug, and it can
>> immediately trigger the bug in my box. It uses two threads, one keeps calling
>> read(), and the other calling readdir(), both on the same directory fd.
>   So the fact that read() or even write() to fd opened O_RDONLY has *any*
> effect on f_pos looks really unexpected to me. I think we really should
> have there:
> 	if (ret >= 0)
> 		file_pos_write(...);

I thought about this. The problem is then we have to check every fop->write()
to see if any of them can return -errno with file->f_pos changed and fix them,
though it's do-able.

>   That would solve problems with read() and write() on directories for
> pretty much every filesystem since the first usually returns -EISDIR and
> the second -EBADF.

Yeah, seems ceph is the only filesystem that allows read() on directories.

> 
>> When I ran it on ext3 (can be replaced with ext2/ext4) which has _dir_index_
>> feature disabled, I got this:
>>
>> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=993, inode=0, rec_len=0, name_len=0
>> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=1009, inode=0, rec_len=0, name_len=0
>> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=993, inode=0, rec_len=0, name_len=0
>> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=1009, inode=0, rec_len=0, name_len=0
>> ...
>>
>> If we configured errors=remount-ro, the filesystem will become read-only.
>>
>> SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
>> {
>> 	...
>> 		loff_t pos = file_pos_read(file);
>> 		ret = vfs_read(file, buf, count, &pos);
>> 		file_pos_write(file, pos);
>> 		fput_light(file, fput_needed);
>> 	...
>> }
>>
>> While readdir() is protected with i_mutex, f_pos can be changed without
>> any locking in various read()/write() syscalls, which leads to this bug.
>>
>> What makes things worse is Andi removed i_mutex from generic_file_llseek,
>> so you can trigger the same bug by replacing read() with lseek() in the
>> test program.
>   Yes, and here I'd say it's a filesystem issue. If filesystem needs f_pos
> changed only under i_mutex, it should use default_llseek() or get the mutex
> itself. That's what the callback is for. We shouldn't unnecessarily impose
> the i_mutex restriction on llseek on a directory for every filesystem.
> 

One of my concern is, concurrent lseek() and readdir() doesn't seem to be
well tested. I'll add a test case in xfstests.

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zefan Li - Feb. 19, 2013, 11:48 a.m.
On 2013/2/19 17:19, Jan Kara wrote:
> On Tue 19-02-13 09:22:40, Li Zefan wrote:
>> There's a long long-standing bug...As long as I don't know when it dates
>> from.
>>
>> I've written and attached a simple program to reproduce this bug, and it can
>> immediately trigger the bug in my box. It uses two threads, one keeps calling
>> read(), and the other calling readdir(), both on the same directory fd.
>   So the fact that read() or even write() to fd opened O_RDONLY has *any*
> effect on f_pos looks really unexpected to me. I think we really should
> have there:
> 	if (ret >= 0)
> 		file_pos_write(...);

I thought about this. The problem is then we have to check every fop->write()
to see if any of them can return -errno with file->f_pos changed and fix them,
though it's do-able.

>   That would solve problems with read() and write() on directories for
> pretty much every filesystem since the first usually returns -EISDIR and
> the second -EBADF.

Yeah, seems ceph is the only filesystem that allows read() on directories.

> 
>> When I ran it on ext3 (can be replaced with ext2/ext4) which has _dir_index_
>> feature disabled, I got this:
>>
>> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=993, inode=0, rec_len=0, name_len=0
>> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=1009, inode=0, rec_len=0, name_len=0
>> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=993, inode=0, rec_len=0, name_len=0
>> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=1009, inode=0, rec_len=0, name_len=0
>> ...
>>
>> If we configured errors=remount-ro, the filesystem will become read-only.
>>
>> SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
>> {
>> 	...
>> 		loff_t pos = file_pos_read(file);
>> 		ret = vfs_read(file, buf, count, &pos);
>> 		file_pos_write(file, pos);
>> 		fput_light(file, fput_needed);
>> 	...
>> }
>>
>> While readdir() is protected with i_mutex, f_pos can be changed without
>> any locking in various read()/write() syscalls, which leads to this bug.
>>
>> What makes things worse is Andi removed i_mutex from generic_file_llseek,
>> so you can trigger the same bug by replacing read() with lseek() in the
>> test program.
>   Yes, and here I'd say it's a filesystem issue. If filesystem needs f_pos
> changed only under i_mutex, it should use default_llseek() or get the mutex
> itself. That's what the callback is for. We shouldn't unnecessarily impose
> the i_mutex restriction on llseek on a directory for every filesystem.
> 

One of my concern is, concurrent lseek() and readdir() doesn't seem to be
well tested. I'll add a test case in xfstests.

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zheng Liu - Feb. 19, 2013, 12:33 p.m.
On Tue, Feb 19, 2013 at 09:22:40AM +0800, Li Zefan wrote:
> There's a long long-standing bug...As long as I don't know when it dates
> from.
> 
> I've written and attached a simple program to reproduce this bug, and it can
> immediately trigger the bug in my box. It uses two threads, one keeps calling
> read(), and the other calling readdir(), both on the same directory fd.

Hi Zefan,

Out of curiosity, why do you call read(2) on a directory fd?  I only
open(2) a directory in order to execute a flush operation to make sure
that a file is really created.

Regards,
                                                - Zheng

> 
> When I ran it on ext3 (can be replaced with ext2/ext4) which has _dir_index_
> feature disabled, I got this:
> 
> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=993, inode=0, rec_len=0, name_len=0
> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=1009, inode=0, rec_len=0, name_len=0
> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=993, inode=0, rec_len=0, name_len=0
> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=1009, inode=0, rec_len=0, name_len=0
> ...
> 
> If we configured errors=remount-ro, the filesystem will become read-only.
> 
> SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
> {
> 	...
> 		loff_t pos = file_pos_read(file);
> 		ret = vfs_read(file, buf, count, &pos);
> 		file_pos_write(file, pos);
> 		fput_light(file, fput_needed);
> 	...
> }
> 
> While readdir() is protected with i_mutex, f_pos can be changed without any locking
> in various read()/write() syscalls, which leads to this bug.
> 
> What makes things worse is Andi removed i_mutex from generic_file_llseek, so you
> can trigger the same bug by replacing read() with lseek() in the test program.
> 
> commit ef3d0fd27e90f67e35da516dafc1482c82939a60
> Author: Andi Kleen <ak@linux.intel.com>
> Date:   Thu Sep 15 16:06:48 2011 -0700
> 
>     vfs: do (nearly) lockless generic_file_llseek
> 
> I've tested ext3 with dir_index enabled and btrfs, nothing bad happened, but there
> should be some other vulnerabilities. For example, running the test program on /sys
> for a few minutes triggered this warning:
> 
> [  917.994600] ------------[ cut here ]------------
> [  917.994614] WARNING: at fs/sysfs/sysfs.h:195 sysfs_readdir+0x24c/0x260()
> [  917.994621] Hardware name: Tecal RH2285
> ...
> [  917.994725] Pid: 8754, comm: a.out Not tainted 3.8.0-rc2-tj-0.7-default+ #69
> [  917.994731] Call Trace:
> [  917.994736]  [<ffffffff81205c6c>] ? sysfs_readdir+0x24c/0x260
> [  917.994743]  [<ffffffff81205c6c>] ? sysfs_readdir+0x24c/0x260
> [  917.994752]  [<ffffffff81041fff>] warn_slowpath_common+0x7f/0xc0
> [  917.994759]  [<ffffffff8104205a>] warn_slowpath_null+0x1a/0x20
> [  917.994766]  [<ffffffff81205c6c>] sysfs_readdir+0x24c/0x260
> [  917.994774]  [<ffffffff8119cbd0>] ? sys_ioctl+0x90/0x90
> [  917.994780]  [<ffffffff8119cbd0>] ? sys_ioctl+0x90/0x90
> [  917.994787]  [<ffffffff8119cfc1>] vfs_readdir+0xb1/0xd0
> [  917.994794]  [<ffffffff8119d07b>] sys_getdents64+0x9b/0x110
> [  917.994803]  [<ffffffff814a45d9>] system_call_fastpath+0x16/0x1b
> [  917.994809] ---[ end trace 6efe15a65b89022a ]---
> [  917.994816] ida_remove called for id=13073 which is not allocated.
> 
> 
> We can fix this bug in each filesystem, but can't we just make sure i_mutex is
> acquired in lseek(), read(), write() and readdir() for directory file operations?
> 
> (the patch is for demonstration only)
> 
> diff --git a/fs/read_write.c b/fs/read_write.c
> index bb34af3..41f76e5 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -218,14 +218,25 @@ EXPORT_SYMBOL(default_llseek);
>  
>  loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
>  {
> +	struct inode *inode = file->f_path.dentry->d_inode;
>  	loff_t (*fn)(struct file *, loff_t, int);
> +	int ret;
>  
>  	fn = no_llseek;
>  	if (file->f_mode & FMODE_LSEEK) {
>  		if (file->f_op && file->f_op->llseek)
>  			fn = file->f_op->llseek;
>  	}
> -	return fn(file, offset, whence);
> +
> +	if (S_ISDIR(inode->i_mode)) {
> +		mutex_lock(&inode->i_mutex);
> +		ret = fn(file, offset, whence);
> +		mutex_unlock(&inode->i_mutex);
> +	} else {
> +		ret = fn(file, offset, whence);
> +	}
> +
> +	return ret;
>  }
>  EXPORT_SYMBOL(vfs_llseek);
>  
> @@ -442,12 +453,32 @@ EXPORT_SYMBOL(vfs_write);
>  
>  static inline loff_t file_pos_read(struct file *file)
>  {
> -	return file->f_pos;
> +	struct inode *inode = file->f_path.dentry->d_inode;
> +	loff_t pos;
> +
> +	if (S_ISDIR(inode->i_mode)) {
> +		mutex_lock(&inode->i_mutex);
> +		pos = file->f_pos;
> +		mutex_unlock(&inode->i_mutex);
> +	} else {
> +		pos = file->f_pos;
> +	}
> +
> +	return pos;
>  }
>  
>  static inline void file_pos_write(struct file *file, loff_t pos)
>  {
> -	file->f_pos = pos;
> +	struct inode *inode = file->f_path.dentry->d_inode;
> +
> +	if (S_ISDIR(inode->i_mode)) {
> +		mutex_lock(&inode->i_mutex);
> +		file->f_pos = pos;
> +		file->f_version = 0;
> +		mutex_unlock(&inode->i_mutex);
> +	} else {
> +		file->f_pos = pos;
> +	}
>  }
>  
>  SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
> 
> 

> #include <unistd.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <dirent.h>
> #include <fcntl.h>
> #include <sys/stat.h>
> #include <sys/types.h>
> 
> int main(int argc, char *argv[])
> {
> 	int fd;
> 	int ret;
> 	DIR *dir;
> 	struct dirent *ptr;
> 
> 	if (argc != 2)
> 		errx(1, "Please specify a directory");
> 
> 	dir = opendir(argv[1]);
> 	if (!dir)
> 		err(1, "Failed to open directory %s", argv[1]);
> 
> 	fd = dirfd(dir);
> 	if (fd < 0)
> 		err(1, "Failed to get dirfd");
> 
> 	ret = fork();
> 	if (ret == 0) {
> 		char buf[100];
> 
> 		while (1)
> 			read(fd, buf, 100);
> 	} else {
> 		int ret2;
> 
> 		while (1) {
> 			ret2 = lseek(fd, 0, SEEK_SET);
> 			if (ret2 < -1)
> 				err(1, "seek failed");
> 
> 			while (ptr = readdir(dir))
> 				;
> 		}
> 	}	
> 
> 	closedir(dir);
> 	return 0;
> }

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zefan Li - Feb. 19, 2013, 12:43 p.m.
On 2013/2/19 20:33, Zheng Liu wrote:
> On Tue, Feb 19, 2013 at 09:22:40AM +0800, Li Zefan wrote:
>> There's a long long-standing bug...As long as I don't know when it dates
>> from.
>>
>> I've written and attached a simple program to reproduce this bug, and it can
>> immediately trigger the bug in my box. It uses two threads, one keeps calling
>> read(), and the other calling readdir(), both on the same directory fd.
> 
> Hi Zefan,
> 
> Out of curiosity, why do you call read(2) on a directory fd?  I only
> open(2) a directory in order to execute a flush operation to make sure
> that a file is really created.
> 

Because something wrong happened in userspace programs.

After a thread closed a socket, another thread is still reading data from
this socket, but the socket fd has been re-used for opening directory for
readdir()!

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - Feb. 19, 2013, 12:59 p.m.
On Tue 19-02-13 19:47:30, Li Zefan wrote:
> On 2013/2/19 17:19, Jan Kara wrote:
> > On Tue 19-02-13 09:22:40, Li Zefan wrote:
> >> There's a long long-standing bug...As long as I don't know when it dates
> >> from.
> >>
> >> I've written and attached a simple program to reproduce this bug, and it can
> >> immediately trigger the bug in my box. It uses two threads, one keeps calling
> >> read(), and the other calling readdir(), both on the same directory fd.
> >   So the fact that read() or even write() to fd opened O_RDONLY has *any*
> > effect on f_pos looks really unexpected to me. I think we really should
> > have there:
> > 	if (ret >= 0)
> > 		file_pos_write(...);
> 
> I thought about this. The problem is then we have to check every fop->write()
> to see if any of them can return -errno with file->f_pos changed and fix them,
> though it's do-able.
  But returning error and advancing f_pos would be a bug - specification
says write() returns the number of bytes written or -1 and f_pos should be
advanced by the number of bytes written.

> >   That would solve problems with read() and write() on directories for
> > pretty much every filesystem since the first usually returns -EISDIR and
> > the second -EBADF.
> 
> Yeah, seems ceph is the only filesystem that allows read() on directories.
> 
> >> When I ran it on ext3 (can be replaced with ext2/ext4) which has _dir_index_
> >> feature disabled, I got this:
> >>
> >> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=993, inode=0, rec_len=0, name_len=0
> >> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=1009, inode=0, rec_len=0, name_len=0
> >> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=993, inode=0, rec_len=0, name_len=0
> >> EXT3-fs error (device loop1): ext3_readdir: bad entry in directory #34817: rec_len is smaller than minimal - offset=1009, inode=0, rec_len=0, name_len=0
> >> ...
> >>
> >> If we configured errors=remount-ro, the filesystem will become read-only.
> >>
> >> SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
> >> {
> >> 	...
> >> 		loff_t pos = file_pos_read(file);
> >> 		ret = vfs_read(file, buf, count, &pos);
> >> 		file_pos_write(file, pos);
> >> 		fput_light(file, fput_needed);
> >> 	...
> >> }
> >>
> >> While readdir() is protected with i_mutex, f_pos can be changed without
> >> any locking in various read()/write() syscalls, which leads to this bug.
> >>
> >> What makes things worse is Andi removed i_mutex from generic_file_llseek,
> >> so you can trigger the same bug by replacing read() with lseek() in the
> >> test program.
> >   Yes, and here I'd say it's a filesystem issue. If filesystem needs f_pos
> > changed only under i_mutex, it should use default_llseek() or get the mutex
> > itself. That's what the callback is for. We shouldn't unnecessarily impose
> > the i_mutex restriction on llseek on a directory for every filesystem.
> 
> One of my concern is, concurrent lseek() and readdir() doesn't seem to be
> well tested. I'll add a test case in xfstests.
  Yes, that might be a useful test to add.

								Honza
Zefan Li - Feb. 20, 2013, 1:49 a.m.
On 2013/2/19 20:59, Jan Kara wrote:
> On Tue 19-02-13 19:47:30, Li Zefan wrote:
>> On 2013/2/19 17:19, Jan Kara wrote:
>>> On Tue 19-02-13 09:22:40, Li Zefan wrote:
>>>> There's a long long-standing bug...As long as I don't know when it dates
>>>> from.
>>>>
>>>> I've written and attached a simple program to reproduce this bug, and it can
>>>> immediately trigger the bug in my box. It uses two threads, one keeps calling
>>>> read(), and the other calling readdir(), both on the same directory fd.
>>>   So the fact that read() or even write() to fd opened O_RDONLY has *any*
>>> effect on f_pos looks really unexpected to me. I think we really should
>>> have there:
>>> 	if (ret >= 0)
>>> 		file_pos_write(...);
>>
>> I thought about this. The problem is then we have to check every fop->write()
>> to see if any of them can return -errno with file->f_pos changed and fix them,
>> though it's do-able.
>   But returning error and advancing f_pos would be a bug - specification
> says write() returns the number of bytes written or -1 and f_pos should be
> advanced by the number of bytes written.
> 

Oh, I had an illusion that vfs saves f_pos and calls write() and restore f_pos
if write() fails.

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro - Feb. 23, 2013, 5:35 p.m.
On Tue, Feb 19, 2013 at 09:22:40AM +0800, Li Zefan wrote:
> There's a long long-standing bug...As long as I don't know when it dates
> from.
> 
> I've written and attached a simple program to reproduce this bug, and it can
> immediately trigger the bug in my box. It uses two threads, one keeps calling
> read(), and the other calling readdir(), both on the same directory fd.

> While readdir() is protected with i_mutex, f_pos can be changed without any locking
> in various read()/write() syscalls, which leads to this bug.

_What_ read/write syscalls (on a directory, that is)?

> We can fix this bug in each filesystem, but can't we just make sure i_mutex is
> acquired in lseek(), read(), write() and readdir() for directory file operations?
> 
> (the patch is for demonstration only)

No.  This is a very massive overkill.  If anything, we want to *reduce* the
amount of time we hold ->i_mutex in that area.

There are several bugs mixed here:
	* disappearing exclusion between readdir and lseek for directories.
Bad, since offset validation suddenly needs to be redone every time we look
at ->f_pos in ->readdir() instances *and* since ->readdir() itself updates
position, often by file->f_pos += something.
	* write(2) doing "get a copy of f_pos, try and fail ->write(),
put that copy back".  With no locking whatsoever.  What we get is a f_pos
value reverting to what it used to be at some random earlier point.  Makes
life really nasty for everything that updates ->f_pos, obviously.
	* read(2) doing the same, *and* some directories apparently having
->read() now.

->readdir() part of that would be the simplest one - we need to stop messing
with ->f_pos and just pass an address of its copy, like we do for ->read()
et.al.  Preserving the method prototype is not worth it and this particular
method has needed an overhaul of calling conventions for many reasons.

The issue with write(2) and friends is potentially nastier.  I'm looking
at the ->f_pos users right now, and while the majority are ->readdir()
and ->llseek() instances, there's some stuff beyond that.  Some of that is
done with struct file opened kernel-side and not accessible to userland;
those are safe (and often really ugly - see drivers/media/pci/cx25821/
hits for f_pos).  Some are simply wrong - e.g. dev_mem_read()
(in drivers/net/wireless/ti/wlcore/debugfs.c) ignores *ppos value and uses
file->f_pos instead; wrong behaviour for ->read() instance.  I'm about
20% through the list; so far everything seems to be possible to deal with
(especially if we add a couple of helpers for common lseek variants and
use existing generic_file_llseek_size()), so it might turn out to be
not a serious issue, but it's a potential minefield.  Hell knows...

As for ->readdir(), I'd like to resurrect an old proposal to change the ABI
of that sucker.  Quoting the thread from 4 years ago:
====
As for the locking...  I toyed with one idea for a while: instead of passing
a callback and one of many completely different structs, how about a common
*beginning* of a struct, with callback stored in it along with several
common fields?  Namely,
        * count of filldir calls already made
        * pointer to file in question
        * "are we done" flag
And instead of calling filldir(buf, ...) ->readdir() would call one of several
helpers:
        * readdir_emit(buf, ...) - obvious
        * readdir_relax(buf) - called in a spot convenient for dropping
and retaking lock; returns whether we need to do revalidation.
        * readdir_eof(buf) - end of directory
        * maybe readdir_dot() and readdir_dotdot() - those are certainly
common enough
That's the obvious flagday stuff, but since we need to give serious beating
to most of the instances anyway...  Might as well consider something in
that direction.
====

Back then it didn't go anywhere, but if we really go for change of calling
conventions (passing a pointer to copy of ->f_pos), it would make a lot of
sense, IMO.  Note that ->i_mutex contention could be seriously relieved that
way - e.g. ext* would just call readdir_relax() at the block boundaries,
since those locations are always valid there, etc.

Comments?
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zefan Li - Feb. 25, 2013, 6:09 a.m.
>> We can fix this bug in each filesystem, but can't we just make sure i_mutex is
>> acquired in lseek(), read(), write() and readdir() for directory file operations?
>>
>> (the patch is for demonstration only)
> 
> No.  This is a very massive overkill.  If anything, we want to *reduce* the
> amount of time we hold ->i_mutex in that area.
> 
> There are several bugs mixed here:
> 	* disappearing exclusion between readdir and lseek for directories.
> Bad, since offset validation suddenly needs to be redone every time we look
> at ->f_pos in ->readdir() instances *and* since ->readdir() itself updates
> position, often by file->f_pos += something.
> 	* write(2) doing "get a copy of f_pos, try and fail ->write(),
> put that copy back".  With no locking whatsoever.  What we get is a f_pos
> value reverting to what it used to be at some random earlier point.  Makes
> life really nasty for everything that updates ->f_pos, obviously.
> 	* read(2) doing the same, *and* some directories apparently having
> ->read() now.
> 
> ->readdir() part of that would be the simplest one - we need to stop messing
> with ->f_pos and just pass an address of its copy, like we do for ->read()
> et.al.  Preserving the method prototype is not worth it and this particular
> method has needed an overhaul of calling conventions for many reasons.
> 
> The issue with write(2) and friends is potentially nastier.  I'm looking
> at the ->f_pos users right now, and while the majority are ->readdir()
> and ->llseek() instances, there's some stuff beyond that.  Some of that is
> done with struct file opened kernel-side and not accessible to userland;
> those are safe (and often really ugly - see drivers/media/pci/cx25821/
> hits for f_pos).  Some are simply wrong - e.g. dev_mem_read()
> (in drivers/net/wireless/ti/wlcore/debugfs.c) ignores *ppos value and uses
> file->f_pos instead; wrong behaviour for ->read() instance.  I'm about
> 20% through the list; so far everything seems to be possible to deal with
> (especially if we add a couple of helpers for common lseek variants and
> use existing generic_file_llseek_size()), so it might turn out to be
> not a serious issue, but it's a potential minefield.  Hell knows...
> 
> As for ->readdir(), I'd like to resurrect an old proposal to change the ABI
> of that sucker.  Quoting the thread from 4 years ago:
> ====
> As for the locking...  I toyed with one idea for a while: instead of passing
> a callback and one of many completely different structs, how about a common
> *beginning* of a struct, with callback stored in it along with several
> common fields?  Namely,
>         * count of filldir calls already made
>         * pointer to file in question
>         * "are we done" flag
> And instead of calling filldir(buf, ...) ->readdir() would call one of several
> helpers:
>         * readdir_emit(buf, ...) - obvious
>         * readdir_relax(buf) - called in a spot convenient for dropping
> and retaking lock; returns whether we need to do revalidation.
>         * readdir_eof(buf) - end of directory
>         * maybe readdir_dot() and readdir_dotdot() - those are certainly
> common enough
> That's the obvious flagday stuff, but since we need to give serious beating
> to most of the instances anyway...  Might as well consider something in
> that direction.
> ====
> 
> Back then it didn't go anywhere, but if we really go for change of calling
> conventions (passing a pointer to copy of ->f_pos), it would make a lot of
> sense, IMO.  Note that ->i_mutex contention could be seriously relieved that
> way - e.g. ext* would just call readdir_relax() at the block boundaries,
> since those locations are always valid there, etc.
> 

So there will be no lock to protect f_pos in read/write/llseek in your proposal.
Do we need to care about reading/writing fpos in 32 bit machine is not atomic?

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zach Brown - Feb. 25, 2013, 6:25 p.m.
> As for ->readdir(), I'd like to resurrect an old proposal to change the ABI
> of that sucker.  Quoting the thread from 4 years ago:

I'd love to see the readdir() interface cleaned up, yes please.

> Comments?

Hmm.  Do we want to think about letting callers copy the name to
userspace in fragments?

I was looking at current readdir paths and saw that btrfs has an
annoying wrinkle that its directory entries might be in large blocks
that span disjoint pages.  So it copies them to a kmalloced()ed buffer
to pass to filldir().

Maybe it wouldn't be worth having to track incomplete entries in the
core, but it'd be nice to get rid of this copy in btrfs.

(btrfs could also be more clever and only do this dance when the name
really does span pages.. it's very rare. :/).

- z
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/read_write.c b/fs/read_write.c
index bb34af3..41f76e5 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -218,14 +218,25 @@  EXPORT_SYMBOL(default_llseek);
 
 loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 {
+	struct inode *inode = file->f_path.dentry->d_inode;
 	loff_t (*fn)(struct file *, loff_t, int);
+	int ret;
 
 	fn = no_llseek;
 	if (file->f_mode & FMODE_LSEEK) {
 		if (file->f_op && file->f_op->llseek)
 			fn = file->f_op->llseek;
 	}
-	return fn(file, offset, whence);
+
+	if (S_ISDIR(inode->i_mode)) {
+		mutex_lock(&inode->i_mutex);
+		ret = fn(file, offset, whence);
+		mutex_unlock(&inode->i_mutex);
+	} else {
+		ret = fn(file, offset, whence);
+	}
+
+	return ret;
 }
 EXPORT_SYMBOL(vfs_llseek);
 
@@ -442,12 +453,32 @@  EXPORT_SYMBOL(vfs_write);
 
 static inline loff_t file_pos_read(struct file *file)
 {
-	return file->f_pos;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	loff_t pos;
+
+	if (S_ISDIR(inode->i_mode)) {
+		mutex_lock(&inode->i_mutex);
+		pos = file->f_pos;
+		mutex_unlock(&inode->i_mutex);
+	} else {
+		pos = file->f_pos;
+	}
+
+	return pos;
 }
 
 static inline void file_pos_write(struct file *file, loff_t pos)
 {
-	file->f_pos = pos;
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	if (S_ISDIR(inode->i_mode)) {
+		mutex_lock(&inode->i_mutex);
+		file->f_pos = pos;
+		file->f_version = 0;
+		mutex_unlock(&inode->i_mutex);
+	} else {
+		file->f_pos = pos;
+	}
 }
 
 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)