jffs2: GC deadlock reading a page that is used in jffs2_write_begin()

Submitted by Kyeong Yoo on July 4, 2017, 4:22 a.m.

Details

Message ID 149914202384.24318.7331828698981799313.stgit@kyeongy-dl.atlnz.lc
State New
Delegated to: David Woodhouse
Headers show

Commit Message

Kyeong Yoo July 4, 2017, 4:22 a.m.
GC task can deadlock in read_cache_page() because it may attempt
to release a page that is actually allocated by another task in
jffs2_write_begin().
The reason is that in jffs2_write_begin() there is a small window
a cache page is allocated for use but not set Uptodate yet.

This ends up with a deadlock between two tasks:
1) A task (e.g. file copy)
   - jffs2_write_begin() locks a cache page
   - jffs2_write_end() tries to lock "alloc_sem" from
	 jffs2_reserve_space() <-- STUCK
2) GC task (jffs2_gcd_mtd3)
   - jffs2_garbage_collect_pass() locks "alloc_sem"
   - try to lock the same cache page in read_cache_page() <-- STUCK

So to avoid this deadlock, hold "alloc_sem" in jffs2_write_begin()
while reading data in a cache page.

Signed-off-by: Kyeong Yoo <kyeong.yoo@alliedtelesis.co.nz>
---

Note: I'm resending this patch to linux-mtd.

 fs/jffs2/file.c |   40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

Comments

Kyeong Yoo July 17, 2017, 9:54 p.m.
Hi,

Can someone review this and give me any feedback?

This patch is to prevent deadlock in jffs2 GC.

Thanks,
Kyeong


On 04/07/17 16:22, Kyeong Yoo wrote:
> GC task can deadlock in read_cache_page() because it may attempt
> to release a page that is actually allocated by another task in
> jffs2_write_begin().
> The reason is that in jffs2_write_begin() there is a small window
> a cache page is allocated for use but not set Uptodate yet.
> 
> This ends up with a deadlock between two tasks:
> 1) A task (e.g. file copy)
>     - jffs2_write_begin() locks a cache page
>     - jffs2_write_end() tries to lock "alloc_sem" from
> 	 jffs2_reserve_space() <-- STUCK
> 2) GC task (jffs2_gcd_mtd3)
>     - jffs2_garbage_collect_pass() locks "alloc_sem"
>     - try to lock the same cache page in read_cache_page() <-- STUCK
> 
> So to avoid this deadlock, hold "alloc_sem" in jffs2_write_begin()
> while reading data in a cache page.
> 
> Signed-off-by: Kyeong Yoo <kyeong.yoo@alliedtelesis.co.nz>
> ---
> 
> Note: I'm resending this patch to linux-mtd.
> 
>   fs/jffs2/file.c |   40 +++++++++++++++++++++++++---------------
>   1 file changed, 25 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
> index c12476e309c6..eb4e4d784d26 100644
> --- a/fs/jffs2/file.c
> +++ b/fs/jffs2/file.c
> @@ -135,20 +135,15 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
>   	struct page *pg;
>   	struct inode *inode = mapping->host;
>   	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
> +	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
>   	pgoff_t index = pos >> PAGE_SHIFT;
>   	uint32_t pageofs = index << PAGE_SHIFT;
>   	int ret = 0;
>   
> -	pg = grab_cache_page_write_begin(mapping, index, flags);
> -	if (!pg)
> -		return -ENOMEM;
> -	*pagep = pg;
> -
>   	jffs2_dbg(1, "%s()\n", __func__);
>   
>   	if (pageofs > inode->i_size) {
>   		/* Make new hole frag from old EOF to new page */
> -		struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
>   		struct jffs2_raw_inode ri;
>   		struct jffs2_full_dnode *fn;
>   		uint32_t alloc_len;
> @@ -159,7 +154,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
>   		ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
>   					  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
>   		if (ret)
> -			goto out_page;
> +			goto out_err;
>   
>   		mutex_lock(&f->sem);
>   		memset(&ri, 0, sizeof(ri));
> @@ -189,7 +184,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
>   			ret = PTR_ERR(fn);
>   			jffs2_complete_reservation(c);
>   			mutex_unlock(&f->sem);
> -			goto out_page;
> +			goto out_err;
>   		}
>   		ret = jffs2_add_full_dnode_to_inode(c, f, fn);
>   		if (f->metadata) {
> @@ -204,7 +199,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
>   			jffs2_free_full_dnode(fn);
>   			jffs2_complete_reservation(c);
>   			mutex_unlock(&f->sem);
> -			goto out_page;
> +			goto out_err;
>   		}
>   		jffs2_complete_reservation(c);
>   		inode->i_size = pageofs;
> @@ -212,6 +207,19 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
>   	}
>   
>   	/*
> +	 * While getting a page and reading data in, lock c->alloc_sem until
> +	 * the page is Uptodate. Otherwise GC task may attempt to read the same
> +	 * page in read_cache_page(), which causes a deadlock.
> +	 */
> +	mutex_lock(&c->alloc_sem);
> +	pg = grab_cache_page_write_begin(mapping, index, flags);
> +	if (!pg) {
> +		ret = -ENOMEM;
> +		goto release_sem;
> +	}
> +	*pagep = pg;
> +
> +	/*
>   	 * Read in the page if it wasn't already present. Cannot optimize away
>   	 * the whole page write case until jffs2_write_end can handle the
>   	 * case of a short-copy.
> @@ -220,15 +228,17 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
>   		mutex_lock(&f->sem);
>   		ret = jffs2_do_readpage_nolock(inode, pg);
>   		mutex_unlock(&f->sem);
> -		if (ret)
> -			goto out_page;
> +		if (ret) {
> +			unlock_page(pg);
> +			put_page(pg);
> +			goto release_sem;
> +		}
>   	}
>   	jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
> -	return ret;
>   
> -out_page:
> -	unlock_page(pg);
> -	put_page(pg);
> +release_sem:
> +	mutex_unlock(&c->alloc_sem);
> +out_err:
>   	return ret;
>   }
>   
>
richard -rw- weinberger July 18, 2017, 7:17 a.m.
Kyeong Yoo,

On Mon, Jul 17, 2017 at 11:54 PM, Kyeong Yoo
<Kyeong.Yoo@alliedtelesis.co.nz> wrote:
> Hi,
>
> Can someone review this and give me any feedback?
>
> This patch is to prevent deadlock in jffs2 GC.
>
> Thanks,
> Kyeong
>
>
> On 04/07/17 16:22, Kyeong Yoo wrote:
>> GC task can deadlock in read_cache_page() because it may attempt
>> to release a page that is actually allocated by another task in
>> jffs2_write_begin().
>> The reason is that in jffs2_write_begin() there is a small window
>> a cache page is allocated for use but not set Uptodate yet.
>>
>> This ends up with a deadlock between two tasks:
>> 1) A task (e.g. file copy)
>>     - jffs2_write_begin() locks a cache page
>>     - jffs2_write_end() tries to lock "alloc_sem" from
>>        jffs2_reserve_space() <-- STUCK
>> 2) GC task (jffs2_gcd_mtd3)
>>     - jffs2_garbage_collect_pass() locks "alloc_sem"
>>     - try to lock the same cache page in read_cache_page() <-- STUCK

Your description does not match the code. Are you seeing this on a
recent mainline kernel?
Can t be that your kernel does not has this commit?
commit 157078f64b8a9cd7011b6b900b2f2498df850748
Author: Thomas Betker <thomas.betker@rohde-schwarz.com>
Date:   Tue Nov 10 22:18:15 2015 +0100

    Revert "jffs2: Fix lock acquisition order bug in jffs2_write_begin"

    This reverts commit 5ffd3412ae55
    ("jffs2: Fix lock acquisition order bug in jffs2_write_begin").
Kyeong Yoo July 18, 2017, 11 p.m.
Hi Richard,

This patch was on top of v4.12 and still applies cleanly to the latest 
mainline which has the following commit at the top:

commit 74cbd96bc2e00f5daa805e2ebf49e998f7045062 (origin/master, origin/HEAD)
Merge: bef85bd7db26 6409e84ec58f
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Tue Jul 18 11:51:08 2017 -0700

     Merge tag 'md/4.13-rc2' of 
git://git.kernel.org/pub/scm/linux/kernel/git/shli/md


And I can see the commit you mentioned exists in my repo.

  commit 157078f64b8a9cd7011b6b900b2f2498df850748
  Author: Thomas Betker <thomas.betker@rohde-schwarz.com>
  Date:   Tue Nov 10 22:18:15 2015 +0100

     Revert "jffs2: Fix lock acquisition order bug in jffs2_write_begin"


The logic of deadlock still exists with the above commit, that's what I 
described in the patch commit message. Actual kernel version I saw the 
issue is 4.4.6. But what I found is the logic regarding locking is same 
in the current mainline.
For you information, attached the kernel panic info below.

Thanks,
Kyeong

<3>INFO: task jffs2_gcd_mtd3:190 blocked for more than 120 seconds.
<3>      Tainted: P           O    4.4.6-at1 #1
<3>"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
<6>jffs2_gcd_mtd3  D 00000000     0   190      2 0x00000800
<6>Call Trace:
<6>[a712faf0] [a4003180] 0xa4003180 (unreliable)
<6>[a712fbb0] [8068ea54] __schedule+0x1f4/0x5f0
<6>[a712fbf0] [8068ee84] schedule+0x34/0xb0
<6>[a712fc00] [80691b48] schedule_timeout+0x188/0x1d0
<6>[a712fc50] [8068e824] io_schedule_timeout+0x84/0xc0
<6>[a712fc70] [8068f7c0] bit_wait_io+0x20/0x70
<6>[a712fc80] [8068f584] __wait_on_bit_lock+0xa4/0x140
<6>[a712fcb0] [800b51bc] __lock_page+0x9c/0xb0
<6>[a712fce0] [800b58c8] do_read_cache_page+0x168/0x200
<6>[a712fd20] [8022b670] jffs2_gc_fetch_page+0x30/0xd0
<6>[a712fd30] [80694934] jffs2_garbage_collect_live+0x958/0xe64
<6>[a712fde0] [80228934] jffs2_garbage_collect_pass+0x454/0x7e0
<6>[a712fe40] [8022a290] jffs2_garbage_collect_thread+0xc0/0x1d0
<6>[a712fef0] [8003fb38] kthread+0xc8/0xe0
<6>[a712ff40] [8000eed8] ret_from_kernel_thread+0x5c/0x64
<3>INFO: task syslog-ng:483 blocked for more than 120 seconds.
<3>      Tainted: P           O    4.4.6-at1 #1
<3>"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
<6>syslog-ng       D 0fc2101c     0   483      1 0x00000000
<6>Call Trace:
<6>[a663bb60] [a6346840] 0xa6346840 (unreliable)
<6>[a663bc20] [8068ea54] __schedule+0x1f4/0x5f0
<6>[a663bc60] [8068ee84] schedule+0x34/0xb0
<6>[a663bc70] [8068f230] schedule_preempt_disabled+0x10/0x20
<6>[a663bc80] [806908a0] __mutex_lock_slowpath+0xa0/0x140
<6>[a663bcc0] [80690994] mutex_lock+0x54/0x70
<6>[a663bcd0] [802237e0] jffs2_reserve_space+0x40/0x310
<6>[a663bd40] [802263b0] jffs2_write_inode_range+0xe0/0x2e0
<6>[a663bda0] [80220c04] jffs2_write_end+0xf4/0x2d0
<6>[a663bde0] [800b62b4] generic_perform_write+0x114/0x200
<6>[a663be40] [800b772c] __generic_file_write_iter+0x1bc/0x230
<6>[a663be80] [800b78b0] generic_file_write_iter+0x110/0x300
<6>[a663bea0] [80103cb0] __vfs_write+0xc0/0x130
<6>[a663bef0] [80104660] vfs_write+0xa0/0x1b0
<6>[a663bf10] [8010506c] SyS_write+0x4c/0xd0
<6>[a663bf40] [8000eda0] ret_from_syscall+0x0/0x3c
<6>--- interrupt: c01 at 0xfc2101c
<6>    LR = 0x10023e94
<3>INFO: task scp:15190 blocked for more than 120 seconds.
<3>      Tainted: P           O    4.4.6-at1 #1
<3>"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
<6>scp             D 1fdb3b4c     0 15190  15189 0x00000000
<6>Call Trace:
<6>[a468fb60] [0929effc] 0x929effc (unreliable)
<6>[a468fc20] [8068ea54] __schedule+0x1f4/0x5f0
<6>[a468fc60] [8068ee84] schedule+0x34/0xb0
<6>[a468fc70] [8068f230] schedule_preempt_disabled+0x10/0x20
<6>[a468fc80] [806908a0] __mutex_lock_slowpath+0xa0/0x140
<6>[a468fcc0] [80690994] mutex_lock+0x54/0x70
<6>[a468fcd0] [802237e0] jffs2_reserve_space+0x40/0x310
<6>[a468fd40] [802263b0] jffs2_write_inode_range+0xe0/0x2e0
<6>[a468fda0] [80220c04] jffs2_write_end+0xf4/0x2d0
<6>[a468fde0] [800b62b4] generic_perform_write+0x114/0x200
<6>[a468fe40] [800b772c] __generic_file_write_iter+0x1bc/0x230
<6>[a468fe80] [800b78b0] generic_file_write_iter+0x110/0x300
<6>[a468fea0] [80103cb0] __vfs_write+0xc0/0x130
<6>[a468fef0] [80104660] vfs_write+0xa0/0x1b0
<6>[a468ff10] [8010506c] SyS_write+0x4c/0xd0
<6>[a468ff40] [8000eda0] ret_from_syscall+0x0/0x3c
<6>--- interrupt: c01 at 0x1fdb3b4c
<6>    LR = 0x2021adcc

On 18/07/17 19:17, Richard Weinberger wrote:
> Kyeong Yoo,
> 
> On Mon, Jul 17, 2017 at 11:54 PM, Kyeong Yoo
> <Kyeong.Yoo@alliedtelesis.co.nz> wrote:
>> Hi,
>>
>> Can someone review this and give me any feedback?
>>
>> This patch is to prevent deadlock in jffs2 GC.
>>
>> Thanks,
>> Kyeong
>>
>>
>> On 04/07/17 16:22, Kyeong Yoo wrote:
>>> GC task can deadlock in read_cache_page() because it may attempt
>>> to release a page that is actually allocated by another task in
>>> jffs2_write_begin().
>>> The reason is that in jffs2_write_begin() there is a small window
>>> a cache page is allocated for use but not set Uptodate yet.
>>>
>>> This ends up with a deadlock between two tasks:
>>> 1) A task (e.g. file copy)
>>>      - jffs2_write_begin() locks a cache page
>>>      - jffs2_write_end() tries to lock "alloc_sem" from
>>>         jffs2_reserve_space() <-- STUCK
>>> 2) GC task (jffs2_gcd_mtd3)
>>>      - jffs2_garbage_collect_pass() locks "alloc_sem"
>>>      - try to lock the same cache page in read_cache_page() <-- STUCK
> 
> Your description does not match the code. Are you seeing this on a
> recent mainline kernel?
> Can t be that your kernel does not has this commit?
> commit 157078f64b8a9cd7011b6b900b2f2498df850748
> Author: Thomas Betker <thomas.betker@rohde-schwarz.com>
> Date:   Tue Nov 10 22:18:15 2015 +0100
> 
>      Revert "jffs2: Fix lock acquisition order bug in jffs2_write_begin"
> 
>      This reverts commit 5ffd3412ae55
>      ("jffs2: Fix lock acquisition order bug in jffs2_write_begin").
> 
>

Patch hide | download patch | download mbox

diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index c12476e309c6..eb4e4d784d26 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -135,20 +135,15 @@  static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 	struct page *pg;
 	struct inode *inode = mapping->host;
 	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 	pgoff_t index = pos >> PAGE_SHIFT;
 	uint32_t pageofs = index << PAGE_SHIFT;
 	int ret = 0;
 
-	pg = grab_cache_page_write_begin(mapping, index, flags);
-	if (!pg)
-		return -ENOMEM;
-	*pagep = pg;
-
 	jffs2_dbg(1, "%s()\n", __func__);
 
 	if (pageofs > inode->i_size) {
 		/* Make new hole frag from old EOF to new page */
-		struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 		struct jffs2_raw_inode ri;
 		struct jffs2_full_dnode *fn;
 		uint32_t alloc_len;
@@ -159,7 +154,7 @@  static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 		ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
 					  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
 		if (ret)
-			goto out_page;
+			goto out_err;
 
 		mutex_lock(&f->sem);
 		memset(&ri, 0, sizeof(ri));
@@ -189,7 +184,7 @@  static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 			ret = PTR_ERR(fn);
 			jffs2_complete_reservation(c);
 			mutex_unlock(&f->sem);
-			goto out_page;
+			goto out_err;
 		}
 		ret = jffs2_add_full_dnode_to_inode(c, f, fn);
 		if (f->metadata) {
@@ -204,7 +199,7 @@  static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 			jffs2_free_full_dnode(fn);
 			jffs2_complete_reservation(c);
 			mutex_unlock(&f->sem);
-			goto out_page;
+			goto out_err;
 		}
 		jffs2_complete_reservation(c);
 		inode->i_size = pageofs;
@@ -212,6 +207,19 @@  static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 	}
 
 	/*
+	 * While getting a page and reading data in, lock c->alloc_sem until
+	 * the page is Uptodate. Otherwise GC task may attempt to read the same
+	 * page in read_cache_page(), which causes a deadlock.
+	 */
+	mutex_lock(&c->alloc_sem);
+	pg = grab_cache_page_write_begin(mapping, index, flags);
+	if (!pg) {
+		ret = -ENOMEM;
+		goto release_sem;
+	}
+	*pagep = pg;
+
+	/*
 	 * Read in the page if it wasn't already present. Cannot optimize away
 	 * the whole page write case until jffs2_write_end can handle the
 	 * case of a short-copy.
@@ -220,15 +228,17 @@  static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 		mutex_lock(&f->sem);
 		ret = jffs2_do_readpage_nolock(inode, pg);
 		mutex_unlock(&f->sem);
-		if (ret)
-			goto out_page;
+		if (ret) {
+			unlock_page(pg);
+			put_page(pg);
+			goto release_sem;
+		}
 	}
 	jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
-	return ret;
 
-out_page:
-	unlock_page(pg);
-	put_page(pg);
+release_sem:
+	mutex_unlock(&c->alloc_sem);
+out_err:
 	return ret;
 }