jffs2: GC deadlock reading a page that is used in jffs2_write_begin()

Message ID 149914202384.24318.7331828698981799313.stgit@kyeongy-dl.atlnz.lc
State New
Delegated to: David Woodhouse
Headers show

Commit Message

Kyeong Yoo July 4, 2017, 4:22 a.m.
GC task can deadlock in read_cache_page() because it may attempt
to release a page that is actually allocated by another task in
jffs2_write_begin().
The reason is that in jffs2_write_begin() there is a small window
a cache page is allocated for use but not set Uptodate yet.

This ends up with a deadlock between two tasks:
1) A task (e.g. file copy)
   - jffs2_write_begin() locks a cache page
   - jffs2_write_end() tries to lock "alloc_sem" from
	 jffs2_reserve_space() <-- STUCK
2) GC task (jffs2_gcd_mtd3)
   - jffs2_garbage_collect_pass() locks "alloc_sem"
   - try to lock the same cache page in read_cache_page() <-- STUCK

So to avoid this deadlock, hold "alloc_sem" in jffs2_write_begin()
while reading data in a cache page.

Signed-off-by: Kyeong Yoo <kyeong.yoo@alliedtelesis.co.nz>
---

Note: I'm resending this patch to linux-mtd.

 fs/jffs2/file.c |   40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

Comments

Kyeong Yoo July 17, 2017, 9:54 p.m. | #1
Hi,

Can someone review this and give me any feedback?

This patch is to prevent deadlock in jffs2 GC.

Thanks,
Kyeong


On 04/07/17 16:22, Kyeong Yoo wrote:
> GC task can deadlock in read_cache_page() because it may attempt
> to release a page that is actually allocated by another task in
> jffs2_write_begin().
> The reason is that in jffs2_write_begin() there is a small window
> a cache page is allocated for use but not set Uptodate yet.
> 
> This ends up with a deadlock between two tasks:
> 1) A task (e.g. file copy)
>     - jffs2_write_begin() locks a cache page
>     - jffs2_write_end() tries to lock "alloc_sem" from
> 	 jffs2_reserve_space() <-- STUCK
> 2) GC task (jffs2_gcd_mtd3)
>     - jffs2_garbage_collect_pass() locks "alloc_sem"
>     - try to lock the same cache page in read_cache_page() <-- STUCK
> 
> So to avoid this deadlock, hold "alloc_sem" in jffs2_write_begin()
> while reading data in a cache page.
> 
> Signed-off-by: Kyeong Yoo <kyeong.yoo@alliedtelesis.co.nz>
> ---
> 
> Note: I'm resending this patch to linux-mtd.
> 
>   fs/jffs2/file.c |   40 +++++++++++++++++++++++++---------------
>   1 file changed, 25 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
> index c12476e309c6..eb4e4d784d26 100644
> --- a/fs/jffs2/file.c
> +++ b/fs/jffs2/file.c
> @@ -135,20 +135,15 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
>   	struct page *pg;
>   	struct inode *inode = mapping->host;
>   	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
> +	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
>   	pgoff_t index = pos >> PAGE_SHIFT;
>   	uint32_t pageofs = index << PAGE_SHIFT;
>   	int ret = 0;
>   
> -	pg = grab_cache_page_write_begin(mapping, index, flags);
> -	if (!pg)
> -		return -ENOMEM;
> -	*pagep = pg;
> -
>   	jffs2_dbg(1, "%s()\n", __func__);
>   
>   	if (pageofs > inode->i_size) {
>   		/* Make new hole frag from old EOF to new page */
> -		struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
>   		struct jffs2_raw_inode ri;
>   		struct jffs2_full_dnode *fn;
>   		uint32_t alloc_len;
> @@ -159,7 +154,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
>   		ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
>   					  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
>   		if (ret)
> -			goto out_page;
> +			goto out_err;
>   
>   		mutex_lock(&f->sem);
>   		memset(&ri, 0, sizeof(ri));
> @@ -189,7 +184,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
>   			ret = PTR_ERR(fn);
>   			jffs2_complete_reservation(c);
>   			mutex_unlock(&f->sem);
> -			goto out_page;
> +			goto out_err;
>   		}
>   		ret = jffs2_add_full_dnode_to_inode(c, f, fn);
>   		if (f->metadata) {
> @@ -204,7 +199,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
>   			jffs2_free_full_dnode(fn);
>   			jffs2_complete_reservation(c);
>   			mutex_unlock(&f->sem);
> -			goto out_page;
> +			goto out_err;
>   		}
>   		jffs2_complete_reservation(c);
>   		inode->i_size = pageofs;
> @@ -212,6 +207,19 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
>   	}
>   
>   	/*
> +	 * While getting a page and reading data in, lock c->alloc_sem until
> +	 * the page is Uptodate. Otherwise GC task may attempt to read the same
> +	 * page in read_cache_page(), which causes a deadlock.
> +	 */
> +	mutex_lock(&c->alloc_sem);
> +	pg = grab_cache_page_write_begin(mapping, index, flags);
> +	if (!pg) {
> +		ret = -ENOMEM;
> +		goto release_sem;
> +	}
> +	*pagep = pg;
> +
> +	/*
>   	 * Read in the page if it wasn't already present. Cannot optimize away
>   	 * the whole page write case until jffs2_write_end can handle the
>   	 * case of a short-copy.
> @@ -220,15 +228,17 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
>   		mutex_lock(&f->sem);
>   		ret = jffs2_do_readpage_nolock(inode, pg);
>   		mutex_unlock(&f->sem);
> -		if (ret)
> -			goto out_page;
> +		if (ret) {
> +			unlock_page(pg);
> +			put_page(pg);
> +			goto release_sem;
> +		}
>   	}
>   	jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
> -	return ret;
>   
> -out_page:
> -	unlock_page(pg);
> -	put_page(pg);
> +release_sem:
> +	mutex_unlock(&c->alloc_sem);
> +out_err:
>   	return ret;
>   }
>   
>
Richard Weinberger July 18, 2017, 7:17 a.m. | #2
Kyeong Yoo,

On Mon, Jul 17, 2017 at 11:54 PM, Kyeong Yoo
<Kyeong.Yoo@alliedtelesis.co.nz> wrote:
> Hi,
>
> Can someone review this and give me any feedback?
>
> This patch is to prevent deadlock in jffs2 GC.
>
> Thanks,
> Kyeong
>
>
> On 04/07/17 16:22, Kyeong Yoo wrote:
>> GC task can deadlock in read_cache_page() because it may attempt
>> to release a page that is actually allocated by another task in
>> jffs2_write_begin().
>> The reason is that in jffs2_write_begin() there is a small window
>> a cache page is allocated for use but not set Uptodate yet.
>>
>> This ends up with a deadlock between two tasks:
>> 1) A task (e.g. file copy)
>>     - jffs2_write_begin() locks a cache page
>>     - jffs2_write_end() tries to lock "alloc_sem" from
>>        jffs2_reserve_space() <-- STUCK
>> 2) GC task (jffs2_gcd_mtd3)
>>     - jffs2_garbage_collect_pass() locks "alloc_sem"
>>     - try to lock the same cache page in read_cache_page() <-- STUCK

Your description does not match the code. Are you seeing this on a
recent mainline kernel?
Can t be that your kernel does not has this commit?
commit 157078f64b8a9cd7011b6b900b2f2498df850748
Author: Thomas Betker <thomas.betker@rohde-schwarz.com>
Date:   Tue Nov 10 22:18:15 2015 +0100

    Revert "jffs2: Fix lock acquisition order bug in jffs2_write_begin"

    This reverts commit 5ffd3412ae55
    ("jffs2: Fix lock acquisition order bug in jffs2_write_begin").
Kyeong Yoo July 18, 2017, 11 p.m. | #3
Hi Richard,

This patch was on top of v4.12 and still applies cleanly to the latest 
mainline which has the following commit at the top:

commit 74cbd96bc2e00f5daa805e2ebf49e998f7045062 (origin/master, origin/HEAD)
Merge: bef85bd7db26 6409e84ec58f
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Tue Jul 18 11:51:08 2017 -0700

     Merge tag 'md/4.13-rc2' of 
git://git.kernel.org/pub/scm/linux/kernel/git/shli/md


And I can see the commit you mentioned exists in my repo.

  commit 157078f64b8a9cd7011b6b900b2f2498df850748
  Author: Thomas Betker <thomas.betker@rohde-schwarz.com>
  Date:   Tue Nov 10 22:18:15 2015 +0100

     Revert "jffs2: Fix lock acquisition order bug in jffs2_write_begin"


The logic of deadlock still exists with the above commit, that's what I 
described in the patch commit message. Actual kernel version I saw the 
issue is 4.4.6. But what I found is the logic regarding locking is same 
in the current mainline.
For you information, attached the kernel panic info below.

Thanks,
Kyeong

<3>INFO: task jffs2_gcd_mtd3:190 blocked for more than 120 seconds.
<3>      Tainted: P           O    4.4.6-at1 #1
<3>"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
<6>jffs2_gcd_mtd3  D 00000000     0   190      2 0x00000800
<6>Call Trace:
<6>[a712faf0] [a4003180] 0xa4003180 (unreliable)
<6>[a712fbb0] [8068ea54] __schedule+0x1f4/0x5f0
<6>[a712fbf0] [8068ee84] schedule+0x34/0xb0
<6>[a712fc00] [80691b48] schedule_timeout+0x188/0x1d0
<6>[a712fc50] [8068e824] io_schedule_timeout+0x84/0xc0
<6>[a712fc70] [8068f7c0] bit_wait_io+0x20/0x70
<6>[a712fc80] [8068f584] __wait_on_bit_lock+0xa4/0x140
<6>[a712fcb0] [800b51bc] __lock_page+0x9c/0xb0
<6>[a712fce0] [800b58c8] do_read_cache_page+0x168/0x200
<6>[a712fd20] [8022b670] jffs2_gc_fetch_page+0x30/0xd0
<6>[a712fd30] [80694934] jffs2_garbage_collect_live+0x958/0xe64
<6>[a712fde0] [80228934] jffs2_garbage_collect_pass+0x454/0x7e0
<6>[a712fe40] [8022a290] jffs2_garbage_collect_thread+0xc0/0x1d0
<6>[a712fef0] [8003fb38] kthread+0xc8/0xe0
<6>[a712ff40] [8000eed8] ret_from_kernel_thread+0x5c/0x64
<3>INFO: task syslog-ng:483 blocked for more than 120 seconds.
<3>      Tainted: P           O    4.4.6-at1 #1
<3>"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
<6>syslog-ng       D 0fc2101c     0   483      1 0x00000000
<6>Call Trace:
<6>[a663bb60] [a6346840] 0xa6346840 (unreliable)
<6>[a663bc20] [8068ea54] __schedule+0x1f4/0x5f0
<6>[a663bc60] [8068ee84] schedule+0x34/0xb0
<6>[a663bc70] [8068f230] schedule_preempt_disabled+0x10/0x20
<6>[a663bc80] [806908a0] __mutex_lock_slowpath+0xa0/0x140
<6>[a663bcc0] [80690994] mutex_lock+0x54/0x70
<6>[a663bcd0] [802237e0] jffs2_reserve_space+0x40/0x310
<6>[a663bd40] [802263b0] jffs2_write_inode_range+0xe0/0x2e0
<6>[a663bda0] [80220c04] jffs2_write_end+0xf4/0x2d0
<6>[a663bde0] [800b62b4] generic_perform_write+0x114/0x200
<6>[a663be40] [800b772c] __generic_file_write_iter+0x1bc/0x230
<6>[a663be80] [800b78b0] generic_file_write_iter+0x110/0x300
<6>[a663bea0] [80103cb0] __vfs_write+0xc0/0x130
<6>[a663bef0] [80104660] vfs_write+0xa0/0x1b0
<6>[a663bf10] [8010506c] SyS_write+0x4c/0xd0
<6>[a663bf40] [8000eda0] ret_from_syscall+0x0/0x3c
<6>--- interrupt: c01 at 0xfc2101c
<6>    LR = 0x10023e94
<3>INFO: task scp:15190 blocked for more than 120 seconds.
<3>      Tainted: P           O    4.4.6-at1 #1
<3>"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
<6>scp             D 1fdb3b4c     0 15190  15189 0x00000000
<6>Call Trace:
<6>[a468fb60] [0929effc] 0x929effc (unreliable)
<6>[a468fc20] [8068ea54] __schedule+0x1f4/0x5f0
<6>[a468fc60] [8068ee84] schedule+0x34/0xb0
<6>[a468fc70] [8068f230] schedule_preempt_disabled+0x10/0x20
<6>[a468fc80] [806908a0] __mutex_lock_slowpath+0xa0/0x140
<6>[a468fcc0] [80690994] mutex_lock+0x54/0x70
<6>[a468fcd0] [802237e0] jffs2_reserve_space+0x40/0x310
<6>[a468fd40] [802263b0] jffs2_write_inode_range+0xe0/0x2e0
<6>[a468fda0] [80220c04] jffs2_write_end+0xf4/0x2d0
<6>[a468fde0] [800b62b4] generic_perform_write+0x114/0x200
<6>[a468fe40] [800b772c] __generic_file_write_iter+0x1bc/0x230
<6>[a468fe80] [800b78b0] generic_file_write_iter+0x110/0x300
<6>[a468fea0] [80103cb0] __vfs_write+0xc0/0x130
<6>[a468fef0] [80104660] vfs_write+0xa0/0x1b0
<6>[a468ff10] [8010506c] SyS_write+0x4c/0xd0
<6>[a468ff40] [8000eda0] ret_from_syscall+0x0/0x3c
<6>--- interrupt: c01 at 0x1fdb3b4c
<6>    LR = 0x2021adcc

On 18/07/17 19:17, Richard Weinberger wrote:
> Kyeong Yoo,
> 
> On Mon, Jul 17, 2017 at 11:54 PM, Kyeong Yoo
> <Kyeong.Yoo@alliedtelesis.co.nz> wrote:
>> Hi,
>>
>> Can someone review this and give me any feedback?
>>
>> This patch is to prevent deadlock in jffs2 GC.
>>
>> Thanks,
>> Kyeong
>>
>>
>> On 04/07/17 16:22, Kyeong Yoo wrote:
>>> GC task can deadlock in read_cache_page() because it may attempt
>>> to release a page that is actually allocated by another task in
>>> jffs2_write_begin().
>>> The reason is that in jffs2_write_begin() there is a small window
>>> a cache page is allocated for use but not set Uptodate yet.
>>>
>>> This ends up with a deadlock between two tasks:
>>> 1) A task (e.g. file copy)
>>>      - jffs2_write_begin() locks a cache page
>>>      - jffs2_write_end() tries to lock "alloc_sem" from
>>>         jffs2_reserve_space() <-- STUCK
>>> 2) GC task (jffs2_gcd_mtd3)
>>>      - jffs2_garbage_collect_pass() locks "alloc_sem"
>>>      - try to lock the same cache page in read_cache_page() <-- STUCK
> 
> Your description does not match the code. Are you seeing this on a
> recent mainline kernel?
> Can t be that your kernel does not has this commit?
> commit 157078f64b8a9cd7011b6b900b2f2498df850748
> Author: Thomas Betker <thomas.betker@rohde-schwarz.com>
> Date:   Tue Nov 10 22:18:15 2015 +0100
> 
>      Revert "jffs2: Fix lock acquisition order bug in jffs2_write_begin"
> 
>      This reverts commit 5ffd3412ae55
>      ("jffs2: Fix lock acquisition order bug in jffs2_write_begin").
> 
>

Patch

diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index c12476e309c6..eb4e4d784d26 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -135,20 +135,15 @@  static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 	struct page *pg;
 	struct inode *inode = mapping->host;
 	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 	pgoff_t index = pos >> PAGE_SHIFT;
 	uint32_t pageofs = index << PAGE_SHIFT;
 	int ret = 0;
 
-	pg = grab_cache_page_write_begin(mapping, index, flags);
-	if (!pg)
-		return -ENOMEM;
-	*pagep = pg;
-
 	jffs2_dbg(1, "%s()\n", __func__);
 
 	if (pageofs > inode->i_size) {
 		/* Make new hole frag from old EOF to new page */
-		struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 		struct jffs2_raw_inode ri;
 		struct jffs2_full_dnode *fn;
 		uint32_t alloc_len;
@@ -159,7 +154,7 @@  static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 		ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
 					  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
 		if (ret)
-			goto out_page;
+			goto out_err;
 
 		mutex_lock(&f->sem);
 		memset(&ri, 0, sizeof(ri));
@@ -189,7 +184,7 @@  static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 			ret = PTR_ERR(fn);
 			jffs2_complete_reservation(c);
 			mutex_unlock(&f->sem);
-			goto out_page;
+			goto out_err;
 		}
 		ret = jffs2_add_full_dnode_to_inode(c, f, fn);
 		if (f->metadata) {
@@ -204,7 +199,7 @@  static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 			jffs2_free_full_dnode(fn);
 			jffs2_complete_reservation(c);
 			mutex_unlock(&f->sem);
-			goto out_page;
+			goto out_err;
 		}
 		jffs2_complete_reservation(c);
 		inode->i_size = pageofs;
@@ -212,6 +207,19 @@  static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 	}
 
 	/*
+	 * While getting a page and reading data in, lock c->alloc_sem until
+	 * the page is Uptodate. Otherwise GC task may attempt to read the same
+	 * page in read_cache_page(), which causes a deadlock.
+	 */
+	mutex_lock(&c->alloc_sem);
+	pg = grab_cache_page_write_begin(mapping, index, flags);
+	if (!pg) {
+		ret = -ENOMEM;
+		goto release_sem;
+	}
+	*pagep = pg;
+
+	/*
 	 * Read in the page if it wasn't already present. Cannot optimize away
 	 * the whole page write case until jffs2_write_end can handle the
 	 * case of a short-copy.
@@ -220,15 +228,17 @@  static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 		mutex_lock(&f->sem);
 		ret = jffs2_do_readpage_nolock(inode, pg);
 		mutex_unlock(&f->sem);
-		if (ret)
-			goto out_page;
+		if (ret) {
+			unlock_page(pg);
+			put_page(pg);
+			goto release_sem;
+		}
 	}
 	jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
-	return ret;
 
-out_page:
-	unlock_page(pg);
-	put_page(pg);
+release_sem:
+	mutex_unlock(&c->alloc_sem);
+out_err:
 	return ret;
 }