diff mbox

[RFC,2/2] mtd: ubi: wl: avoid erasing a PEB which is empty

Message ID 1448302147-19272-3-git-send-email-bigeasy@linutronix.de
State Superseded
Headers show

Commit Message

Sebastian Andrzej Siewior Nov. 23, 2015, 6:09 p.m. UTC
wear_leveling_worker() currently unconditionally puts a PEB on erase in
the error case even it just been taken from the free_list and never
used.
In case the PEB was never used it can be put back on the free list
saving an erase cycle.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/mtd/ubi/wl.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

Comments

Richard Weinberger Nov. 23, 2015, 9:30 p.m. UTC | #1
Sebastian,

Am 23.11.2015 um 19:09 schrieb Sebastian Andrzej Siewior:
> wear_leveling_worker() currently unconditionally puts a PEB on erase in
> the error case even it just been taken from the free_list and never
> used.
> In case the PEB was never used it can be put back on the free list
> saving an erase cycle.

Makes sense, thanks for pointing out this optimization!

> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>  drivers/mtd/ubi/wl.c | 23 ++++++++++++++++++++---
>  1 file changed, 20 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
> index eb4489f9082f..709ca27e103c 100644
> --- a/drivers/mtd/ubi/wl.c
> +++ b/drivers/mtd/ubi/wl.c
> @@ -631,6 +631,7 @@ static int do_sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e,
>  	return erase_worker(ubi, wl_wrk, 0);
>  }
>  
> +static int ensure_wear_leveling(struct ubi_device *ubi, int nested);
>  /**
>   * wear_leveling_worker - wear-leveling worker function.
>   * @ubi: UBI device description object
> @@ -652,6 +653,7 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
>  #endif
>  	struct ubi_wl_entry *e1, *e2;
>  	struct ubi_vid_hdr *vid_hdr;
> +	int to_leb_clean = 0;

Can we please rename this variable?
I know it means "the `to` LEB is clean". But for everybody else this reads as "I'm going to clean a LEB". ;)

>  	kfree(wrk);
>  	if (shutdown)
> @@ -756,6 +758,7 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
>  
>  	err = ubi_io_read_vid_hdr(ubi, e1->pnum, vid_hdr, 0);
>  	if (err && err != UBI_IO_BITFLIPS) {
> +		to_leb_clean = 1;
>  		if (err == UBI_IO_FF) {
>  			/*
>  			 * We are trying to move PEB without a VID header. UBI
> @@ -801,10 +804,12 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
>  			 * protection queue.
>  			 */
>  			protect = 1;
> +			to_leb_clean = 1;
>  			goto out_not_moved;
>  		}
>  		if (err == MOVE_RETRY) {
>  			scrubbing = 1;
> +			to_leb_clean = 1;
>  			goto out_not_moved;
>  		}
>  		if (err == MOVE_TARGET_BITFLIPS || err == MOVE_TARGET_WR_ERR ||
> @@ -830,6 +835,7 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
>  					ubi->erroneous_peb_count);
>  				goto out_error;
>  			}
> +			to_leb_clean = 1;
>  			erroneous = 1;
>  			goto out_not_moved;
>  		}
> @@ -900,15 +906,26 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
>  		wl_tree_add(e1, &ubi->scrub);
>  	else
>  		wl_tree_add(e1, &ubi->used);
> +	if (to_leb_clean) {
> +		wl_tree_add(e2, &ubi->free);
> +		ubi->free_count++;
> +	}
> +
>  	ubi_assert(!ubi->move_to_put);
>  	ubi->move_from = ubi->move_to = NULL;
>  	ubi->wl_scheduled = 0;
>  	spin_unlock(&ubi->wl_lock);
>  
>  	ubi_free_vid_hdr(ubi, vid_hdr);
> -	err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
> -	if (err)
> -		goto out_ro;
> +	if (to_leb_clean) {
> +		ensure_wear_leveling(ubi, 0);

Why are you rescheduling wear leveling?
And the nested variable has to be set to no-zero as you're calling it from a UBI worker.

> +	} else {
> +		err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
> +		if (err) {
> +			wl_entry_destroy(ubi, e2);

Why that? The erase_worker will free e2 if it encounters
a fatal error and gives up this PEB. You're introducing a double free.

I think you have copy&pasted from:
        err = do_sync_erase(ubi, e1, vol_id, lnum, 0);
        if (err) {
                if (e2)
                        wl_entry_destroy(ubi, e2);
                goto out_ro;
        }

...which looks wrong too. I'll double check tomorrow with a fresh brain.

Thanks,
//richard
Richard Weinberger Nov. 23, 2015, 9:50 p.m. UTC | #2
Am 23.11.2015 um 22:30 schrieb Richard Weinberger:
> Sebastian,
> 
> Am 23.11.2015 um 19:09 schrieb Sebastian Andrzej Siewior:
>> wear_leveling_worker() currently unconditionally puts a PEB on erase in
>> the error case even it just been taken from the free_list and never
>> used.
>> In case the PEB was never used it can be put back on the free list
>> saving an erase cycle.
> 
> Makes sense, thanks for pointing out this optimization!
> 
>> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
>> ---
>>  drivers/mtd/ubi/wl.c | 23 ++++++++++++++++++++---
>>  1 file changed, 20 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
>> index eb4489f9082f..709ca27e103c 100644
>> --- a/drivers/mtd/ubi/wl.c
>> +++ b/drivers/mtd/ubi/wl.c
>> @@ -631,6 +631,7 @@ static int do_sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e,
>>  	return erase_worker(ubi, wl_wrk, 0);
>>  }
>>  
>> +static int ensure_wear_leveling(struct ubi_device *ubi, int nested);
>>  /**
>>   * wear_leveling_worker - wear-leveling worker function.
>>   * @ubi: UBI device description object
>> @@ -652,6 +653,7 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
>>  #endif
>>  	struct ubi_wl_entry *e1, *e2;
>>  	struct ubi_vid_hdr *vid_hdr;
>> +	int to_leb_clean = 0;
> 
> Can we please rename this variable?
> I know it means "the `to` LEB is clean". But for everybody else this reads as "I'm going to clean a LEB". ;)
> 
>>  	kfree(wrk);
>>  	if (shutdown)
>> @@ -756,6 +758,7 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
>>  
>>  	err = ubi_io_read_vid_hdr(ubi, e1->pnum, vid_hdr, 0);
>>  	if (err && err != UBI_IO_BITFLIPS) {
>> +		to_leb_clean = 1;
>>  		if (err == UBI_IO_FF) {
>>  			/*
>>  			 * We are trying to move PEB without a VID header. UBI
>> @@ -801,10 +804,12 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
>>  			 * protection queue.
>>  			 */
>>  			protect = 1;
>> +			to_leb_clean = 1;
>>  			goto out_not_moved;
>>  		}
>>  		if (err == MOVE_RETRY) {
>>  			scrubbing = 1;
>> +			to_leb_clean = 1;
>>  			goto out_not_moved;
>>  		}
>>  		if (err == MOVE_TARGET_BITFLIPS || err == MOVE_TARGET_WR_ERR ||
>> @@ -830,6 +835,7 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
>>  					ubi->erroneous_peb_count);
>>  				goto out_error;
>>  			}
>> +			to_leb_clean = 1;
>>  			erroneous = 1;
>>  			goto out_not_moved;
>>  		}
>> @@ -900,15 +906,26 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
>>  		wl_tree_add(e1, &ubi->scrub);
>>  	else
>>  		wl_tree_add(e1, &ubi->used);
>> +	if (to_leb_clean) {
>> +		wl_tree_add(e2, &ubi->free);
>> +		ubi->free_count++;
>> +	}
>> +
>>  	ubi_assert(!ubi->move_to_put);
>>  	ubi->move_from = ubi->move_to = NULL;
>>  	ubi->wl_scheduled = 0;
>>  	spin_unlock(&ubi->wl_lock);
>>  
>>  	ubi_free_vid_hdr(ubi, vid_hdr);
>> -	err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
>> -	if (err)
>> -		goto out_ro;
>> +	if (to_leb_clean) {
>> +		ensure_wear_leveling(ubi, 0);
> 
> Why are you rescheduling wear leveling?
> And the nested variable has to be set to no-zero as you're calling it from a UBI worker.
> 
>> +	} else {
>> +		err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
>> +		if (err) {
>> +			wl_entry_destroy(ubi, e2);
> 
> Why that? The erase_worker will free e2 if it encounters
> a fatal error and gives up this PEB. You're introducing a double free.
> 
> I think you have copy&pasted from:
>         err = do_sync_erase(ubi, e1, vol_id, lnum, 0);
>         if (err) {
>                 if (e2)
>                         wl_entry_destroy(ubi, e2);
>                 goto out_ro;
>         }
> 
> ...which looks wrong too. I'll double check tomorrow with a fresh brain.

Okay. That one is fine, as we switch to ro mode anyway...

Thanks,
//richard
Sebastian Andrzej Siewior Nov. 24, 2015, 8:26 a.m. UTC | #3
On 11/23/2015 10:30 PM, Richard Weinberger wrote:
>> diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
>> index eb4489f9082f..709ca27e103c 100644
>> --- a/drivers/mtd/ubi/wl.c
>> +++ b/drivers/mtd/ubi/wl.c
>> @@ -652,6 +653,7 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
>>  #endif
>>  	struct ubi_wl_entry *e1, *e2;
>>  	struct ubi_vid_hdr *vid_hdr;
>> +	int to_leb_clean = 0;
> 
> Can we please rename this variable?
> I know it means "the `to` LEB is clean". But for everybody else this reads as "I'm going to clean a LEB". ;)

such a shame that you can't make files RO. Any suggestions? dst_clean ?

>>  	kfree(wrk);
>>  	if (shutdown)
>> @@ -900,15 +906,26 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
>>  		wl_tree_add(e1, &ubi->scrub);
>>  	else
>>  		wl_tree_add(e1, &ubi->used);
>> +	if (to_leb_clean) {
>> +		wl_tree_add(e2, &ubi->free);
>> +		ubi->free_count++;
>> +	}
>> +
>>  	ubi_assert(!ubi->move_to_put);
>>  	ubi->move_from = ubi->move_to = NULL;
>>  	ubi->wl_scheduled = 0;
>>  	spin_unlock(&ubi->wl_lock);
>>  
>>  	ubi_free_vid_hdr(ubi, vid_hdr);
>> -	err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
>> -	if (err)
>> -		goto out_ro;
>> +	if (to_leb_clean) {
>> +		ensure_wear_leveling(ubi, 0);
> 
> Why are you rescheduling wear leveling?

Because we had it pending and we didn't do anything. If I don't
schedule it would be deferred until another LEB is going to be deleted.
Before the change it happens after do_sync_erase() work job completes
but now we add it back to the free list.

> And the nested variable has to be set to no-zero as you're calling it from a UBI worker.
Ah, okay. I've been looking at it and got wrong then. As I said in my
cover mail, this was forwarded ported.

> 
>> +	} else {
>> +		err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
>> +		if (err) {
>> +			wl_entry_destroy(ubi, e2);
> 
> Why that? The erase_worker will free e2 if it encounters
> a fatal error and gives up this PEB. You're introducing a double free.

Hmmm. That is real bad error handling you have there. So you invoke
do_sync_erase(), the kmalloc() fails and how exactly you free e2 here?
You don't so I did what I did. But then if this succeeds but
erase_worker() fails then we have a double free here. Indeed.

> I think you have copy&pasted from:
>         err = do_sync_erase(ubi, e1, vol_id, lnum, 0);
>         if (err) {
>                 if (e2)
>                         wl_entry_destroy(ubi, e2);
>                 goto out_ro;
>         }
> 
> ...which looks wrong too. I'll double check tomorrow with a fresh brain.
> 
> Thanks,
> //richard

Sebastian
Richard Weinberger Nov. 24, 2015, 8:39 a.m. UTC | #4
Am 24.11.2015 um 09:26 schrieb Sebastian Andrzej Siewior:
> On 11/23/2015 10:30 PM, Richard Weinberger wrote:
>>> diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
>>> index eb4489f9082f..709ca27e103c 100644
>>> --- a/drivers/mtd/ubi/wl.c
>>> +++ b/drivers/mtd/ubi/wl.c
>>> @@ -652,6 +653,7 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
>>>  #endif
>>>  	struct ubi_wl_entry *e1, *e2;
>>>  	struct ubi_vid_hdr *vid_hdr;
>>> +	int to_leb_clean = 0;
>>
>> Can we please rename this variable?
>> I know it means "the `to` LEB is clean". But for everybody else this reads as "I'm going to clean a LEB". ;)
> 
> such a shame that you can't make files RO. Any suggestions? dst_clean ?

Files RO? I don't get it.

dst_clean is good. :-)

> 
>>>  	kfree(wrk);
>>>  	if (shutdown)
>>> @@ -900,15 +906,26 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
>>>  		wl_tree_add(e1, &ubi->scrub);
>>>  	else
>>>  		wl_tree_add(e1, &ubi->used);
>>> +	if (to_leb_clean) {
>>> +		wl_tree_add(e2, &ubi->free);
>>> +		ubi->free_count++;
>>> +	}
>>> +
>>>  	ubi_assert(!ubi->move_to_put);
>>>  	ubi->move_from = ubi->move_to = NULL;
>>>  	ubi->wl_scheduled = 0;
>>>  	spin_unlock(&ubi->wl_lock);
>>>  
>>>  	ubi_free_vid_hdr(ubi, vid_hdr);
>>> -	err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
>>> -	if (err)
>>> -		goto out_ro;
>>> +	if (to_leb_clean) {
>>> +		ensure_wear_leveling(ubi, 0);
>>
>> Why are you rescheduling wear leveling?
> 
> Because we had it pending and we didn't do anything. If I don't
> schedule it would be deferred until another LEB is going to be deleted.
> Before the change it happens after do_sync_erase() work job completes
> but now we add it back to the free list.

Makes sense.

>> And the nested variable has to be set to no-zero as you're calling it from a UBI worker.
> Ah, okay. I've been looking at it and got wrong then. As I said in my
> cover mail, this was forwarded ported.
> 
>>
>>> +	} else {
>>> +		err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
>>> +		if (err) {
>>> +			wl_entry_destroy(ubi, e2);
>>
>> Why that? The erase_worker will free e2 if it encounters
>> a fatal error and gives up this PEB. You're introducing a double free.
> 
> Hmmm. That is real bad error handling you have there. So you invoke
> do_sync_erase(), the kmalloc() fails and how exactly you free e2 here?

Why do you want to free e2? We free an erase entry only if we give
it up. wear leveling entries are allocated at init time and destroyed
when you detach UBI.

Thanks,
//richard
Sebastian Andrzej Siewior Nov. 24, 2015, 8:42 a.m. UTC | #5
On 11/24/2015 09:39 AM, Richard Weinberger wrote:
>>>> +	} else {
>>>> +		err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
>>>> +		if (err) {
>>>> +			wl_entry_destroy(ubi, e2);
>>>
>>> Why that? The erase_worker will free e2 if it encounters
>>> a fatal error and gives up this PEB. You're introducing a double free.
>>
>> Hmmm. That is real bad error handling you have there. So you invoke
>> do_sync_erase(), the kmalloc() fails and how exactly you free e2 here?
> 
> Why do you want to free e2? We free an erase entry only if we give
> it up. wear leveling entries are allocated at init time and destroyed
> when you detach UBI.

The reference to it in the RB-tree (free) was removed. Is there another
reference to it?

> Thanks,
> //richard
> 

Sebastian
Richard Weinberger Nov. 24, 2015, 9:02 a.m. UTC | #6
Am 24.11.2015 um 09:42 schrieb Sebastian Andrzej Siewior:
> On 11/24/2015 09:39 AM, Richard Weinberger wrote:
>>>>> +	} else {
>>>>> +		err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
>>>>> +		if (err) {
>>>>> +			wl_entry_destroy(ubi, e2);
>>>>
>>>> Why that? The erase_worker will free e2 if it encounters
>>>> a fatal error and gives up this PEB. You're introducing a double free.
>>>
>>> Hmmm. That is real bad error handling you have there. So you invoke
>>> do_sync_erase(), the kmalloc() fails and how exactly you free e2 here?
>>
>> Why do you want to free e2? We free an erase entry only if we give
>> it up. wear leveling entries are allocated at init time and destroyed
>> when you detach UBI.
> 
> The reference to it in the RB-tree (free) was removed. Is there another
> reference to it?

UBI supports only single references.
Everything else would be a bug.

That said, I agree that the error handling in the wear_leveling_worker()
can be improved.
Especially as currently an error while do_sync_erase() puts UBI into read-only
mode and cleanup is skipped as we're in read-only anyway then.
Would you volunteer? :-)

Thanks,
//richard
Sebastian Andrzej Siewior Nov. 24, 2015, 9:07 a.m. UTC | #7
On 11/24/2015 10:02 AM, Richard Weinberger wrote:
> Am 24.11.2015 um 09:42 schrieb Sebastian Andrzej Siewior:
>> On 11/24/2015 09:39 AM, Richard Weinberger wrote:
>>>>>> +	} else {
>>>>>> +		err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
>>>>>> +		if (err) {
>>>>>> +			wl_entry_destroy(ubi, e2);
>>>>>
>>>>> Why that? The erase_worker will free e2 if it encounters
>>>>> a fatal error and gives up this PEB. You're introducing a double free.
>>>>
>>>> Hmmm. That is real bad error handling you have there. So you invoke
>>>> do_sync_erase(), the kmalloc() fails and how exactly you free e2 here?
>>>
>>> Why do you want to free e2? We free an erase entry only if we give
>>> it up. wear leveling entries are allocated at init time and destroyed
>>> when you detach UBI.
>>
>> The reference to it in the RB-tree (free) was removed. Is there another
>> reference to it?
> 
> UBI supports only single references.
> Everything else would be a bug.

So if there is no reference to e2 which was just removed from the
RB-tree free and do_sync_erase() can't kmalloc() then we leak e2,
correct?

> Thanks,
> //richard
> 
Sebastian
Richard Weinberger Nov. 24, 2015, 9:16 a.m. UTC | #8
Am 24.11.2015 um 10:07 schrieb Sebastian Andrzej Siewior:
> On 11/24/2015 10:02 AM, Richard Weinberger wrote:
>> Am 24.11.2015 um 09:42 schrieb Sebastian Andrzej Siewior:
>>> On 11/24/2015 09:39 AM, Richard Weinberger wrote:
>>>>>>> +	} else {
>>>>>>> +		err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
>>>>>>> +		if (err) {
>>>>>>> +			wl_entry_destroy(ubi, e2);
>>>>>>
>>>>>> Why that? The erase_worker will free e2 if it encounters
>>>>>> a fatal error and gives up this PEB. You're introducing a double free.
>>>>>
>>>>> Hmmm. That is real bad error handling you have there. So you invoke
>>>>> do_sync_erase(), the kmalloc() fails and how exactly you free e2 here?
>>>>
>>>> Why do you want to free e2? We free an erase entry only if we give
>>>> it up. wear leveling entries are allocated at init time and destroyed
>>>> when you detach UBI.
>>>
>>> The reference to it in the RB-tree (free) was removed. Is there another
>>> reference to it?
>>
>> UBI supports only single references.
>> Everything else would be a bug.
> 
> So if there is no reference to e2 which was just removed from the
> RB-tree free and do_sync_erase() can't kmalloc() then we leak e2,
> correct?

Yes, you are right. That definitely needs improvement.

A possible solution would be iterating over ubi->lookuptbl upon
detach time and call wl_entry_destroy() on every non-NULL entry.

...or rework do_sync_erase(), currently it calls the erase worker directly.
The erase worker destroys a failed wl entry upon failure. But from the return code
of do_sync_erase() we cannot know whether the erase worker destroyed it already.

Thanks,
//richard
Artem Bityutskiy Nov. 24, 2015, 12:58 p.m. UTC | #9
On Mon, 2015-11-23 at 19:09 +0100, Sebastian Andrzej Siewior wrote:
> wear_leveling_worker() currently unconditionally puts a PEB on erase
> in
> the error case even it just been taken from the free_list and never
> used.
> In case the PEB was never used it can be put back on the free list
> saving an erase cycle.

Did you think about putting LEBs like that to the protection queue
instead of playing tricks with scheduler?

The protection queue is there in order to protect eraseblocks from the
wear-leveling subsystem (not the best choice of words, but terminology
could be improved separately)

And this is what you need - you want the wear-leveling subsystem to
leave the eraseblock alone for some time, right?

The protection queue uses the erase cycles counts instead of the actual
time, but this seems OK for the situation you described.

Just to remind why this protection queue exists - when the WL subsystem
gives an eraseblock to the user, this may be an eraseblock with a high
erase counter, and it may be a candidate for being moved, the WL
subsystem just did not have a chance to do this yet. But if we give the
eraseblock to the user, the user will probably write something there
soon, and we do not want the WL subsystem to initiate data relocation
while the user is writing the data there. Instead, we want to wait a
little, and then move the data in background without interfering with
the user.

Back to my idea - what if you add the MOVE_RETRY eraseblocks to the
protection queue. May be not to the tail, may be to the head.
Sebastian Andrzej Siewior Nov. 24, 2015, 1:33 p.m. UTC | #10
On 11/24/2015 01:58 PM, Artem Bityutskiy wrote:

Hello Artem,

> On Mon, 2015-11-23 at 19:09 +0100, Sebastian Andrzej Siewior wrote:
>> wear_leveling_worker() currently unconditionally puts a PEB on erase
>> in
>> the error case even it just been taken from the free_list and never
>> used.
>> In case the PEB was never used it can be put back on the free list
>> saving an erase cycle.
> 
> Did you think about putting LEBs like that to the protection queue
> instead of playing tricks with scheduler?

Why am I playing tricks with the scheduler?

> The protection queue is there in order to protect eraseblocks from the
> wear-leveling subsystem (not the best choice of words, but terminology
> could be improved separately)
> 
> And this is what you need - you want the wear-leveling subsystem to
> leave the eraseblock alone for some time, right?

The empty eraseblock is not the problem. The src-LEB is the problem
because it can not be moved due to lock contention. Ideally I would do
nothing here (except putting it back to the free list) and on unlock of
the SRC-LEB it would trigger a wear-leveling cycle.

> The protection queue uses the erase cycles counts instead of the actual
> time, but this seems OK for the situation you described.
> 
> Just to remind why this protection queue exists - when the WL subsystem
> gives an eraseblock to the user, this may be an eraseblock with a high
> erase counter, and it may be a candidate for being moved, the WL
> subsystem just did not have a chance to do this yet. But if we give the
> eraseblock to the user, the user will probably write something there
> soon, and we do not want the WL subsystem to initiate data relocation
> while the user is writing the data there. Instead, we want to wait a
> little, and then move the data in background without interfering with
> the user.
> 
> Back to my idea - what if you add the MOVE_RETRY eraseblocks to the
> protection queue. May be not to the tail, may be to the head.

Hmm. About which erase blocks are you talking about? The e1 which is
the src EB and will be relocated _or_ the e2 which is the destination
EB where the data will be written to?
From what you explain it does not make sense to put e2 on the protect
list. I just try to save here an erase cycle here.
e1 on the other hand might be a candidate.

So e1 has a low EC value and WL-subsystem decides to move it to a an
empty block with a high EC value. This fails due to MOVE_RETRY.
I add e2 on to free-RB-tree, e1 on the protect-list. No
ensure_wear_leveling() invocation. What will happen next? When will e1
be removed from the protection list?

Sebastian
Artem Bityutskiy Nov. 24, 2015, 1:40 p.m. UTC | #11
On Tue, 2015-11-24 at 14:33 +0100, Sebastian Andrzej Siewior wrote:
> What will happen next? When will e1
> be removed from the protection list?

If my memory still serves me, the answer is: roughly speaking, after a
number of erase operation, which is currently defined as

/*
 * Length of the protection queue. The length is effectively equivalent t o the
 * number of (global) erase cycles PEBs are protected from the wear-leveling
 * worker.
 */
#define UBI_PROT_QUEUE_LEN 10

But if you put it to the head of the protection queue, it'll be removed
from there as soon as any LEB is "put", which effectively means
"scheduled for erasure".

Artem.
Artem Bityutskiy Nov. 24, 2015, 1:57 p.m. UTC | #12
A follow-up...

On Tue, 2015-11-24 at 14:33 +0100, Sebastian Andrzej Siewior wrote:
> > Did you think about putting LEBs like that to the protection queue
> > instead of playing tricks with scheduler?
> 
> Why am I playing tricks with the scheduler?

I should have replied the "1/2" e-mail, not this one, sorry. Because
this is what you seem to try ding in 1/2.

> Hmm. About which erase blocks are you talking about? The e1 which is
> the src EB and will be relocated _or_ the e2 which is the destination

I think the one which is currently unavailable. Or may be even both. I
suggest you to consider pros an cons :-)

> From what you explain it does not make sense to put e2 on the protect
> list. I just try to save here an erase cycle here.

I think if you put to the head of the PQ, that will be it.
Sebastian Andrzej Siewior Nov. 26, 2015, 2:56 p.m. UTC | #13
On 11/24/2015 01:58 PM, Artem Bityutskiy wrote:
> On Mon, 2015-11-23 at 19:09 +0100, Sebastian Andrzej Siewior wrote:
> Back to my idea - what if you add the MOVE_RETRY eraseblocks to the
> protection queue. May be not to the tail, may be to the head.

I've been thinking about this. The protected list protects EBs from the
WL worker. There are 10 slots and after one successful erase process
all EB from this slot will be moved back to ->used list. So e2 can't be
placed there because it belongs on the ->free list.

e1 could be added there so it is protected from the WL worker until at
least one erase cycle happened. This is no guarantee that MOVE_RETRY
does not happen again but an optimistic try :)

However once the protection time ends, it won't be added to the ->scrub
list but the ->used list instead. The difference is that if it has an
average number of erase cycles it won't be considered by the WL worker
any time soon. So if it was added to the ->scrub list because a bitflip
was detected on read then you lose this information.
For that reason I think it is not worth putting it on the protected
list.

Sebastian
diff mbox

Patch

diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
index eb4489f9082f..709ca27e103c 100644
--- a/drivers/mtd/ubi/wl.c
+++ b/drivers/mtd/ubi/wl.c
@@ -631,6 +631,7 @@  static int do_sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e,
 	return erase_worker(ubi, wl_wrk, 0);
 }
 
+static int ensure_wear_leveling(struct ubi_device *ubi, int nested);
 /**
  * wear_leveling_worker - wear-leveling worker function.
  * @ubi: UBI device description object
@@ -652,6 +653,7 @@  static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
 #endif
 	struct ubi_wl_entry *e1, *e2;
 	struct ubi_vid_hdr *vid_hdr;
+	int to_leb_clean = 0;
 
 	kfree(wrk);
 	if (shutdown)
@@ -756,6 +758,7 @@  static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
 
 	err = ubi_io_read_vid_hdr(ubi, e1->pnum, vid_hdr, 0);
 	if (err && err != UBI_IO_BITFLIPS) {
+		to_leb_clean = 1;
 		if (err == UBI_IO_FF) {
 			/*
 			 * We are trying to move PEB without a VID header. UBI
@@ -801,10 +804,12 @@  static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
 			 * protection queue.
 			 */
 			protect = 1;
+			to_leb_clean = 1;
 			goto out_not_moved;
 		}
 		if (err == MOVE_RETRY) {
 			scrubbing = 1;
+			to_leb_clean = 1;
 			goto out_not_moved;
 		}
 		if (err == MOVE_TARGET_BITFLIPS || err == MOVE_TARGET_WR_ERR ||
@@ -830,6 +835,7 @@  static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
 					ubi->erroneous_peb_count);
 				goto out_error;
 			}
+			to_leb_clean = 1;
 			erroneous = 1;
 			goto out_not_moved;
 		}
@@ -900,15 +906,26 @@  static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
 		wl_tree_add(e1, &ubi->scrub);
 	else
 		wl_tree_add(e1, &ubi->used);
+	if (to_leb_clean) {
+		wl_tree_add(e2, &ubi->free);
+		ubi->free_count++;
+	}
+
 	ubi_assert(!ubi->move_to_put);
 	ubi->move_from = ubi->move_to = NULL;
 	ubi->wl_scheduled = 0;
 	spin_unlock(&ubi->wl_lock);
 
 	ubi_free_vid_hdr(ubi, vid_hdr);
-	err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
-	if (err)
-		goto out_ro;
+	if (to_leb_clean) {
+		ensure_wear_leveling(ubi, 0);
+	} else {
+		err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
+		if (err) {
+			wl_entry_destroy(ubi, e2);
+			goto out_ro;
+		}
+	}
 
 	mutex_unlock(&ubi->move_mutex);
 	return 0;