Patchwork [v2] MTD: Retry Read/Write Transfer Buffer Allocations

login
register
mail settings
Submitter Grant Erickson
Date April 4, 2011, 6:19 p.m.
Message ID <1301941174-10050-1-git-send-email-marathon96@gmail.com>
Download mbox | patch
Permalink /patch/89683/
State New
Headers show

Comments

Grant Erickson - April 4, 2011, 6:19 p.m.
When handling user space read or write requests via mtd_{read,write},
exponentially back off on the size of the requested kernel transfer
buffer until it succeeds or until the requested transfer buffer size
falls below the page size.

This helps ensure the operation can succeed under low-memory,
highly-fragmented situations albeit somewhat more slowly.

  v2: Added __GFP_NOWARN flag and made common retry loop a function
      as recommended by Artem.

Signed-off-by: Grant Erickson <marathon96@gmail.com>
---
 drivers/mtd/mtdchar.c |   66 +++++++++++++++++++++++++++++++++---------------
 1 files changed, 45 insertions(+), 21 deletions(-)

1.7.4.2
Artem Bityutskiy - April 5, 2011, 4:39 a.m.
Hi,

On Mon, 2011-04-04 at 11:19 -0700, Grant Erickson wrote:
> When handling user space read or write requests via mtd_{read,write},
> exponentially back off on the size of the requested kernel transfer
> buffer until it succeeds or until the requested transfer buffer size
> falls below the page size.
> 
> This helps ensure the operation can succeed under low-memory,
> highly-fragmented situations albeit somewhat more slowly.
> 
>   v2: Added __GFP_NOWARN flag and made common retry loop a function
>       as recommended by Artem.
> 
> Signed-off-by: Grant Erickson <marathon96@gmail.com>
> ---
>  drivers/mtd/mtdchar.c |   66 +++++++++++++++++++++++++++++++++---------------
>  1 files changed, 45 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
> index 145b3d0d..df9be51 100644
> --- a/drivers/mtd/mtdchar.c
> +++ b/drivers/mtd/mtdchar.c
> @@ -166,11 +166,44 @@ static int mtd_close(struct inode *inode, struct file *file)
>  	return 0;
>  } /* mtd_close */
>  
> -/* FIXME: This _really_ needs to die. In 2.5, we should lock the
> -   userspace buffer down and use it directly with readv/writev.
> -*/
> +/* Back in April 2005, Linus wrote:
> + * 
> + *   FIXME: This _really_ needs to die. In 2.5, we should lock the
> + *   userspace buffer down and use it directly with readv/writev.
> + *
> + * The implementation below, using mtd_try_alloc, mitigates allocation
> + * failures when the sytem is under low-memory situations or if memory

s/sytem/system/

> + * is highly fragmented at the cost of reducing the performance of the
> + * requested transfer due to a smaller buffer size.
> + *
> + * A more complex but more memory-efficient implementation based on
> + * get_user_pages and iovecs to cover extents of those pages is a
> + * longer-term goal, as intimated by Linus above. However, for the
> + * write case, this requires yet more complex head and tail transfer
> + * handling when those head and tail offsets and sizes are such that
> + * alignment requirements are not met in the NAND subdriver.
> + */
>  #define MAX_KMALLOC_SIZE 0x20000
>  
> +static void *mtd_try_alloc(size_t *size)
> +{
> +	const gfp_t flags = (GFP_KERNEL | __GFP_NOWARN);

I still think you'll damage the performance when you try to do

kmalloc(128KiB, flags)

because as I wrote in my previous e-mail your system will start doing
the following to free memory for you:

1. write-back dirty FS data = overall slowdown = e.g., background mp3
   playback glitches
2. drop FS caches = slow down later because the system will have to
   re-read the dropped data from the media later.
3. not really sure, needs checking if this is the case, but I think
   the kernel may start swapping out apps.

This is why I suggested to use the following flags here:

	gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY;

> +	size_t try;
> +	void *kbuf;
> +
> +	try = min_t(size_t, *size, MAX_KMALLOC_SIZE);
> +
> +	do {
> +		kbuf = kmalloc(try, flags);
> +	} while (!kbuf && ((try >>= 1) >= PAGE_SIZE));

So, you try 128KiB, 64KiB, 32KiB, 16KiB, 8KiB and fail, it is OK. But
4KiB is the last resort allocation. If it fails, you do want to see
scary kmalloc warning, so you should not use __GFP_NOWARN for this last
allocation. Also, you do want kmalloc to try hard, so for this last
PAGE_SIZE allocation you want to use GFP_KERNEL flags.

> +
> +	if (kbuf) {
> +		*size = try;
> +	}

Braces are not necessary here. But actually the whole if is not needed -
just make the function interface so that if it returns NULL then *size
is undefined and the user of this function should not look at it. I
think it is the case in your code.

I mean, just 

	*size = try;
	return kbuf;

> +
> +	return kbuf;
> +}
> +
>  static ssize_t mtd_read(struct file *file, char __user *buf, size_t count,loff_t *ppos)
>  {
>  	struct mtd_file_info *mfi = file->private_data;
> @@ -179,6 +212,7 @@ static ssize_t mtd_read(struct file *file, char __user *buf, size_t count,loff_t
>  	size_t total_retlen=0;
>  	int ret=0;
>  	int len;
> +	size_t size;
>  	char *kbuf;
>  
>  	DEBUG(MTD_DEBUG_LEVEL0,"MTD_read\n");
> @@ -189,23 +223,16 @@ static ssize_t mtd_read(struct file *file, char __user *buf, size_t count,loff_t
>  	if (!count)
>  		return 0;
>  
> -	/* FIXME: Use kiovec in 2.5 to lock down the user's buffers
> -	   and pass them directly to the MTD functions */
> +	size = count;
I think you can do this assignment when you declare 'size';

>  
> -	if (count > MAX_KMALLOC_SIZE)
> -		kbuf=kmalloc(MAX_KMALLOC_SIZE, GFP_KERNEL);
> -	else
> -		kbuf=kmalloc(count, GFP_KERNEL);
> +	kbuf = mtd_try_alloc(&size);
>  
>  	if (!kbuf)
>  		return -ENOMEM;

No need to put extra new lines, too many of them make the code less
readable. I think allocating and checking should have not space in
between.

>  
>  	while (count) {
>  
> -		if (count > MAX_KMALLOC_SIZE)
> -			len = MAX_KMALLOC_SIZE;
> -		else
> -			len = count;
Please, kill the extra white-space after "while" as well.

> +		len = min_t(size_t, count, size);
>  
>  		switch (mfi->mode) {
>  		case MTD_MODE_OTP_FACTORY:
> @@ -268,6 +295,7 @@ static ssize_t mtd_write(struct file *file, const char __user *buf, size_t count
>  {
>  	struct mtd_file_info *mfi = file->private_data;
>  	struct mtd_info *mtd = mfi->mtd;
> +	size_t size;
>  	char *kbuf;
>  	size_t retlen;
>  	size_t total_retlen=0;
> @@ -285,21 +313,16 @@ static ssize_t mtd_write(struct file *file, const char __user *buf, size_t count
>  	if (!count)
>  		return 0;
>  
> -	if (count > MAX_KMALLOC_SIZE)
> -		kbuf=kmalloc(MAX_KMALLOC_SIZE, GFP_KERNEL);
> -	else
> -		kbuf=kmalloc(count, GFP_KERNEL);
> +	size = count;
> +
> +	kbuf = mtd_try_alloc(&size);
>  
>  	if (!kbuf)
>  		return -ENOMEM;
>  
>  	while (count) {
>  
> -		if (count > MAX_KMALLOC_SIZE)
> -			len = MAX_KMALLOC_SIZE;
> -		else
> -			len = count;
> +		len = min_t(size_t, count, size);
>  
>  		if (copy_from_user(kbuf, buf, len)) {
>  			kfree(kbuf);

Similar requests for this "symmetric" piece of code.
Artem Bityutskiy - April 5, 2011, 4:48 a.m.
On Mon, 2011-04-04 at 11:19 -0700, Grant Erickson wrote:
> When handling user space read or write requests via mtd_{read,write},
> exponentially back off on the size of the requested kernel transfer
> buffer until it succeeds or until the requested transfer buffer size
> falls below the page size.
> 
> This helps ensure the operation can succeed under low-memory,
> highly-fragmented situations albeit somewhat more slowly.
> 
>   v2: Added __GFP_NOWARN flag and made common retry loop a function
>       as recommended by Artem.
> 
> Signed-off-by: Grant Erickson <marathon96@gmail.com>
> ---
>  drivers/mtd/mtdchar.c |   66 +++++++++++++++++++++++++++++++++---------------
>  1 files changed, 45 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
> index 145b3d0d..df9be51 100644
> --- a/drivers/mtd/mtdchar.c
> +++ b/drivers/mtd/mtdchar.c
> @@ -166,11 +166,44 @@ static int mtd_close(struct inode *inode, struct file *file)
>  	return 0;
>  } /* mtd_close */
>  
> -/* FIXME: This _really_ needs to die. In 2.5, we should lock the
> -   userspace buffer down and use it directly with readv/writev.
> -*/
> +/* Back in April 2005, Linus wrote:
> + * 
> + *   FIXME: This _really_ needs to die. In 2.5, we should lock the
> + *   userspace buffer down and use it directly with readv/writev.
> + *
> + * The implementation below, using mtd_try_alloc, mitigates allocation
> + * failures when the sytem is under low-memory situations or if memory
> + * is highly fragmented at the cost of reducing the performance of the
> + * requested transfer due to a smaller buffer size.
> + *
> + * A more complex but more memory-efficient implementation based on
> + * get_user_pages and iovecs to cover extents of those pages is a
> + * longer-term goal, as intimated by Linus above. However, for the
> + * write case, this requires yet more complex head and tail transfer
> + * handling when those head and tail offsets and sizes are such that
> + * alignment requirements are not met in the NAND subdriver.
> + */
>  #define MAX_KMALLOC_SIZE 0x20000
>  
> +static void *mtd_try_alloc(size_t *size)

Also, if you do the changes I request and make this function allow scary
kmalloc warnings on the last resort PAGE_SIZE allocation, the "try" in
the function name becomes not very appropriate, because in the kernel
APIs it is usually used for something like "try, if did not succeed, no
worry, just return". E.g., mutex_try_lock() or something.

I think it is better to name it mtd_alloc or something like this, but
without "try".

And probably you want to reuse this function in JFFS2, so we should give
it some name which is good for exported API function. May be
mtd_alloc_upto() ? Or mtd_alloc_as_much() ? Or better ideas?
Grant Erickson - April 5, 2011, 3:54 p.m.
Artem:

Thanks for the quick turnaround in feedback. Please see inline below.

On 4/4/11 9:39 PM, Artem Bityutskiy wrote:
> On Mon, 2011-04-04 at 11:19 -0700, Grant Erickson wrote:
>> + * is highly fragmented at the cost of reducing the performance of the
>> + * requested transfer due to a smaller buffer size.
>> + *
>> + * A more complex but more memory-efficient implementation based on
>> + * get_user_pages and iovecs to cover extents of those pages is a
>> + * longer-term goal, as intimated by Linus above. However, for the
>> + * write case, this requires yet more complex head and tail transfer
>> + * handling when those head and tail offsets and sizes are such that
>> + * alignment requirements are not met in the NAND subdriver.
>> + */
>>  #define MAX_KMALLOC_SIZE 0x20000
>>  
>> +static void *mtd_try_alloc(size_t *size)
>> +{
>> + const gfp_t flags = (GFP_KERNEL | __GFP_NOWARN);
> 
> I still think you'll damage the performance when you try to do
> 
> kmalloc(128KiB, flags)
> 
> because as I wrote in my previous e-mail your system will start doing
> the following to free memory for you:
> 
> 1. write-back dirty FS data = overall slowdown = e.g., background mp3
>    playback glitches
> 2. drop FS caches = slow down later because the system will have to
>    re-read the dropped data from the media later.
> 3. not really sure, needs checking if this is the case, but I think
>    the kernel may start swapping out apps.
> 
> This is why I suggested to use the following flags here:
> 
> gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY;

On my system (64 MiB RAM, 256 MiB Flash), there is no swap and under these
allocation conditions for jffs2_scan_medium, mtd_read or mtd_write, I don't
see the kernel doing (1), (2) or (3).

My impression is that the above behaviors are only activated when a swap
store exists and, in general, most systems using JFFS2 and MTD do not have
swap.

Regardless, adding the additional flags should not be detrimental for
systems with no swap and, it sounds like, helpful for systems with it.

Regarding the suggestion of mtd_alloc_upto() or mtd_alloc_as_much(), are you
OK exporting these from mtdchar.c or would you rather they be moved to and
exported from mtdcore.c?

Stay tuned for v3. 

-Grant
Artem Bityutskiy - April 5, 2011, 4:54 p.m.
On Tue, 2011-04-05 at 08:54 -0700, Grant Erickson wrote:
> Artem:
> 
> Thanks for the quick turnaround in feedback. Please see inline below.
> 
> On 4/4/11 9:39 PM, Artem Bityutskiy wrote:
> > On Mon, 2011-04-04 at 11:19 -0700, Grant Erickson wrote:
> >> + * is highly fragmented at the cost of reducing the performance of the
> >> + * requested transfer due to a smaller buffer size.
> >> + *
> >> + * A more complex but more memory-efficient implementation based on
> >> + * get_user_pages and iovecs to cover extents of those pages is a
> >> + * longer-term goal, as intimated by Linus above. However, for the
> >> + * write case, this requires yet more complex head and tail transfer
> >> + * handling when those head and tail offsets and sizes are such that
> >> + * alignment requirements are not met in the NAND subdriver.
> >> + */
> >>  #define MAX_KMALLOC_SIZE 0x20000
> >>  
> >> +static void *mtd_try_alloc(size_t *size)
> >> +{
> >> + const gfp_t flags = (GFP_KERNEL | __GFP_NOWARN);
> > 
> > I still think you'll damage the performance when you try to do
> > 
> > kmalloc(128KiB, flags)
> > 
> > because as I wrote in my previous e-mail your system will start doing
> > the following to free memory for you:
> > 
> > 1. write-back dirty FS data = overall slowdown = e.g., background mp3
> >    playback glitches
> > 2. drop FS caches = slow down later because the system will have to
> >    re-read the dropped data from the media later.
> > 3. not really sure, needs checking if this is the case, but I think
> >    the kernel may start swapping out apps.
> > 
> > This is why I suggested to use the following flags here:
> > 
> > gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY;
> 
> On my system (64 MiB RAM, 256 MiB Flash), there is no swap and under these
> allocation conditions for jffs2_scan_medium, mtd_read or mtd_write, I don't
> see the kernel doing (1), (2) or (3).

Well, the code is complex and not easy to follow if you do not know it.
But I navigated it to the 'do_try_to_free_pages()' function. This
function can be called by kmalloc(), and it does some of the things I
described. And yes, kmalloc() may cause kswapd to wake up and start
swapping, I can see it in '__alloc_pages_slowpath()'. To prevent this we
need __GFP_NO_KSWAPD flag which I suggest you to also add.

> My impression is that the above behaviors are only activated when a swap
> store exists and, in general, most systems using JFFS2 and MTD do not have
> swap.

Well, most but not all, I worked with one with swap (N900 phone).

> Regardless, adding the additional flags should not be detrimental for
> systems with no swap and, it sounds like, helpful for systems with it.

OK. But things 1 and 2 which I described are relevant for non-swap
systems anyway. Dunno why you did not observe them, probably you did not
have high enough memory pressure and your flusher threads and other
things kept the memory within limits (there are watermarks which cause
background processes to start and free RAM if you cross them).

> Regarding the suggestion of mtd_alloc_upto() or mtd_alloc_as_much(), are you
> OK exporting these from mtdchar.c or would you rather they be moved to and
> exported from mtdcore.c?

I guess mtdcore is better place.

Patch

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 145b3d0d..df9be51 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -166,11 +166,44 @@  static int mtd_close(struct inode *inode, struct file *file)
 	return 0;
 } /* mtd_close */
 
-/* FIXME: This _really_ needs to die. In 2.5, we should lock the
-   userspace buffer down and use it directly with readv/writev.
-*/
+/* Back in April 2005, Linus wrote:
+ * 
+ *   FIXME: This _really_ needs to die. In 2.5, we should lock the
+ *   userspace buffer down and use it directly with readv/writev.
+ *
+ * The implementation below, using mtd_try_alloc, mitigates allocation
+ * failures when the sytem is under low-memory situations or if memory
+ * is highly fragmented at the cost of reducing the performance of the
+ * requested transfer due to a smaller buffer size.
+ *
+ * A more complex but more memory-efficient implementation based on
+ * get_user_pages and iovecs to cover extents of those pages is a
+ * longer-term goal, as intimated by Linus above. However, for the
+ * write case, this requires yet more complex head and tail transfer
+ * handling when those head and tail offsets and sizes are such that
+ * alignment requirements are not met in the NAND subdriver.
+ */
 #define MAX_KMALLOC_SIZE 0x20000
 
+static void *mtd_try_alloc(size_t *size)
+{
+	const gfp_t flags = (GFP_KERNEL | __GFP_NOWARN);
+	size_t try;
+	void *kbuf;
+
+	try = min_t(size_t, *size, MAX_KMALLOC_SIZE);
+
+	do {
+		kbuf = kmalloc(try, flags);
+	} while (!kbuf && ((try >>= 1) >= PAGE_SIZE));
+
+	if (kbuf) {
+		*size = try;
+	}
+
+	return kbuf;
+}
+
 static ssize_t mtd_read(struct file *file, char __user *buf, size_t count,loff_t *ppos)
 {
 	struct mtd_file_info *mfi = file->private_data;
@@ -179,6 +212,7 @@  static ssize_t mtd_read(struct file *file, char __user *buf, size_t count,loff_t
 	size_t total_retlen=0;
 	int ret=0;
 	int len;
+	size_t size;
 	char *kbuf;
 
 	DEBUG(MTD_DEBUG_LEVEL0,"MTD_read\n");
@@ -189,23 +223,16 @@  static ssize_t mtd_read(struct file *file, char __user *buf, size_t count,loff_t
 	if (!count)
 		return 0;
 
-	/* FIXME: Use kiovec in 2.5 to lock down the user's buffers
-	   and pass them directly to the MTD functions */
+	size = count;
 
-	if (count > MAX_KMALLOC_SIZE)
-		kbuf=kmalloc(MAX_KMALLOC_SIZE, GFP_KERNEL);
-	else
-		kbuf=kmalloc(count, GFP_KERNEL);
+	kbuf = mtd_try_alloc(&size);
 
 	if (!kbuf)
 		return -ENOMEM;
 
 	while (count) {
 
-		if (count > MAX_KMALLOC_SIZE)
-			len = MAX_KMALLOC_SIZE;
-		else
-			len = count;
+		len = min_t(size_t, count, size);
 
 		switch (mfi->mode) {
 		case MTD_MODE_OTP_FACTORY:
@@ -268,6 +295,7 @@  static ssize_t mtd_write(struct file *file, const char __user *buf, size_t count
 {
 	struct mtd_file_info *mfi = file->private_data;
 	struct mtd_info *mtd = mfi->mtd;
+	size_t size;
 	char *kbuf;
 	size_t retlen;
 	size_t total_retlen=0;
@@ -285,21 +313,16 @@  static ssize_t mtd_write(struct file *file, const char __user *buf, size_t count
 	if (!count)
 		return 0;
 
-	if (count > MAX_KMALLOC_SIZE)
-		kbuf=kmalloc(MAX_KMALLOC_SIZE, GFP_KERNEL);
-	else
-		kbuf=kmalloc(count, GFP_KERNEL);
+	size = count;
+
+	kbuf = mtd_try_alloc(&size);
 
 	if (!kbuf)
 		return -ENOMEM;
 
 	while (count) {
 
-		if (count > MAX_KMALLOC_SIZE)
-			len = MAX_KMALLOC_SIZE;
-		else
-			len = count;
+		len = min_t(size_t, count, size);
 
 		if (copy_from_user(kbuf, buf, len)) {
 			kfree(kbuf);
--