diff mbox

[U-Boot,V3,1/3] drivers: block: add block device cache

Message ID 1459184744-15101-1-git-send-email-eric@nelint.com
State Accepted
Delegated to: Tom Rini
Headers show

Commit Message

Eric Nelson March 28, 2016, 5:05 p.m. UTC
Add a block device cache to speed up repeated reads of block devices by
various filesystems.

This small amount of cache can dramatically speed up filesystem
operations by skipping repeated reads of common areas of a block
device (typically directory structures).

This has shown to have some benefit on FAT filesystem operations of
loading a kernel and RAM disk, but more dramatic benefits on ext4
filesystems when the kernel and/or RAM disk are spread across
multiple extent header structures as described in commit fc0fc50.

The cache is implemented through a minimal list (block_cache) maintained
in most-recently-used order and count of the current number of entries
(cache_count). It uses a maximum block count setting to prevent copies
of large block reads and an upper bound on the number of cached areas.

The maximum number of entries in the cache defaults to 32 and the maximum
number of blocks per cache entry has a default of 2, which has shown to
produce the best results on testing of ext4 and FAT filesystems.

The 'blkcache' command (enabled through CONFIG_CMD_BLOCK_CACHE) allows
changing these values and can be used to tune for a particular filesystem
layout.

Signed-off-by: Eric Nelson <eric@nelint.com>
---
Changes in V3:
   - replace remnant of "blkcache max" from sub-command name change

Changes in V2:
   - moved blkcache command into cmd/blkcache (Misc commands)
   - removed invalidate sub-command - done in max command
   - removed clear sub-command - done in show command
   - removed dump command (was only for debug)
   - removed run-time trace control in favor of debug() messages
   - use struct block_cache_stats instead of separate static vars
     to streamline block_cache_
   - rename max sub-command to configure to match API
   - change show sub-command to auto-clear statistics

 cmd/Kconfig                |  11 +++
 cmd/Makefile               |   1 +
 cmd/blkcache.c             |  90 +++++++++++++++++++++++
 disk/part.c                |   2 +
 drivers/block/Kconfig      |   9 +++
 drivers/block/Makefile     |   1 +
 drivers/block/blk-uclass.c |  13 +++-
 drivers/block/blkcache.c   | 173 +++++++++++++++++++++++++++++++++++++++++++++
 include/blk.h              | 105 ++++++++++++++++++++++++++-
 9 files changed, 403 insertions(+), 2 deletions(-)
 create mode 100644 cmd/blkcache.c
 create mode 100644 drivers/block/blkcache.c

Comments

Stephen Warren March 30, 2016, 2:36 p.m. UTC | #1
On 03/28/2016 11:05 AM, Eric Nelson wrote:
> Add a block device cache to speed up repeated reads of block devices by
> various filesystems.
>
> This small amount of cache can dramatically speed up filesystem
> operations by skipping repeated reads of common areas of a block
> device (typically directory structures).
>
> This has shown to have some benefit on FAT filesystem operations of
> loading a kernel and RAM disk, but more dramatic benefits on ext4
> filesystems when the kernel and/or RAM disk are spread across
> multiple extent header structures as described in commit fc0fc50.
>
> The cache is implemented through a minimal list (block_cache) maintained
> in most-recently-used order and count of the current number of entries
> (cache_count). It uses a maximum block count setting to prevent copies
> of large block reads and an upper bound on the number of cached areas.
>
> The maximum number of entries in the cache defaults to 32 and the maximum
> number of blocks per cache entry has a default of 2, which has shown to
> produce the best results on testing of ext4 and FAT filesystems.
>
> The 'blkcache' command (enabled through CONFIG_CMD_BLOCK_CACHE) allows
> changing these values and can be used to tune for a particular filesystem
> layout.

> diff --git a/cmd/blkcache.c b/cmd/blkcache.c

> +static int blkc_show(cmd_tbl_t *cmdtp, int flag,
> +		     int argc, char * const argv[])
> +{
> +	struct block_cache_stats stats;
> +	blkcache_stats(&stats);
> +
> +	printf("    hits: %u\n"
> +	       "    misses: %u\n"
> +	       "    entries: %u\n"
> +	       "    max blocks/entry: %u\n"
> +	       "    max cache entries: %u\n",

Nit: I'm not sure why all that command output is indented. Perhaps it 
made sense before if some kind of title was printed before that text, 
but I don't think it is any more.

> +static int do_blkcache(cmd_tbl_t *cmdtp, int flag,

> +	c = find_cmd_tbl(argv[0], &cmd_blkc_sub[0], ARRAY_SIZE(cmd_blkc_sub));
> +
> +	if (c)
> +		return c->cmd(cmdtp, flag, argc, argv);
> +	else
> +		return CMD_RET_USAGE;
> +
> +	return 0;

Nit: I'd prefer to see the error handling immediately follow the 
function call whose result is being tested, and the non-failure-case 
code be not indented, i.e.:


c = find_cmd_tbl(argv[0], &cmd_blkc_sub[0], ARRAY_SIZE(cmd_blkc_sub));
if (!c)
     return CMD_RET_USAGE;

return c->cmd(cmdtp, flag, argc, argv);

> diff --git a/disk/part.c b/disk/part.c

> @@ -268,6 +268,8 @@ void part_init(struct blk_desc *dev_desc)
>   	const int n_ents = ll_entry_count(struct part_driver, part_driver);
>   	struct part_driver *entry;
>
> +	blkcache_invalidate(dev_desc->if_type, dev_desc->devnum);

Doesn't this invalidate the cache far too often? I expect that function 
is called for command the user executes from the command-line, whereas 
it'd be nice if the cache persisted across commands. I suppose this is a 
reasonable (and very safe) first implementation though, and saves having 
to go through each storage provider type and find out the right place to 
detect media changes.

> diff --git a/drivers/block/blkcache.c b/drivers/block/blkcache.c

> +struct block_cache_node {
> +	struct list_head lh;
> +	int iftype;
> +	int devnum;
> +	lbaint_t start;
> +	lbaint_t blkcnt;
> +	unsigned long blksz;
> +	char *cache;
> +};
> +
> +static LIST_HEAD(block_cache);
> +
> +static struct block_cache_stats _stats = {
> +	.max_blocks_per_entry = 2,
> +	.max_entries = 32
> +};

Now is a good time to mention another reason why I don't like using a 
dynamically allocated linked list for this: Memory fragmentation. By 
dynamically allocating the cache, we could easily run into a situation 
where the user runs a command that allocates memory and also adds to the 
block cache, then most of that memory gets freed when U-Boot returns to 
the command prompt, then the user runs the command again but it fails 
since it can't allocate the memory due to fragmentation of the heap. 
This is a real problem I've seen e.g. with the "ums" and "dfu" commands, 
since they might initialize the USB controller the first time they're 
run, which allocates some new memory. Statically allocation would avoid 
this.

> diff --git a/include/blk.h b/include/blk.h

> +int blkcache_read
> +	(int iftype, int dev,
> +	 lbaint_t start, lbaint_t blkcnt,
> +	 unsigned long blksz, void *buffer);

Nit: The opening ( and first n arguments should be wrapped onto the same 
line as the function name.
Tom Rini March 30, 2016, 3:19 p.m. UTC | #2
On Wed, Mar 30, 2016 at 08:36:21AM -0600, Stephen Warren wrote:
> On 03/28/2016 11:05 AM, Eric Nelson wrote:
> >Add a block device cache to speed up repeated reads of block devices by
> >various filesystems.
> >
> >This small amount of cache can dramatically speed up filesystem
> >operations by skipping repeated reads of common areas of a block
> >device (typically directory structures).
> >
> >This has shown to have some benefit on FAT filesystem operations of
> >loading a kernel and RAM disk, but more dramatic benefits on ext4
> >filesystems when the kernel and/or RAM disk are spread across
> >multiple extent header structures as described in commit fc0fc50.
> >
> >The cache is implemented through a minimal list (block_cache) maintained
> >in most-recently-used order and count of the current number of entries
> >(cache_count). It uses a maximum block count setting to prevent copies
> >of large block reads and an upper bound on the number of cached areas.
> >
> >The maximum number of entries in the cache defaults to 32 and the maximum
> >number of blocks per cache entry has a default of 2, which has shown to
> >produce the best results on testing of ext4 and FAT filesystems.
> >
> >The 'blkcache' command (enabled through CONFIG_CMD_BLOCK_CACHE) allows
> >changing these values and can be used to tune for a particular filesystem
> >layout.
[snip]
> >diff --git a/disk/part.c b/disk/part.c
> 
> >@@ -268,6 +268,8 @@ void part_init(struct blk_desc *dev_desc)
> >  	const int n_ents = ll_entry_count(struct part_driver, part_driver);
> >  	struct part_driver *entry;
> >
> >+	blkcache_invalidate(dev_desc->if_type, dev_desc->devnum);
> 
> Doesn't this invalidate the cache far too often? I expect that
> function is called for command the user executes from the
> command-line, whereas it'd be nice if the cache persisted across
> commands. I suppose this is a reasonable (and very safe) first
> implementation though, and saves having to go through each storage
> provider type and find out the right place to detect media changes.

My initial reaction here is that we should stay conservative and
invalidate the cache more often rather than too infrequent.  I mean,
what's the worst case here, an extra read? A few extra reads?  We want
to make sure we keep the complexity to functionality ratio right here,
if we make the recovery/flashing/factory cases a whole lot better but
are leaving 1 second of wall clock time on the table when we've just
gained a minute, we're OK.

> >diff --git a/drivers/block/blkcache.c b/drivers/block/blkcache.c
> 
> >+struct block_cache_node {
> >+	struct list_head lh;
> >+	int iftype;
> >+	int devnum;
> >+	lbaint_t start;
> >+	lbaint_t blkcnt;
> >+	unsigned long blksz;
> >+	char *cache;
> >+};
> >+
> >+static LIST_HEAD(block_cache);
> >+
> >+static struct block_cache_stats _stats = {
> >+	.max_blocks_per_entry = 2,
> >+	.max_entries = 32
> >+};
> 
> Now is a good time to mention another reason why I don't like using
> a dynamically allocated linked list for this: Memory fragmentation.
> By dynamically allocating the cache, we could easily run into a
> situation where the user runs a command that allocates memory and
> also adds to the block cache, then most of that memory gets freed
> when U-Boot returns to the command prompt, then the user runs the
> command again but it fails since it can't allocate the memory due to
> fragmentation of the heap. This is a real problem I've seen e.g.
> with the "ums" and "dfu" commands, since they might initialize the
> USB controller the first time they're run, which allocates some new
> memory. Statically allocation would avoid this.

That is a good point.  But how would you hit this?  The problem in
ums/dfu was that it was several megabytes, yes?  My quick read over the
code right now has me thinking this is something measured in kilobytes.
Stephen Warren March 30, 2016, 3:21 p.m. UTC | #3
On 03/30/2016 09:19 AM, Tom Rini wrote:
> On Wed, Mar 30, 2016 at 08:36:21AM -0600, Stephen Warren wrote:
>> On 03/28/2016 11:05 AM, Eric Nelson wrote:
>>> Add a block device cache to speed up repeated reads of block devices by
>>> various filesystems.
>>>
>>> This small amount of cache can dramatically speed up filesystem
>>> operations by skipping repeated reads of common areas of a block
>>> device (typically directory structures).
>>>
>>> This has shown to have some benefit on FAT filesystem operations of
>>> loading a kernel and RAM disk, but more dramatic benefits on ext4
>>> filesystems when the kernel and/or RAM disk are spread across
>>> multiple extent header structures as described in commit fc0fc50.
>>>
>>> The cache is implemented through a minimal list (block_cache) maintained
>>> in most-recently-used order and count of the current number of entries
>>> (cache_count). It uses a maximum block count setting to prevent copies
>>> of large block reads and an upper bound on the number of cached areas.
>>>
>>> The maximum number of entries in the cache defaults to 32 and the maximum
>>> number of blocks per cache entry has a default of 2, which has shown to
>>> produce the best results on testing of ext4 and FAT filesystems.
>>>
>>> The 'blkcache' command (enabled through CONFIG_CMD_BLOCK_CACHE) allows
>>> changing these values and can be used to tune for a particular filesystem
>>> layout.
> [snip]
>>> diff --git a/disk/part.c b/disk/part.c
>>
>>> @@ -268,6 +268,8 @@ void part_init(struct blk_desc *dev_desc)
>>>   	const int n_ents = ll_entry_count(struct part_driver, part_driver);
>>>   	struct part_driver *entry;
>>>
>>> +	blkcache_invalidate(dev_desc->if_type, dev_desc->devnum);
>>
>> Doesn't this invalidate the cache far too often? I expect that
>> function is called for command the user executes from the
>> command-line, whereas it'd be nice if the cache persisted across
>> commands. I suppose this is a reasonable (and very safe) first
>> implementation though, and saves having to go through each storage
>> provider type and find out the right place to detect media changes.
>
> My initial reaction here is that we should stay conservative and
> invalidate the cache more often rather than too infrequent.  I mean,
> what's the worst case here, an extra read? A few extra reads?  We want
> to make sure we keep the complexity to functionality ratio right here,
> if we make the recovery/flashing/factory cases a whole lot better but
> are leaving 1 second of wall clock time on the table when we've just
> gained a minute, we're OK.
>
>>> diff --git a/drivers/block/blkcache.c b/drivers/block/blkcache.c
>>
>>> +struct block_cache_node {
>>> +	struct list_head lh;
>>> +	int iftype;
>>> +	int devnum;
>>> +	lbaint_t start;
>>> +	lbaint_t blkcnt;
>>> +	unsigned long blksz;
>>> +	char *cache;
>>> +};
>>> +
>>> +static LIST_HEAD(block_cache);
>>> +
>>> +static struct block_cache_stats _stats = {
>>> +	.max_blocks_per_entry = 2,
>>> +	.max_entries = 32
>>> +};
>>
>> Now is a good time to mention another reason why I don't like using
>> a dynamically allocated linked list for this: Memory fragmentation.
>> By dynamically allocating the cache, we could easily run into a
>> situation where the user runs a command that allocates memory and
>> also adds to the block cache, then most of that memory gets freed
>> when U-Boot returns to the command prompt, then the user runs the
>> command again but it fails since it can't allocate the memory due to
>> fragmentation of the heap. This is a real problem I've seen e.g.
>> with the "ums" and "dfu" commands, since they might initialize the
>> USB controller the first time they're run, which allocates some new
>> memory. Statically allocation would avoid this.
>
> That is a good point.  But how would you hit this?  The problem in
> ums/dfu was that it was several megabytes, yes?  My quick read over the
> code right now has me thinking this is something measured in kilobytes.

The allocation that failed was a large multi-megabyte buffer. However, 
the allocations that caused the fragmentation/failure were small 
(probably tens/hundreds of bytes, but I didn't check).
Eric Nelson March 30, 2016, 5:34 p.m. UTC | #4
Thanks again for the detailed review, Stephen.

On 03/30/2016 07:36 AM, Stephen Warren wrote:
> On 03/28/2016 11:05 AM, Eric Nelson wrote:
>> Add a block device cache to speed up repeated reads of block devices by
>> various filesystems.
>>
<snip>
>> +
>> +    printf("    hits: %u\n"
>> +           "    misses: %u\n"
>> +           "    entries: %u\n"
>> +           "    max blocks/entry: %u\n"
>> +           "    max cache entries: %u\n",
> 
> Nit: I'm not sure why all that command output is indented. Perhaps it
> made sense before if some kind of title was printed before that text,
> but I don't think it is any more.
> 

Okay.

>> +    if (c)
>> +        return c->cmd(cmdtp, flag, argc, argv);
>> +    else
>> +        return CMD_RET_USAGE;
>> +
>> +    return 0;
> 
> Nit: I'd prefer to see the error handling immediately follow the
> function call whose result is being tested, and the non-failure-case
> code be not indented, i.e.:
> 
> 
> c = find_cmd_tbl(argv[0], &cmd_blkc_sub[0], ARRAY_SIZE(cmd_blkc_sub));
> if (!c)
>     return CMD_RET_USAGE;
> 
> return c->cmd(cmdtp, flag, argc, argv);
> 

Okay. As Tom pointed out, I copied this verbatim from i2c.c.

>> diff --git a/disk/part.c b/disk/part.c
> 
>> @@ -268,6 +268,8 @@ void part_init(struct blk_desc *dev_desc)
>>       const int n_ents = ll_entry_count(struct part_driver, part_driver);
>>       struct part_driver *entry;
>>
>> +    blkcache_invalidate(dev_desc->if_type, dev_desc->devnum);
> 
> Doesn't this invalidate the cache far too often? I expect that function
> is called for command the user executes from the command-line, whereas
> it'd be nice if the cache persisted across commands. I suppose this is a
> reasonable (and very safe) first implementation though, and saves having
> to go through each storage provider type and find out the right place to
> detect media changes.
> 

I'm not sure it does. I traced through the mmc initialization and it's
only called when the card itself is initialized.

IOW, at the point where we'd recognize a card swap.

>> diff --git a/drivers/block/blkcache.c b/drivers/block/blkcache.c
> 
>> +struct block_cache_node {
>> +    struct list_head lh;
>> +    int iftype;
>> +    int devnum;
>> +    lbaint_t start;
>> +    lbaint_t blkcnt;
>> +    unsigned long blksz;
>> +    char *cache;
>> +};
>> +
>> +static LIST_HEAD(block_cache);
>> +
>> +static struct block_cache_stats _stats = {
>> +    .max_blocks_per_entry = 2,
>> +    .max_entries = 32
>> +};
> 
> Now is a good time to mention another reason why I don't like using a
> dynamically allocated linked list for this: Memory fragmentation. By
> dynamically allocating the cache, we could easily run into a situation
> where the user runs a command that allocates memory and also adds to the
> block cache, then most of that memory gets freed when U-Boot returns to
> the command prompt, then the user runs the command again but it fails
> since it can't allocate the memory due to fragmentation of the heap.
> This is a real problem I've seen e.g. with the "ums" and "dfu" commands,
> since they might initialize the USB controller the first time they're
> run, which allocates some new memory. Statically allocation would avoid
> this.
> 

We're going to allocate a block or set of blocks every time we allocate
a new node for the list, so having the list in an array doesn't fix the
problem.

While re-working the code, I also thought more about using an array and
still don't see how the implementation doesn't get more complex.

The key bit is that the list is implemented in MRU order so
invalidating the oldest is trivial.

Doing this in an array would require keeping the array sorted (a bad
idea) or keeping an access sequence in each node of the array and
searching for oldest when invalidation is needed (when the max entries
limit is hit).

... unless I'm still missing something and you or Marek have something
else in mind.

I could also allocate an array of "free" nodes once the first time
around (as a buffer pool), but I don't think this is warranted because
of the block allocations I mentioned above.

>> diff --git a/include/blk.h b/include/blk.h
> 
>> +int blkcache_read
>> +    (int iftype, int dev,
>> +     lbaint_t start, lbaint_t blkcnt,
>> +     unsigned long blksz, void *buffer);
> 
> Nit: The opening ( and first n arguments should be wrapped onto the same
> line as the function name.

Aargh. You've now pointed this out several times and I keep missing it.

Will fix in V4.

Regards,


Eric
Eric Nelson March 30, 2016, 5:37 p.m. UTC | #5
Hi Tom,

On 03/30/2016 08:19 AM, Tom Rini wrote:
> On Wed, Mar 30, 2016 at 08:36:21AM -0600, Stephen Warren wrote:
>> On 03/28/2016 11:05 AM, Eric Nelson wrote:
>>> Add a block device cache to speed up repeated reads of block devices by
>>> various filesystems.
>>>
[snip]

>>> @@ -268,6 +268,8 @@ void part_init(struct blk_desc *dev_desc)
>>>  	const int n_ents = ll_entry_count(struct part_driver, part_driver);
>>>  	struct part_driver *entry;
>>>
>>> +	blkcache_invalidate(dev_desc->if_type, dev_desc->devnum);
>>
>> Doesn't this invalidate the cache far too often? I expect that
>> function is called for command the user executes from the
>> command-line, whereas it'd be nice if the cache persisted across
>> commands. I suppose this is a reasonable (and very safe) first
>> implementation though, and saves having to go through each storage
>> provider type and find out the right place to detect media changes.
> 
> My initial reaction here is that we should stay conservative and
> invalidate the cache more often rather than too infrequent.  I mean,
> what's the worst case here, an extra read? A few extra reads?  We want
> to make sure we keep the complexity to functionality ratio right here,
> if we make the recovery/flashing/factory cases a whole lot better but
> are leaving 1 second of wall clock time on the table when we've just
> gained a minute, we're OK.
> 

I don't think this is called during every command, at least not
with mmc.

>>> diff --git a/drivers/block/blkcache.c b/drivers/block/blkcache.c
>>
>>> +struct block_cache_node {
>>> +	struct list_head lh;
>>> +	int iftype;
>>> +	int devnum;
>>> +	lbaint_t start;
>>> +	lbaint_t blkcnt;
>>> +	unsigned long blksz;
>>> +	char *cache;
>>> +};
>>> +
>>> +static LIST_HEAD(block_cache);
>>> +
>>> +static struct block_cache_stats _stats = {
>>> +	.max_blocks_per_entry = 2,
>>> +	.max_entries = 32
>>> +};
>>
>> Now is a good time to mention another reason why I don't like using
>> a dynamically allocated linked list for this: Memory fragmentation.
>> By dynamically allocating the cache, we could easily run into a
>> situation where the user runs a command that allocates memory and
>> also adds to the block cache, then most of that memory gets freed
>> when U-Boot returns to the command prompt, then the user runs the
>> command again but it fails since it can't allocate the memory due to
>> fragmentation of the heap. This is a real problem I've seen e.g.
>> with the "ums" and "dfu" commands, since they might initialize the
>> USB controller the first time they're run, which allocates some new
>> memory. Statically allocation would avoid this.
> 
> That is a good point.  But how would you hit this?  The problem in
> ums/dfu was that it was several megabytes, yes?  My quick read over the
> code right now has me thinking this is something measured in kilobytes.
> 

36 bytes for the node, plus 512 bytes or 1k for the block data unless
tuned through the blkcache command.

And the block data is going to be allocated at the same time.

Regards,


Eric
Stephen Warren March 30, 2016, 9:57 p.m. UTC | #6
On 03/30/2016 11:34 AM, Eric Nelson wrote:
> Thanks again for the detailed review, Stephen.
>
> On 03/30/2016 07:36 AM, Stephen Warren wrote:
>> On 03/28/2016 11:05 AM, Eric Nelson wrote:
>>> Add a block device cache to speed up repeated reads of block devices by
>>> various filesystems.

>>> diff --git a/disk/part.c b/disk/part.c
>>
>>> @@ -268,6 +268,8 @@ void part_init(struct blk_desc *dev_desc)
>>>        const int n_ents = ll_entry_count(struct part_driver, part_driver);
>>>        struct part_driver *entry;
>>>
>>> +    blkcache_invalidate(dev_desc->if_type, dev_desc->devnum);
>>
>> Doesn't this invalidate the cache far too often? I expect that function
>> is called for command the user executes from the command-line, whereas
>> it'd be nice if the cache persisted across commands. I suppose this is a
>> reasonable (and very safe) first implementation though, and saves having
>> to go through each storage provider type and find out the right place to
>> detect media changes.
>
> I'm not sure it does. I traced through the mmc initialization and it's
> only called when the card itself is initialized.

I don't believe U-Boot caches the partition structure across user 
commands. Doesn't each user command (e.g. part list, ls, load, save) 
first look up the block device, then scan the partition table, then 
"mount" the filesystem, then perform its action, then throw all that 
state away? Conversely, "mmc rescan" only happens under explicit user 
control. Still as I said, the current implementation is probably fine to 
start with, and at least is safe.

>>> diff --git a/drivers/block/blkcache.c b/drivers/block/blkcache.c
>>
>>> +struct block_cache_node {
>>> +    struct list_head lh;
>>> +    int iftype;
>>> +    int devnum;
>>> +    lbaint_t start;
>>> +    lbaint_t blkcnt;
>>> +    unsigned long blksz;
>>> +    char *cache;
>>> +};
>>> +
>>> +static LIST_HEAD(block_cache);
>>> +
>>> +static struct block_cache_stats _stats = {
>>> +    .max_blocks_per_entry = 2,
>>> +    .max_entries = 32
>>> +};
>>
>> Now is a good time to mention another reason why I don't like using a
>> dynamically allocated linked list for this: Memory fragmentation. By
>> dynamically allocating the cache, we could easily run into a situation
>> where the user runs a command that allocates memory and also adds to the
>> block cache, then most of that memory gets freed when U-Boot returns to
>> the command prompt, then the user runs the command again but it fails
>> since it can't allocate the memory due to fragmentation of the heap.
>> This is a real problem I've seen e.g. with the "ums" and "dfu" commands,
>> since they might initialize the USB controller the first time they're
>> run, which allocates some new memory. Statically allocation would avoid
>> this.
>
> We're going to allocate a block or set of blocks every time we allocate
> a new node for the list, so having the list in an array doesn't fix the
> problem.

We could allocate the data storage for the block cache at the top of RAM 
before relocation, like many other things are allocated, and hence not 
use malloc() for that.

> While re-working the code, I also thought more about using an array and
> still don't see how the implementation doesn't get more complex.
>
> The key bit is that the list is implemented in MRU order so
> invalidating the oldest is trivial.

Yes, the MRU logic would make it more complex. Is that particularly 
useful, i.e. is it an intrinsic part of the speedup?
Eric Nelson March 31, 2016, 8:24 p.m. UTC | #7
Hi Stephen,

On 03/30/2016 02:57 PM, Stephen Warren wrote:
> On 03/30/2016 11:34 AM, Eric Nelson wrote:
>> Thanks again for the detailed review, Stephen.
>>
>> On 03/30/2016 07:36 AM, Stephen Warren wrote:
>>> On 03/28/2016 11:05 AM, Eric Nelson wrote:
>>>> Add a block device cache to speed up repeated reads of block devices by
>>>> various filesystems.
>>>> diff --git a/disk/part.c b/disk/part.c
>>>
>>>> @@ -268,6 +268,8 @@ void part_init(struct blk_desc *dev_desc)
>>>>        const int n_ents = ll_entry_count(struct part_driver,
>>>> part_driver);
>>>>        struct part_driver *entry;
>>>>
>>>> +    blkcache_invalidate(dev_desc->if_type, dev_desc->devnum);
>>>
>>> Doesn't this invalidate the cache far too often? I expect that function
>>> is called for command the user executes from the command-line, whereas
>>> it'd be nice if the cache persisted across commands. I suppose this is a
>>> reasonable (and very safe) first implementation though, and saves having
>>> to go through each storage provider type and find out the right place to
>>> detect media changes.
>>
>> I'm not sure it does. I traced through the mmc initialization and it's
>> only called when the card itself is initialized.
> 
> I don't believe U-Boot caches the partition structure across user
> commands. Doesn't each user command (e.g. part list, ls, load, save)
> first look up the block device, then scan the partition table, then
> "mount" the filesystem, then perform its action, then throw all that
> state away? Conversely, "mmc rescan" only happens under explicit user
> control. Still as I said, the current implementation is probably fine to
> start with, and at least is safe.
> 

At least for MMC, this isn't the case. Various filesystem commands
operate without calling part_init.

>>>> diff --git a/drivers/block/blkcache.c b/drivers/block/blkcache.c
>>>
>>>> +struct block_cache_node {
>>>> +    struct list_head lh;
>>>> +    int iftype;
>>>> +    int devnum;
>>>> +    lbaint_t start;
>>>> +    lbaint_t blkcnt;
>>>> +    unsigned long blksz;
>>>> +    char *cache;
>>>> +};
>>>> +
>>>> +static LIST_HEAD(block_cache);
>>>> +
>>>> +static struct block_cache_stats _stats = {
>>>> +    .max_blocks_per_entry = 2,
>>>> +    .max_entries = 32
>>>> +};
>>>
>>> Now is a good time to mention another reason why I don't like using a
>>> dynamically allocated linked list for this: Memory fragmentation. By
>>> dynamically allocating the cache, we could easily run into a situation
>>> where the user runs a command that allocates memory and also adds to the
>>> block cache, then most of that memory gets freed when U-Boot returns to
>>> the command prompt, then the user runs the command again but it fails
>>> since it can't allocate the memory due to fragmentation of the heap.
>>> This is a real problem I've seen e.g. with the "ums" and "dfu" commands,
>>> since they might initialize the USB controller the first time they're
>>> run, which allocates some new memory. Statically allocation would avoid
>>> this.
>>
>> We're going to allocate a block or set of blocks every time we allocate
>> a new node for the list, so having the list in an array doesn't fix the
>> problem.
> 
> We could allocate the data storage for the block cache at the top of RAM
> before relocation, like many other things are allocated, and hence not
> use malloc() for that.
> 

Hmmm. We seem to have gone from a discussion about data structures to
type of allocation.

I'm interested in seeing how that works. Can you provide hints about
what's doing this now?

>> While re-working the code, I also thought more about using an array and
>> still don't see how the implementation doesn't get more complex.
>>
>> The key bit is that the list is implemented in MRU order so
>> invalidating the oldest is trivial.
> 
> Yes, the MRU logic would make it more complex. Is that particularly
> useful, i.e. is it an intrinsic part of the speedup?

It's not a question of speed with small numbers of entries. The code
to handle eviction would just be more complex.

Given that the command "blkcache configure 0 0" will discard all
cache and since both dfu and ums should properly have the cache
disabled, I'd like to proceed as-is with the list and heap approach.

A follow-up change to use another form of allocation is unlikely to
change the primary interfaces, though I can't be sure until I
understand how these allocation(s) would occur.

I have a V3 prepped that addresses your other comments.

To reiterate the impact of this code, I have use cases where file
loading takes minutes when it should take seconds and suspect that
others have been seeing the same for quite some time.

Let me know your thoughts.

Regards,


Eric
Stephen Warren April 1, 2016, 10:57 p.m. UTC | #8
On 03/31/2016 02:24 PM, Eric Nelson wrote:
> Hi Stephen,
>
> On 03/30/2016 02:57 PM, Stephen Warren wrote:
>> On 03/30/2016 11:34 AM, Eric Nelson wrote:
>>> Thanks again for the detailed review, Stephen.
>>>
>>> On 03/30/2016 07:36 AM, Stephen Warren wrote:
>>>> On 03/28/2016 11:05 AM, Eric Nelson wrote:
>>>>> Add a block device cache to speed up repeated reads of block devices by
>>>>> various filesystems.
>>>>> diff --git a/disk/part.c b/disk/part.c
>>>>
>>>>> @@ -268,6 +268,8 @@ void part_init(struct blk_desc *dev_desc)
>>>>>         const int n_ents = ll_entry_count(struct part_driver,
>>>>> part_driver);
>>>>>         struct part_driver *entry;
>>>>>
>>>>> +    blkcache_invalidate(dev_desc->if_type, dev_desc->devnum);
>>>>
>>>> Doesn't this invalidate the cache far too often? I expect that function
>>>> is called for command the user executes from the command-line, whereas
>>>> it'd be nice if the cache persisted across commands. I suppose this is a
>>>> reasonable (and very safe) first implementation though, and saves having
>>>> to go through each storage provider type and find out the right place to
>>>> detect media changes.
>>>
>>> I'm not sure it does. I traced through the mmc initialization and it's
>>> only called when the card itself is initialized.
>>
>> I don't believe U-Boot caches the partition structure across user
>> commands. Doesn't each user command (e.g. part list, ls, load, save)
>> first look up the block device, then scan the partition table, then
>> "mount" the filesystem, then perform its action, then throw all that
>> state away? Conversely, "mmc rescan" only happens under explicit user
>> control. Still as I said, the current implementation is probably fine to
>> start with, and at least is safe.
>>
>
> At least for MMC, this isn't the case. Various filesystem commands
> operate without calling part_init.

Interesting; that step is indeed only performed when the device is first 
probed for MMC and USB.

>>>>> diff --git a/drivers/block/blkcache.c b/drivers/block/blkcache.c
>>>>
>>>>> +struct block_cache_node {
>>>>> +    struct list_head lh;
>>>>> +    int iftype;
>>>>> +    int devnum;
>>>>> +    lbaint_t start;
>>>>> +    lbaint_t blkcnt;
>>>>> +    unsigned long blksz;
>>>>> +    char *cache;
>>>>> +};
>>>>> +
>>>>> +static LIST_HEAD(block_cache);
>>>>> +
>>>>> +static struct block_cache_stats _stats = {
>>>>> +    .max_blocks_per_entry = 2,
>>>>> +    .max_entries = 32
>>>>> +};
>>>>
>>>> Now is a good time to mention another reason why I don't like using a
>>>> dynamically allocated linked list for this: Memory fragmentation. By
>>>> dynamically allocating the cache, we could easily run into a situation
>>>> where the user runs a command that allocates memory and also adds to the
>>>> block cache, then most of that memory gets freed when U-Boot returns to
>>>> the command prompt, then the user runs the command again but it fails
>>>> since it can't allocate the memory due to fragmentation of the heap.
>>>> This is a real problem I've seen e.g. with the "ums" and "dfu" commands,
>>>> since they might initialize the USB controller the first time they're
>>>> run, which allocates some new memory. Statically allocation would avoid
>>>> this.
>>>
>>> We're going to allocate a block or set of blocks every time we allocate
>>> a new node for the list, so having the list in an array doesn't fix the
>>> problem.
>>
>> We could allocate the data storage for the block cache at the top of RAM
>> before relocation, like many other things are allocated, and hence not
>> use malloc() for that.
>
> Hmmm. We seem to have gone from a discussion about data structures to
> type of allocation.
>
> I'm interested in seeing how that works. Can you provide hints about
> what's doing this now?

Something like common/board_f.c:reserve_mmu() and many other functions 
there. relocaddr starts at approximately the top of RAM, continually 
gets adjusted down as many static allocations are reserved, and 
eventually becomes the address that U-Boot is relocated to. Simply 
adding another entry into init_sequence_f[] for the disk cache might work.

>>> While re-working the code, I also thought more about using an array and
>>> still don't see how the implementation doesn't get more complex.
>>>
>>> The key bit is that the list is implemented in MRU order so
>>> invalidating the oldest is trivial.
>>
>> Yes, the MRU logic would make it more complex. Is that particularly
>> useful, i.e. is it an intrinsic part of the speedup?
>
> It's not a question of speed with small numbers of entries. The code
> to handle eviction would just be more complex.

My thought was that if the eviction algorithm wasn't important (i.e. 
most of the speedup comes from have some (any) kind of cache, but the 
eviction algorithm makes little difference to the gain from having the 
cache), we could just drop MRU completely. If that's not possible, then 
indeed a list would make implementing MRU easier.

You could still do a list with a statically allocated set of list nodes, 
especially since the length of the list is bounded.

> Given that the command "blkcache configure 0 0" will discard all
> cache and since both dfu and ums should properly have the cache
> disabled, I'd like to proceed as-is with the list and heap approach.

I don't understand "since both dfu and ums should properly have the 
cache disabled"; I didn't see anything that did that. Perhaps you're 
referring to the fact that writes invalidate the cache?

Eventually it seems better to keep the cache enabled for at least DFU to 
a filesystem (rather than raw block device) since presumably parsing the 
directory structure to write to a file for DFU would benefit from the 
cache just like anything else.
Eric Nelson April 1, 2016, 11:16 p.m. UTC | #9
Hi Stephen,

On 04/01/2016 03:57 PM, Stephen Warren wrote:
> On 03/31/2016 02:24 PM, Eric Nelson wrote:
>> On 03/30/2016 02:57 PM, Stephen Warren wrote:
>>> On 03/30/2016 11:34 AM, Eric Nelson wrote:
>>>> On 03/30/2016 07:36 AM, Stephen Warren wrote:
>>>>> On 03/28/2016 11:05 AM, Eric Nelson wrote:

<snip>

>>>
>>> We could allocate the data storage for the block cache at the top of RAM
>>> before relocation, like many other things are allocated, and hence not
>>> use malloc() for that.
>>
>> Hmmm. We seem to have gone from a discussion about data structures to
>> type of allocation.
>>
>> I'm interested in seeing how that works. Can you provide hints about
>> what's doing this now?
> 
> Something like common/board_f.c:reserve_mmu() and many other functions
> there. relocaddr starts at approximately the top of RAM, continually
> gets adjusted down as many static allocations are reserved, and
> eventually becomes the address that U-Boot is relocated to. Simply
> adding another entry into init_sequence_f[] for the disk cache might work.
> 

Thanks for the pointer. I'll review that when time permits.

This would remove the opportunity to re-configure the cache though, right?

I'm not sure whether how important this feature is, and I think
only time and use will tell.

I'd prefer to keep that ability at least for a cycle or two so that
I and others can test.

>>>> While re-working the code, I also thought more about using an array and
>>>> still don't see how the implementation doesn't get more complex.
>>>>
>>>> The key bit is that the list is implemented in MRU order so
>>>> invalidating the oldest is trivial.
>>>
>>> Yes, the MRU logic would make it more complex. Is that particularly
>>> useful, i.e. is it an intrinsic part of the speedup?
>>
>> It's not a question of speed with small numbers of entries. The code
>> to handle eviction would just be more complex.
> 
> My thought was that if the eviction algorithm wasn't important (i.e.
> most of the speedup comes from have some (any) kind of cache, but the
> eviction algorithm makes little difference to the gain from having the
> cache), we could just drop MRU completely. If that's not possible, then
> indeed a list would make implementing MRU easier.
> 

How would we decide which block to discard? I haven't traced enough
to know what algorithm(s) might be best, but I can say that there's
a preponderance of repeated accesses to the last-accessed block,
especially in ext4.

> You could still do a list with a statically allocated set of list nodes,
> especially since the length of the list is bounded.
> 

Sure. A pooled allocator (pool of free nodes) works well with
array-based allocation.

Having a fixed upper limit on the number of blocks would require
additional checking unless we just sized it for (max entries * max
blocks/entry).

>> Given that the command "blkcache configure 0 0" will discard all
>> cache and since both dfu and ums should properly have the cache
>> disabled, I'd like to proceed as-is with the list and heap approach.
>
> I don't understand "since both dfu and ums should properly have the
> cache disabled"; I didn't see anything that did that. Perhaps you're
> referring to the fact that writes invalidate the cache?
> 

Yes, but also that the host will cache blocks in the ums case, so
having the cache enabled will only slow things down slightly by
lots of memcpy's to cached blocks that won't be helpful.

I think I was a bit flippant by including dfu in this statement,
since I haven't used it to access anything except SPI-NOR.

> Eventually it seems better to keep the cache enabled for at least DFU to
> a filesystem (rather than raw block device) since presumably parsing the
> directory structure to write to a file for DFU would benefit from the
> cache just like anything else.

I'm not in a position to comment about dfu.

Regards,


Eric
Tom Rini April 1, 2016, 11:41 p.m. UTC | #10
On Fri, Apr 01, 2016 at 04:16:42PM -0700, Eric Nelson wrote:
> Hi Stephen,
> 
> On 04/01/2016 03:57 PM, Stephen Warren wrote:
> > On 03/31/2016 02:24 PM, Eric Nelson wrote:
> >> On 03/30/2016 02:57 PM, Stephen Warren wrote:
> >>> On 03/30/2016 11:34 AM, Eric Nelson wrote:
> >>>> On 03/30/2016 07:36 AM, Stephen Warren wrote:
> >>>>> On 03/28/2016 11:05 AM, Eric Nelson wrote:
> 
> <snip>
> 
> >>>
> >>> We could allocate the data storage for the block cache at the top of RAM
> >>> before relocation, like many other things are allocated, and hence not
> >>> use malloc() for that.
> >>
> >> Hmmm. We seem to have gone from a discussion about data structures to
> >> type of allocation.
> >>
> >> I'm interested in seeing how that works. Can you provide hints about
> >> what's doing this now?
> > 
> > Something like common/board_f.c:reserve_mmu() and many other functions
> > there. relocaddr starts at approximately the top of RAM, continually
> > gets adjusted down as many static allocations are reserved, and
> > eventually becomes the address that U-Boot is relocated to. Simply
> > adding another entry into init_sequence_f[] for the disk cache might work.
> > 
> 
> Thanks for the pointer. I'll review that when time permits.
> 
> This would remove the opportunity to re-configure the cache though, right?
> 
> I'm not sure whether how important this feature is, and I think
> only time and use will tell.
> 
> I'd prefer to keep that ability at least for a cycle or two so that
> I and others can test.
> 
> >>>> While re-working the code, I also thought more about using an array and
> >>>> still don't see how the implementation doesn't get more complex.
> >>>>
> >>>> The key bit is that the list is implemented in MRU order so
> >>>> invalidating the oldest is trivial.
> >>>
> >>> Yes, the MRU logic would make it more complex. Is that particularly
> >>> useful, i.e. is it an intrinsic part of the speedup?
> >>
> >> It's not a question of speed with small numbers of entries. The code
> >> to handle eviction would just be more complex.
> > 
> > My thought was that if the eviction algorithm wasn't important (i.e.
> > most of the speedup comes from have some (any) kind of cache, but the
> > eviction algorithm makes little difference to the gain from having the
> > cache), we could just drop MRU completely. If that's not possible, then
> > indeed a list would make implementing MRU easier.
> > 
> 
> How would we decide which block to discard? I haven't traced enough
> to know what algorithm(s) might be best, but I can say that there's
> a preponderance of repeated accesses to the last-accessed block,
> especially in ext4.
> 
> > You could still do a list with a statically allocated set of list nodes,
> > especially since the length of the list is bounded.
> > 
> 
> Sure. A pooled allocator (pool of free nodes) works well with
> array-based allocation.
> 
> Having a fixed upper limit on the number of blocks would require
> additional checking unless we just sized it for (max entries * max
> blocks/entry).
> 
> >> Given that the command "blkcache configure 0 0" will discard all
> >> cache and since both dfu and ums should properly have the cache
> >> disabled, I'd like to proceed as-is with the list and heap approach.
> >
> > I don't understand "since both dfu and ums should properly have the
> > cache disabled"; I didn't see anything that did that. Perhaps you're
> > referring to the fact that writes invalidate the cache?
> > 
> 
> Yes, but also that the host will cache blocks in the ums case, so
> having the cache enabled will only slow things down slightly by
> lots of memcpy's to cached blocks that won't be helpful.
> 
> I think I was a bit flippant by including dfu in this statement,
> since I haven't used it to access anything except SPI-NOR.
> 
> > Eventually it seems better to keep the cache enabled for at least DFU to
> > a filesystem (rather than raw block device) since presumably parsing the
> > directory structure to write to a file for DFU would benefit from the
> > cache just like anything else.
> 
> I'm not in a position to comment about dfu.

For the record, I think this discussion is good and important, but not a
blocker for getting the initial functionality in.
Tom Rini April 2, 2016, 1:59 a.m. UTC | #11
On Mon, Mar 28, 2016 at 10:05:44AM -0700, Eric Nelson wrote:

> Add a block device cache to speed up repeated reads of block devices by
> various filesystems.
> 
> This small amount of cache can dramatically speed up filesystem
> operations by skipping repeated reads of common areas of a block
> device (typically directory structures).
> 
> This has shown to have some benefit on FAT filesystem operations of
> loading a kernel and RAM disk, but more dramatic benefits on ext4
> filesystems when the kernel and/or RAM disk are spread across
> multiple extent header structures as described in commit fc0fc50.
> 
> The cache is implemented through a minimal list (block_cache) maintained
> in most-recently-used order and count of the current number of entries
> (cache_count). It uses a maximum block count setting to prevent copies
> of large block reads and an upper bound on the number of cached areas.
> 
> The maximum number of entries in the cache defaults to 32 and the maximum
> number of blocks per cache entry has a default of 2, which has shown to
> produce the best results on testing of ext4 and FAT filesystems.
> 
> The 'blkcache' command (enabled through CONFIG_CMD_BLOCK_CACHE) allows
> changing these values and can be used to tune for a particular filesystem
> layout.
> 
> Signed-off-by: Eric Nelson <eric@nelint.com>

Applied to u-boot/master, thanks!
Stephen Warren April 2, 2016, 2:07 a.m. UTC | #12
On 04/01/2016 05:16 PM, Eric Nelson wrote:
> Hi Stephen,
>
> On 04/01/2016 03:57 PM, Stephen Warren wrote:
>> On 03/31/2016 02:24 PM, Eric Nelson wrote:
>>> On 03/30/2016 02:57 PM, Stephen Warren wrote:
>>>> On 03/30/2016 11:34 AM, Eric Nelson wrote:
>>>>> On 03/30/2016 07:36 AM, Stephen Warren wrote:
>>>>>> On 03/28/2016 11:05 AM, Eric Nelson wrote:
>
> <snip>
>
>>>>
>>>> We could allocate the data storage for the block cache at the top of RAM
>>>> before relocation, like many other things are allocated, and hence not
>>>> use malloc() for that.
>>>
>>> Hmmm. We seem to have gone from a discussion about data structures to
>>> type of allocation.
>>>
>>> I'm interested in seeing how that works. Can you provide hints about
>>> what's doing this now?
>>
>> Something like common/board_f.c:reserve_mmu() and many other functions
>> there. relocaddr starts at approximately the top of RAM, continually
>> gets adjusted down as many static allocations are reserved, and
>> eventually becomes the address that U-Boot is relocated to. Simply
>> adding another entry into init_sequence_f[] for the disk cache might work.
>>
>
> Thanks for the pointer. I'll review that when time permits.
>
> This would remove the opportunity to re-configure the cache though, right?

Well, it would make it impossible to use less RAM. One could use more by 
having a mix of the initial static allocation plus some additional 
dynamic allocation, but that might get a bit painful to manage.

It might be interesting to use the MMU more and allow de-fragmentation 
of VA space. That is, assuming there's much more VA space than RAM, such 
as is true on current 64-bit architectures. Then I wouldn't dislike 
dynamic allocation so much:-)

> I'm not sure whether how important this feature is, and I think
> only time and use will tell.
>
> I'd prefer to keep that ability at least for a cycle or two so that
> I and others can test.
>
>>>>> While re-working the code, I also thought more about using an array and
>>>>> still don't see how the implementation doesn't get more complex.
>>>>>
>>>>> The key bit is that the list is implemented in MRU order so
>>>>> invalidating the oldest is trivial.
>>>>
>>>> Yes, the MRU logic would make it more complex. Is that particularly
>>>> useful, i.e. is it an intrinsic part of the speedup?
>>>
>>> It's not a question of speed with small numbers of entries. The code
>>> to handle eviction would just be more complex.
>>
>> My thought was that if the eviction algorithm wasn't important (i.e.
>> most of the speedup comes from have some (any) kind of cache, but the
>> eviction algorithm makes little difference to the gain from having the
>> cache), we could just drop MRU completely. If that's not possible, then
>> indeed a list would make implementing MRU easier.
>>
>
> How would we decide which block to discard? I haven't traced enough
> to know what algorithm(s) might be best, but I can say that there's
> a preponderance of repeated accesses to the last-accessed block,
> especially in ext4.

Perhaps just keep an index into the array, use that index any time 
something is written to the cache, and then increment it each time. 
Probably not anywhere near as optimal as MRU/LRU though.
Eric Nelson April 2, 2016, 2:17 p.m. UTC | #13
On 04/01/2016 04:41 PM, Tom Rini wrote:
> On Fri, Apr 01, 2016 at 04:16:42PM -0700, Eric Nelson wrote:
>> Hi Stephen,
>>
>> On 04/01/2016 03:57 PM, Stephen Warren wrote:
>>> On 03/31/2016 02:24 PM, Eric Nelson wrote:
>>>> On 03/30/2016 02:57 PM, Stephen Warren wrote:
>>>>> On 03/30/2016 11:34 AM, Eric Nelson wrote:
>>>>>> On 03/30/2016 07:36 AM, Stephen Warren wrote:
>>>>>>> On 03/28/2016 11:05 AM, Eric Nelson wrote:

<snip>

>> Yes, but also that the host will cache blocks in the ums case, so
>> having the cache enabled will only slow things down slightly by
>> lots of memcpy's to cached blocks that won't be helpful.
>>
>> I think I was a bit flippant by including dfu in this statement,
>> since I haven't used it to access anything except SPI-NOR.
>>
>>> Eventually it seems better to keep the cache enabled for at least DFU to
>>> a filesystem (rather than raw block device) since presumably parsing the
>>> directory structure to write to a file for DFU would benefit from the
>>> cache just like anything else.
>>
>> I'm not in a position to comment about dfu.
> 
> For the record, I think this discussion is good and important, but not a
> blocker for getting the initial functionality in.
> 

I wholeheartedly agree.

I'm learning a lot and the code's better because of Stephen's feedback.

Regards,


Eric
Eric Nelson April 2, 2016, 2:19 p.m. UTC | #14
Hi Tom,

On 04/01/2016 06:59 PM, Tom Rini wrote:
> On Mon, Mar 28, 2016 at 10:05:44AM -0700, Eric Nelson wrote:
> 
>> Add a block device cache to speed up repeated reads of block devices by
>> various filesystems.
>>
>> This small amount of cache can dramatically speed up filesystem
>> operations by skipping repeated reads of common areas of a block
>> device (typically directory structures).
>>
>> This has shown to have some benefit on FAT filesystem operations of
>> loading a kernel and RAM disk, but more dramatic benefits on ext4
>> filesystems when the kernel and/or RAM disk are spread across
>> multiple extent header structures as described in commit fc0fc50.
>>
>> The cache is implemented through a minimal list (block_cache) maintained
>> in most-recently-used order and count of the current number of entries
>> (cache_count). It uses a maximum block count setting to prevent copies
>> of large block reads and an upper bound on the number of cached areas.
>>
>> The maximum number of entries in the cache defaults to 32 and the maximum
>> number of blocks per cache entry has a default of 2, which has shown to
>> produce the best results on testing of ext4 and FAT filesystems.
>>
>> The 'blkcache' command (enabled through CONFIG_CMD_BLOCK_CACHE) allows
>> changing these values and can be used to tune for a particular filesystem
>> layout.
>>
>> Signed-off-by: Eric Nelson <eric@nelint.com>
> 
> Applied to u-boot/master, thanks!
> 

Whoops. I have a couple of minor updates from Stephen's last review that
I'll forward shortly.
Eric Nelson April 2, 2016, 2:24 p.m. UTC | #15
Hi Stephen,

On 04/01/2016 07:07 PM, Stephen Warren wrote:
> On 04/01/2016 05:16 PM, Eric Nelson wrote:
>> On 04/01/2016 03:57 PM, Stephen Warren wrote:
>>> On 03/31/2016 02:24 PM, Eric Nelson wrote:
>>>> On 03/30/2016 02:57 PM, Stephen Warren wrote:
>>>>> On 03/30/2016 11:34 AM, Eric Nelson wrote:
>>>>>> On 03/30/2016 07:36 AM, Stephen Warren wrote:
>>>>>>> On 03/28/2016 11:05 AM, Eric Nelson wrote:
>>
>> <snip>
>>
>>>>>
>>>>> We could allocate the data storage for the block cache at the top
>>>>> of RAM
>>>>> before relocation, like many other things are allocated, and hence not
>>>>> use malloc() for that.
>>>>
>>>> Hmmm. We seem to have gone from a discussion about data structures to
>>>> type of allocation.
>>>>
>>>> I'm interested in seeing how that works. Can you provide hints about
>>>> what's doing this now?
>>>
>>> Something like common/board_f.c:reserve_mmu() and many other functions
>>> there. relocaddr starts at approximately the top of RAM, continually
>>> gets adjusted down as many static allocations are reserved, and
>>> eventually becomes the address that U-Boot is relocated to. Simply
>>> adding another entry into init_sequence_f[] for the disk cache might
>>> work.
>>>
>>
>> Thanks for the pointer. I'll review that when time permits.
>>
>> This would remove the opportunity to re-configure the cache though,
>> right?
> 
> Well, it would make it impossible to use less RAM. One could use more by
> having a mix of the initial static allocation plus some additional
> dynamic allocation, but that might get a bit painful to manage.
> 

This might not be too bad though. Even if we allocated 4x the current
defaults, we're only at ~64k.

> It might be interesting to use the MMU more and allow de-fragmentation
> of VA space. That is, assuming there's much more VA space than RAM, such
> as is true on current 64-bit architectures. Then I wouldn't dislike
> dynamic allocation so much:-)
> 

That's interesting, but probably more invasive than this patch set.

>> I'm not sure whether how important this feature is, and I think
>> only time and use will tell.
>>
>> I'd prefer to keep that ability at least for a cycle or two so that
>> I and others can test.
>>
>>>>>> While re-working the code, I also thought more about using an
>>>>>> array and
>>>>>> still don't see how the implementation doesn't get more complex.
>>>>>>
>>>>>> The key bit is that the list is implemented in MRU order so
>>>>>> invalidating the oldest is trivial.
>>>>>
>>>>> Yes, the MRU logic would make it more complex. Is that particularly
>>>>> useful, i.e. is it an intrinsic part of the speedup?
>>>>
>>>> It's not a question of speed with small numbers of entries. The code
>>>> to handle eviction would just be more complex.
>>>
>>> My thought was that if the eviction algorithm wasn't important (i.e.
>>> most of the speedup comes from have some (any) kind of cache, but the
>>> eviction algorithm makes little difference to the gain from having the
>>> cache), we could just drop MRU completely. If that's not possible, then
>>> indeed a list would make implementing MRU easier.
>>>
>>
>> How would we decide which block to discard? I haven't traced enough
>> to know what algorithm(s) might be best, but I can say that there's
>> a preponderance of repeated accesses to the last-accessed block,
>> especially in ext4.
> 
> Perhaps just keep an index into the array, use that index any time
> something is written to the cache, and then increment it each time.
> Probably not anywhere near as optimal as MRU/LRU though.

I see that Tom just applied V3, so I'd be interested in seeing
patches on top of that.

Regards,


Eric
Eric Nelson April 2, 2016, 2:37 p.m. UTC | #16
This set of updates addresses a number of small coding style
issues caught by Stephen Warren's review of the block cache
patch set.

Eric Nelson (3):
  cmd: blkcache: remove indentation from output of 'show'
  cmd: blkcache: simplify sub-command handling
  drivers: block: formatting: fix placement of function parameters

 cmd/blkcache.c | 16 +++++++---------
 include/blk.h  | 34 ++++++++++++++--------------------
 2 files changed, 21 insertions(+), 29 deletions(-)
diff mbox

Patch

diff --git a/cmd/Kconfig b/cmd/Kconfig
index 7cdff04..11106af 100644
--- a/cmd/Kconfig
+++ b/cmd/Kconfig
@@ -485,6 +485,17 @@  config SYS_AMBAPP_PRINT_ON_STARTUP
 	help
 	  Show AMBA Plug-n-Play information on startup.
 
+config CMD_BLOCK_CACHE
+	bool "blkcache - control and stats for block cache"
+	depends on BLOCK_CACHE
+	default y if BLOCK_CACHE
+	help
+	  Enable the blkcache command, which can be used to control the
+	  operation of the cache functions.
+	  This is most useful when fine-tuning the operation of the cache
+	  during development, but also allows the cache to be disabled when
+	  it might hurt performance (e.g. when using the ums command).
+
 config CMD_TIME
 	bool "time"
 	help
diff --git a/cmd/Makefile b/cmd/Makefile
index 7604621..ba04197 100644
--- a/cmd/Makefile
+++ b/cmd/Makefile
@@ -20,6 +20,7 @@  obj-$(CONFIG_SOURCE) += source.o
 obj-$(CONFIG_CMD_SOURCE) += source.o
 obj-$(CONFIG_CMD_BDI) += bdinfo.o
 obj-$(CONFIG_CMD_BEDBUG) += bedbug.o
+obj-$(CONFIG_CMD_BLOCK_CACHE) += blkcache.o
 obj-$(CONFIG_CMD_BMP) += bmp.o
 obj-$(CONFIG_CMD_BOOTEFI) += bootefi.o
 obj-$(CONFIG_CMD_BOOTMENU) += bootmenu.o
diff --git a/cmd/blkcache.c b/cmd/blkcache.c
new file mode 100644
index 0000000..725163a
--- /dev/null
+++ b/cmd/blkcache.c
@@ -0,0 +1,90 @@ 
+/*
+ * Copyright (C) Nelson Integration, LLC 2016
+ * Author: Eric Nelson<eric@nelint.com>
+ *
+ * SPDX-License-Identifier:	GPL-2.0+
+ *
+ */
+#include <config.h>
+#include <common.h>
+#include <malloc.h>
+#include <part.h>
+
+static int blkc_show(cmd_tbl_t *cmdtp, int flag,
+		     int argc, char * const argv[])
+{
+	struct block_cache_stats stats;
+	blkcache_stats(&stats);
+
+	printf("    hits: %u\n"
+	       "    misses: %u\n"
+	       "    entries: %u\n"
+	       "    max blocks/entry: %u\n"
+	       "    max cache entries: %u\n",
+	       stats.hits, stats.misses, stats.entries,
+	       stats.max_blocks_per_entry, stats.max_entries);
+	return 0;
+}
+
+static int blkc_configure(cmd_tbl_t *cmdtp, int flag,
+			  int argc, char * const argv[])
+{
+	unsigned blocks_per_entry, max_entries;
+	if (argc != 3)
+		return CMD_RET_USAGE;
+
+	blocks_per_entry = simple_strtoul(argv[1], 0, 0);
+	max_entries = simple_strtoul(argv[2], 0, 0);
+	blkcache_configure(blocks_per_entry, max_entries);
+	printf("changed to max of %u entries of %u blocks each\n",
+	       max_entries, blocks_per_entry);
+	return 0;
+}
+
+static cmd_tbl_t cmd_blkc_sub[] = {
+	U_BOOT_CMD_MKENT(show, 0, 0, blkc_show, "", ""),
+	U_BOOT_CMD_MKENT(configure, 3, 0, blkc_configure, "", ""),
+};
+
+static __maybe_unused void blkc_reloc(void)
+{
+	static int relocated;
+
+	if (!relocated) {
+		fixup_cmdtable(cmd_blkc_sub, ARRAY_SIZE(cmd_blkc_sub));
+		relocated = 1;
+	};
+}
+
+static int do_blkcache(cmd_tbl_t *cmdtp, int flag,
+		       int argc, char * const argv[])
+{
+	cmd_tbl_t *c;
+
+#ifdef CONFIG_NEEDS_MANUAL_RELOC
+	blkc_reloc();
+#endif
+	if (argc < 2)
+		return CMD_RET_USAGE;
+
+	/* Strip off leading argument */
+	argc--;
+	argv++;
+
+	c = find_cmd_tbl(argv[0], &cmd_blkc_sub[0], ARRAY_SIZE(cmd_blkc_sub));
+
+	if (c)
+		return c->cmd(cmdtp, flag, argc, argv);
+	else
+		return CMD_RET_USAGE;
+
+	return 0;
+}
+
+U_BOOT_CMD(
+	blkcache, 4, 0, do_blkcache,
+	"block cache diagnostics and control",
+	"show - show and reset statistics\n"
+	"blkcache configure blocks entries\n"
+);
+
diff --git a/disk/part.c b/disk/part.c
index 67d98fe..0aff954 100644
--- a/disk/part.c
+++ b/disk/part.c
@@ -268,6 +268,8 @@  void part_init(struct blk_desc *dev_desc)
 	const int n_ents = ll_entry_count(struct part_driver, part_driver);
 	struct part_driver *entry;
 
+	blkcache_invalidate(dev_desc->if_type, dev_desc->devnum);
+
 	dev_desc->part_type = PART_TYPE_UNKNOWN;
 	for (entry = drv; entry != drv + n_ents; entry++) {
 		int ret;
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index f35c4d4..fcc9ccd 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -18,3 +18,12 @@  config DISK
 	  types can use this, such as AHCI/SATA. It does not provide any standard
 	  operations at present. The block device interface has not been converted
 	  to driver model.
+
+config BLOCK_CACHE
+	bool "Use block device cache"
+	default n
+	help
+	  This option enables a disk-block cache for all block devices.
+	  This is most useful when accessing filesystems under U-Boot since
+	  it will prevent repeated reads from directory structures and other
+	  filesystem data structures.
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index b5c7ae1..b4cbb09 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -24,3 +24,4 @@  obj-$(CONFIG_IDE_SIL680) += sil680.o
 obj-$(CONFIG_SANDBOX) += sandbox.o
 obj-$(CONFIG_SCSI_SYM53C8XX) += sym53c8xx.o
 obj-$(CONFIG_SYSTEMACE) += systemace.o
+obj-$(CONFIG_BLOCK_CACHE) += blkcache.o
diff --git a/drivers/block/blk-uclass.c b/drivers/block/blk-uclass.c
index 49df2a6..617db22 100644
--- a/drivers/block/blk-uclass.c
+++ b/drivers/block/blk-uclass.c
@@ -80,11 +80,20 @@  unsigned long blk_dread(struct blk_desc *block_dev, lbaint_t start,
 {
 	struct udevice *dev = block_dev->bdev;
 	const struct blk_ops *ops = blk_get_ops(dev);
+	ulong blks_read;
 
 	if (!ops->read)
 		return -ENOSYS;
 
-	return ops->read(dev, start, blkcnt, buffer);
+	if (blkcache_read(block_dev->if_type, block_dev->devnum,
+			  start, blkcnt, block_dev->blksz, buffer))
+		return blkcnt;
+	blks_read = ops->read(dev, start, blkcnt, buffer);
+	if (blks_read == blkcnt)
+		blkcache_fill(block_dev->if_type, block_dev->devnum,
+			      start, blkcnt, block_dev->blksz, buffer);
+
+	return blks_read;
 }
 
 unsigned long blk_dwrite(struct blk_desc *block_dev, lbaint_t start,
@@ -96,6 +105,7 @@  unsigned long blk_dwrite(struct blk_desc *block_dev, lbaint_t start,
 	if (!ops->write)
 		return -ENOSYS;
 
+	blkcache_invalidate(block_dev->if_type, block_dev->devnum);
 	return ops->write(dev, start, blkcnt, buffer);
 }
 
@@ -108,6 +118,7 @@  unsigned long blk_derase(struct blk_desc *block_dev, lbaint_t start,
 	if (!ops->erase)
 		return -ENOSYS;
 
+	blkcache_invalidate(block_dev->if_type, block_dev->devnum);
 	return ops->erase(dev, start, blkcnt);
 }
 
diff --git a/drivers/block/blkcache.c b/drivers/block/blkcache.c
new file mode 100644
index 0000000..46a6059
--- /dev/null
+++ b/drivers/block/blkcache.c
@@ -0,0 +1,173 @@ 
+/*
+ * Copyright (C) Nelson Integration, LLC 2016
+ * Author: Eric Nelson<eric@nelint.com>
+ *
+ * SPDX-License-Identifier:	GPL-2.0+
+ *
+ */
+#include <config.h>
+#include <common.h>
+#include <malloc.h>
+#include <part.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+
+struct block_cache_node {
+	struct list_head lh;
+	int iftype;
+	int devnum;
+	lbaint_t start;
+	lbaint_t blkcnt;
+	unsigned long blksz;
+	char *cache;
+};
+
+static LIST_HEAD(block_cache);
+
+static struct block_cache_stats _stats = {
+	.max_blocks_per_entry = 2,
+	.max_entries = 32
+};
+
+static struct block_cache_node *cache_find(int iftype, int devnum,
+					   lbaint_t start, lbaint_t blkcnt,
+					   unsigned long blksz)
+{
+	struct block_cache_node *node;
+
+	list_for_each_entry(node, &block_cache, lh)
+		if ((node->iftype == iftype) &&
+		    (node->devnum == devnum) &&
+		    (node->blksz == blksz) &&
+		    (node->start <= start) &&
+		    (node->start + node->blkcnt >= start + blkcnt)) {
+			if (block_cache.next != &node->lh) {
+				/* maintain MRU ordering */
+				list_del(&node->lh);
+				list_add(&node->lh, &block_cache);
+			}
+			return node;
+		}
+	return 0;
+}
+
+int blkcache_read(int iftype, int devnum,
+		  lbaint_t start, lbaint_t blkcnt,
+		  unsigned long blksz, void *buffer)
+{
+	struct block_cache_node *node = cache_find(iftype, devnum, start,
+						   blkcnt, blksz);
+	if (node) {
+		const char *src = node->cache + (start - node->start) * blksz;
+		memcpy(buffer, src, blksz * blkcnt);
+		debug("hit: start " LBAF ", count " LBAFU "\n",
+		      start, blkcnt);
+		++_stats.hits;
+		return 1;
+	}
+
+	debug("miss: start " LBAF ", count " LBAFU "\n",
+	      start, blkcnt);
+	++_stats.misses;
+	return 0;
+}
+
+void blkcache_fill(int iftype, int devnum,
+		   lbaint_t start, lbaint_t blkcnt,
+		   unsigned long blksz, void const *buffer)
+{
+	lbaint_t bytes;
+	struct block_cache_node *node;
+
+	/* don't cache big stuff */
+	if (blkcnt > _stats.max_blocks_per_entry)
+		return;
+
+	if (_stats.max_entries == 0)
+		return;
+
+	bytes = blksz * blkcnt;
+	if (_stats.max_entries <= _stats.entries) {
+		/* pop LRU */
+		node = (struct block_cache_node *)block_cache.prev;
+		list_del(&node->lh);
+		_stats.entries--;
+		debug("drop: start " LBAF ", count " LBAFU "\n",
+		      node->start, node->blkcnt);
+		if (node->blkcnt * node->blksz < bytes) {
+			free(node->cache);
+			node->cache = 0;
+		}
+	} else {
+		node = malloc(sizeof(*node));
+		if (!node)
+			return;
+		node->cache = 0;
+	}
+
+	if (!node->cache) {
+		node->cache = malloc(bytes);
+		if (!node->cache) {
+			free(node);
+			return;
+		}
+	}
+
+	debug("fill: start " LBAF ", count " LBAFU "\n",
+	      start, blkcnt);
+
+	node->iftype = iftype;
+	node->devnum = devnum;
+	node->start = start;
+	node->blkcnt = blkcnt;
+	node->blksz = blksz;
+	memcpy(node->cache, buffer, bytes);
+	list_add(&node->lh, &block_cache);
+	_stats.entries++;
+}
+
+void blkcache_invalidate(int iftype, int devnum)
+{
+	struct list_head *entry, *n;
+	struct block_cache_node *node;
+
+	list_for_each_safe(entry, n, &block_cache) {
+		node = (struct block_cache_node *)entry;
+		if ((node->iftype == iftype) &&
+		    (node->devnum == devnum)) {
+			list_del(entry);
+			free(node->cache);
+			free(node);
+			--_stats.entries;
+		}
+	}
+}
+
+void blkcache_configure(unsigned blocks, unsigned entries)
+{
+	struct block_cache_node *node;
+	if ((blocks != _stats.max_blocks_per_entry) ||
+	    (entries != _stats.max_entries)) {
+		/* invalidate cache */
+		while (!list_empty(&block_cache)) {
+			node = (struct block_cache_node *)block_cache.next;
+			list_del(&node->lh);
+			free(node->cache);
+			free(node);
+		}
+		_stats.entries = 0;
+	}
+
+	_stats.max_blocks_per_entry = blocks;
+	_stats.max_entries = entries;
+
+	_stats.hits = 0;
+	_stats.misses = 0;
+}
+
+void blkcache_stats(struct block_cache_stats *stats)
+{
+	memcpy(stats, &_stats, sizeof(*stats));
+	_stats.hits = 0;
+	_stats.misses = 0;
+}
diff --git a/include/blk.h b/include/blk.h
index e83c144..263a791 100644
--- a/include/blk.h
+++ b/include/blk.h
@@ -83,6 +83,97 @@  struct blk_desc {
 #define PAD_TO_BLOCKSIZE(size, blk_desc) \
 	(PAD_SIZE(size, blk_desc->blksz))
 
+#ifdef CONFIG_BLOCK_CACHE
+/**
+ * blkcache_read() - attempt to read a set of blocks from cache
+ *
+ * @param iftype - IF_TYPE_x for type of device
+ * @param dev - device index of particular type
+ * @param start - starting block number
+ * @param blkcnt - number of blocks to read
+ * @param blksz - size in bytes of each block
+ * @param buf - buffer to contain cached data
+ *
+ * @return - '1' if block returned from cache, '0' otherwise.
+ */
+int blkcache_read
+	(int iftype, int dev,
+	 lbaint_t start, lbaint_t blkcnt,
+	 unsigned long blksz, void *buffer);
+
+/**
+ * blkcache_fill() - make data read from a block device available
+ * to the block cache
+ *
+ * @param iftype - IF_TYPE_x for type of device
+ * @param dev - device index of particular type
+ * @param start - starting block number
+ * @param blkcnt - number of blocks available
+ * @param blksz - size in bytes of each block
+ * @param buf - buffer containing data to cache
+ *
+ */
+void blkcache_fill
+	(int iftype, int dev,
+	 lbaint_t start, lbaint_t blkcnt,
+	 unsigned long blksz, void const *buffer);
+
+/**
+ * blkcache_invalidate() - discard the cache for a set of blocks
+ * because of a write or device (re)initialization.
+ *
+ * @param iftype - IF_TYPE_x for type of device
+ * @param dev - device index of particular type
+ */
+void blkcache_invalidate
+	(int iftype, int dev);
+
+/**
+ * blkcache_configure() - configure block cache
+ *
+ * @param blocks - maximum blocks per entry
+ * @param entries - maximum entries in cache
+ */
+void blkcache_configure(unsigned blocks, unsigned entries);
+
+/*
+ * statistics of the block cache
+ */
+struct block_cache_stats {
+	unsigned hits;
+	unsigned misses;
+	unsigned entries; /* current entry count */
+	unsigned max_blocks_per_entry;
+	unsigned max_entries;
+};
+
+/**
+ * get_blkcache_stats() - return statistics and reset
+ *
+ * @param stats - statistics are copied here
+ */
+void blkcache_stats(struct block_cache_stats *stats);
+
+#else
+
+static inline int blkcache_read
+	(int iftype, int dev,
+	 lbaint_t start, lbaint_t blkcnt,
+	 unsigned long blksz, void *buffer)
+{
+	return 0;
+}
+
+static inline void blkcache_fill
+	(int iftype, int dev,
+	 lbaint_t start, lbaint_t blkcnt,
+	 unsigned long blksz, void const *buffer) {}
+
+static inline void blkcache_invalidate
+	(int iftype, int dev) {}
+
+#endif
+
 #ifdef CONFIG_BLK
 struct udevice;
 
@@ -224,23 +315,35 @@  int blk_unbind_all(int if_type);
 static inline ulong blk_dread(struct blk_desc *block_dev, lbaint_t start,
 			      lbaint_t blkcnt, void *buffer)
 {
+	ulong blks_read;
+	if (blkcache_read(block_dev->if_type, block_dev->devnum,
+			  start, blkcnt, block_dev->blksz, buffer))
+		return blkcnt;
+
 	/*
 	 * We could check if block_read is NULL and return -ENOSYS. But this
 	 * bloats the code slightly (cause some board to fail to build), and
 	 * it would be an error to try an operation that does not exist.
 	 */
-	return block_dev->block_read(block_dev, start, blkcnt, buffer);
+	blks_read = block_dev->block_read(block_dev, start, blkcnt, buffer);
+	if (blks_read == blkcnt)
+		blkcache_fill(block_dev->if_type, block_dev->devnum,
+			      start, blkcnt, block_dev->blksz, buffer);
+
+	return blks_read;
 }
 
 static inline ulong blk_dwrite(struct blk_desc *block_dev, lbaint_t start,
 			       lbaint_t blkcnt, const void *buffer)
 {
+	blkcache_invalidate(block_dev->if_type, block_dev->devnum);
 	return block_dev->block_write(block_dev, start, blkcnt, buffer);
 }
 
 static inline ulong blk_derase(struct blk_desc *block_dev, lbaint_t start,
 			       lbaint_t blkcnt)
 {
+	blkcache_invalidate(block_dev->if_type, block_dev->devnum);
 	return block_dev->block_erase(block_dev, start, blkcnt);
 }
 #endif /* !CONFIG_BLK */