Patchwork [RFC,v2,05/10] vfs: introduce one hash table

login
register
mail settings
Submitter Zhiyong Wu
Date Sept. 23, 2012, 12:56 p.m.
Message ID <1348404995-14372-6-git-send-email-zwu.kernel@gmail.com>
Download mbox | patch
Permalink /patch/186224/
State Not Applicable
Headers show

Comments

Zhiyong Wu - Sept. 23, 2012, 12:56 p.m.
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  Adds a hash table structure which contains
a lot of hash list and is used to efficiently
look up the data temperature of a file or its
ranges.
  In each hash list of hash table, the hash node
will keep track of temperature info.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/hot_tracking.c            |   77 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/hot_tracking.h |   35 +++++++++++++++++++
 2 files changed, 110 insertions(+), 2 deletions(-)
Ram Pai - Sept. 25, 2012, 9:54 a.m.
On Sun, Sep 23, 2012 at 08:56:30PM +0800, zwu.kernel@gmail.com wrote:
> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> 
>   Adds a hash table structure which contains
> a lot of hash list and is used to efficiently
> look up the data temperature of a file or its
> ranges.
>   In each hash list of hash table, the hash node
> will keep track of temperature info.
> 
> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> ---
>  fs/hot_tracking.c            |   77 ++++++++++++++++++++++++++++++++++++++++-
>  include/linux/hot_tracking.h |   35 +++++++++++++++++++
>  2 files changed, 110 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
> index fa89f70..5f96442 100644
> --- a/fs/hot_tracking.c
> +++ b/fs/hot_tracking.c
> @@ -16,6 +16,7 @@
>  #include <linux/module.h>
>  #include <linux/spinlock.h>
>  #include <linux/hardirq.h>
> +#include <linux/hash.h>
>  #include <linux/fs.h>
>  #include <linux/blkdev.h>
>  #include <linux/types.h>
> @@ -24,6 +25,9 @@

...snip...

> +/* Hash list heads for hot hash table */
> +struct hot_hash_head {
> +	struct hlist_head hashhead;
> +	rwlock_t rwlock;
> +	u32 temperature;
> +};
> +
> +/* Nodes stored in each hash list of hash table */
> +struct hot_hash_node {
> +	struct hlist_node hashnode;
> +	struct list_head node;
> +	struct hot_freq_data *hot_freq_data;
> +	struct hot_hash_head *hlist;
> +	spinlock_t lock; /* protects hlist */
> +
> +	/*
> +	 * number of references to this node
> +	 * equals 1 (hashlist entry)
> +	 */
> +	struct kref refs;
> +};

Dont see why you need yet another datastructure to hash the inode_item
and the range_item into a hash list.  You can just add another
hlist_node in the inode_item and range_item. This field can be then used
to link into the corresponding hash list.

You can use the container_of() get to the inode_item or the range_item
using the hlist_node field. 

You can thus eliminate a lot of code.

> +
>  /* An item representing an inode and its access frequency */
>  struct hot_inode_item {
>  	/* node for hot_inode_tree rb_tree */
> @@ -68,6 +93,8 @@ struct hot_inode_item {
>  	spinlock_t lock;
>  	/* prevents kfree */
>  	struct kref refs;
> +	/* hashlist node for this inode */
> +	struct hot_hash_node *heat_node;

this can be just
	struct hlist_node head_node; /* lookup hot_inode hash list */

Use this field to link it into the corresponding hashlist.

>  };
> 
this can be just 
>  /*
> @@ -91,6 +118,8 @@ struct hot_range_item {
>  	spinlock_t lock;
>  	/* prevents kfree */
>  	struct kref refs;
> +	/* hashlist node for this range */
> +	struct hot_hash_node *heat_node;

this can be just 
	struct hlist_node head_node; /* lookup hot_range hash list */


>  };
> 
>  struct hot_info {
> @@ -98,6 +127,12 @@ struct hot_info {
> 
>  	/* red-black tree that keeps track of fs-wide hot data */
>  	struct hot_inode_tree hot_inode_tree;
> +
> +	/* hash map of inode temperature */
> +	struct hot_hash_head heat_inode_hl[HEAT_HASH_SIZE];
> +
> +	/* hash map of range temperature */
> +	struct hot_hash_head heat_range_hl[HEAT_HASH_SIZE];
>  };
> 
>  #endif  /* _LINUX_HOTTRACK_H */

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhiyong Wu - Sept. 26, 2012, 4:08 a.m.
On Tue, Sep 25, 2012 at 5:54 PM, Ram Pai <linuxram@us.ibm.com> wrote:
> On Sun, Sep 23, 2012 at 08:56:30PM +0800, zwu.kernel@gmail.com wrote:
>> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>
>>   Adds a hash table structure which contains
>> a lot of hash list and is used to efficiently
>> look up the data temperature of a file or its
>> ranges.
>>   In each hash list of hash table, the hash node
>> will keep track of temperature info.
>>
>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>> ---
>>  fs/hot_tracking.c            |   77 ++++++++++++++++++++++++++++++++++++++++-
>>  include/linux/hot_tracking.h |   35 +++++++++++++++++++
>>  2 files changed, 110 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
>> index fa89f70..5f96442 100644
>> --- a/fs/hot_tracking.c
>> +++ b/fs/hot_tracking.c
>> @@ -16,6 +16,7 @@
>>  #include <linux/module.h>
>>  #include <linux/spinlock.h>
>>  #include <linux/hardirq.h>
>> +#include <linux/hash.h>
>>  #include <linux/fs.h>
>>  #include <linux/blkdev.h>
>>  #include <linux/types.h>
>> @@ -24,6 +25,9 @@
>
> ...snip...
>
>> +/* Hash list heads for hot hash table */
>> +struct hot_hash_head {
>> +     struct hlist_head hashhead;
>> +     rwlock_t rwlock;
>> +     u32 temperature;
>> +};
>> +
>> +/* Nodes stored in each hash list of hash table */
>> +struct hot_hash_node {
>> +     struct hlist_node hashnode;
>> +     struct list_head node;
>> +     struct hot_freq_data *hot_freq_data;
>> +     struct hot_hash_head *hlist;
>> +     spinlock_t lock; /* protects hlist */
>> +
>> +     /*
>> +      * number of references to this node
>> +      * equals 1 (hashlist entry)
>> +      */
>> +     struct kref refs;
>> +};
>
> Dont see why you need yet another datastructure to hash the inode_item
> and the range_item into a hash list.  You can just add another
If there's such one structure, we can rapidly know which hash bucket
one inode_item is currently linking to via its hlist field.
This is useful if one inode_item corresponding to one file need to
move from one bucket to another bucket when this file get cold or
hotter.
Does it make sense to you?

> hlist_node in the inode_item and range_item. This field can be then used
> to link into the corresponding hash list.
>
> You can use the container_of() get to the inode_item or the range_item
> using the hlist_node field.
>
> You can thus eliminate a lot of code.
As what i said above, i don't think it can eliminate a lo of code.
>
>> +
>>  /* An item representing an inode and its access frequency */
>>  struct hot_inode_item {
>>       /* node for hot_inode_tree rb_tree */
>> @@ -68,6 +93,8 @@ struct hot_inode_item {
>>       spinlock_t lock;
>>       /* prevents kfree */
>>       struct kref refs;
>> +     /* hashlist node for this inode */
>> +     struct hot_hash_node *heat_node;
>
> this can be just
>         struct hlist_node head_node; /* lookup hot_inode hash list */
>
> Use this field to link it into the corresponding hashlist.
>
>>  };
>>
> this can be just
>>  /*
>> @@ -91,6 +118,8 @@ struct hot_range_item {
>>       spinlock_t lock;
>>       /* prevents kfree */
>>       struct kref refs;
>> +     /* hashlist node for this range */
>> +     struct hot_hash_node *heat_node;
>
> this can be just
>         struct hlist_node head_node; /* lookup hot_range hash list */
>
>
>>  };
>>
>>  struct hot_info {
>> @@ -98,6 +127,12 @@ struct hot_info {
>>
>>       /* red-black tree that keeps track of fs-wide hot data */
>>       struct hot_inode_tree hot_inode_tree;
>> +
>> +     /* hash map of inode temperature */
>> +     struct hot_hash_head heat_inode_hl[HEAT_HASH_SIZE];
>> +
>> +     /* hash map of range temperature */
>> +     struct hot_hash_head heat_range_hl[HEAT_HASH_SIZE];
>>  };
>>
>>  #endif  /* _LINUX_HOTTRACK_H */
>
Dave Chinner - Sept. 27, 2012, 3:43 a.m.
On Sun, Sep 23, 2012 at 08:56:30PM +0800, zwu.kernel@gmail.com wrote:
> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> 
>   Adds a hash table structure which contains
> a lot of hash list and is used to efficiently
> look up the data temperature of a file or its
> ranges.
>   In each hash list of hash table, the hash node
> will keep track of temperature info.

So, let me see if I've got the relationship straight:

- sb->s_hot_info.hot_inode_tree indexes hot_inode_items, one per inode

- hot_inode_item contains access frequency data for that inode

- hot_inode_item holds a heat hash node to index the access
  frequency data for that inode

- hot_inode_item.hot_range_tree indexes hot_range_items for that inode

- hot_range_item contains access frequency data for that range

- hot_range_item holds a heat hash node to index the access
  frequency data for that range

- sb->s_hot_info.heat_inode_hl indexes per-inode heat hash nodes

- sb->s_hot_info.heat_range_hl indexes per-range heat hash nodes

How about some ascii art? :) Just looking at the hot inode item case
(the range item case is the same pattern, though), we have:


heat_inode_hl		hot_inode_tree
    |			      |
    |			      V
    |		+-------hot_inode_item-------+
+---+		|	frequency data       |
|		V            ^               V
| ...<--hot_inode_item-->... |	  ...<--hot_inode_item-->....
|	frequency data	     |		frequency data
|		^	     |		     ^
|		|	     |		     |
|		|	     |		     |
+------>hot_hash_node-->hot_hash_node-->hot_hash_node-->....


There's no actual data stored in the hot_hash_node, just pointer
back to the frequency data, a hlist_node and a pointer to the
hashlist head. IOWs, I agree with Ram that this does not need to
exist and just embedding a hlist_node inside the hot_inode_item is
all that is needed. i.e:

heat_inode_hl		hot_inode_tree
    |			      |
    |			      V
    |		+-------hot_inode_item-------+
    |		|	frequency data       |
+---+		|	hlist_node           |
|		V            ^ |             V
| ...<--hot_inode_item-->... | |  ...<--hot_inode_item-->....
|	frequency data	     | |	frequency data
+------>hlist_node-----------+ +------->hlist_node--->.....

There's no need for separate allocations, initialisations, locks and
reference counting - all that is already in the hot_inode_item. The
items have the same lifecycle limitations - a hot_hash_node must be
torn down before the frequency data it points to is freed. Finally,
there's no difference in how you move it between lists.

Indeed, calling it a hash is wrong - there's not hashing at all
- it keeping an array of list where each entry corresponds to a
specific temperature. It is a *heat map*, not a hash list. i.e.
inode_heat_map, not heat_inode_hl. HEAT_MAP_SIZE, not HASH_SIZE.

As it is, there aren't any users of the heat maps that are generated
in this patch set - it's not even exported to userspace or to
debugfs, so I'm not sure how it will be used yet. How are these heat
maps going to be used by filesystems, Zhi?

Cheers,

Dave.
Zhiyong Wu - Sept. 27, 2012, 6:23 a.m.
On Thu, Sep 27, 2012 at 11:43 AM, Dave Chinner <david@fromorbit.com> wrote:
> On Sun, Sep 23, 2012 at 08:56:30PM +0800, zwu.kernel@gmail.com wrote:
>> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>
>>   Adds a hash table structure which contains
>> a lot of hash list and is used to efficiently
>> look up the data temperature of a file or its
>> ranges.
>>   In each hash list of hash table, the hash node
>> will keep track of temperature info.
>
> So, let me see if I've got the relationship straight:
>
> - sb->s_hot_info.hot_inode_tree indexes hot_inode_items, one per inode
>
> - hot_inode_item contains access frequency data for that inode
>
> - hot_inode_item holds a heat hash node to index the access
>   frequency data for that inode
>
> - hot_inode_item.hot_range_tree indexes hot_range_items for that inode
>
> - hot_range_item contains access frequency data for that range
>
> - hot_range_item holds a heat hash node to index the access
>   frequency data for that range
>
> - sb->s_hot_info.heat_inode_hl indexes per-inode heat hash nodes
>
> - sb->s_hot_info.heat_range_hl indexes per-range heat hash nodes
Correct.
>
> How about some ascii art? :) Just looking at the hot inode item case
> (the range item case is the same pattern, though), we have:
>
>
> heat_inode_hl           hot_inode_tree
>     |                         |
>     |                         V
>     |           +-------hot_inode_item-------+
> +---+           |       frequency data       |
> |               V            ^               V
> | ...<--hot_inode_item-->... |    ...<--hot_inode_item-->....
> |       frequency data       |          frequency data
> |               ^            |               ^
> |               |            |               |
> |               |            |               |
> +------>hot_hash_node-->hot_hash_node-->hot_hash_node-->....
Great, can we put them in hot_tracking.txt in Documentation?
>
>
> There's no actual data stored in the hot_hash_node, just pointer
> back to the frequency data, a hlist_node and a pointer to the
> hashlist head. IOWs, I agree with Ram that this does not need to
> exist and just embedding a hlist_node inside the hot_inode_item is
> all that is needed. i.e:
>
> heat_inode_hl           hot_inode_tree
>     |                         |
>     |                         V
>     |           +-------hot_inode_item-------+
>     |           |       frequency data       |
> +---+           |       hlist_node           |
> |               V            ^ |             V
> | ...<--hot_inode_item-->... | |  ...<--hot_inode_item-->....
> |       frequency data       | |        frequency data
> +------>hlist_node-----------+ +------->hlist_node--->.....
>
> There's no need for separate allocations, initialisations, locks and
> reference counting - all that is already in the hot_inode_item. The
> items have the same lifecycle limitations - a hot_hash_node must be
> torn down before the frequency data it points to is freed. Finally,
> there's no difference in how you move it between lists.
How will you know if one hot_inode_item should be moved between lists
when its freq data is changed?
>
> Indeed, calling it a hash is wrong - there's not hashing at all
> - it keeping an array of list where each entry corresponds to a
> specific temperature. It is a *heat map*, not a hash list. i.e.
> inode_heat_map, not heat_inode_hl. HEAT_MAP_SIZE, not HASH_SIZE.
OK.
>
> As it is, there aren't any users of the heat maps that are generated
> in this patch set - it's not even exported to userspace or to
> debugfs, so I'm not sure how it will be used yet. How are these heat
> maps going to be used by filesystems, Zhi?
In hot_hash_calc_temperature(), you can see that one hot_inode or
hot_range's freq data will be distilled into one temperature value,
then it will be inserted to the heat map based on its temperature.
When the file corresponding to the inode or range got hotter or cold,
its location will be changed in the heat map based on its new
temperature in hot_hash_update_hash_table().

And the user will retrieve those freq data and temperature info via
debugfs or ioctl interfaces.
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
Dave Chinner - Sept. 27, 2012, 6:57 a.m.
On Thu, Sep 27, 2012 at 02:23:16PM +0800, Zhi Yong Wu wrote:
> On Thu, Sep 27, 2012 at 11:43 AM, Dave Chinner <david@fromorbit.com> wrote:
> > On Sun, Sep 23, 2012 at 08:56:30PM +0800, zwu.kernel@gmail.com wrote:
> >> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> >>
> >>   Adds a hash table structure which contains
> >> a lot of hash list and is used to efficiently
> >> look up the data temperature of a file or its
> >> ranges.
> >>   In each hash list of hash table, the hash node
> >> will keep track of temperature info.
> >
> > So, let me see if I've got the relationship straight:
> >
> > - sb->s_hot_info.hot_inode_tree indexes hot_inode_items, one per inode
> >
> > - hot_inode_item contains access frequency data for that inode
> >
> > - hot_inode_item holds a heat hash node to index the access
> >   frequency data for that inode
> >
> > - hot_inode_item.hot_range_tree indexes hot_range_items for that inode
> >
> > - hot_range_item contains access frequency data for that range
> >
> > - hot_range_item holds a heat hash node to index the access
> >   frequency data for that range
> >
> > - sb->s_hot_info.heat_inode_hl indexes per-inode heat hash nodes
> >
> > - sb->s_hot_info.heat_range_hl indexes per-range heat hash nodes
> Correct.
> >
> > How about some ascii art? :) Just looking at the hot inode item case
> > (the range item case is the same pattern, though), we have:
> >
> >
> > heat_inode_hl           hot_inode_tree
> >     |                         |
> >     |                         V
> >     |           +-------hot_inode_item-------+
> > +---+           |       frequency data       |
> > |               V            ^               V
> > | ...<--hot_inode_item-->... |    ...<--hot_inode_item-->....
> > |       frequency data       |          frequency data
> > |               ^            |               ^
> > |               |            |               |
> > |               |            |               |
> > +------>hot_hash_node-->hot_hash_node-->hot_hash_node-->....
> Great, can we put them in hot_tracking.txt in Documentation?
> >
> >
> > There's no actual data stored in the hot_hash_node, just pointer
> > back to the frequency data, a hlist_node and a pointer to the
> > hashlist head. IOWs, I agree with Ram that this does not need to
> > exist and just embedding a hlist_node inside the hot_inode_item is
> > all that is needed. i.e:
> >
> > heat_inode_hl           hot_inode_tree
> >     |                         |
> >     |                         V
> >     |           +-------hot_inode_item-------+
> >     |           |       frequency data       |
> > +---+           |       hlist_node           |
> > |               V            ^ |             V
> > | ...<--hot_inode_item-->... | |  ...<--hot_inode_item-->....
> > |       frequency data       | |        frequency data
> > +------>hlist_node-----------+ +------->hlist_node--->.....
> >
> > There's no need for separate allocations, initialisations, locks and
> > reference counting - all that is already in the hot_inode_item. The
> > items have the same lifecycle limitations - a hot_hash_node must be
> > torn down before the frequency data it points to is freed. Finally,
> > there's no difference in how you move it between lists.
> How will you know if one hot_inode_item should be moved between lists
> when its freq data is changed?

Record the current temperature in the frequency data, and if it
changes, change the list it is on.

> > Indeed, calling it a hash is wrong - there's not hashing at all
> > - it keeping an array of list where each entry corresponds to a
> > specific temperature. It is a *heat map*, not a hash list. i.e.
> > inode_heat_map, not heat_inode_hl. HEAT_MAP_SIZE, not HASH_SIZE.
> OK.
> >
> > As it is, there aren't any users of the heat maps that are generated
> > in this patch set - it's not even exported to userspace or to
> > debugfs, so I'm not sure how it will be used yet. How are these heat
> > maps going to be used by filesystems, Zhi?
> In hot_hash_calc_temperature(), you can see that one hot_inode or
> hot_range's freq data will be distilled into one temperature value,
> then it will be inserted to the heat map based on its temperature.
> When the file corresponding to the inode or range got hotter or cold,
> its location will be changed in the heat map based on its new
> temperature in hot_hash_update_hash_table().

Yes, but a hot_inode_item or hot_range_item can only have one
location in the heat map, right? So it doesn't need external
structure to point to the frequency data to track this....

> And the user will retrieve those freq data and temperature info via
> debugfs or ioctl interfaces.

Right - but that data is only extracted after an initial
hot_inode_tree lookup - The heat map itself is never directly used
for lookups. If it's not used for lookups based on temperature, why
is it needed?

Cheers,

Dave.
Zhiyong Wu - Sept. 27, 2012, 7:10 a.m.
On Thu, Sep 27, 2012 at 2:57 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Thu, Sep 27, 2012 at 02:23:16PM +0800, Zhi Yong Wu wrote:
>> On Thu, Sep 27, 2012 at 11:43 AM, Dave Chinner <david@fromorbit.com> wrote:
>> > On Sun, Sep 23, 2012 at 08:56:30PM +0800, zwu.kernel@gmail.com wrote:
>> >> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>> >>
>> >>   Adds a hash table structure which contains
>> >> a lot of hash list and is used to efficiently
>> >> look up the data temperature of a file or its
>> >> ranges.
>> >>   In each hash list of hash table, the hash node
>> >> will keep track of temperature info.
>> >
>> > So, let me see if I've got the relationship straight:
>> >
>> > - sb->s_hot_info.hot_inode_tree indexes hot_inode_items, one per inode
>> >
>> > - hot_inode_item contains access frequency data for that inode
>> >
>> > - hot_inode_item holds a heat hash node to index the access
>> >   frequency data for that inode
>> >
>> > - hot_inode_item.hot_range_tree indexes hot_range_items for that inode
>> >
>> > - hot_range_item contains access frequency data for that range
>> >
>> > - hot_range_item holds a heat hash node to index the access
>> >   frequency data for that range
>> >
>> > - sb->s_hot_info.heat_inode_hl indexes per-inode heat hash nodes
>> >
>> > - sb->s_hot_info.heat_range_hl indexes per-range heat hash nodes
>> Correct.
>> >
>> > How about some ascii art? :) Just looking at the hot inode item case
>> > (the range item case is the same pattern, though), we have:
>> >
>> >
>> > heat_inode_hl           hot_inode_tree
>> >     |                         |
>> >     |                         V
>> >     |           +-------hot_inode_item-------+
>> > +---+           |       frequency data       |
>> > |               V            ^               V
>> > | ...<--hot_inode_item-->... |    ...<--hot_inode_item-->....
>> > |       frequency data       |          frequency data
>> > |               ^            |               ^
>> > |               |            |               |
>> > |               |            |               |
>> > +------>hot_hash_node-->hot_hash_node-->hot_hash_node-->....
>> Great, can we put them in hot_tracking.txt in Documentation?
>> >
>> >
>> > There's no actual data stored in the hot_hash_node, just pointer
>> > back to the frequency data, a hlist_node and a pointer to the
>> > hashlist head. IOWs, I agree with Ram that this does not need to
>> > exist and just embedding a hlist_node inside the hot_inode_item is
>> > all that is needed. i.e:
>> >
>> > heat_inode_hl           hot_inode_tree
>> >     |                         |
>> >     |                         V
>> >     |           +-------hot_inode_item-------+
>> >     |           |       frequency data       |
>> > +---+           |       hlist_node           |
>> > |               V            ^ |             V
>> > | ...<--hot_inode_item-->... | |  ...<--hot_inode_item-->....
>> > |       frequency data       | |        frequency data
>> > +------>hlist_node-----------+ +------->hlist_node--->.....
>> >
>> > There's no need for separate allocations, initialisations, locks and
>> > reference counting - all that is already in the hot_inode_item. The
>> > items have the same lifecycle limitations - a hot_hash_node must be
>> > torn down before the frequency data it points to is freed. Finally,
>> > there's no difference in how you move it between lists.
>> How will you know if one hot_inode_item should be moved between lists
>> when its freq data is changed?
>
> Record the current temperature in the frequency data, and if it
I know how to do it, thanks.
> changes, change the list it is on.
>
>> > Indeed, calling it a hash is wrong - there's not hashing at all
>> > - it keeping an array of list where each entry corresponds to a
>> > specific temperature. It is a *heat map*, not a hash list. i.e.
>> > inode_heat_map, not heat_inode_hl. HEAT_MAP_SIZE, not HASH_SIZE.
>> OK.
>> >
>> > As it is, there aren't any users of the heat maps that are generated
>> > in this patch set - it's not even exported to userspace or to
>> > debugfs, so I'm not sure how it will be used yet. How are these heat
>> > maps going to be used by filesystems, Zhi?
>> In hot_hash_calc_temperature(), you can see that one hot_inode or
>> hot_range's freq data will be distilled into one temperature value,
>> then it will be inserted to the heat map based on its temperature.
>> When the file corresponding to the inode or range got hotter or cold,
>> its location will be changed in the heat map based on its new
>> temperature in hot_hash_update_hash_table().
>
> Yes, but a hot_inode_item or hot_range_item can only have one
> location in the heat map, right? So it doesn't need external
Yes.
> structure to point to the frequency data to track this....
OK.
>
>> And the user will retrieve those freq data and temperature info via
>> debugfs or ioctl interfaces.
>
> Right - but that data is only extracted after an initial
> hot_inode_tree lookup - The heat map itself is never directly used
> for lookups. If it's not used for lookups based on temperature, why
> is it needed?
You mean we don't need hot_inode_tree? You know, after those hook
functions collect the freq data for inode, they will store those raw
info in hot_inode_tree. One private kthread will iterate this tree to
distill those raw freq data into one temperatue value in [0 ~ 255],
then link the corresponding hot_inode_item in heat map.

>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com

Patch

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index fa89f70..5f96442 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -16,6 +16,7 @@ 
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
+#include <linux/hash.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/types.h>
@@ -24,6 +25,9 @@ 
 /* kmem_cache pointers for slab caches */
 static struct kmem_cache *hot_inode_item_cache;
 static struct kmem_cache *hot_range_item_cache;
+static struct kmem_cache *hot_hash_node_cache;
+
+static void hot_hash_node_init(void *_node);
 
 /*
  * Initialize the inode tree. Should be called for each new inode
@@ -57,6 +61,10 @@  void hot_rb_inode_item_init(void *_item)
 	memset(he, 0, sizeof(*he));
 	kref_init(&he->refs);
 	spin_lock_init(&he->lock);
+	he->heat_node = kmem_cache_alloc(hot_hash_node_cache,
+					GFP_KERNEL | GFP_NOFS);
+	hot_hash_node_init(he->heat_node);
+	he->heat_node->hot_freq_data = &he->hot_freq_data;
 	he->hot_freq_data.avg_delta_reads = (u64) -1;
 	he->hot_freq_data.avg_delta_writes = (u64) -1;
 	he->hot_freq_data.flags = FREQ_DATA_TYPE_INODE;
@@ -75,6 +83,10 @@  static void hot_rb_range_item_init(void *_item)
 	memset(hr, 0, sizeof(*hr));
 	kref_init(&hr->refs);
 	spin_lock_init(&hr->lock);
+	hr->heat_node = kmem_cache_alloc(hot_hash_node_cache,
+				GFP_KERNEL | GFP_NOFS);
+	hot_hash_node_init(hr->heat_node);
+	hr->heat_node->hot_freq_data = &hr->hot_freq_data;
 	hr->hot_freq_data.avg_delta_reads = (u64) -1;
 	hr->hot_freq_data.avg_delta_writes = (u64) -1;
 	hr->hot_freq_data.flags = FREQ_DATA_TYPE_RANGE;
@@ -105,6 +117,18 @@  inode_err:
 	return -ENOMEM;
 }
 
+static void hot_rb_inode_item_exit(void)
+{
+	if (hot_inode_item_cache)
+		kmem_cache_destroy(hot_inode_item_cache);
+}
+
+static void hot_rb_range_item_exit(void)
+{
+	if (hot_range_item_cache)
+		kmem_cache_destroy(hot_range_item_cache);
+}
+
 /*
  * Drops the reference out on hot_inode_item by one and free the structure
  * if the reference count hits zero
@@ -510,6 +534,48 @@  void hot_rb_update_freqs(struct inode *inode, u64 start,
 }
 
 /*
+ * Initialize hash node.
+ */
+static void hot_hash_node_init(void *_node)
+{
+	struct hot_hash_node *node = _node;
+
+	memset(node, 0, sizeof(*node));
+	INIT_HLIST_NODE(&node->hashnode);
+	node->hot_freq_data = NULL;
+	node->hlist = NULL;
+	spin_lock_init(&node->lock);
+	kref_init(&node->refs);
+}
+
+static int __init hot_hash_node_cache_init(void)
+{
+	hot_hash_node_cache = kmem_cache_create("hot_hash_node",
+				sizeof(struct hot_hash_node),
+				0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				hot_hash_node_init);
+	if (!hot_hash_node_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/*
+ * Initialize inode/range hash lists.
+ */
+static void hot_hash_table_init(struct hot_info *root)
+{
+	int i;
+	for (i = 0; i < HEAT_HASH_SIZE; i++) {
+		root->heat_inode_hl[i].temperature = i;
+		root->heat_range_hl[i].temperature = i;
+		rwlock_init(&root->heat_inode_hl[i].rwlock);
+		rwlock_init(&root->heat_range_hl[i].rwlock);
+	}
+}
+
+/*
  * Regular mount options parser for -hottrack option.
  * return false if no -hottrack is specified;
  * otherwise return true. And the -hottrack will be
@@ -544,13 +610,18 @@  bool hot_track_parse_options(char *options)
 }
 
 /*
- * Initialize kmem cache for hot_inode_item
- * and hot_range_item
+ * Initialize kmem cache for hot_inode_item,
+ * hot_range_item and hot_hash_node
  */
 void __init hot_track_cache_init(void)
 {
 	if (hot_rb_item_cache_init())
 		return;
+
+	if (hot_hash_node_cache_init()) {
+		hot_rb_inode_item_exit();
+		hot_rb_range_item_exit();
+	}
 }
 
 /*
@@ -560,10 +631,12 @@  void hot_track_init(struct super_block *sb, const char *name)
 {
 	sb->s_hotinfo.mount_opt |= HOT_MOUNT_HOT_TRACK;
 	hot_rb_inode_tree_init(&sb->s_hotinfo.hot_inode_tree);
+	hot_hash_table_init(&sb->s_hotinfo);
 }
 
 void hot_track_exit(struct super_block *sb)
 {
 	sb->s_hotinfo.mount_opt &= ~HOT_MOUNT_HOT_TRACK;
+	hot_hash_table_free(&sb->s_hotinfo);
 	hot_rb_inode_tree_free(&sb->s_hotinfo);
 }
diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
index bb2a41c..635ffb6 100644
--- a/include/linux/hot_tracking.h
+++ b/include/linux/hot_tracking.h
@@ -20,6 +20,9 @@ 
 #include <linux/rbtree.h>
 #include <linux/kref.h>
 
+#define HEAT_HASH_BITS 8
+#define HEAT_HASH_SIZE (1 << HEAT_HASH_BITS)
+
 /*
  * Flags for hot data tracking mount options.
  */
@@ -52,6 +55,28 @@  struct hot_freq_data {
 	u32 last_temperature;
 };
 
+/* Hash list heads for hot hash table */
+struct hot_hash_head {
+	struct hlist_head hashhead;
+	rwlock_t rwlock;
+	u32 temperature;
+};
+
+/* Nodes stored in each hash list of hash table */
+struct hot_hash_node {
+	struct hlist_node hashnode;
+	struct list_head node;
+	struct hot_freq_data *hot_freq_data;
+	struct hot_hash_head *hlist;
+	spinlock_t lock; /* protects hlist */
+
+	/*
+	 * number of references to this node
+	 * equals 1 (hashlist entry)
+	 */
+	struct kref refs;
+};
+
 /* An item representing an inode and its access frequency */
 struct hot_inode_item {
 	/* node for hot_inode_tree rb_tree */
@@ -68,6 +93,8 @@  struct hot_inode_item {
 	spinlock_t lock;
 	/* prevents kfree */
 	struct kref refs;
+	/* hashlist node for this inode */
+	struct hot_hash_node *heat_node;
 };
 
 /*
@@ -91,6 +118,8 @@  struct hot_range_item {
 	spinlock_t lock;
 	/* prevents kfree */
 	struct kref refs;
+	/* hashlist node for this range */
+	struct hot_hash_node *heat_node;
 };
 
 struct hot_info {
@@ -98,6 +127,12 @@  struct hot_info {
 
 	/* red-black tree that keeps track of fs-wide hot data */
 	struct hot_inode_tree hot_inode_tree;
+
+	/* hash map of inode temperature */
+	struct hot_hash_head heat_inode_hl[HEAT_HASH_SIZE];
+
+	/* hash map of range temperature */
+	struct hot_hash_head heat_range_hl[HEAT_HASH_SIZE];
 };
 
 #endif  /* _LINUX_HOTTRACK_H */