Patchwork [RFC,v3,12/13] vfs: add debugfs support

login
register
mail settings
Submitter Zhiyong Wu
Date Oct. 10, 2012, 10:07 a.m.
Message ID <1349863655-29320-13-git-send-email-zwu.kernel@gmail.com>
Download mbox | patch
Permalink /patch/190593/
State Not Applicable
Headers show

Comments

Zhiyong Wu - Oct. 10, 2012, 10:07 a.m.
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  Add a /sys/kernel/debug/hot_track/<device_name>/ directory for each
volume that contains two files. The first, `inode_data', contains the
heat information for inodes that have been brought into the hot data map
structures. The second, `range_data', contains similar information for
subfile ranges.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/hot_tracking.c |  462 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/hot_tracking.h |   43 +++++
 2 files changed, 505 insertions(+), 0 deletions(-)
David Sterba - Oct. 10, 2012, 4:53 p.m.
On Wed, Oct 10, 2012 at 06:07:34PM +0800, zwu.kernel@gmail.com wrote:
> +static int hot_debugfs_log_init(struct debugfs_vol_data *data)
> +{
> +	int err = 0;
> +	struct lstring *debugfs_log = data->debugfs_log;
> +
> +	spin_lock(&data->log_lock);
> +	debugfs_log->str = vmalloc(INIT_LOG_ALLOC_SIZE);

vmalloc __might_sleep(), do the allocation outside of the lock and assign
the value inside. Also, you may use vzalloc and drop the following memset.

dmesg:

vfs: turning on hot data tracking
BUG: sleeping function called from invalid context at mm/slab.c:3220
in_atomic(): 1, irqs_disabled(): 0, pid: 3103, name: mc
1 lock held by mc/3103:
 #0:  (&(&inode_data->log_lock)->rlock){+.+.+.}, at: [<ffffffff8118c656>] __hot_debugfs_comm_read+0x216/0x280
Pid: 3103, comm: mc Tainted: G        W    3.6.0hottrack-default+ #208
Call Trace:
 [<ffffffff8108068c>] __might_sleep+0xfc/0x130
 [<ffffffff8114f7c1>] kmem_cache_alloc_trace+0xe1/0x270
 [<ffffffff81142005>] __get_vm_area_node+0x95/0x1a0
 [<ffffffff8108630f>] ? local_clock+0x6f/0x80
 [<ffffffff8118c660>] ? __hot_debugfs_comm_read+0x220/0x280
 [<ffffffff8114283d>] __vmalloc_node_range+0x6d/0x200
 [<ffffffff8118c660>] ? __hot_debugfs_comm_read+0x220/0x280
 [<ffffffff8118b810>] ? hot_debugfs_log+0xe0/0xe0
 [<ffffffff81142a05>] __vmalloc_node+0x35/0x40
 [<ffffffff8118c660>] ? __hot_debugfs_comm_read+0x220/0x280
 [<ffffffff81142bcc>] vmalloc+0x2c/0x30
 [<ffffffff8118c660>] __hot_debugfs_comm_read+0x220/0x280
 [<ffffffff8191fd58>] ? __do_page_fault+0x238/0x590
 [<ffffffff810ab825>] ? trace_hardirqs_on_caller+0x155/0x1d0
 [<ffffffff8138d01e>] ? trace_hardirqs_on_thunk+0x3a/0x3f
 [<ffffffff8118c6d5>] __hot_debugfs_inode_read+0x15/0x20
 [<ffffffff8115a19b>] vfs_read+0xcb/0x190
 [<ffffffff8115a2c2>] sys_read+0x62/0xb0
 [<ffffffff8138d01e>] ? trace_hardirqs_on_thunk+0x3a/0x3f
 [<ffffffff81924979>] system_call_fastpath+0x16/0x1b

> +	if (debugfs_log->str) {
> +		memset(debugfs_log->str, 0, INIT_LOG_ALLOC_SIZE);
> +		data->log_alloc_size = INIT_LOG_ALLOC_SIZE;
> +	} else {
> +		err = -ENOMEM;
> +	}
> +	spin_unlock(&data->log_lock);
> +
> +	return err;
> +}

I'm now playing with it, and haven't gone through the code yet,
david
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba - Oct. 10, 2012, 9:05 p.m.
On Wed, Oct 10, 2012 at 06:07:34PM +0800, zwu.kernel@gmail.com wrote:
> +static int hot_debugfs_copy(struct debugfs_vol_data *data, char *msg, int len)
> +{
> +	struct lstring *debugfs_log = data->debugfs_log;
> +	uint new_log_alloc_size;
> +	char *new_log;
> +	static char err_msg[] = "No more memory!\n";
> +
> +	if (len >= data->log_alloc_size - debugfs_log->len) {
> +		/*
> +		 * Not enough room in the log buffer for the new message.
> +		 * Allocate a bigger buffer.
> +		 */
> +		new_log_alloc_size = data->log_alloc_size + LOG_PAGE_SIZE;
> +		new_log = vmalloc(new_log_alloc_size);

This is also called with a spinlock from hot_debugfs_log, and it is a
frequent call. I found my testbox inaccessible after an hour of md5sums
on a partition when I tried to print contents of the /sys/debug files.

Serial console log filled with

[ 4886.141736] BUG: scheduling while atomic: mc/3176/0x00000004
[ 4886.148443] INFO: lockdep is turned off.
[ 4886.153424] Modules linked in: aoe dm_crypt loop btrfs
[ 4886.159705] Pid: 3176, comm: mc Tainted: G        W    3.6.0hottrack-default+ #209
[ 4886.168346] Call Trace:
[ 4886.171842]  [<ffffffff8107e528>] __schedule_bug+0x68/0x90
[ 4886.178427]  [<ffffffff81919b6c>] __schedule+0x73c/0x810
[ 4886.184809]  [<ffffffff81919cf9>] schedule+0x29/0x70
[ 4886.190838]  [<ffffffff8191729c>] schedule_timeout+0x17c/0x2f0
[ 4886.197732]  [<ffffffff8105c260>] ? del_timer+0x100/0x100
[ 4886.204198]  [<ffffffff8191b59b>] ? _raw_spin_unlock+0x2b/0x50
[ 4886.211099]  [<ffffffff8191742e>] schedule_timeout_uninterruptible+0x1e/0x20
[ 4886.219211]  [<ffffffff811150a9>] __alloc_pages_nodemask+0x839/0x9f0
[ 4886.226624]  [<ffffffff811428c3>] __vmalloc_node_range+0xf3/0x200
[ 4886.233788]  [<ffffffff8118b659>] ? hot_debugfs_copy+0x59/0x130
[ 4886.240774]  [<ffffffff81142a05>] __vmalloc_node+0x35/0x40
[ 4886.247335]  [<ffffffff8118b659>] ? hot_debugfs_copy+0x59/0x130
[ 4886.254331]  [<ffffffff81142bcc>] vmalloc+0x2c/0x30
[ 4886.260299]  [<ffffffff8118b659>] hot_debugfs_copy+0x59/0x130
[ 4886.267130]  [<ffffffff8118b7c6>] hot_debugfs_log+0x96/0xe0
[ 4886.273772]  [<ffffffff8118b86f>] __hot_debugfs_print_inode_freq_data+0x5f/0x80
[ 4886.282149]  [<ffffffff8118c58c>] __hot_debugfs_comm_read+0x14c/0x280
[ 4886.289653]  [<ffffffff8118b810>] ? hot_debugfs_log+0xe0/0xe0
[ 4886.296461]  [<ffffffff8118c6d5>] __hot_debugfs_inode_read+0x15/0x20
[ 4886.303853]  [<ffffffff8115a19b>] vfs_read+0xcb/0x190
[ 4886.309934]  [<ffffffff8115a2c2>] sys_read+0x62/0xb0
[ 4886.315927]  [<ffffffff8138d01e>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 4886.323399]  [<ffffffff81924979>] system_call_fastpath+0x16/0x1b


david
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner - Oct. 15, 2012, 7:55 a.m.
On Wed, Oct 10, 2012 at 06:07:34PM +0800, zwu.kernel@gmail.com wrote:
> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> 
>   Add a /sys/kernel/debug/hot_track/<device_name>/ directory for each
> volume that contains two files. The first, `inode_data', contains the
> heat information for inodes that have been brought into the hot data map
> structures. The second, `range_data', contains similar information for
> subfile ranges.
....
> +	/* create debugfs range_data file */
> +	debugfs_range_entry = debugfs_create_file("range_data",
> +				S_IFREG | S_IRUSR | S_IWUSR | S_IRUGO,
> +				debugfs_volume_entry,
> +				(void *) range_data,
> +				&hot_debugfs_range_fops);

These should not be world readable. 0600 is probably the correct
permissions for them as we do not want random users to be able to
infer what files users are accessing from this information.

Cheers,

Dave.
Dave Chinner - Oct. 15, 2012, 8:04 a.m.
On Wed, Oct 10, 2012 at 06:07:34PM +0800, zwu.kernel@gmail.com wrote:
> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> 
>   Add a /sys/kernel/debug/hot_track/<device_name>/ directory for each
> volume that contains two files. The first, `inode_data', contains the
> heat information for inodes that have been brought into the hot data map
> structures. The second, `range_data', contains similar information for
> subfile ranges.
> 
> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> ---
>  fs/hot_tracking.c |  462 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/hot_tracking.h |   43 +++++
>  2 files changed, 505 insertions(+), 0 deletions(-)
.....
> +static int hot_debugfs_copy(struct debugfs_vol_data *data, char *msg, int len)
> +{
> +	struct lstring *debugfs_log = data->debugfs_log;
> +	uint new_log_alloc_size;
> +	char *new_log;
> +	static char err_msg[] = "No more memory!\n";
> +
> +	if (len >= data->log_alloc_size - debugfs_log->len) {
......
> +	}
> +
> +	memcpy(debugfs_log->str + debugfs_log->len, data->log_work_buff, len);
> +	debugfs_log->len += (unsigned long) len;
> +
> +	return len;
> +}
> +
> +/* Returns the number of bytes written to the log. */
> +static int hot_debugfs_log(struct debugfs_vol_data *data, const char *fmt, ...)
> +{
> +	struct lstring *debugfs_log = data->debugfs_log;
> +	va_list args;
> +	int len;
> +	static char trunc_msg[] =
> +			"The next message has been truncated.\n";
> +
> +	if (debugfs_log->str == NULL)
> +		return -1;
> +
> +	spin_lock(&data->log_lock);
> +
> +	va_start(args, fmt);
> +	len = vsnprintf(data->log_work_buff,
> +			sizeof(data->log_work_buff), fmt, args);
> +	va_end(args);
> +
> +	if (len >= sizeof(data->log_work_buff)) {
> +		hot_debugfs_copy(data, trunc_msg, sizeof(trunc_msg));
> +	}
> +
> +	len = hot_debugfs_copy(data, data->log_work_buff, len);
> +	spin_unlock(&data->log_lock);
> +
> +	return len;
> +}

Aren't you just recreating seq_printf() here? i.e. can't you replace
all this complexity with generic seq_file/seq_operations constructs?

Cheers,

Dave.
Zhiyong Wu - Oct. 15, 2012, 8:15 a.m.
On Mon, Oct 15, 2012 at 3:55 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Wed, Oct 10, 2012 at 06:07:34PM +0800, zwu.kernel@gmail.com wrote:
>> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>
>>   Add a /sys/kernel/debug/hot_track/<device_name>/ directory for each
>> volume that contains two files. The first, `inode_data', contains the
>> heat information for inodes that have been brought into the hot data map
>> structures. The second, `range_data', contains similar information for
>> subfile ranges.
> ....
>> +     /* create debugfs range_data file */
>> +     debugfs_range_entry = debugfs_create_file("range_data",
>> +                             S_IFREG | S_IRUSR | S_IWUSR | S_IRUGO,
>> +                             debugfs_volume_entry,
>> +                             (void *) range_data,
>> +                             &hot_debugfs_range_fops);
>
> These should not be world readable. 0600 is probably the correct
> permissions for them as we do not want random users to be able to
> infer what files users are accessing from this information.
Good catch, its mode should be S_IFREG | S_IRUSR | S_IWUSR

>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
Zhiyong Wu - Oct. 15, 2012, 8:47 a.m.
On Mon, Oct 15, 2012 at 4:04 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Wed, Oct 10, 2012 at 06:07:34PM +0800, zwu.kernel@gmail.com wrote:
>> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>
>>   Add a /sys/kernel/debug/hot_track/<device_name>/ directory for each
>> volume that contains two files. The first, `inode_data', contains the
>> heat information for inodes that have been brought into the hot data map
>> structures. The second, `range_data', contains similar information for
>> subfile ranges.
>>
>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>> ---
>>  fs/hot_tracking.c |  462 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  fs/hot_tracking.h |   43 +++++
>>  2 files changed, 505 insertions(+), 0 deletions(-)
> .....
>> +static int hot_debugfs_copy(struct debugfs_vol_data *data, char *msg, int len)
>> +{
>> +     struct lstring *debugfs_log = data->debugfs_log;
>> +     uint new_log_alloc_size;
>> +     char *new_log;
>> +     static char err_msg[] = "No more memory!\n";
>> +
>> +     if (len >= data->log_alloc_size - debugfs_log->len) {
> ......
>> +     }
>> +
>> +     memcpy(debugfs_log->str + debugfs_log->len, data->log_work_buff, len);
>> +     debugfs_log->len += (unsigned long) len;
>> +
>> +     return len;
>> +}
>> +
>> +/* Returns the number of bytes written to the log. */
>> +static int hot_debugfs_log(struct debugfs_vol_data *data, const char *fmt, ...)
>> +{
>> +     struct lstring *debugfs_log = data->debugfs_log;
>> +     va_list args;
>> +     int len;
>> +     static char trunc_msg[] =
>> +                     "The next message has been truncated.\n";
>> +
>> +     if (debugfs_log->str == NULL)
>> +             return -1;
>> +
>> +     spin_lock(&data->log_lock);
>> +
>> +     va_start(args, fmt);
>> +     len = vsnprintf(data->log_work_buff,
>> +                     sizeof(data->log_work_buff), fmt, args);
>> +     va_end(args);
>> +
>> +     if (len >= sizeof(data->log_work_buff)) {
>> +             hot_debugfs_copy(data, trunc_msg, sizeof(trunc_msg));
>> +     }
>> +
>> +     len = hot_debugfs_copy(data, data->log_work_buff, len);
>> +     spin_unlock(&data->log_lock);
>> +
>> +     return len;
>> +}
>
> Aren't you just recreating seq_printf() here? i.e. can't you replace
> all this complexity with generic seq_file/seq_operations constructs?
It seems to be a good suggestion, let me try it. thanks.

>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com

Patch

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index fcde55e..60e93e6 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -20,6 +20,8 @@ 
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/types.h>
+#include <linux/debugfs.h> 
+#include <linux/vmalloc.h> 
 #include <linux/limits.h>
 #include "hot_tracking.h"
 
@@ -29,6 +31,13 @@  struct hot_info *global_hot_tracking_info;
 static struct kmem_cache *hot_inode_item_cachep;
 static struct kmem_cache *hot_range_item_cachep;
 
+/* list to keep track of each mounted volumes debugfs_vol_data */
+static struct list_head hot_debugfs_vol_data_list;
+/* lock for debugfs_vol_data_list */
+static spinlock_t hot_debugfs_data_list_lock;
+/* pointer to top level debugfs dentry */
+static struct dentry *hot_debugfs_root_dentry;
+
 /*
  * Initialize the inode tree. Should be called for each new inode
  * access or other user of the hot_inode interface.
@@ -706,6 +715,451 @@  static void hot_wq_exit(struct workqueue_struct *wq)
 	destroy_workqueue(wq);
 }
 
+static int hot_debugfs_copy(struct debugfs_vol_data *data, char *msg, int len)
+{
+	struct lstring *debugfs_log = data->debugfs_log;
+	uint new_log_alloc_size;
+	char *new_log;
+	static char err_msg[] = "No more memory!\n";
+
+	if (len >= data->log_alloc_size - debugfs_log->len) {
+		/*
+		 * Not enough room in the log buffer for the new message.
+		 * Allocate a bigger buffer.
+		 */
+		new_log_alloc_size = data->log_alloc_size + LOG_PAGE_SIZE;
+		new_log = vmalloc(new_log_alloc_size);
+
+		if (new_log) {
+			memcpy(new_log, debugfs_log->str, debugfs_log->len);
+			memset(new_log + debugfs_log->len, 0,
+				new_log_alloc_size - debugfs_log->len);
+			vfree(debugfs_log->str);
+			debugfs_log->str = new_log;
+			data->log_alloc_size = new_log_alloc_size;
+		} else {
+			WARN_ON(1);
+			if (data->log_alloc_size - debugfs_log->len) {
+				strlcpy(debugfs_log->str +
+				debugfs_log->len,
+				err_msg,
+				data->log_alloc_size - debugfs_log->len);
+				debugfs_log->len +=
+				min((typeof(debugfs_log->len))
+				sizeof(err_msg),
+				((typeof(debugfs_log->len))
+				data->log_alloc_size - debugfs_log->len));
+			}
+			return 0;
+		}
+	}
+
+	memcpy(debugfs_log->str + debugfs_log->len, data->log_work_buff, len);
+	debugfs_log->len += (unsigned long) len;
+
+	return len;
+}
+
+/* Returns the number of bytes written to the log. */
+static int hot_debugfs_log(struct debugfs_vol_data *data, const char *fmt, ...)
+{
+	struct lstring *debugfs_log = data->debugfs_log;
+	va_list args;
+	int len;
+	static char trunc_msg[] =
+			"The next message has been truncated.\n";
+
+	if (debugfs_log->str == NULL)
+		return -1;
+
+	spin_lock(&data->log_lock);
+
+	va_start(args, fmt);
+	len = vsnprintf(data->log_work_buff,
+			sizeof(data->log_work_buff), fmt, args);
+	va_end(args);
+
+	if (len >= sizeof(data->log_work_buff)) {
+		hot_debugfs_copy(data, trunc_msg, sizeof(trunc_msg));
+	}
+
+	len = hot_debugfs_copy(data, data->log_work_buff, len);
+	spin_unlock(&data->log_lock);
+
+	return len;
+}
+
+/* initialize a log corresponding to a fs volume */
+static int hot_debugfs_log_init(struct debugfs_vol_data *data)
+{
+	int err = 0;
+	struct lstring *debugfs_log = data->debugfs_log;
+
+	spin_lock(&data->log_lock);
+	debugfs_log->str = vmalloc(INIT_LOG_ALLOC_SIZE);
+	if (debugfs_log->str) {
+		memset(debugfs_log->str, 0, INIT_LOG_ALLOC_SIZE);
+		data->log_alloc_size = INIT_LOG_ALLOC_SIZE;
+	} else {
+		err = -ENOMEM;
+	}
+	spin_unlock(&data->log_lock);
+
+	return err;
+}
+
+/* free a log corresponding to a fs volume */
+static void hot_debugfs_log_exit(struct debugfs_vol_data *data)
+{
+	struct lstring *debugfs_log = data->debugfs_log;
+
+	spin_lock(&data->log_lock);
+	vfree(debugfs_log->str);
+	debugfs_log->str = NULL;
+	debugfs_log->len = 0;
+	spin_unlock(&data->log_lock);
+}
+
+/* debugfs open file override from fops table */
+static int __hot_debugfs_open(struct inode *inode, struct file *file)
+{
+	if (inode->i_private)
+		file->private_data = inode->i_private;
+
+	return 0;
+}
+
+static void __hot_debugfs_print_range_freq_data(
+			struct hot_inode_item *he,
+			struct hot_range_item *hr,
+			struct debugfs_vol_data *data,
+			struct hot_info *root)
+{
+	struct hot_freq_data *freq_data;
+
+	freq_data = &hr->hot_range.hot_freq_data;
+
+	/* Always lock hot_inode_item first */
+	spin_lock(&he->hot_inode.lock);
+	spin_lock(&hr->hot_range.lock);
+	hot_debugfs_log(data, "inode #%lu, range start " \
+			"%llu (range len %llu) reads %u, writes %u, "
+			"avg read time %llu, avg write time %llu, temp %u\n",
+			he->i_ino,
+			hr->start,
+			hr->len,
+			freq_data->nr_reads,
+			freq_data->nr_writes,
+			freq_data->avg_delta_reads,
+			freq_data->avg_delta_writes,
+			freq_data->last_temperature);
+	spin_unlock(&hr->hot_range.lock);
+	spin_unlock(&he->hot_inode.lock);
+}
+
+/*
+ * take the inode, find ranges associated with inode
+ * and print each range data struct
+ */
+static void __hot_debugfs_walk_range_tree(struct hot_inode_item *he,
+				struct debugfs_vol_data *data,
+				struct hot_info *root)
+{
+	struct hot_range_item *hr_nodes[8];
+	u32 start = 0;
+	int i, n;
+
+	/* Walk the hot_range_tree for inode */
+	while (1) {
+		spin_lock(&he->lock);
+		n = radix_tree_gang_lookup(&he->hot_range_tree,
+					   (void **)hr_nodes, start,
+					   ARRAY_SIZE(hr_nodes));
+		if (!n) {
+			spin_unlock(&he->lock);
+			break;
+		}
+
+		start = hr_nodes[n - 1]->start + 1;
+		for (i = 0; i < n; i++) {
+			kref_get(&hr_nodes[i]->hot_range.refs);
+			__hot_debugfs_print_range_freq_data(he,
+						hr_nodes[i], data, root);
+			hot_range_item_put(hr_nodes[i]);
+		}
+		spin_unlock(&he->lock);
+	}
+}
+
+/* Print frequency data for each freq data to log */
+static void __hot_debugfs_print_inode_freq_data(
+				struct hot_inode_item *he,
+				struct debugfs_vol_data *data,
+				struct hot_info *root)
+{
+	struct hot_freq_data *freq_data = &he->hot_inode.hot_freq_data;
+
+	spin_lock(&he->hot_inode.lock);
+	hot_debugfs_log(data, "inode #%lu, reads %u, writes %u, " \
+		"avg read time %llu, avg write time %llu, temp %u\n",
+		he->i_ino,
+		freq_data->nr_reads,
+		freq_data->nr_writes,
+		freq_data->avg_delta_reads,
+		freq_data->avg_delta_writes,
+		freq_data->last_temperature);
+	spin_unlock(&he->hot_inode.lock);
+}
+
+/* debugfs common read file override from fops table */
+static ssize_t __hot_debugfs_comm_read(struct file *file, char __user *user,
+					size_t count, loff_t *ppos,
+					hot_debugfs_walk_t private_walk)
+{
+	int err = 0;
+	struct hot_info *root;
+	struct debugfs_vol_data *data;
+	struct lstring *debugfs_log;
+	struct hot_inode_item *hi_nodes[8];
+	u64 ino = 0;
+	int i, n;
+
+	data = (struct debugfs_vol_data *) file->private_data;
+	root = global_hot_tracking_info;
+
+	if (!data->debugfs_log) {
+		/* initialize debugfs log corresponding to this volume */
+		debugfs_log = kmalloc(sizeof(struct lstring),
+					GFP_KERNEL | GFP_NOFS);
+		debugfs_log->str = NULL,
+		debugfs_log->len = 0;
+		data->debugfs_log = debugfs_log;
+		hot_debugfs_log_init(data);
+	}
+
+	if ((unsigned long) *ppos > 0) {
+		/* caller is continuing a previous read, don't walk tree */
+		if ((unsigned long) *ppos >= data->debugfs_log->len)
+			goto clean_up;
+
+		goto print_to_user;
+	}
+
+	/* walk the inode tree */
+	while (1) {
+		spin_lock(&root->lock);
+		n = radix_tree_gang_lookup(&root->hot_inode_tree,
+					   (void **)hi_nodes, ino,
+					   ARRAY_SIZE(hi_nodes));
+		if (!n) {
+			spin_unlock(&root->lock);
+			break;
+		}
+
+		ino = hi_nodes[n - 1]->i_ino + 1;
+		for (i = 0; i < n; i++) {
+			kref_get(&hi_nodes[i]->hot_inode.refs);
+			/* walk ranges, print data to debugfs log */
+			private_walk(hi_nodes[i], data, root);
+			hot_inode_item_put(hi_nodes[i]);
+		}
+		spin_unlock(&root->lock);
+	}
+
+print_to_user:
+	if (data->debugfs_log->len) {
+		err = simple_read_from_buffer(user, count, ppos,
+					data->debugfs_log->str,
+					data->debugfs_log->len);
+	}
+
+	return err;
+
+clean_up:
+	/* reader has finished the file, clean up */
+	hot_debugfs_log_exit(data);
+	kfree(data->debugfs_log);
+	data->debugfs_log = NULL;
+
+	return 0;
+}
+
+/* debugfs read file override from fops table */
+static ssize_t __hot_debugfs_range_read(struct file *file, char __user *user,
+					size_t count, loff_t *ppos)
+{
+	return __hot_debugfs_comm_read(file, user,count, ppos,
+				__hot_debugfs_walk_range_tree);
+}
+
+/* debugfs read file override from fops table */
+static ssize_t __hot_debugfs_inode_read(struct file *file, char __user *user,
+					size_t count, loff_t *ppos)
+{
+	return __hot_debugfs_comm_read(file, user,count, ppos,
+				__hot_debugfs_print_inode_freq_data);
+
+}
+
+/* fops to override for printing range data */
+static const struct file_operations hot_debugfs_range_fops = {
+	.read = __hot_debugfs_range_read,
+	.open = __hot_debugfs_open,
+};
+
+/* fops to override for printing inode data */
+static const struct file_operations hot_debugfs_inode_fops = {
+	.read = __hot_debugfs_inode_read,
+	.open = __hot_debugfs_open,
+};
+
+/*
+ * on each volume mount, initialize the debugfs dentries and associated
+ * structures (debugfs_vol_data and debugfs_log)
+ */
+static int hot_debugfs_volume_init(struct super_block *sb)
+{
+	struct dentry *debugfs_volume_entry = NULL;
+	struct dentry *debugfs_range_entry = NULL;
+	struct dentry *debugfs_inode_entry = NULL;
+	struct debugfs_vol_data *range_data = NULL;
+	struct debugfs_vol_data *inode_data = NULL;
+
+	if (!hot_debugfs_root_dentry)
+		goto debugfs_error;
+
+	/* create debugfs folder for this volume by mounted dev name */
+	debugfs_volume_entry = debugfs_create_dir(sb->s_id, hot_debugfs_root_dentry);
+
+	if (!debugfs_volume_entry)
+		goto debugfs_error;
+
+	/* malloc and initialize debugfs_vol_data for range_data */
+	range_data = kmalloc(sizeof(struct debugfs_vol_data),
+				GFP_KERNEL | GFP_NOFS);
+	memset(range_data, 0, sizeof(struct debugfs_vol_data));
+	range_data->debugfs_log = NULL;
+	range_data->sb = sb;
+	spin_lock_init(&range_data->log_lock);
+	range_data->log_alloc_size = 0;
+
+	/* malloc and initialize debugfs_vol_data for inode_data */
+	inode_data = kmalloc(sizeof(struct debugfs_vol_data),
+				GFP_KERNEL | GFP_NOFS);
+	memset(inode_data, 0, sizeof(struct debugfs_vol_data));
+	inode_data->debugfs_log = NULL;
+	inode_data->sb = sb;
+	spin_lock_init(&inode_data->log_lock);
+	inode_data->log_alloc_size = 0;
+
+	/*
+	 * add debugfs_vol_data for inode data and range data for
+	 * volume to list
+	 */
+	range_data->de = debugfs_volume_entry;
+	inode_data->de = debugfs_volume_entry;
+	spin_lock(&hot_debugfs_data_list_lock);
+	list_add(&range_data->node, &hot_debugfs_vol_data_list);
+	list_add(&inode_data->node, &hot_debugfs_vol_data_list);
+	spin_unlock(&hot_debugfs_data_list_lock);
+
+	/* create debugfs range_data file */
+	debugfs_range_entry = debugfs_create_file("range_data",
+				S_IFREG | S_IRUSR | S_IWUSR | S_IRUGO,
+				debugfs_volume_entry,
+				(void *) range_data,
+				&hot_debugfs_range_fops);
+	if (!debugfs_range_entry)
+		goto debugfs_error;
+
+	/* create debugfs inode_data file */
+	debugfs_inode_entry = debugfs_create_file("inode_data",
+				S_IFREG | S_IRUSR | S_IWUSR | S_IRUGO,
+				debugfs_volume_entry,
+				(void *) inode_data,
+				&hot_debugfs_inode_fops);
+
+	if (!debugfs_inode_entry)
+		goto debugfs_error;
+
+	return 0;
+
+debugfs_error:
+	kfree(range_data);
+	kfree(inode_data);
+
+	return -EIO;
+}
+
+/*
+ * find volume mounted (match by superblock) and remove
+ * debugfs dentry
+ */
+static void hot_debugfs_volume_exit(struct super_block *sb)
+{
+	struct list_head *head;
+	struct list_head *pos;
+	struct debugfs_vol_data *data;
+
+	spin_lock(&hot_debugfs_data_list_lock);
+	head = &hot_debugfs_vol_data_list;
+	/* must clean up memory assicatied with superblock */
+	list_for_each(pos, head)
+	{
+		data = list_entry(pos, struct debugfs_vol_data, node);
+		if (data->sb == sb) {
+			list_del(pos);
+			debugfs_remove_recursive(data->de);
+			kfree(data);
+			data = NULL;
+		}
+	}
+	spin_unlock(&hot_debugfs_data_list_lock);
+}
+
+/* initialize debugfs */
+static int hot_debugfs_init(struct super_block *sb)
+{
+	hot_debugfs_root_dentry = debugfs_create_dir(DEBUGFS_ROOT_NAME, NULL);
+	/*init list of debugfs data list */
+	INIT_LIST_HEAD(&hot_debugfs_vol_data_list);
+	/*init lock to list of debugfs data list */
+	spin_lock_init(&hot_debugfs_data_list_lock);
+	if (!hot_debugfs_root_dentry)
+		goto debugfs_error;
+
+	hot_debugfs_volume_init(sb);
+
+	return 0;
+
+debugfs_error:
+	return -EIO;
+}
+
+/* clean up memory and remove dentries for debugsfs */
+static void hot_debugfs_exit(struct super_block *sb)
+{
+	/* first iterate through debugfs_vol_data_list and free memory */
+	struct list_head *head;
+	struct list_head *pos;
+	struct list_head *cur;
+	struct debugfs_vol_data *data;
+
+	hot_debugfs_volume_exit(sb);
+
+	spin_lock(&hot_debugfs_data_list_lock);
+	head = &hot_debugfs_vol_data_list;
+	list_for_each_safe(pos, cur, head) {
+		data = list_entry(pos, struct debugfs_vol_data, node);
+		if (data && pos != head)
+			kfree(data);
+	}
+	spin_unlock(&hot_debugfs_data_list_lock);
+
+	/* remove all debugfs entries recursively from the root */
+	debugfs_remove_recursive(hot_debugfs_root_dentry);
+}
+
 /*
  * Initialize kmem cache for hot_inode_item and hot_range_item.
  */
@@ -832,6 +1286,13 @@  void hot_track_init(struct super_block *sb)
 	root->hot_shrink.seeks = DEFAULT_SEEKS;
 	register_shrinker(&root->hot_shrink);
 
+	err = hot_debugfs_init(sb);
+	if (err) {
+		printk(KERN_ERR "%s: hot_debugfs_init error: %d\n",
+				__func__, err);
+		return;
+	}
+
 	printk(KERN_INFO "vfs: turning on hot data tracking\n");
 
 	return;
@@ -855,5 +1316,6 @@  void hot_track_exit(struct super_block *sb)
 	hot_inode_tree_exit(root);
 	sb->hot_flags &= ~MS_HOT_TRACKING;
 	hot_cache_exit();
+	hot_debugfs_exit(sb);
 	kfree(root);
 }
diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
index 7a79a6d..76d7469 100644
--- a/fs/hot_tracking.h
+++ b/fs/hot_tracking.h
@@ -92,6 +92,49 @@ 
 #define AVW_DIVIDER_POWER 40
 #define AVW_COEFF_POWER 0
 
+/* size of log to vmalloc */
+#define INIT_LOG_ALLOC_SIZE (PAGE_SIZE * 10)
+#define LOG_PAGE_SIZE (PAGE_SIZE * 10)
+
+/*
+ * number of chars of device name of chop off
+ * for making debugfs folder e.g. /dev/sda -> sda
+ */
+#define DEV_NAME_CHOP 5
+
+/*
+ * Name for VFS data in debugfs directory
+ * e.g. /sys/kernel/debug/hot_track
+ */
+#define DEBUGFS_ROOT_NAME "hot_track"
+
+/* log to output to userspace in debugfs files */
+struct lstring {
+	char *str;
+	unsigned long len;
+};
+
+/*
+ * debugfs_vol_data is a struct of items
+ * that is passed to the debugfs
+ */
+struct debugfs_vol_data {
+	/* protected by hot_debugfs_data_list_lock */
+	struct list_head node;
+	struct lstring *debugfs_log;
+	struct super_block *sb;
+	struct dentry *de;
+	/* protects debugfs_log */
+	spinlock_t log_lock;
+	char log_work_buff[1024];
+	uint log_alloc_size;
+};
+
+typedef void (*hot_debugfs_walk_t)(
+			struct hot_inode_item *hot_inode,
+			struct debugfs_vol_data *data,
+			struct hot_info *root);
+
 struct hot_update_work {
 	struct work_struct work;
 	struct hot_info *hot_info;