diff mbox

ext4: add lazyinit stats support

Message ID 1463456488-93466-1-git-send-email-wangshilong1991@gmail.com
State Rejected, archived
Headers show

Commit Message

Wang Shilong May 17, 2016, 3:41 a.m. UTC
From: Wang Shilong <wshilong@ddn.com>

Somtimes, we need figure out progress of Lazyinit
in the background, this patch try to add stats support
for it, output is something like:

$ cat /sys/fs/ext4/vda/lazyinit_stats
groups_finished: 80
groups_total: 80

Signed-off-by: Wang Shilong <wshilong@ddn.com>
---
Sorry if you received twice, first one is blocked by
mail list.
---
 fs/ext4/ext4.h  |    4 ++++
 fs/ext4/super.c |   19 ++++++++++++++++---
 fs/ext4/sysfs.c |   20 ++++++++++++++++++++
 3 files changed, 40 insertions(+), 3 deletions(-)

Comments

Eric Sandeen May 17, 2016, 3:59 a.m. UTC | #1
On 5/16/16 10:41 PM, Wang Shilong wrote:
> From: Wang Shilong <wshilong@ddn.com>
> 
> Somtimes, we need figure out progress of Lazyinit
> in the background, this patch try to add stats support
> for it, output is something like:
> 
> $ cat /sys/fs/ext4/vda/lazyinit_stats
> groups_finished: 80
> groups_total: 80

A few thoughts:

If this goes in, it should be documented in
Documentation/fs/ext4.txt, as well.

I suppose it might be nice, but when do you think this
might be needed in real life?

Also: sysfs is technically supposed to be one value per
file, and (I think) just a number, no text.  At least,
that's what every other ext4 device sysfs file has today,
so this should follow that precedent.

And as far as I can tell, "groups total" is the total
uninit groups at mount time, not total  in the filesystem,
so this would change on a remount? I think that's unexpected,
and not very useful.

Simply printing the remaining uninitialized block group
count might be enough, i.e.:

$ cat /sys/fs/ext4/vda/lazyinit_remaining
42

-Eric

> Signed-off-by: Wang Shilong <wshilong@ddn.com>
> ---
> Sorry if you received twice, first one is blocked by
> mail list.
> ---
>  fs/ext4/ext4.h  |    4 ++++
>  fs/ext4/super.c |   19 ++++++++++++++++---
>  fs/ext4/sysfs.c |   20 ++++++++++++++++++++
>  3 files changed, 40 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 349afeb..9aaae81 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1505,6 +1505,10 @@ struct ext4_sb_info {
>  	struct ratelimit_state s_err_ratelimit_state;
>  	struct ratelimit_state s_warning_ratelimit_state;
>  	struct ratelimit_state s_msg_ratelimit_state;
> +
> +	/* for lazyinit stats */
> +	unsigned long lazyinit_finished_cnt;
> +	unsigned long lazyinit_total_cnt;
>  };
>  
>  static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 304c712..b95e622 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -2660,6 +2660,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
>  		}
>  		elr->lr_next_sched = jiffies + elr->lr_timeout;
>  		elr->lr_next_group = group + 1;
> +		EXT4_SB(sb)->lazyinit_finished_cnt++;
>  	}
>  	sb_end_write(sb);
>  
> @@ -2864,10 +2865,12 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
>  {
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
>  	struct ext4_li_request *elr;
> +	ext4_group_t group, ngroups;
> +	struct ext4_group_desc *gdp = NULL;
>  
>  	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
>  	if (!elr)
> -		return NULL;
> +		return ERR_PTR(-ENOMEM);
>  
>  	elr->lr_super = sb;
>  	elr->lr_sbi = sbi;
> @@ -2880,6 +2883,16 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
>  	 */
>  	elr->lr_next_sched = jiffies + (prandom_u32() %
>  				(EXT4_DEF_LI_MAX_START_DELAY * HZ));
> +	ngroups = EXT4_SB(sb)->s_groups_count;
> +	 for (group = elr->lr_next_group; group < ngroups; group++) {
	^whitespace issue here
> +		gdp = ext4_get_group_desc(sb, group, NULL);
> +		if (!gdp) {
> +			elr = ERR_PTR(-EIO);
> +			break;
> +		}
> +		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
> +			sbi->lazyinit_total_cnt++;
> +	}
>  	return elr;
>  }
>  
> @@ -2907,8 +2920,8 @@ int ext4_register_li_request(struct super_block *sb,
>  		goto out;
>  
>  	elr = ext4_li_request_new(sb, first_not_zeroed);
> -	if (!elr) {
> -		ret = -ENOMEM;
> +	if (IS_ERR(elr)) {
> +		ret = PTR_ERR(elr);
>  		goto out;
>  	}
>  
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index 1420a3c..8d0c199 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -20,6 +20,7 @@ typedef enum {
>  	attr_delayed_allocation_blocks,
>  	attr_session_write_kbytes,
>  	attr_lifetime_write_kbytes,
> +	attr_lazyinit_stats,
>  	attr_reserved_clusters,
>  	attr_inode_readahead,
>  	attr_trigger_test_error,
> @@ -72,6 +73,21 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
>  			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
>  }
>  
> +static ssize_t lazyinit_stats_show(struct ext4_attr *a,
> +				   struct ext4_sb_info *sbi, char *buf)
> +{
> +	int len = 0;
> +	unsigned long total = sbi->lazyinit_total_cnt;
> +	unsigned long finish = sbi->lazyinit_finished_cnt;
> +
> +	len += snprintf(buf + len, PAGE_SIZE,
> +			"groups_finished: %lu\n", finish);
> +	len += snprintf(buf + len, PAGE_SIZE,
> +			"groups_total: %lu\n", total);
> +
> +	return len;
> +}
> +
>  static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
>  					  struct ext4_sb_info *sbi,
>  					  const char *buf, size_t count)
> @@ -165,6 +181,7 @@ static struct ext4_attr ext4_attr_##_name = {			\
>  EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
>  EXT4_ATTR_FUNC(session_write_kbytes, 0444);
>  EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
> +EXT4_ATTR_FUNC(lazyinit_stats, 0444);
>  EXT4_ATTR_FUNC(reserved_clusters, 0644);
>  
>  EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
> @@ -195,6 +212,7 @@ static struct attribute *ext4_attrs[] = {
>  	ATTR_LIST(delayed_allocation_blocks),
>  	ATTR_LIST(session_write_kbytes),
>  	ATTR_LIST(lifetime_write_kbytes),
> +	ATTR_LIST(lazyinit_stats),
>  	ATTR_LIST(reserved_clusters),
>  	ATTR_LIST(inode_readahead_blks),
>  	ATTR_LIST(inode_goal),
> @@ -265,6 +283,8 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
>  		return session_write_kbytes_show(a, sbi, buf);
>  	case attr_lifetime_write_kbytes:
>  		return lifetime_write_kbytes_show(a, sbi, buf);
> +	case attr_lazyinit_stats:
> +		return lazyinit_stats_show(a, sbi, buf);
>  	case attr_reserved_clusters:
>  		return snprintf(buf, PAGE_SIZE, "%llu\n",
>  				(unsigned long long)
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o May 17, 2016, 4:05 a.m. UTC | #2
On Tue, May 17, 2016 at 11:41:28AM +0800, Wang Shilong wrote:
> From: Wang Shilong <wshilong@ddn.com>
> 
> Somtimes, we need figure out progress of Lazyinit
> in the background, this patch try to add stats support
> for it, output is something like:
> 
> $ cat /sys/fs/ext4/vda/lazyinit_stats
> groups_finished: 80
> groups_total: 80

That's more of a procfs style of output.  In general the sysfs idiom
would be to have two sysfs variables:

/sys/fs/ext4/vda/nr_block_groups
/sys/fs/ext4/vda/nr_lazyinit_bg_done

and each would return an integer value.

For example:

% cat /sys/block/sda/queue/discard_granularity
512
% cat /sys/block/sda/queue/discard_max_bytes
2147450880
% cat /sys/block/sda/queue/discard_zeroes_data
0

There are certainly some exceptions to this rule, but it certainly
simplifies the sysfs support code, as well as making it a bit easier
for userspace programs to parse the output from those files.

    	      	       	  	    - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wang Shilong May 17, 2016, 4:14 a.m. UTC | #3
On Tue, May 17, 2016 at 11:59 AM, Eric Sandeen <sandeen@redhat.com> wrote:
> On 5/16/16 10:41 PM, Wang Shilong wrote:
>> From: Wang Shilong <wshilong@ddn.com>
>>
>> Somtimes, we need figure out progress of Lazyinit
>> in the background, this patch try to add stats support
>> for it, output is something like:
>>
>> $ cat /sys/fs/ext4/vda/lazyinit_stats
>> groups_finished: 80
>> groups_total: 80
>
> A few thoughts:
>
> If this goes in, it should be documented in
> Documentation/fs/ext4.txt, as well.
>
> I suppose it might be nice, but when do you think this
> might be needed in real life?

In our performances benchmarking, we want to wait
Lazyinit finished before starting really benchmarking sometimes.
(Since Lazyinit thread trigger some write background here)

but there is no way to see what is progress of lazyinit.
and we can only make sure there is no write from Device
counter...

>
> Also: sysfs is technically supposed to be one value per
> file, and (I think) just a number, no text.  At least,
> that's what every other ext4 device sysfs file has today,
> so this should follow that precedent.
>
> And as far as I can tell, "groups total" is the total
> uninit groups at mount time, not total  in the filesystem,
> so this would change on a remount? I think that's unexpected,
> and not very useful.

Yeah, From our usage, we need know progress of lazyiniting.
So after remoutning, 'total' will be dynamically changed.

We could store 'total' in superblock etc, but IMO it is
a bit overhead here.

>
> Simply printing the remaining uninitialized block group
> count might be enough, i.e.:
>
> $ cat /sys/fs/ext4/vda/lazyinit_remaining
> 42

Yup, this works fine for me.

Thank you for your coments!

>
> -Eric
>
>> Signed-off-by: Wang Shilong <wshilong@ddn.com>
>> ---
>> Sorry if you received twice, first one is blocked by
>> mail list.
>> ---
>>  fs/ext4/ext4.h  |    4 ++++
>>  fs/ext4/super.c |   19 ++++++++++++++++---
>>  fs/ext4/sysfs.c |   20 ++++++++++++++++++++
>>  3 files changed, 40 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 349afeb..9aaae81 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -1505,6 +1505,10 @@ struct ext4_sb_info {
>>       struct ratelimit_state s_err_ratelimit_state;
>>       struct ratelimit_state s_warning_ratelimit_state;
>>       struct ratelimit_state s_msg_ratelimit_state;
>> +
>> +     /* for lazyinit stats */
>> +     unsigned long lazyinit_finished_cnt;
>> +     unsigned long lazyinit_total_cnt;
>>  };
>>
>>  static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index 304c712..b95e622 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -2660,6 +2660,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
>>               }
>>               elr->lr_next_sched = jiffies + elr->lr_timeout;
>>               elr->lr_next_group = group + 1;
>> +             EXT4_SB(sb)->lazyinit_finished_cnt++;
>>       }
>>       sb_end_write(sb);
>>
>> @@ -2864,10 +2865,12 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
>>  {
>>       struct ext4_sb_info *sbi = EXT4_SB(sb);
>>       struct ext4_li_request *elr;
>> +     ext4_group_t group, ngroups;
>> +     struct ext4_group_desc *gdp = NULL;
>>
>>       elr = kzalloc(sizeof(*elr), GFP_KERNEL);
>>       if (!elr)
>> -             return NULL;
>> +             return ERR_PTR(-ENOMEM);
>>
>>       elr->lr_super = sb;
>>       elr->lr_sbi = sbi;
>> @@ -2880,6 +2883,16 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
>>        */
>>       elr->lr_next_sched = jiffies + (prandom_u32() %
>>                               (EXT4_DEF_LI_MAX_START_DELAY * HZ));
>> +     ngroups = EXT4_SB(sb)->s_groups_count;
>> +      for (group = elr->lr_next_group; group < ngroups; group++) {
>         ^whitespace issue here
>> +             gdp = ext4_get_group_desc(sb, group, NULL);
>> +             if (!gdp) {
>> +                     elr = ERR_PTR(-EIO);
>> +                     break;
>> +             }
>> +             if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
>> +                     sbi->lazyinit_total_cnt++;
>> +     }
>>       return elr;
>>  }
>>
>> @@ -2907,8 +2920,8 @@ int ext4_register_li_request(struct super_block *sb,
>>               goto out;
>>
>>       elr = ext4_li_request_new(sb, first_not_zeroed);
>> -     if (!elr) {
>> -             ret = -ENOMEM;
>> +     if (IS_ERR(elr)) {
>> +             ret = PTR_ERR(elr);
>>               goto out;
>>       }
>>
>> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
>> index 1420a3c..8d0c199 100644
>> --- a/fs/ext4/sysfs.c
>> +++ b/fs/ext4/sysfs.c
>> @@ -20,6 +20,7 @@ typedef enum {
>>       attr_delayed_allocation_blocks,
>>       attr_session_write_kbytes,
>>       attr_lifetime_write_kbytes,
>> +     attr_lazyinit_stats,
>>       attr_reserved_clusters,
>>       attr_inode_readahead,
>>       attr_trigger_test_error,
>> @@ -72,6 +73,21 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
>>                         EXT4_SB(sb)->s_sectors_written_start) >> 1)));
>>  }
>>
>> +static ssize_t lazyinit_stats_show(struct ext4_attr *a,
>> +                                struct ext4_sb_info *sbi, char *buf)
>> +{
>> +     int len = 0;
>> +     unsigned long total = sbi->lazyinit_total_cnt;
>> +     unsigned long finish = sbi->lazyinit_finished_cnt;
>> +
>> +     len += snprintf(buf + len, PAGE_SIZE,
>> +                     "groups_finished: %lu\n", finish);
>> +     len += snprintf(buf + len, PAGE_SIZE,
>> +                     "groups_total: %lu\n", total);
>> +
>> +     return len;
>> +}
>> +
>>  static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
>>                                         struct ext4_sb_info *sbi,
>>                                         const char *buf, size_t count)
>> @@ -165,6 +181,7 @@ static struct ext4_attr ext4_attr_##_name = {                     \
>>  EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
>>  EXT4_ATTR_FUNC(session_write_kbytes, 0444);
>>  EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
>> +EXT4_ATTR_FUNC(lazyinit_stats, 0444);
>>  EXT4_ATTR_FUNC(reserved_clusters, 0644);
>>
>>  EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
>> @@ -195,6 +212,7 @@ static struct attribute *ext4_attrs[] = {
>>       ATTR_LIST(delayed_allocation_blocks),
>>       ATTR_LIST(session_write_kbytes),
>>       ATTR_LIST(lifetime_write_kbytes),
>> +     ATTR_LIST(lazyinit_stats),
>>       ATTR_LIST(reserved_clusters),
>>       ATTR_LIST(inode_readahead_blks),
>>       ATTR_LIST(inode_goal),
>> @@ -265,6 +283,8 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
>>               return session_write_kbytes_show(a, sbi, buf);
>>       case attr_lifetime_write_kbytes:
>>               return lifetime_write_kbytes_show(a, sbi, buf);
>> +     case attr_lazyinit_stats:
>> +             return lazyinit_stats_show(a, sbi, buf);
>>       case attr_reserved_clusters:
>>               return snprintf(buf, PAGE_SIZE, "%llu\n",
>>                               (unsigned long long)
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Sandeen May 17, 2016, 4:22 a.m. UTC | #4
On 5/16/16 11:14 PM, Wang Shilong wrote:
> On Tue, May 17, 2016 at 11:59 AM, Eric Sandeen <sandeen@redhat.com> wrote:
>> On 5/16/16 10:41 PM, Wang Shilong wrote:
>>> From: Wang Shilong <wshilong@ddn.com>
>>>
>>> Somtimes, we need figure out progress of Lazyinit
>>> in the background, this patch try to add stats support
>>> for it, output is something like:
>>>
>>> $ cat /sys/fs/ext4/vda/lazyinit_stats
>>> groups_finished: 80
>>> groups_total: 80
>>
>> A few thoughts:
>>
>> If this goes in, it should be documented in
>> Documentation/fs/ext4.txt, as well.
>>
>> I suppose it might be nice, but when do you think this
>> might be needed in real life?
> 
> In our performances benchmarking, we want to wait
> Lazyinit finished before starting really benchmarking sometimes.
> (Since Lazyinit thread trigger some write background here)
> 
> but there is no way to see what is progress of lazyinit.
> and we can only make sure there is no write from Device
> counter...

In that case you should just specify no lazyinit at mkfs time :)

>> Also: sysfs is technically supposed to be one value per
>> file, and (I think) just a number, no text.  At least,
>> that's what every other ext4 device sysfs file has today,
>> so this should follow that precedent.
>>
>> And as far as I can tell, "groups total" is the total
>> uninit groups at mount time, not total  in the filesystem,
>> so this would change on a remount? I think that's unexpected,
>> and not very useful.
> 
> Yeah, From our usage, we need know progress of lazyiniting.
> So after remoutning, 'total' will be dynamically changed.
> 
> We could store 'total' in superblock etc, but IMO it is
> a bit overhead here.

Agreed.

>>
>> Simply printing the remaining uninitialized block group
>> count might be enough, i.e.:
>>
>> $ cat /sys/fs/ext4/vda/lazyinit_remaining
>> 42
> 
> Yup, this works fine for me.
> 
> Thank you for your coments!

Sure thing; I'm still on the fence about usefulness, because if
anyone really cares to wait for it to hit zero, they probably
should have just changed their mkfs options to disable lazyinit.

But if you simply print remaining groups I think it is a very
simple and low-risk patch, so I wouldn't NAK it, either.

-Eric

>>
>> -Eric

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o May 17, 2016, 4:45 a.m. UTC | #5
On Mon, May 16, 2016 at 11:22:35PM -0500, Eric Sandeen wrote:
> 
> Sure thing; I'm still on the fence about usefulness, because if
> anyone really cares to wait for it to hit zero, they probably
> should have just changed their mkfs options to disable lazyinit.

Indeed, if you're going to stall your test startup until lazyinit is
done, it will be faster to let mke2fs initialize the inode table
blocks, since lazyinit deliberately rate limits how much I/O it sends
down to the disk in order to avoid impacting user latencies.  So i'd
recommend "mke2fs -E lazy_itable_init=0,lazy_journal_init=0" instead
of patching the kernel and then waiting for lazy itable init to
complete.  I think you'll be happier with how long this takes.

BTW, if you care about 99.9 percentile (long tail) latency as a
performance metric, it's a good idea to disable lazy inode table init
and just let mke2fs take its own sweet time.  This is what we do
inside Google, since we care very much about long tail latencies on a
number of our workloads.

Cheers,

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shuichi Ihara May 17, 2016, 5:36 a.m. UTC | #6
U3VyZSwgZGlzYWJsaW5nIGxheXppbml0IGlzIGFuIG9wdGlvbiwgYnV0IG91ciBzaW5nbGUgZGlz
ayBzaXplIGlzIG1vcmUgdGhhbiA2MFRCIHRvIG1ha2Ugc2V2ZXJhbCBwZXRhYnl0ZSBzeXN0ZW0g
d2l0aCBMdXN0cmUuDQpJdCB0YWtlcyB0aW1lIGluIGEgbG9uZyB3aGlsZSB0byBiZSBjb21wbGV0
aW9uIG9mIG1rZnMgd2l0aG91dCBsYXl6aW5pdCBhbmQgd2UgY2FuJ3QgZG8gYW55dGhpbmcgdW50
aWwgbWtmcyBpcyBkb25lLg0KbGF5emluaXQgaXMgc3RpbGwgaGVscCwgd2UgY2FuIG1vdW50IGZp
bGVzeXN0ZW0gYW5kIGRvIHNvbWV0aGluZyAoY3JlYXRlIEx1c3RyZSBhbmQgdGVzdGluZyBmcm9t
IGNsaWVudCwgZXRjKSBpbiBwYXJhbGxlbCB1bmRlciBsYXp5aW5pdCBpcyBydW5uaW5nIGJlaGlu
ZC4NCg0KDQpTbywgd2Ugc3RpbGwgd2FudCB0byBzZWUgaG93IGxhenlpbml0IGlzIGdvaW5nIGFu
ZCBrbm93IGVzdGltYXRpb24gb2YgZmluaXNoIGl0IHRvIGRvIHJlYWxseSBwZXJmb3JtYW5jZSBp
bnRlbnNpdmUgdGVzdGluZyBhZnRlciBsYXp5aW5pdCBmaW5pc2hlZC4NCg0KVGhhbmtzDQpJaGFy
YSANCg0KT24gNS8xNy8xNiwgMTo0NSBQTSwgIlRoZW9kb3JlIFRzJ28iIDx0eXRzb0BtaXQuZWR1
PiB3cm90ZToNCg0KPk9uIE1vbiwgTWF5IDE2LCAyMDE2IGF0IDExOjIyOjM1UE0gLTA1MDAsIEVy
aWMgU2FuZGVlbiB3cm90ZToNCj4+IA0KPj4gU3VyZSB0aGluZzsgSSdtIHN0aWxsIG9uIHRoZSBm
ZW5jZSBhYm91dCB1c2VmdWxuZXNzLCBiZWNhdXNlIGlmDQo+PiBhbnlvbmUgcmVhbGx5IGNhcmVz
IHRvIHdhaXQgZm9yIGl0IHRvIGhpdCB6ZXJvLCB0aGV5IHByb2JhYmx5DQo+PiBzaG91bGQgaGF2
ZSBqdXN0IGNoYW5nZWQgdGhlaXIgbWtmcyBvcHRpb25zIHRvIGRpc2FibGUgbGF6eWluaXQuDQo+
DQo+SW5kZWVkLCBpZiB5b3UncmUgZ29pbmcgdG8gc3RhbGwgeW91ciB0ZXN0IHN0YXJ0dXAgdW50
aWwgbGF6eWluaXQgaXMNCj5kb25lLCBpdCB3aWxsIGJlIGZhc3RlciB0byBsZXQgbWtlMmZzIGlu
aXRpYWxpemUgdGhlIGlub2RlIHRhYmxlDQo+YmxvY2tzLCBzaW5jZSBsYXp5aW5pdCBkZWxpYmVy
YXRlbHkgcmF0ZSBsaW1pdHMgaG93IG11Y2ggSS9PIGl0IHNlbmRzDQo+ZG93biB0byB0aGUgZGlz
ayBpbiBvcmRlciB0byBhdm9pZCBpbXBhY3RpbmcgdXNlciBsYXRlbmNpZXMuICBTbyBpJ2QNCj5y
ZWNvbW1lbmQgIm1rZTJmcyAtRSBsYXp5X2l0YWJsZV9pbml0PTAsbGF6eV9qb3VybmFsX2luaXQ9
MCIgaW5zdGVhZA0KPm9mIHBhdGNoaW5nIHRoZSBrZXJuZWwgYW5kIHRoZW4gd2FpdGluZyBmb3Ig
bGF6eSBpdGFibGUgaW5pdCB0bw0KPmNvbXBsZXRlLiAgSSB0aGluayB5b3UnbGwgYmUgaGFwcGll
ciB3aXRoIGhvdyBsb25nIHRoaXMgdGFrZXMuDQo+DQo+QlRXLCBpZiB5b3UgY2FyZSBhYm91dCA5
OS45IHBlcmNlbnRpbGUgKGxvbmcgdGFpbCkgbGF0ZW5jeSBhcyBhDQo+cGVyZm9ybWFuY2UgbWV0
cmljLCBpdCdzIGEgZ29vZCBpZGVhIHRvIGRpc2FibGUgbGF6eSBpbm9kZSB0YWJsZSBpbml0DQo+
YW5kIGp1c3QgbGV0IG1rZTJmcyB0YWtlIGl0cyBvd24gc3dlZXQgdGltZS4gIFRoaXMgaXMgd2hh
dCB3ZSBkbw0KPmluc2lkZSBHb29nbGUsIHNpbmNlIHdlIGNhcmUgdmVyeSBtdWNoIGFib3V0IGxv
bmcgdGFpbCBsYXRlbmNpZXMgb24gYQ0KPm51bWJlciBvZiBvdXIgd29ya2xvYWRzLg0KPg0KPkNo
ZWVycywNCj4NCj4JCQkJCQktIFRlZA0K
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o May 17, 2016, 6:13 a.m. UTC | #7
On Tue, May 17, 2016 at 05:36:57AM +0000, Shuichi Ihara wrote:
> Sure, disabling layzinit is an option, but our single disk size is
> more than 60TB to make several petabyte system with Lustre.
> It takes time in a long while to be completion of mkfs without
> layzinit and we can't do anything until mkfs is done.  layzinit is
> still help, we can mount filesystem and do something (create Lustre
> and testing from client, etc) in parallel under lazyinit is running
> behind.

Ah, I'm used to doing single disk benchmarks where in the interests of
getting numbers which are as reproducible as possible, I always run
the performance benchmarks on a freshly initialized file system, and
so I don't want to do anything between when the file system is
initialized and when the workload is started --- or if I'm going to be
running a test on an aged file system, I'll have a standardized file
system aging tool with a fixed random seed, and/or a file system image
which I'll dd into place so that the starting point for the
performance evaluation is exactly the same for each benchmark run.

It didn't occur to me that you might want to be using the file system
while lazy init was taking place, and then later on, doing the
performance benchmarks on a used, randomly dirtied file system.  Hence
my assumption that the benmarking shell script[1] would not be doing
anything at all while it waited for lazy init to be finished.

[1] In the interests of keeping the results as consistent as possible,
I tend to have benchmarking scripts that do all of the workload setup
and test running and data collection done in an automated fashion, so
I can kick off a benchmarking run and come back the next day (or after
the weekend) with all of the results collected.

	    	       	    	  - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 349afeb..9aaae81 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1505,6 +1505,10 @@  struct ext4_sb_info {
 	struct ratelimit_state s_err_ratelimit_state;
 	struct ratelimit_state s_warning_ratelimit_state;
 	struct ratelimit_state s_msg_ratelimit_state;
+
+	/* for lazyinit stats */
+	unsigned long lazyinit_finished_cnt;
+	unsigned long lazyinit_total_cnt;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 304c712..b95e622 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2660,6 +2660,7 @@  static int ext4_run_li_request(struct ext4_li_request *elr)
 		}
 		elr->lr_next_sched = jiffies + elr->lr_timeout;
 		elr->lr_next_group = group + 1;
+		EXT4_SB(sb)->lazyinit_finished_cnt++;
 	}
 	sb_end_write(sb);
 
@@ -2864,10 +2865,12 @@  static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_li_request *elr;
+	ext4_group_t group, ngroups;
+	struct ext4_group_desc *gdp = NULL;
 
 	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
 	if (!elr)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	elr->lr_super = sb;
 	elr->lr_sbi = sbi;
@@ -2880,6 +2883,16 @@  static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
 	 */
 	elr->lr_next_sched = jiffies + (prandom_u32() %
 				(EXT4_DEF_LI_MAX_START_DELAY * HZ));
+	ngroups = EXT4_SB(sb)->s_groups_count;
+	 for (group = elr->lr_next_group; group < ngroups; group++) {
+		gdp = ext4_get_group_desc(sb, group, NULL);
+		if (!gdp) {
+			elr = ERR_PTR(-EIO);
+			break;
+		}
+		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			sbi->lazyinit_total_cnt++;
+	}
 	return elr;
 }
 
@@ -2907,8 +2920,8 @@  int ext4_register_li_request(struct super_block *sb,
 		goto out;
 
 	elr = ext4_li_request_new(sb, first_not_zeroed);
-	if (!elr) {
-		ret = -ENOMEM;
+	if (IS_ERR(elr)) {
+		ret = PTR_ERR(elr);
 		goto out;
 	}
 
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 1420a3c..8d0c199 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -20,6 +20,7 @@  typedef enum {
 	attr_delayed_allocation_blocks,
 	attr_session_write_kbytes,
 	attr_lifetime_write_kbytes,
+	attr_lazyinit_stats,
 	attr_reserved_clusters,
 	attr_inode_readahead,
 	attr_trigger_test_error,
@@ -72,6 +73,21 @@  static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
 			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
 }
 
+static ssize_t lazyinit_stats_show(struct ext4_attr *a,
+				   struct ext4_sb_info *sbi, char *buf)
+{
+	int len = 0;
+	unsigned long total = sbi->lazyinit_total_cnt;
+	unsigned long finish = sbi->lazyinit_finished_cnt;
+
+	len += snprintf(buf + len, PAGE_SIZE,
+			"groups_finished: %lu\n", finish);
+	len += snprintf(buf + len, PAGE_SIZE,
+			"groups_total: %lu\n", total);
+
+	return len;
+}
+
 static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 					  struct ext4_sb_info *sbi,
 					  const char *buf, size_t count)
@@ -165,6 +181,7 @@  static struct ext4_attr ext4_attr_##_name = {			\
 EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
 EXT4_ATTR_FUNC(session_write_kbytes, 0444);
 EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
+EXT4_ATTR_FUNC(lazyinit_stats, 0444);
 EXT4_ATTR_FUNC(reserved_clusters, 0644);
 
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
@@ -195,6 +212,7 @@  static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
 	ATTR_LIST(session_write_kbytes),
 	ATTR_LIST(lifetime_write_kbytes),
+	ATTR_LIST(lazyinit_stats),
 	ATTR_LIST(reserved_clusters),
 	ATTR_LIST(inode_readahead_blks),
 	ATTR_LIST(inode_goal),
@@ -265,6 +283,8 @@  static ssize_t ext4_attr_show(struct kobject *kobj,
 		return session_write_kbytes_show(a, sbi, buf);
 	case attr_lifetime_write_kbytes:
 		return lifetime_write_kbytes_show(a, sbi, buf);
+	case attr_lazyinit_stats:
+		return lazyinit_stats_show(a, sbi, buf);
 	case attr_reserved_clusters:
 		return snprintf(buf, PAGE_SIZE, "%llu\n",
 				(unsigned long long)