diff mbox series

[-next,v2,2/6] ext4: introduce last_check_time record previous check time

Message ID 20210911090059.1876456-3-yebin10@huawei.com
State Not Applicable
Headers show
Series Fix some issues about mmp | expand

Commit Message

yebin (H) Sept. 11, 2021, 9 a.m. UTC
kmmpd:
...
    diff = jiffies - last_update_time;
    if (diff > mmp_check_interval * HZ) {
...
As "mmp_check_interval = 2 * mmp_update_interval", 'diff' always little
than 'mmp_update_interval', so there will never trigger detection.
Introduce last_check_time record previous check time.

Signed-off-by: Ye Bin <yebin10@huawei.com>
---
 fs/ext4/mmp.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

Comments

Jan Kara Oct. 7, 2021, 12:31 p.m. UTC | #1
On Sat 11-09-21 17:00:55, Ye Bin wrote:
> kmmpd:
> ...
>     diff = jiffies - last_update_time;
>     if (diff > mmp_check_interval * HZ) {
> ...
> As "mmp_check_interval = 2 * mmp_update_interval", 'diff' always little
> than 'mmp_update_interval', so there will never trigger detection.
> Introduce last_check_time record previous check time.
> 
> Signed-off-by: Ye Bin <yebin10@huawei.com>

I think the check is there only for the case where write_mmp_block() +
sleep took longer than mmp_check_interval. I agree that should rarely
happen but on a really busy system it is possible and in that case we would
miss updating mmp block for too long and so another node could have started
using the filesystem. I actually don't see a reason why kmmpd should be
checking the block each mmp_check_interval as you do - mmp_check_interval
is just for ext4_multi_mount_protect() to know how long it should wait
before considering mmp block stale... Am I missing something?

								Honza

> ---
>  fs/ext4/mmp.c | 14 +++++++++-----
>  1 file changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
> index 12af6dc8457b..c781b09a78c9 100644
> --- a/fs/ext4/mmp.c
> +++ b/fs/ext4/mmp.c
> @@ -152,6 +152,7 @@ static int kmmpd(void *data)
>  	int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
>  	unsigned mmp_check_interval;
>  	unsigned long last_update_time;
> +	unsigned long last_check_time;
>  	unsigned long diff;
>  	int retval = 0;
>  
> @@ -170,6 +171,7 @@ static int kmmpd(void *data)
>  
>  	memcpy(mmp->mmp_nodename, init_utsname()->nodename,
>  	       sizeof(mmp->mmp_nodename));
> +	last_check_time = jiffies;
>  
>  	while (!kthread_should_stop() && !sb_rdonly(sb)) {
>  		if (!ext4_has_feature_mmp(sb)) {
> @@ -198,17 +200,18 @@ static int kmmpd(void *data)
>  		}
>  
>  		diff = jiffies - last_update_time;
> -		if (diff < mmp_update_interval * HZ)
> +		if (diff < mmp_update_interval * HZ) {
>  			schedule_timeout_interruptible(mmp_update_interval *
>  						       HZ - diff);
> +			diff = jiffies - last_update_time;
> +		}
>  
>  		/*
>  		 * We need to make sure that more than mmp_check_interval
> -		 * seconds have not passed since writing. If that has happened
> -		 * we need to check if the MMP block is as we left it.
> +		 * seconds have not passed since check. If that has happened
> +		 * we need to check if the MMP block is as we write it.
>  		 */
> -		diff = jiffies - last_update_time;
> -		if (diff > mmp_check_interval * HZ) {
> +		if (jiffies - last_check_time > mmp_check_interval * HZ) {
>  			struct buffer_head *bh_check = NULL;
>  			struct mmp_struct *mmp_check;
>  
> @@ -234,6 +237,7 @@ static int kmmpd(void *data)
>  				goto wait_to_exit;
>  			}
>  			put_bh(bh_check);
> +			last_check_time = jiffies;
>  		}
>  
>  		 /*
> -- 
> 2.31.1
>
yebin (H) Oct. 8, 2021, 1:56 a.m. UTC | #2
On 2021/10/7 20:31, Jan Kara wrote:
> On Sat 11-09-21 17:00:55, Ye Bin wrote:
>> kmmpd:
>> ...
>>      diff = jiffies - last_update_time;
>>      if (diff > mmp_check_interval * HZ) {
>> ...
>> As "mmp_check_interval = 2 * mmp_update_interval", 'diff' always little
>> than 'mmp_update_interval', so there will never trigger detection.
>> Introduce last_check_time record previous check time.
>>
>> Signed-off-by: Ye Bin <yebin10@huawei.com>
> I think the check is there only for the case where write_mmp_block() +
> sleep took longer than mmp_check_interval. I agree that should rarely
> happen but on a really busy system it is possible and in that case we would
> miss updating mmp block for too long and so another node could have started
> using the filesystem. I actually don't see a reason why kmmpd should be
> checking the block each mmp_check_interval as you do - mmp_check_interval
> is just for ext4_multi_mount_protect() to know how long it should wait
> before considering mmp block stale... Am I missing something?
>
> 								Honza
I'm sorry, I didn't understand the detection mechanism here before. Now 
I understand
the detection mechanism here.
As you said, it's just an abnormal protection. There's really no problem.

>> ---
>>   fs/ext4/mmp.c | 14 +++++++++-----
>>   1 file changed, 9 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
>> index 12af6dc8457b..c781b09a78c9 100644
>> --- a/fs/ext4/mmp.c
>> +++ b/fs/ext4/mmp.c
>> @@ -152,6 +152,7 @@ static int kmmpd(void *data)
>>   	int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
>>   	unsigned mmp_check_interval;
>>   	unsigned long last_update_time;
>> +	unsigned long last_check_time;
>>   	unsigned long diff;
>>   	int retval = 0;
>>   
>> @@ -170,6 +171,7 @@ static int kmmpd(void *data)
>>   
>>   	memcpy(mmp->mmp_nodename, init_utsname()->nodename,
>>   	       sizeof(mmp->mmp_nodename));
>> +	last_check_time = jiffies;
>>   
>>   	while (!kthread_should_stop() && !sb_rdonly(sb)) {
>>   		if (!ext4_has_feature_mmp(sb)) {
>> @@ -198,17 +200,18 @@ static int kmmpd(void *data)
>>   		}
>>   
>>   		diff = jiffies - last_update_time;
>> -		if (diff < mmp_update_interval * HZ)
>> +		if (diff < mmp_update_interval * HZ) {
>>   			schedule_timeout_interruptible(mmp_update_interval *
>>   						       HZ - diff);
>> +			diff = jiffies - last_update_time;
>> +		}
>>   
>>   		/*
>>   		 * We need to make sure that more than mmp_check_interval
>> -		 * seconds have not passed since writing. If that has happened
>> -		 * we need to check if the MMP block is as we left it.
>> +		 * seconds have not passed since check. If that has happened
>> +		 * we need to check if the MMP block is as we write it.
>>   		 */
>> -		diff = jiffies - last_update_time;
>> -		if (diff > mmp_check_interval * HZ) {
>> +		if (jiffies - last_check_time > mmp_check_interval * HZ) {
>>   			struct buffer_head *bh_check = NULL;
>>   			struct mmp_struct *mmp_check;
>>   
>> @@ -234,6 +237,7 @@ static int kmmpd(void *data)
>>   				goto wait_to_exit;
>>   			}
>>   			put_bh(bh_check);
>> +			last_check_time = jiffies;
>>   		}
>>   
>>   		 /*
>> -- 
>> 2.31.1
>>
yebin (H) Oct. 8, 2021, 2:38 a.m. UTC | #3
On 2021/10/8 9:56, yebin wrote:
>
>
> On 2021/10/7 20:31, Jan Kara wrote:
>> On Sat 11-09-21 17:00:55, Ye Bin wrote:
>>> kmmpd:
>>> ...
>>>      diff = jiffies - last_update_time;
>>>      if (diff > mmp_check_interval * HZ) {
>>> ...
>>> As "mmp_check_interval = 2 * mmp_update_interval", 'diff' always little
>>> than 'mmp_update_interval', so there will never trigger detection.
>>> Introduce last_check_time record previous check time.
>>>
>>> Signed-off-by: Ye Bin <yebin10@huawei.com>
>> I think the check is there only for the case where write_mmp_block() +
>> sleep took longer than mmp_check_interval. I agree that should rarely
>> happen but on a really busy system it is possible and in that case we 
>> would
>> miss updating mmp block for too long and so another node could have 
>> started
>> using the filesystem. I actually don't see a reason why kmmpd should be
>> checking the block each mmp_check_interval as you do - 
>> mmp_check_interval
>> is just for ext4_multi_mount_protect() to know how long it should wait
>> before considering mmp block stale... Am I missing something?
>>
>>                                 Honza
> I'm sorry, I didn't understand the detection mechanism here before. 
> Now I understand
> the detection mechanism here.
> As you said, it's just an abnormal protection. There's really no problem.
>
Yeah, i did test as following steps
hostA                        hostB
    mount
      ext4_multi_mount_protect  -> seq == EXT4_MMP_SEQ_CLEAN
         delay 5s after label "skip" so hostB will see seq is 
EXT4_MMP_SEQ_CLEAN
                        mount
                        ext4_multi_mount_protect -> seq == 
EXT4_MMP_SEQ_CLEAN
                                run  kmmpd
     run kmmpd

Actually,in this  situation kmmpd will not detect  confliction.
In  ext4_multi_mount_protect  function we write mmp data fisrt and wait 
'wait_time * HZ'  seconds,
read mmp data do check.Most of the time, If 'wait_time' is zero, it can 
pass check.
>>> ---
>>>   fs/ext4/mmp.c | 14 +++++++++-----
>>>   1 file changed, 9 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
>>> index 12af6dc8457b..c781b09a78c9 100644
>>> --- a/fs/ext4/mmp.c
>>> +++ b/fs/ext4/mmp.c
>>> @@ -152,6 +152,7 @@ static int kmmpd(void *data)
>>>       int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
>>>       unsigned mmp_check_interval;
>>>       unsigned long last_update_time;
>>> +    unsigned long last_check_time;
>>>       unsigned long diff;
>>>       int retval = 0;
>>>   @@ -170,6 +171,7 @@ static int kmmpd(void *data)
>>>         memcpy(mmp->mmp_nodename, init_utsname()->nodename,
>>>              sizeof(mmp->mmp_nodename));
>>> +    last_check_time = jiffies;
>>>         while (!kthread_should_stop() && !sb_rdonly(sb)) {
>>>           if (!ext4_has_feature_mmp(sb)) {
>>> @@ -198,17 +200,18 @@ static int kmmpd(void *data)
>>>           }
>>>             diff = jiffies - last_update_time;
>>> -        if (diff < mmp_update_interval * HZ)
>>> +        if (diff < mmp_update_interval * HZ) {
>>> schedule_timeout_interruptible(mmp_update_interval *
>>>                                  HZ - diff);
>>> +            diff = jiffies - last_update_time;
>>> +        }
>>>             /*
>>>            * We need to make sure that more than mmp_check_interval
>>> -         * seconds have not passed since writing. If that has happened
>>> -         * we need to check if the MMP block is as we left it.
>>> +         * seconds have not passed since check. If that has happened
>>> +         * we need to check if the MMP block is as we write it.
>>>            */
>>> -        diff = jiffies - last_update_time;
>>> -        if (diff > mmp_check_interval * HZ) {
>>> +        if (jiffies - last_check_time > mmp_check_interval * HZ) {
>>>               struct buffer_head *bh_check = NULL;
>>>               struct mmp_struct *mmp_check;
>>>   @@ -234,6 +237,7 @@ static int kmmpd(void *data)
>>>                   goto wait_to_exit;
>>>               }
>>>               put_bh(bh_check);
>>> +            last_check_time = jiffies;
>>>           }
>>>              /*
>>> -- 
>>> 2.31.1
>>>
>
Jan Kara Oct. 12, 2021, 8:47 a.m. UTC | #4
On Fri 08-10-21 10:38:31, yebin wrote:
> On 2021/10/8 9:56, yebin wrote:
> > On 2021/10/7 20:31, Jan Kara wrote:
> > > On Sat 11-09-21 17:00:55, Ye Bin wrote:
> > > > kmmpd:
> > > > ...
> > > >      diff = jiffies - last_update_time;
> > > >      if (diff > mmp_check_interval * HZ) {
> > > > ...
> > > > As "mmp_check_interval = 2 * mmp_update_interval", 'diff' always little
> > > > than 'mmp_update_interval', so there will never trigger detection.
> > > > Introduce last_check_time record previous check time.
> > > > 
> > > > Signed-off-by: Ye Bin <yebin10@huawei.com>
> > > I think the check is there only for the case where write_mmp_block() +
> > > sleep took longer than mmp_check_interval. I agree that should rarely
> > > happen but on a really busy system it is possible and in that case
> > > we would
> > > miss updating mmp block for too long and so another node could have
> > > started
> > > using the filesystem. I actually don't see a reason why kmmpd should be
> > > checking the block each mmp_check_interval as you do -
> > > mmp_check_interval
> > > is just for ext4_multi_mount_protect() to know how long it should wait
> > > before considering mmp block stale... Am I missing something?
> > > 
> > >                                 Honza
> > I'm sorry, I didn't understand the detection mechanism here before. Now
> > I understand
> > the detection mechanism here.
> > As you said, it's just an abnormal protection. There's really no problem.
> > 
> Yeah, i did test as following steps
> hostA                        hostB
>    mount
>      ext4_multi_mount_protect  -> seq == EXT4_MMP_SEQ_CLEAN
>         delay 5s after label "skip" so hostB will see seq is
> EXT4_MMP_SEQ_CLEAN
>                        mount
>                        ext4_multi_mount_protect -> seq == EXT4_MMP_SEQ_CLEAN
>                                run  kmmpd
>     run kmmpd
> 
> Actually,in this  situation kmmpd will not detect  confliction.
> In ext4_multi_mount_protect function we write mmp data first and wait
> 'wait_time * HZ'  seconds,
> read mmp data do check. Most of the time, If 'wait_time' is zero, it can pass
> check.

But how can be wait_time zero? As far as I'm reading the code, wait_time
must be at least EXT4_MMP_MIN_CHECK_INTERVAL...

								Honza
yebin (H) Oct. 12, 2021, 11:46 a.m. UTC | #5
On 2021/10/12 16:47, Jan Kara wrote:
> On Fri 08-10-21 10:38:31, yebin wrote:
>> On 2021/10/8 9:56, yebin wrote:
>>> On 2021/10/7 20:31, Jan Kara wrote:
>>>> On Sat 11-09-21 17:00:55, Ye Bin wrote:
>>>>> kmmpd:
>>>>> ...
>>>>>       diff = jiffies - last_update_time;
>>>>>       if (diff > mmp_check_interval * HZ) {
>>>>> ...
>>>>> As "mmp_check_interval = 2 * mmp_update_interval", 'diff' always little
>>>>> than 'mmp_update_interval', so there will never trigger detection.
>>>>> Introduce last_check_time record previous check time.
>>>>>
>>>>> Signed-off-by: Ye Bin <yebin10@huawei.com>
>>>> I think the check is there only for the case where write_mmp_block() +
>>>> sleep took longer than mmp_check_interval. I agree that should rarely
>>>> happen but on a really busy system it is possible and in that case
>>>> we would
>>>> miss updating mmp block for too long and so another node could have
>>>> started
>>>> using the filesystem. I actually don't see a reason why kmmpd should be
>>>> checking the block each mmp_check_interval as you do -
>>>> mmp_check_interval
>>>> is just for ext4_multi_mount_protect() to know how long it should wait
>>>> before considering mmp block stale... Am I missing something?
>>>>
>>>>                                  Honza
>>> I'm sorry, I didn't understand the detection mechanism here before. Now
>>> I understand
>>> the detection mechanism here.
>>> As you said, it's just an abnormal protection. There's really no problem.
>>>
>> Yeah, i did test as following steps
>> hostA                        hostB
>>     mount
>>       ext4_multi_mount_protect  -> seq == EXT4_MMP_SEQ_CLEAN
>>          delay 5s after label "skip" so hostB will see seq is
>> EXT4_MMP_SEQ_CLEAN
>>                         mount
>>                         ext4_multi_mount_protect -> seq == EXT4_MMP_SEQ_CLEAN
>>                                 run  kmmpd
>>      run kmmpd
>>
>> Actually,in this  situation kmmpd will not detect  confliction.
>> In ext4_multi_mount_protect function we write mmp data first and wait
>> 'wait_time * HZ'  seconds,
>> read mmp data do check. Most of the time, If 'wait_time' is zero, it can pass
>> check.
> But how can be wait_time zero? As far as I'm reading the code, wait_time
> must be at least EXT4_MMP_MIN_CHECK_INTERVAL...
>
> 								Honza
  int ext4_multi_mount_protect(struct super_block *sb,
                                      ext4_fsblk_t mmp_block)
  {
          struct ext4_super_block *es = EXT4_SB(sb)->s_es;
          struct buffer_head *bh = NULL;
          struct mmp_struct *mmp = NULL;
          u32 seq;
          unsigned int mmp_check_interval = 
le16_to_cpu(es->s_mmp_update_interval);
          unsigned int wait_time = 0;                    --> wait_time 
is equal with zero
          int retval;

          if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
              mmp_block >= ext4_blocks_count(es)) {
                  ext4_warning(sb, "Invalid MMP block in superblock");
                  goto failed;
          }

          retval = read_mmp_block(sb, &bh, mmp_block);
          if (retval)
                  goto failed;

          mmp = (struct mmp_struct *)(bh->b_data);

          if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
                  mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;

          /*
           * If check_interval in MMP block is larger, use that instead of
           * update_interval from the superblock.
           */
          if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
                  mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);

          seq = le32_to_cpu(mmp->mmp_seq);
          if (seq == EXT4_MMP_SEQ_CLEAN)   --> If hostA and hostB mount 
the same block device at the same time,
--> HostA and hostB  maybe get 'seq' with the same value 
EXT4_MMP_SEQ_CLEAN.
                  goto skip;
...
skip:
         /*
          * write a new random sequence number.
          */
         seq = mmp_new_seq();
         mmp->mmp_seq = cpu_to_le32(seq);

         retval = write_mmp_block(sb, bh);
         if (retval)
                 goto failed;

         /*
          * wait for MMP interval and check mmp_seq.
          */
         if (schedule_timeout_interruptible(HZ * wait_time) != 0) 
{        --> If seq is equal with EXT4_MMP_SEQ_CLEAN, wait_time is zero.
                 ext4_warning(sb, "MMP startup interrupted, failing mount");
                 goto failed;
         }

         retval = read_mmp_block(sb, &bh, mmp_block); -->We may get the 
same data with which we wrote, so we can't detect conflict at here.
         if (retval)
                 goto failed;
         mmp = (struct mmp_struct *)(bh->b_data);
         if (seq != le32_to_cpu(mmp->mmp_seq)) {
                 dump_mmp_msg(sb, mmp,
                              "Device is already active on another node.");
                 goto failed;
         }
...
}
Jan Kara Oct. 13, 2021, 9:38 a.m. UTC | #6
On Tue 12-10-21 19:46:24, yebin wrote:
> On 2021/10/12 16:47, Jan Kara wrote:
> > On Fri 08-10-21 10:38:31, yebin wrote:
> > > On 2021/10/8 9:56, yebin wrote:
> > > > On 2021/10/7 20:31, Jan Kara wrote:
> > > > > On Sat 11-09-21 17:00:55, Ye Bin wrote:
> > > > > > kmmpd:
> > > > > > ...
> > > > > >       diff = jiffies - last_update_time;
> > > > > >       if (diff > mmp_check_interval * HZ) {
> > > > > > ...
> > > > > > As "mmp_check_interval = 2 * mmp_update_interval", 'diff' always little
> > > > > > than 'mmp_update_interval', so there will never trigger detection.
> > > > > > Introduce last_check_time record previous check time.
> > > > > > 
> > > > > > Signed-off-by: Ye Bin <yebin10@huawei.com>
> > > > > I think the check is there only for the case where write_mmp_block() +
> > > > > sleep took longer than mmp_check_interval. I agree that should rarely
> > > > > happen but on a really busy system it is possible and in that case
> > > > > we would
> > > > > miss updating mmp block for too long and so another node could have
> > > > > started
> > > > > using the filesystem. I actually don't see a reason why kmmpd should be
> > > > > checking the block each mmp_check_interval as you do -
> > > > > mmp_check_interval
> > > > > is just for ext4_multi_mount_protect() to know how long it should wait
> > > > > before considering mmp block stale... Am I missing something?
> > > > > 
> > > > >                                  Honza
> > > > I'm sorry, I didn't understand the detection mechanism here before. Now
> > > > I understand
> > > > the detection mechanism here.
> > > > As you said, it's just an abnormal protection. There's really no problem.
> > > > 
> > > Yeah, i did test as following steps
> > > hostA                        hostB
> > >     mount
> > >       ext4_multi_mount_protect  -> seq == EXT4_MMP_SEQ_CLEAN
> > >          delay 5s after label "skip" so hostB will see seq is
> > > EXT4_MMP_SEQ_CLEAN
> > >                         mount
> > >                         ext4_multi_mount_protect -> seq == EXT4_MMP_SEQ_CLEAN
> > >                                 run  kmmpd
> > >      run kmmpd
> > > 
> > > Actually,in this  situation kmmpd will not detect  confliction.
> > > In ext4_multi_mount_protect function we write mmp data first and wait
> > > 'wait_time * HZ'  seconds,
> > > read mmp data do check. Most of the time, If 'wait_time' is zero, it can pass
> > > check.
> > But how can be wait_time zero? As far as I'm reading the code, wait_time
> > must be at least EXT4_MMP_MIN_CHECK_INTERVAL...
> > 
> > 								Honza
>  int ext4_multi_mount_protect(struct super_block *sb,
>                                      ext4_fsblk_t mmp_block)
>  {
>          struct ext4_super_block *es = EXT4_SB(sb)->s_es;
>          struct buffer_head *bh = NULL;
>          struct mmp_struct *mmp = NULL;
>          u32 seq;
>          unsigned int mmp_check_interval =
> le16_to_cpu(es->s_mmp_update_interval);
>          unsigned int wait_time = 0;                    --> wait_time is
> equal with zero
>          int retval;
> 
>          if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
>              mmp_block >= ext4_blocks_count(es)) {
>                  ext4_warning(sb, "Invalid MMP block in superblock");
>                  goto failed;
>          }
> 
>          retval = read_mmp_block(sb, &bh, mmp_block);
>          if (retval)
>                  goto failed;
> 
>          mmp = (struct mmp_struct *)(bh->b_data);
> 
>          if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
>                  mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
> 
>          /*
>           * If check_interval in MMP block is larger, use that instead of
>           * update_interval from the superblock.
>           */
>          if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
>                  mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
> 
>          seq = le32_to_cpu(mmp->mmp_seq);
>          if (seq == EXT4_MMP_SEQ_CLEAN)   --> If hostA and hostB mount the
> same block device at the same time,
> --> HostA and hostB  maybe get 'seq' with the same value EXT4_MMP_SEQ_CLEAN.
>                  goto skip;

Oh, I see. Thanks for explanation. 

> ...
> skip:
>         /*
>          * write a new random sequence number.
>          */
>         seq = mmp_new_seq();
>         mmp->mmp_seq = cpu_to_le32(seq);
> 
>         retval = write_mmp_block(sb, bh);
>         if (retval)
>                 goto failed;
> 
>         /*
>          * wait for MMP interval and check mmp_seq.
>          */
>         if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
> --> If seq is equal with EXT4_MMP_SEQ_CLEAN, wait_time is zero.
>                 ext4_warning(sb, "MMP startup interrupted, failing mount");
>                 goto failed;
>         }
> 
>         retval = read_mmp_block(sb, &bh, mmp_block); -->We may get the same
> data with which we wrote, so we can't detect conflict at here.

OK, I see. So the race in ext4_multi_mount_protect() goes like:

hostA				hostB

read_mmp_block()		read_mmp_block()
- sees EXT4_MMP_SEQ_CLEAN	- sees EXT4_MMP_SEQ_CLEAN
write_mmp_block()
wait_time == 0 -> no wait
read_mmp_block()
  - all OK, mount
				write_mmp_block()
				wait_time == 0 -> no wait
				read_mmp_block()
				  - all OK, mount

Do I get it right? Actually, if we passed seq we wrote in
ext4_multi_mount_protect() to kmmpd (probably in sb), then kmmpd would
notice the conflict on its first invocation but still that would be a bit
late because there would be a time window where hostA and hostB would be
both using the fs.

We could reduce the likelyhood of this race by always waiting in
ext4_multi_mount_protect() between write & read but I guess that is
undesirable as it would slow down all clean mounts. Ted?

								Honza
yebin (H) Oct. 13, 2021, 12:33 p.m. UTC | #7
On 2021/10/13 17:38, Jan Kara wrote:
> On Tue 12-10-21 19:46:24, yebin wrote:
>> On 2021/10/12 16:47, Jan Kara wrote:
>>> On Fri 08-10-21 10:38:31, yebin wrote:
>>>> On 2021/10/8 9:56, yebin wrote:
>>>>> On 2021/10/7 20:31, Jan Kara wrote:
>>>>>> On Sat 11-09-21 17:00:55, Ye Bin wrote:
>>>>>>> kmmpd:
>>>>>>> ...
>>>>>>>        diff = jiffies - last_update_time;
>>>>>>>        if (diff > mmp_check_interval * HZ) {
>>>>>>> ...
>>>>>>> As "mmp_check_interval = 2 * mmp_update_interval", 'diff' always little
>>>>>>> than 'mmp_update_interval', so there will never trigger detection.
>>>>>>> Introduce last_check_time record previous check time.
>>>>>>>
>>>>>>> Signed-off-by: Ye Bin <yebin10@huawei.com>
>>>>>> I think the check is there only for the case where write_mmp_block() +
>>>>>> sleep took longer than mmp_check_interval. I agree that should rarely
>>>>>> happen but on a really busy system it is possible and in that case
>>>>>> we would
>>>>>> miss updating mmp block for too long and so another node could have
>>>>>> started
>>>>>> using the filesystem. I actually don't see a reason why kmmpd should be
>>>>>> checking the block each mmp_check_interval as you do -
>>>>>> mmp_check_interval
>>>>>> is just for ext4_multi_mount_protect() to know how long it should wait
>>>>>> before considering mmp block stale... Am I missing something?
>>>>>>
>>>>>>                                   Honza
>>>>> I'm sorry, I didn't understand the detection mechanism here before. Now
>>>>> I understand
>>>>> the detection mechanism here.
>>>>> As you said, it's just an abnormal protection. There's really no problem.
>>>>>
>>>> Yeah, i did test as following steps
>>>> hostA                        hostB
>>>>      mount
>>>>        ext4_multi_mount_protect  -> seq == EXT4_MMP_SEQ_CLEAN
>>>>           delay 5s after label "skip" so hostB will see seq is
>>>> EXT4_MMP_SEQ_CLEAN
>>>>                          mount
>>>>                          ext4_multi_mount_protect -> seq == EXT4_MMP_SEQ_CLEAN
>>>>                                  run  kmmpd
>>>>       run kmmpd
>>>>
>>>> Actually,in this  situation kmmpd will not detect  confliction.
>>>> In ext4_multi_mount_protect function we write mmp data first and wait
>>>> 'wait_time * HZ'  seconds,
>>>> read mmp data do check. Most of the time, If 'wait_time' is zero, it can pass
>>>> check.
>>> But how can be wait_time zero? As far as I'm reading the code, wait_time
>>> must be at least EXT4_MMP_MIN_CHECK_INTERVAL...
>>>
>>> 								Honza
>>   int ext4_multi_mount_protect(struct super_block *sb,
>>                                       ext4_fsblk_t mmp_block)
>>   {
>>           struct ext4_super_block *es = EXT4_SB(sb)->s_es;
>>           struct buffer_head *bh = NULL;
>>           struct mmp_struct *mmp = NULL;
>>           u32 seq;
>>           unsigned int mmp_check_interval =
>> le16_to_cpu(es->s_mmp_update_interval);
>>           unsigned int wait_time = 0;                    --> wait_time is
>> equal with zero
>>           int retval;
>>
>>           if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
>>               mmp_block >= ext4_blocks_count(es)) {
>>                   ext4_warning(sb, "Invalid MMP block in superblock");
>>                   goto failed;
>>           }
>>
>>           retval = read_mmp_block(sb, &bh, mmp_block);
>>           if (retval)
>>                   goto failed;
>>
>>           mmp = (struct mmp_struct *)(bh->b_data);
>>
>>           if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
>>                   mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
>>
>>           /*
>>            * If check_interval in MMP block is larger, use that instead of
>>            * update_interval from the superblock.
>>            */
>>           if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
>>                   mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
>>
>>           seq = le32_to_cpu(mmp->mmp_seq);
>>           if (seq == EXT4_MMP_SEQ_CLEAN)   --> If hostA and hostB mount the
>> same block device at the same time,
>> --> HostA and hostB  maybe get 'seq' with the same value EXT4_MMP_SEQ_CLEAN.
>>                   goto skip;
> Oh, I see. Thanks for explanation.
>
>> ...
>> skip:
>>          /*
>>           * write a new random sequence number.
>>           */
>>          seq = mmp_new_seq();
>>          mmp->mmp_seq = cpu_to_le32(seq);
>>
>>          retval = write_mmp_block(sb, bh);
>>          if (retval)
>>                  goto failed;
>>
>>          /*
>>           * wait for MMP interval and check mmp_seq.
>>           */
>>          if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
>> --> If seq is equal with EXT4_MMP_SEQ_CLEAN, wait_time is zero.
>>                  ext4_warning(sb, "MMP startup interrupted, failing mount");
>>                  goto failed;
>>          }
>>
>>          retval = read_mmp_block(sb, &bh, mmp_block); -->We may get the same
>> data with which we wrote, so we can't detect conflict at here.
> OK, I see. So the race in ext4_multi_mount_protect() goes like:
>
> hostA				hostB
>
> read_mmp_block()		read_mmp_block()
> - sees EXT4_MMP_SEQ_CLEAN	- sees EXT4_MMP_SEQ_CLEAN
> write_mmp_block()
> wait_time == 0 -> no wait
> read_mmp_block()
>    - all OK, mount
> 				write_mmp_block()
> 				wait_time == 0 -> no wait
> 				read_mmp_block()
> 				  - all OK, mount
Yes, that's what i mean.
>
> Do I get it right? Actually, if we passed seq we wrote in
> ext4_multi_mount_protect() to kmmpd (probably in sb), then kmmpd would
> notice the conflict on its first invocation but still that would be a bit
> late because there would be a time window where hostA and hostB would be
> both using the fs.
>
> We could reduce the likelyhood of this race by always waiting in
> ext4_multi_mount_protect() between write & read but I guess that is
> undesirable as it would slow down all clean mounts. Ted?
>
> 								Honza
Theodore Ts'o Oct. 13, 2021, 9:41 p.m. UTC | #8
On Wed, Oct 13, 2021 at 11:38:47AM +0200, Jan Kara wrote:
> 
> OK, I see. So the race in ext4_multi_mount_protect() goes like:
> 
> hostA				hostB
> 
> read_mmp_block()		read_mmp_block()
> - sees EXT4_MMP_SEQ_CLEAN	- sees EXT4_MMP_SEQ_CLEAN
> write_mmp_block()
> wait_time == 0 -> no wait
> read_mmp_block()
>   - all OK, mount
> 				write_mmp_block()
> 				wait_time == 0 -> no wait
> 				read_mmp_block()
> 				  - all OK, mount
> 
> Do I get it right? Actually, if we passed seq we wrote in
> ext4_multi_mount_protect() to kmmpd (probably in sb), then kmmpd would
> notice the conflict on its first invocation but still that would be a bit
> late because there would be a time window where hostA and hostB would be
> both using the fs.
> 
> We could reduce the likelyhood of this race by always waiting in
> ext4_multi_mount_protect() between write & read but I guess that is
> undesirable as it would slow down all clean mounts. Ted?

I'd like Andreas to comment here.  My understanding is that MMP
originally intended as a safety mechanism which would be used as part
of a primary/backup high availability system, but not as the *primary*
system where you might try to have two servers simultaneously try to
mount the file system and use MMP as the "election" mechanism to
decide which server is going to be the primary system, and which would
be the backup system.

The cost of being able to handle this particular race is it would slow
down the mounts of cleanly unmounted systems.

There *are* better systems to implement leader elections[1] than using
MMP.  Most of these more efficient leader elections assume that you
have a working IP network, and so if you have a separate storage
network (including a shared SCSI bus) from your standard IP network,
then MMP is a useful failsafe in the face of a network partition of
your IP network.  The question is whether MMP should be useful for
more than that.  And if it isn't, then we should probably document
what MMP is and isn't good for, and give advice in the form of an
application note for how MMP should be used in the context of a larger
system.

[1] https://en.wikipedia.org/wiki/Leader_election

						- Ted
Andreas Dilger Oct. 15, 2021, 3:21 a.m. UTC | #9
On Oct 13, 2021, at 3:41 PM, Theodore Ts'o <tytso@mit.edu> wrote:
> 
> On Wed, Oct 13, 2021 at 11:38:47AM +0200, Jan Kara wrote:
>> 
>> OK, I see. So the race in ext4_multi_mount_protect() goes like:
>> 
>> hostA				hostB
>> 
>> read_mmp_block()		read_mmp_block()
>> - sees EXT4_MMP_SEQ_CLEAN	- sees EXT4_MMP_SEQ_CLEAN
>> write_mmp_block()
>> wait_time == 0 -> no wait
>> read_mmp_block()
>>  - all OK, mount
>> 				write_mmp_block()
>> 				wait_time == 0 -> no wait
>> 				read_mmp_block()
>> 				  - all OK, mount
>> 
>> Do I get it right? Actually, if we passed seq we wrote in
>> ext4_multi_mount_protect() to kmmpd (probably in sb), then kmmpd would
>> notice the conflict on its first invocation but still that would be a bit
>> late because there would be a time window where hostA and hostB would be
>> both using the fs.

It would be enough to have even a short delay between write and read to
detect this case.  I _thought_ there should be a delay in this case,
but maybe it was removed after the patch was originally submitted?

>> We could reduce the likelyhood of this race by always waiting in
>> ext4_multi_mount_protect() between write & read but I guess that is
>> undesirable as it would slow down all clean mounts. Ted?
> 
> I'd like Andreas to comment here.  My understanding is that MMP
> originally intended as a safety mechanism which would be used as part
> of a primary/backup high availability system, but not as the *primary*
> system where you might try to have two servers simultaneously try to
> mount the file system and use MMP as the "election" mechanism to
> decide which server is going to be the primary system, and which would
> be the backup system.
> 
> The cost of being able to handle this particular race is it would slow
> down the mounts of cleanly unmounted systems.

Ted's understanding is correct - MMP is intended to be a backup mechanism
to prevent filesystem corruption in the case where external HA methods
do the wrong thing.  This has avoided problems countless times on systems
with multi-port access to the same storage, and can also be useful in the
case of shared VM images accessed over the network, and similar.

When MMP was implemented for ZFS, a slightly different mechanism was used.
Rather than having the delay to detect concurrent mounts, it instead writes
to multiple different blocks in a random order, and then reads them all.
If two nodes try to mount the filesystem concurrently, they would pick
different block orders, and the chance of them having the same order (and
one clobbering all of the blocks of the other) would be 1/2^num_blocks.
The drawback is that this would consume more space in the filesystem, but
it wouldn't be a huge deal these days.

> There *are* better systems to implement leader elections[1] than using
> MMP.  Most of these more efficient leader elections assume that you
> have a working IP network, and so if you have a separate storage
> network (including a shared SCSI bus) from your standard IP network,
> then MMP is a useful failsafe in the face of a network partition of
> your IP network.  The question is whether MMP should be useful for
> more than that.  And if it isn't, then we should probably document
> what MMP is and isn't good for, and give advice in the form of an
> application note for how MMP should be used in the context of a larger
> system.

One of the existing failure cases with HA that MMP detects is loss of
network connection, so I wouldn't want to depend on that.

Cheers, Andreas
Andreas Dilger Oct. 15, 2021, 3:21 a.m. UTC | #10
On Oct 13, 2021, at 3:41 PM, Theodore Ts'o <tytso@mit.edu> wrote:
> 
> On Wed, Oct 13, 2021 at 11:38:47AM +0200, Jan Kara wrote:
>> 
>> OK, I see. So the race in ext4_multi_mount_protect() goes like:
>> 
>> hostA				hostB
>> 
>> read_mmp_block()		read_mmp_block()
>> - sees EXT4_MMP_SEQ_CLEAN	- sees EXT4_MMP_SEQ_CLEAN
>> write_mmp_block()
>> wait_time == 0 -> no wait
>> read_mmp_block()
>>  - all OK, mount
>> 				write_mmp_block()
>> 				wait_time == 0 -> no wait
>> 				read_mmp_block()
>> 				  - all OK, mount
>> 
>> Do I get it right? Actually, if we passed seq we wrote in
>> ext4_multi_mount_protect() to kmmpd (probably in sb), then kmmpd would
>> notice the conflict on its first invocation but still that would be a bit
>> late because there would be a time window where hostA and hostB would be
>> both using the fs.

It would be enough to have even a short delay between write and read to
detect this case.  I _thought_ there should be a delay in this case,
but maybe it was removed after the patch was originally submitted?

>> We could reduce the likelyhood of this race by always waiting in
>> ext4_multi_mount_protect() between write & read but I guess that is
>> undesirable as it would slow down all clean mounts. Ted?
> 
> I'd like Andreas to comment here.  My understanding is that MMP
> originally intended as a safety mechanism which would be used as part
> of a primary/backup high availability system, but not as the *primary*
> system where you might try to have two servers simultaneously try to
> mount the file system and use MMP as the "election" mechanism to
> decide which server is going to be the primary system, and which would
> be the backup system.
> 
> The cost of being able to handle this particular race is it would slow
> down the mounts of cleanly unmounted systems.

Ted's understanding is correct - MMP is intended to be a backup mechanism
to prevent filesystem corruption in the case where external HA methods
do the wrong thing.  This has avoided problems countless times on systems
with multi-port access to the same storage, and can also be useful in the
case of shared VM images accessed over the network, and similar.

When MMP was implemented for ZFS, a slightly different mechanism was used.
Rather than having the delay to detect concurrent mounts, it instead writes
to multiple different blocks in a random order, and then reads them all.
If two nodes try to mount the filesystem concurrently, they would pick
different block orders, and the chance of them having the same order (and
one clobbering all of the blocks of the other) would be 1/2^num_blocks.
The drawback is that this would consume more space in the filesystem, but
it wouldn't be a huge deal these days.

> There *are* better systems to implement leader elections[1] than using
> MMP.  Most of these more efficient leader elections assume that you
> have a working IP network, and so if you have a separate storage
> network (including a shared SCSI bus) from your standard IP network,
> then MMP is a useful failsafe in the face of a network partition of
> your IP network.  The question is whether MMP should be useful for
> more than that.  And if it isn't, then we should probably document
> what MMP is and isn't good for, and give advice in the form of an
> application note for how MMP should be used in the context of a larger
> system.

One of the existing failure cases with HA that MMP detects is loss of
network connection, so I wouldn't want to depend on that.

Cheers, Andreas
diff mbox series

Patch

diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 12af6dc8457b..c781b09a78c9 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -152,6 +152,7 @@  static int kmmpd(void *data)
 	int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
 	unsigned mmp_check_interval;
 	unsigned long last_update_time;
+	unsigned long last_check_time;
 	unsigned long diff;
 	int retval = 0;
 
@@ -170,6 +171,7 @@  static int kmmpd(void *data)
 
 	memcpy(mmp->mmp_nodename, init_utsname()->nodename,
 	       sizeof(mmp->mmp_nodename));
+	last_check_time = jiffies;
 
 	while (!kthread_should_stop() && !sb_rdonly(sb)) {
 		if (!ext4_has_feature_mmp(sb)) {
@@ -198,17 +200,18 @@  static int kmmpd(void *data)
 		}
 
 		diff = jiffies - last_update_time;
-		if (diff < mmp_update_interval * HZ)
+		if (diff < mmp_update_interval * HZ) {
 			schedule_timeout_interruptible(mmp_update_interval *
 						       HZ - diff);
+			diff = jiffies - last_update_time;
+		}
 
 		/*
 		 * We need to make sure that more than mmp_check_interval
-		 * seconds have not passed since writing. If that has happened
-		 * we need to check if the MMP block is as we left it.
+		 * seconds have not passed since check. If that has happened
+		 * we need to check if the MMP block is as we write it.
 		 */
-		diff = jiffies - last_update_time;
-		if (diff > mmp_check_interval * HZ) {
+		if (jiffies - last_check_time > mmp_check_interval * HZ) {
 			struct buffer_head *bh_check = NULL;
 			struct mmp_struct *mmp_check;
 
@@ -234,6 +237,7 @@  static int kmmpd(void *data)
 				goto wait_to_exit;
 			}
 			put_bh(bh_check);
+			last_check_time = jiffies;
 		}
 
 		 /*