diff mbox series

[ovs-dev] northd: fix infinite loop in ovn_allocate_tnlid()

Message ID 20240401121510.758326-1-odivlad@gmail.com
State Accepted
Headers show
Series [ovs-dev] northd: fix infinite loop in ovn_allocate_tnlid() | expand

Checks

Context Check Description
ovsrobot/apply-robot warning apply and check: warning
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes success github build: passed

Commit Message

Vladislav Odintsov April 1, 2024, 12:15 p.m. UTC
In case if all tunnel ids are exhausted, ovn_allocate_tnlid() function
iterates over tnlids indefinitely when *hint is outside of [min, max].
This is because when tnlid reaches max, next tnlid is min and for-loop
never reaches exit condition for tnlid != *hint.

This patch fixes mentioned issue and adds a testcase.

Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
---
 lib/ovn-util.c      | 10 +++++++---
 tests/ovn-northd.at | 26 ++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 3 deletions(-)

Comments

Mark Michelson April 1, 2024, 2:27 p.m. UTC | #1
Thanks Vladislav,

Acked-by: Mark Michelson <mmichels@redhat.com>

On 4/1/24 08:15, Vladislav Odintsov wrote:
> In case if all tunnel ids are exhausted, ovn_allocate_tnlid() function
> iterates over tnlids indefinitely when *hint is outside of [min, max].
> This is because when tnlid reaches max, next tnlid is min and for-loop
> never reaches exit condition for tnlid != *hint.
> 
> This patch fixes mentioned issue and adds a testcase.
> 
> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
> ---
>   lib/ovn-util.c      | 10 +++++++---
>   tests/ovn-northd.at | 26 ++++++++++++++++++++++++++
>   2 files changed, 33 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/ovn-util.c b/lib/ovn-util.c
> index ee5cbcdc3..9f97ae2ca 100644
> --- a/lib/ovn-util.c
> +++ b/lib/ovn-util.c
> @@ -693,13 +693,17 @@ uint32_t
>   ovn_allocate_tnlid(struct hmap *set, const char *name, uint32_t min,
>                      uint32_t max, uint32_t *hint)
>   {
> -    for (uint32_t tnlid = next_tnlid(*hint, min, max); tnlid != *hint;
> -         tnlid = next_tnlid(tnlid, min, max)) {
> +    /* Normalize hint, because it can be outside of [min, max]. */
> +    *hint = next_tnlid(*hint, min, max);
> +
> +    uint32_t tnlid = *hint;
> +    do {
>           if (ovn_add_tnlid(set, tnlid)) {
>               *hint = tnlid;
>               return tnlid;
>           }
> -    }
> +        tnlid = next_tnlid(tnlid, min, max);
> +    } while (tnlid != *hint);
>   
>       static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
>       VLOG_WARN_RL(&rl, "all %s tunnel ids exhausted", name);
> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> index cd53755b2..174dbacda 100644
> --- a/tests/ovn-northd.at
> +++ b/tests/ovn-northd.at
> @@ -2822,6 +2822,32 @@ AT_CHECK([test $lsp02 = 3 && test $ls1 = 123])
>   AT_CLEANUP
>   ])
>   
> +OVN_FOR_EACH_NORTHD_NO_HV([
> +AT_SETUP([check tunnel ids exhaustion])
> +ovn_start
> +
> +# Create a fake chassis with vxlan encap to lower MAX DP tunnel key to 2^12
> +ovn-sbctl \
> +    --id=@e create encap chassis_name=hv1 ip="192.168.0.1" type="vxlan" \
> +    -- --id=@c create chassis name=hv1 encaps=@e
> +
> +cmd="ovn-nbctl --wait=sb"
> +
> +for i in {1..4097..1}; do
> +    cmd="${cmd} -- ls-add lsw-${i}"
> +done
> +
> +eval $cmd
> +
> +check_row_count nb:Logical_Switch 4097
> +wait_row_count sb:Datapath_Binding 4095
> +
> +OVS_WAIT_UNTIL([grep "all datapath tunnel ids exhausted" northd/ovn-northd.log])
> +
> +AT_CLEANUP
> +])
> +
> +
>   OVN_FOR_EACH_NORTHD_NO_HV([
>   AT_SETUP([Logical Flow Datapath Groups])
>   ovn_start
Dumitru Ceara April 4, 2024, 3:26 p.m. UTC | #2
On 4/1/24 16:27, Mark Michelson wrote:
> Thanks Vladislav,
> 
> Acked-by: Mark Michelson <mmichels@redhat.com>
> 

Thanks, Vladislav and Mark!  Applied to main and backported down to
23.06 with a minor test change, please see below.

> On 4/1/24 08:15, Vladislav Odintsov wrote:
>> In case if all tunnel ids are exhausted, ovn_allocate_tnlid() function
>> iterates over tnlids indefinitely when *hint is outside of [min, max].
>> This is because when tnlid reaches max, next tnlid is min and for-loop
>> never reaches exit condition for tnlid != *hint.
>>
>> This patch fixes mentioned issue and adds a testcase.
>>
>> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
>> ---
>>   lib/ovn-util.c      | 10 +++++++---
>>   tests/ovn-northd.at | 26 ++++++++++++++++++++++++++
>>   2 files changed, 33 insertions(+), 3 deletions(-)
>>
>> diff --git a/lib/ovn-util.c b/lib/ovn-util.c
>> index ee5cbcdc3..9f97ae2ca 100644
>> --- a/lib/ovn-util.c
>> +++ b/lib/ovn-util.c
>> @@ -693,13 +693,17 @@ uint32_t
>>   ovn_allocate_tnlid(struct hmap *set, const char *name, uint32_t min,
>>                      uint32_t max, uint32_t *hint)
>>   {
>> -    for (uint32_t tnlid = next_tnlid(*hint, min, max); tnlid != *hint;
>> -         tnlid = next_tnlid(tnlid, min, max)) {
>> +    /* Normalize hint, because it can be outside of [min, max]. */
>> +    *hint = next_tnlid(*hint, min, max);
>> +
>> +    uint32_t tnlid = *hint;
>> +    do {
>>           if (ovn_add_tnlid(set, tnlid)) {
>>               *hint = tnlid;
>>               return tnlid;
>>           }
>> -    }
>> +        tnlid = next_tnlid(tnlid, min, max);
>> +    } while (tnlid != *hint);
>>         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
>>       VLOG_WARN_RL(&rl, "all %s tunnel ids exhausted", name);
>> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
>> index cd53755b2..174dbacda 100644
>> --- a/tests/ovn-northd.at
>> +++ b/tests/ovn-northd.at
>> @@ -2822,6 +2822,32 @@ AT_CHECK([test $lsp02 = 3 && test $ls1 = 123])
>>   AT_CLEANUP
>>   ])
>>   +OVN_FOR_EACH_NORTHD_NO_HV([
>> +AT_SETUP([check tunnel ids exhaustion])
>> +ovn_start
>> +
>> +# Create a fake chassis with vxlan encap to lower MAX DP tunnel key
>> to 2^12
>> +ovn-sbctl \
>> +    --id=@e create encap chassis_name=hv1 ip="192.168.0.1"
>> type="vxlan" \
>> +    -- --id=@c create chassis name=hv1 encaps=@e
>> +
>> +cmd="ovn-nbctl --wait=sb"
>> +
>> +for i in {1..4097..1}; do

This can be changed to:

for i in {1..4097}; do

>> +    cmd="${cmd} -- ls-add lsw-${i}"
>> +done
>> +
>> +eval $cmd
>> +
>> +check_row_count nb:Logical_Switch 4097
>> +wait_row_count sb:Datapath_Binding 4095
>> +
>> +OVS_WAIT_UNTIL([grep "all datapath tunnel ids exhausted"
>> northd/ovn-northd.log])
>> +
>> +AT_CLEANUP
>> +])
>> +
>> +
>>   OVN_FOR_EACH_NORTHD_NO_HV([
>>   AT_SETUP([Logical Flow Datapath Groups])
>>   ovn_start

Regards,
Dumitru
Vladislav Odintsov April 4, 2024, 3:52 p.m. UTC | #3
Thanks Dumitru!
I’m totally fine with your change.
Should I send backport patches with resolved conflicts for remaining branches at least till 22.03, which is an LTS?

> On 4 Apr 2024, at 18:26, Dumitru Ceara <dceara@redhat.com> wrote:
> 
> On 4/1/24 16:27, Mark Michelson wrote:
>> Thanks Vladislav,
>> 
>> Acked-by: Mark Michelson <mmichels@redhat.com <mailto:mmichels@redhat.com>>
>> 
> 
> Thanks, Vladislav and Mark!  Applied to main and backported down to
> 23.06 with a minor test change, please see below.
> 
>> On 4/1/24 08:15, Vladislav Odintsov wrote:
>>> In case if all tunnel ids are exhausted, ovn_allocate_tnlid() function
>>> iterates over tnlids indefinitely when *hint is outside of [min, max].
>>> This is because when tnlid reaches max, next tnlid is min and for-loop
>>> never reaches exit condition for tnlid != *hint.
>>> 
>>> This patch fixes mentioned issue and adds a testcase.
>>> 
>>> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
>>> ---
>>>   lib/ovn-util.c      | 10 +++++++---
>>>   tests/ovn-northd.at | 26 ++++++++++++++++++++++++++
>>>   2 files changed, 33 insertions(+), 3 deletions(-)
>>> 
>>> diff --git a/lib/ovn-util.c b/lib/ovn-util.c
>>> index ee5cbcdc3..9f97ae2ca 100644
>>> --- a/lib/ovn-util.c
>>> +++ b/lib/ovn-util.c
>>> @@ -693,13 +693,17 @@ uint32_t
>>>   ovn_allocate_tnlid(struct hmap *set, const char *name, uint32_t min,
>>>                      uint32_t max, uint32_t *hint)
>>>   {
>>> -    for (uint32_t tnlid = next_tnlid(*hint, min, max); tnlid != *hint;
>>> -         tnlid = next_tnlid(tnlid, min, max)) {
>>> +    /* Normalize hint, because it can be outside of [min, max]. */
>>> +    *hint = next_tnlid(*hint, min, max);
>>> +
>>> +    uint32_t tnlid = *hint;
>>> +    do {
>>>           if (ovn_add_tnlid(set, tnlid)) {
>>>               *hint = tnlid;
>>>               return tnlid;
>>>           }
>>> -    }
>>> +        tnlid = next_tnlid(tnlid, min, max);
>>> +    } while (tnlid != *hint);
>>>         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
>>>       VLOG_WARN_RL(&rl, "all %s tunnel ids exhausted", name);
>>> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
>>> index cd53755b2..174dbacda 100644
>>> --- a/tests/ovn-northd.at
>>> +++ b/tests/ovn-northd.at
>>> @@ -2822,6 +2822,32 @@ AT_CHECK([test $lsp02 = 3 && test $ls1 = 123])
>>>   AT_CLEANUP
>>>   ])
>>>   +OVN_FOR_EACH_NORTHD_NO_HV([
>>> +AT_SETUP([check tunnel ids exhaustion])
>>> +ovn_start
>>> +
>>> +# Create a fake chassis with vxlan encap to lower MAX DP tunnel key
>>> to 2^12
>>> +ovn-sbctl \
>>> +    --id=@e create encap chassis_name=hv1 ip="192.168.0.1"
>>> type="vxlan" \
>>> +    -- --id=@c create chassis name=hv1 encaps=@e
>>> +
>>> +cmd="ovn-nbctl --wait=sb"
>>> +
>>> +for i in {1..4097..1}; do
> 
> This can be changed to:
> 
> for i in {1..4097}; do
> 
>>> +    cmd="${cmd} -- ls-add lsw-${i}"
>>> +done
>>> +
>>> +eval $cmd
>>> +
>>> +check_row_count nb:Logical_Switch 4097
>>> +wait_row_count sb:Datapath_Binding 4095
>>> +
>>> +OVS_WAIT_UNTIL([grep "all datapath tunnel ids exhausted"
>>> northd/ovn-northd.log])
>>> +
>>> +AT_CLEANUP
>>> +])
>>> +
>>> +
>>>   OVN_FOR_EACH_NORTHD_NO_HV([
>>>   AT_SETUP([Logical Flow Datapath Groups])
>>>   ovn_start
> 
> Regards,
> Dumitru
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org <mailto:dev@openvswitch.org>
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Regards,
Vladislav Odintsov
Dumitru Ceara April 4, 2024, 4:46 p.m. UTC | #4
On 4/4/24 17:52, Vladislav Odintsov wrote:
> Thanks Dumitru!
> I’m totally fine with your change.
> Should I send backport patches with resolved conflicts for remaining branches at least till 22.03, which is an LTS?
> 

Well, 24.03 is the most recent LTS.  We don't really backport patches to
22.03 unless they fix critical issues.  I'm not completely convinced
that this is such a critical issue though.  You need 4K logical
datapaths with vxlan enabled before this gets hit.  In any case, Mark,
what do you think?

Regards,
Dumitru

>> On 4 Apr 2024, at 18:26, Dumitru Ceara <dceara@redhat.com> wrote:
>>
>> On 4/1/24 16:27, Mark Michelson wrote:
>>> Thanks Vladislav,
>>>
>>> Acked-by: Mark Michelson <mmichels@redhat.com <mailto:mmichels@redhat.com>>
>>>
>>
>> Thanks, Vladislav and Mark!  Applied to main and backported down to
>> 23.06 with a minor test change, please see below.
>>
>>> On 4/1/24 08:15, Vladislav Odintsov wrote:
>>>> In case if all tunnel ids are exhausted, ovn_allocate_tnlid() function
>>>> iterates over tnlids indefinitely when *hint is outside of [min, max].
>>>> This is because when tnlid reaches max, next tnlid is min and for-loop
>>>> never reaches exit condition for tnlid != *hint.
>>>>
>>>> This patch fixes mentioned issue and adds a testcase.
>>>>
>>>> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
>>>> ---
>>>>   lib/ovn-util.c      | 10 +++++++---
>>>>   tests/ovn-northd.at | 26 ++++++++++++++++++++++++++
>>>>   2 files changed, 33 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/lib/ovn-util.c b/lib/ovn-util.c
>>>> index ee5cbcdc3..9f97ae2ca 100644
>>>> --- a/lib/ovn-util.c
>>>> +++ b/lib/ovn-util.c
>>>> @@ -693,13 +693,17 @@ uint32_t
>>>>   ovn_allocate_tnlid(struct hmap *set, const char *name, uint32_t min,
>>>>                      uint32_t max, uint32_t *hint)
>>>>   {
>>>> -    for (uint32_t tnlid = next_tnlid(*hint, min, max); tnlid != *hint;
>>>> -         tnlid = next_tnlid(tnlid, min, max)) {
>>>> +    /* Normalize hint, because it can be outside of [min, max]. */
>>>> +    *hint = next_tnlid(*hint, min, max);
>>>> +
>>>> +    uint32_t tnlid = *hint;
>>>> +    do {
>>>>           if (ovn_add_tnlid(set, tnlid)) {
>>>>               *hint = tnlid;
>>>>               return tnlid;
>>>>           }
>>>> -    }
>>>> +        tnlid = next_tnlid(tnlid, min, max);
>>>> +    } while (tnlid != *hint);
>>>>         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
>>>>       VLOG_WARN_RL(&rl, "all %s tunnel ids exhausted", name);
>>>> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
>>>> index cd53755b2..174dbacda 100644
>>>> --- a/tests/ovn-northd.at
>>>> +++ b/tests/ovn-northd.at
>>>> @@ -2822,6 +2822,32 @@ AT_CHECK([test $lsp02 = 3 && test $ls1 = 123])
>>>>   AT_CLEANUP
>>>>   ])
>>>>   +OVN_FOR_EACH_NORTHD_NO_HV([
>>>> +AT_SETUP([check tunnel ids exhaustion])
>>>> +ovn_start
>>>> +
>>>> +# Create a fake chassis with vxlan encap to lower MAX DP tunnel key
>>>> to 2^12
>>>> +ovn-sbctl \
>>>> +    --id=@e create encap chassis_name=hv1 ip="192.168.0.1"
>>>> type="vxlan" \
>>>> +    -- --id=@c create chassis name=hv1 encaps=@e
>>>> +
>>>> +cmd="ovn-nbctl --wait=sb"
>>>> +
>>>> +for i in {1..4097..1}; do
>>
>> This can be changed to:
>>
>> for i in {1..4097}; do
>>
>>>> +    cmd="${cmd} -- ls-add lsw-${i}"
>>>> +done
>>>> +
>>>> +eval $cmd
>>>> +
>>>> +check_row_count nb:Logical_Switch 4097
>>>> +wait_row_count sb:Datapath_Binding 4095
>>>> +
>>>> +OVS_WAIT_UNTIL([grep "all datapath tunnel ids exhausted"
>>>> northd/ovn-northd.log])
>>>> +
>>>> +AT_CLEANUP
>>>> +])
>>>> +
>>>> +
>>>>   OVN_FOR_EACH_NORTHD_NO_HV([
>>>>   AT_SETUP([Logical Flow Datapath Groups])
>>>>   ovn_start
>>
>> Regards,
>> Dumitru
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org <mailto:dev@openvswitch.org>
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> 
> 
> Regards,
> Vladislav Odintsov
> 
>
Ihar Hrachyshka April 4, 2024, 5:17 p.m. UTC | #5
I tried to revert the util change and the test case passed just fine.

I think the scenario that may get the hint out of bounds is 1) start with
no vxlan chassis; 2) create 4097 DPs; 3) add a vxlan chassis - this makes
northd downgrade its max key to 4096. Now when we create a DP, it will spin
in circles. Posted this here:
https://patchwork.ozlabs.org/project/ovn/patch/20240404171358.54678-1-ihrachys@redhat.com/

(We can probably discuss in this context whether it's a good idea for a
cluster to change the max tun id value as chassis come and go. Perhaps it
should be initialized once and for all?)

What I also notice is that with the new patch, *hint is always overridden
at the start of the function, so it's always bumped by 1. I am not sure it
was intended. Comments?

This is all probably relevant to the question of whether any backports
should happen for this patch.

Ihar


On Thu, Apr 4, 2024 at 12:46 PM Dumitru Ceara <dceara@redhat.com> wrote:

> On 4/4/24 17:52, Vladislav Odintsov wrote:
> > Thanks Dumitru!
> > I’m totally fine with your change.
> > Should I send backport patches with resolved conflicts for remaining
> branches at least till 22.03, which is an LTS?
> >
>
> Well, 24.03 is the most recent LTS.  We don't really backport patches to
> 22.03 unless they fix critical issues.  I'm not completely convinced
> that this is such a critical issue though.  You need 4K logical
> datapaths with vxlan enabled before this gets hit.  In any case, Mark,
> what do you think?
>
> Regards,
> Dumitru
>
> >> On 4 Apr 2024, at 18:26, Dumitru Ceara <dceara@redhat.com> wrote:
> >>
> >> On 4/1/24 16:27, Mark Michelson wrote:
> >>> Thanks Vladislav,
> >>>
> >>> Acked-by: Mark Michelson <mmichels@redhat.com <mailto:
> mmichels@redhat.com>>
> >>>
> >>
> >> Thanks, Vladislav and Mark!  Applied to main and backported down to
> >> 23.06 with a minor test change, please see below.
> >>
> >>> On 4/1/24 08:15, Vladislav Odintsov wrote:
> >>>> In case if all tunnel ids are exhausted, ovn_allocate_tnlid() function
> >>>> iterates over tnlids indefinitely when *hint is outside of [min, max].
> >>>> This is because when tnlid reaches max, next tnlid is min and for-loop
> >>>> never reaches exit condition for tnlid != *hint.
> >>>>
> >>>> This patch fixes mentioned issue and adds a testcase.
> >>>>
> >>>> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
> >>>> ---
> >>>>   lib/ovn-util.c      | 10 +++++++---
> >>>>   tests/ovn-northd.at | 26 ++++++++++++++++++++++++++
> >>>>   2 files changed, 33 insertions(+), 3 deletions(-)
> >>>>
> >>>> diff --git a/lib/ovn-util.c b/lib/ovn-util.c
> >>>> index ee5cbcdc3..9f97ae2ca 100644
> >>>> --- a/lib/ovn-util.c
> >>>> +++ b/lib/ovn-util.c
> >>>> @@ -693,13 +693,17 @@ uint32_t
> >>>>   ovn_allocate_tnlid(struct hmap *set, const char *name, uint32_t min,
> >>>>                      uint32_t max, uint32_t *hint)
> >>>>   {
> >>>> -    for (uint32_t tnlid = next_tnlid(*hint, min, max); tnlid !=
> *hint;
> >>>> -         tnlid = next_tnlid(tnlid, min, max)) {
> >>>> +    /* Normalize hint, because it can be outside of [min, max]. */
> >>>> +    *hint = next_tnlid(*hint, min, max);
> >>>> +
> >>>> +    uint32_t tnlid = *hint;
> >>>> +    do {
> >>>>           if (ovn_add_tnlid(set, tnlid)) {
> >>>>               *hint = tnlid;
> >>>>               return tnlid;
> >>>>           }
> >>>> -    }
> >>>> +        tnlid = next_tnlid(tnlid, min, max);
> >>>> +    } while (tnlid != *hint);
> >>>>         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
> >>>>       VLOG_WARN_RL(&rl, "all %s tunnel ids exhausted", name);
> >>>> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> >>>> index cd53755b2..174dbacda 100644
> >>>> --- a/tests/ovn-northd.at
> >>>> +++ b/tests/ovn-northd.at
> >>>> @@ -2822,6 +2822,32 @@ AT_CHECK([test $lsp02 = 3 && test $ls1 = 123])
> >>>>   AT_CLEANUP
> >>>>   ])
> >>>>   +OVN_FOR_EACH_NORTHD_NO_HV([
> >>>> +AT_SETUP([check tunnel ids exhaustion])
> >>>> +ovn_start
> >>>> +
> >>>> +# Create a fake chassis with vxlan encap to lower MAX DP tunnel key
> >>>> to 2^12
> >>>> +ovn-sbctl \
> >>>> +    --id=@e create encap chassis_name=hv1 ip="192.168.0.1"
> >>>> type="vxlan" \
> >>>> +    -- --id=@c create chassis name=hv1 encaps=@e
> >>>> +
> >>>> +cmd="ovn-nbctl --wait=sb"
> >>>> +
> >>>> +for i in {1..4097..1}; do
> >>
> >> This can be changed to:
> >>
> >> for i in {1..4097}; do
> >>
> >>>> +    cmd="${cmd} -- ls-add lsw-${i}"
> >>>> +done
> >>>> +
> >>>> +eval $cmd
> >>>> +
> >>>> +check_row_count nb:Logical_Switch 4097
> >>>> +wait_row_count sb:Datapath_Binding 4095
> >>>> +
> >>>> +OVS_WAIT_UNTIL([grep "all datapath tunnel ids exhausted"
> >>>> northd/ovn-northd.log])
> >>>> +
> >>>> +AT_CLEANUP
> >>>> +])
> >>>> +
> >>>> +
> >>>>   OVN_FOR_EACH_NORTHD_NO_HV([
> >>>>   AT_SETUP([Logical Flow Datapath Groups])
> >>>>   ovn_start
> >>
> >> Regards,
> >> Dumitru
> >>
> >> _______________________________________________
> >> dev mailing list
> >> dev@openvswitch.org <mailto:dev@openvswitch.org>
> >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >
> >
> > Regards,
> > Vladislav Odintsov
> >
> >
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Dumitru Ceara April 4, 2024, 5:45 p.m. UTC | #6
On 4/4/24 19:17, Ihar Hrachyshka wrote:
> I tried to revert the util change and the test case passed just fine.
> 

I had done that before pushing the patch but.. I got tricked by the fact
that northd was spinning and using cpu 100% while the switches were
added.  My bad.

> I think the scenario that may get the hint out of bounds is 1) start with
> no vxlan chassis; 2) create 4097 DPs; 3) add a vxlan chassis - this makes
> northd downgrade its max key to 4096. Now when we create a DP, it will spin
> in circles. Posted this here:
> https://patchwork.ozlabs.org/project/ovn/patch/20240404171358.54678-1-ihrachys@redhat.com/
> 
> (We can probably discuss in this context whether it's a good idea for a
> cluster to change the max tun id value as chassis come and go. Perhaps it
> should be initialized once and for all?)
> 
> What I also notice is that with the new patch, *hint is always overridden
> at the start of the function, so it's always bumped by 1. I am not sure it
> was intended. Comments?
> 

But the actual change in behavior for '*hint' is only for the case in
which we run out of IDs, or am I missing something?  It didn't seem that
big of a deal to me.

> This is all probably relevant to the question of whether any backports
> should happen for this patch.
> 
> Ihar
> 

Regards,
Dumitru

> 
> On Thu, Apr 4, 2024 at 12:46 PM Dumitru Ceara <dceara@redhat.com> wrote:
> 
>> On 4/4/24 17:52, Vladislav Odintsov wrote:
>>> Thanks Dumitru!
>>> I’m totally fine with your change.
>>> Should I send backport patches with resolved conflicts for remaining
>> branches at least till 22.03, which is an LTS?
>>>
>>
>> Well, 24.03 is the most recent LTS.  We don't really backport patches to
>> 22.03 unless they fix critical issues.  I'm not completely convinced
>> that this is such a critical issue though.  You need 4K logical
>> datapaths with vxlan enabled before this gets hit.  In any case, Mark,
>> what do you think?
>>
>> Regards,
>> Dumitru
>>
>>>> On 4 Apr 2024, at 18:26, Dumitru Ceara <dceara@redhat.com> wrote:
>>>>
>>>> On 4/1/24 16:27, Mark Michelson wrote:
>>>>> Thanks Vladislav,
>>>>>
>>>>> Acked-by: Mark Michelson <mmichels@redhat.com <mailto:
>> mmichels@redhat.com>>
>>>>>
>>>>
>>>> Thanks, Vladislav and Mark!  Applied to main and backported down to
>>>> 23.06 with a minor test change, please see below.
>>>>
>>>>> On 4/1/24 08:15, Vladislav Odintsov wrote:
>>>>>> In case if all tunnel ids are exhausted, ovn_allocate_tnlid() function
>>>>>> iterates over tnlids indefinitely when *hint is outside of [min, max].
>>>>>> This is because when tnlid reaches max, next tnlid is min and for-loop
>>>>>> never reaches exit condition for tnlid != *hint.
>>>>>>
>>>>>> This patch fixes mentioned issue and adds a testcase.
>>>>>>
>>>>>> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
>>>>>> ---
>>>>>>   lib/ovn-util.c      | 10 +++++++---
>>>>>>   tests/ovn-northd.at | 26 ++++++++++++++++++++++++++
>>>>>>   2 files changed, 33 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/lib/ovn-util.c b/lib/ovn-util.c
>>>>>> index ee5cbcdc3..9f97ae2ca 100644
>>>>>> --- a/lib/ovn-util.c
>>>>>> +++ b/lib/ovn-util.c
>>>>>> @@ -693,13 +693,17 @@ uint32_t
>>>>>>   ovn_allocate_tnlid(struct hmap *set, const char *name, uint32_t min,
>>>>>>                      uint32_t max, uint32_t *hint)
>>>>>>   {
>>>>>> -    for (uint32_t tnlid = next_tnlid(*hint, min, max); tnlid !=
>> *hint;
>>>>>> -         tnlid = next_tnlid(tnlid, min, max)) {
>>>>>> +    /* Normalize hint, because it can be outside of [min, max]. */
>>>>>> +    *hint = next_tnlid(*hint, min, max);
>>>>>> +
>>>>>> +    uint32_t tnlid = *hint;
>>>>>> +    do {
>>>>>>           if (ovn_add_tnlid(set, tnlid)) {
>>>>>>               *hint = tnlid;
>>>>>>               return tnlid;
>>>>>>           }
>>>>>> -    }
>>>>>> +        tnlid = next_tnlid(tnlid, min, max);
>>>>>> +    } while (tnlid != *hint);
>>>>>>         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
>>>>>>       VLOG_WARN_RL(&rl, "all %s tunnel ids exhausted", name);
>>>>>> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
>>>>>> index cd53755b2..174dbacda 100644
>>>>>> --- a/tests/ovn-northd.at
>>>>>> +++ b/tests/ovn-northd.at
>>>>>> @@ -2822,6 +2822,32 @@ AT_CHECK([test $lsp02 = 3 && test $ls1 = 123])
>>>>>>   AT_CLEANUP
>>>>>>   ])
>>>>>>   +OVN_FOR_EACH_NORTHD_NO_HV([
>>>>>> +AT_SETUP([check tunnel ids exhaustion])
>>>>>> +ovn_start
>>>>>> +
>>>>>> +# Create a fake chassis with vxlan encap to lower MAX DP tunnel key
>>>>>> to 2^12
>>>>>> +ovn-sbctl \
>>>>>> +    --id=@e create encap chassis_name=hv1 ip="192.168.0.1"
>>>>>> type="vxlan" \
>>>>>> +    -- --id=@c create chassis name=hv1 encaps=@e
>>>>>> +
>>>>>> +cmd="ovn-nbctl --wait=sb"
>>>>>> +
>>>>>> +for i in {1..4097..1}; do
>>>>
>>>> This can be changed to:
>>>>
>>>> for i in {1..4097}; do
>>>>
>>>>>> +    cmd="${cmd} -- ls-add lsw-${i}"
>>>>>> +done
>>>>>> +
>>>>>> +eval $cmd
>>>>>> +
>>>>>> +check_row_count nb:Logical_Switch 4097
>>>>>> +wait_row_count sb:Datapath_Binding 4095
>>>>>> +
>>>>>> +OVS_WAIT_UNTIL([grep "all datapath tunnel ids exhausted"
>>>>>> northd/ovn-northd.log])
>>>>>> +
>>>>>> +AT_CLEANUP
>>>>>> +])
>>>>>> +
>>>>>> +
>>>>>>   OVN_FOR_EACH_NORTHD_NO_HV([
>>>>>>   AT_SETUP([Logical Flow Datapath Groups])
>>>>>>   ovn_start
>>>>
>>>> Regards,
>>>> Dumitru
>>>>
>>>> _______________________________________________
>>>> dev mailing list
>>>> dev@openvswitch.org <mailto:dev@openvswitch.org>
>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>
>>>
>>> Regards,
>>> Vladislav Odintsov
>>>
>>>
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
>
Ihar Hrachyshka April 4, 2024, 6:07 p.m. UTC | #7
On Thu, Apr 4, 2024 at 1:46 PM Dumitru Ceara <dceara@redhat.com> wrote:

> On 4/4/24 19:17, Ihar Hrachyshka wrote:
> > I tried to revert the util change and the test case passed just fine.
> >
>
> I had done that before pushing the patch but.. I got tricked by the fact
> that northd was spinning and using cpu 100% while the switches were
> added.  My bad.
>
> > I think the scenario that may get the hint out of bounds is 1) start with
> > no vxlan chassis; 2) create 4097 DPs; 3) add a vxlan chassis - this makes
> > northd downgrade its max key to 4096. Now when we create a DP, it will
> spin
> > in circles. Posted this here:
> >
> https://patchwork.ozlabs.org/project/ovn/patch/20240404171358.54678-1-ihrachys@redhat.com/
> >
> > (We can probably discuss in this context whether it's a good idea for a
> > cluster to change the max tun id value as chassis come and go. Perhaps it
> > should be initialized once and for all?)
> >
> > What I also notice is that with the new patch, *hint is always overridden
> > at the start of the function, so it's always bumped by 1. I am not sure
> it
> > was intended. Comments?
> >
>
> But the actual change in behavior for '*hint' is only for the case in
> which we run out of IDs, or am I missing something?  It didn't seem that
> big of a deal to me.
>

Yes, I also don't see a problem, but want the author to confirm if there's
a reason for that.


>
> > This is all probably relevant to the question of whether any backports
> > should happen for this patch.
> >
> > Ihar
> >
>
> Regards,
> Dumitru
>
> >
> > On Thu, Apr 4, 2024 at 12:46 PM Dumitru Ceara <dceara@redhat.com> wrote:
> >
> >> On 4/4/24 17:52, Vladislav Odintsov wrote:
> >>> Thanks Dumitru!
> >>> I’m totally fine with your change.
> >>> Should I send backport patches with resolved conflicts for remaining
> >> branches at least till 22.03, which is an LTS?
> >>>
> >>
> >> Well, 24.03 is the most recent LTS.  We don't really backport patches to
> >> 22.03 unless they fix critical issues.  I'm not completely convinced
> >> that this is such a critical issue though.  You need 4K logical
> >> datapaths with vxlan enabled before this gets hit.  In any case, Mark,
> >> what do you think?
> >>
> >> Regards,
> >> Dumitru
> >>
> >>>> On 4 Apr 2024, at 18:26, Dumitru Ceara <dceara@redhat.com> wrote:
> >>>>
> >>>> On 4/1/24 16:27, Mark Michelson wrote:
> >>>>> Thanks Vladislav,
> >>>>>
> >>>>> Acked-by: Mark Michelson <mmichels@redhat.com <mailto:
> >> mmichels@redhat.com>>
> >>>>>
> >>>>
> >>>> Thanks, Vladislav and Mark!  Applied to main and backported down to
> >>>> 23.06 with a minor test change, please see below.
> >>>>
> >>>>> On 4/1/24 08:15, Vladislav Odintsov wrote:
> >>>>>> In case if all tunnel ids are exhausted, ovn_allocate_tnlid()
> function
> >>>>>> iterates over tnlids indefinitely when *hint is outside of [min,
> max].
> >>>>>> This is because when tnlid reaches max, next tnlid is min and
> for-loop
> >>>>>> never reaches exit condition for tnlid != *hint.
> >>>>>>
> >>>>>> This patch fixes mentioned issue and adds a testcase.
> >>>>>>
> >>>>>> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
> >>>>>> ---
> >>>>>>   lib/ovn-util.c      | 10 +++++++---
> >>>>>>   tests/ovn-northd.at | 26 ++++++++++++++++++++++++++
> >>>>>>   2 files changed, 33 insertions(+), 3 deletions(-)
> >>>>>>
> >>>>>> diff --git a/lib/ovn-util.c b/lib/ovn-util.c
> >>>>>> index ee5cbcdc3..9f97ae2ca 100644
> >>>>>> --- a/lib/ovn-util.c
> >>>>>> +++ b/lib/ovn-util.c
> >>>>>> @@ -693,13 +693,17 @@ uint32_t
> >>>>>>   ovn_allocate_tnlid(struct hmap *set, const char *name, uint32_t
> min,
> >>>>>>                      uint32_t max, uint32_t *hint)
> >>>>>>   {
> >>>>>> -    for (uint32_t tnlid = next_tnlid(*hint, min, max); tnlid !=
> >> *hint;
> >>>>>> -         tnlid = next_tnlid(tnlid, min, max)) {
> >>>>>> +    /* Normalize hint, because it can be outside of [min, max]. */
> >>>>>> +    *hint = next_tnlid(*hint, min, max);
> >>>>>> +
> >>>>>> +    uint32_t tnlid = *hint;
> >>>>>> +    do {
> >>>>>>           if (ovn_add_tnlid(set, tnlid)) {
> >>>>>>               *hint = tnlid;
> >>>>>>               return tnlid;
> >>>>>>           }
> >>>>>> -    }
> >>>>>> +        tnlid = next_tnlid(tnlid, min, max);
> >>>>>> +    } while (tnlid != *hint);
> >>>>>>         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1,
> 1);
> >>>>>>       VLOG_WARN_RL(&rl, "all %s tunnel ids exhausted", name);
> >>>>>> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> >>>>>> index cd53755b2..174dbacda 100644
> >>>>>> --- a/tests/ovn-northd.at
> >>>>>> +++ b/tests/ovn-northd.at
> >>>>>> @@ -2822,6 +2822,32 @@ AT_CHECK([test $lsp02 = 3 && test $ls1 =
> 123])
> >>>>>>   AT_CLEANUP
> >>>>>>   ])
> >>>>>>   +OVN_FOR_EACH_NORTHD_NO_HV([
> >>>>>> +AT_SETUP([check tunnel ids exhaustion])
> >>>>>> +ovn_start
> >>>>>> +
> >>>>>> +# Create a fake chassis with vxlan encap to lower MAX DP tunnel key
> >>>>>> to 2^12
> >>>>>> +ovn-sbctl \
> >>>>>> +    --id=@e create encap chassis_name=hv1 ip="192.168.0.1"
> >>>>>> type="vxlan" \
> >>>>>> +    -- --id=@c create chassis name=hv1 encaps=@e
> >>>>>> +
> >>>>>> +cmd="ovn-nbctl --wait=sb"
> >>>>>> +
> >>>>>> +for i in {1..4097..1}; do
> >>>>
> >>>> This can be changed to:
> >>>>
> >>>> for i in {1..4097}; do
> >>>>
> >>>>>> +    cmd="${cmd} -- ls-add lsw-${i}"
> >>>>>> +done
> >>>>>> +
> >>>>>> +eval $cmd
> >>>>>> +
> >>>>>> +check_row_count nb:Logical_Switch 4097
> >>>>>> +wait_row_count sb:Datapath_Binding 4095
> >>>>>> +
> >>>>>> +OVS_WAIT_UNTIL([grep "all datapath tunnel ids exhausted"
> >>>>>> northd/ovn-northd.log])
> >>>>>> +
> >>>>>> +AT_CLEANUP
> >>>>>> +])
> >>>>>> +
> >>>>>> +
> >>>>>>   OVN_FOR_EACH_NORTHD_NO_HV([
> >>>>>>   AT_SETUP([Logical Flow Datapath Groups])
> >>>>>>   ovn_start
> >>>>
> >>>> Regards,
> >>>> Dumitru
> >>>>
> >>>> _______________________________________________
> >>>> dev mailing list
> >>>> dev@openvswitch.org <mailto:dev@openvswitch.org>
> >>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >>>
> >>>
> >>> Regards,
> >>> Vladislav Odintsov
> >>>
> >>>
> >>
> >> _______________________________________________
> >> dev mailing list
> >> dev@openvswitch.org
> >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >>
> >
>
>
Vladislav Odintsov April 4, 2024, 7:33 p.m. UTC | #8
> On 4 Apr 2024, at 21:07, Ihar Hrachyshka <ihrachys@redhat.com> wrote:
> 
> On Thu, Apr 4, 2024 at 1:46 PM Dumitru Ceara <dceara@redhat.com <mailto:dceara@redhat.com>> wrote:
>> On 4/4/24 19:17, Ihar Hrachyshka wrote:
>> > I tried to revert the util change and the test case passed just fine.
>> > 
>> 
>> I had done that before pushing the patch but.. I got tricked by the fact
>> that northd was spinning and using cpu 100% while the switches were
>> added.  My bad.
>> 
>> > I think the scenario that may get the hint out of bounds is 1) start with
>> > no vxlan chassis; 2) create 4097 DPs; 3) add a vxlan chassis - this makes
>> > northd downgrade its max key to 4096. Now when we create a DP, it will spin
>> > in circles. Posted this here:
>> > https://patchwork.ozlabs.org/project/ovn/patch/20240404171358.54678-1-ihrachys@redhat.com/
>> > 

Nice catch! Thanks for the patch!

>> > (We can probably discuss in this context whether it's a good idea for a
>> > cluster to change the max tun id value as chassis come and go. Perhaps it
>> > should be initialized once and for all?)
>> > 
>> > What I also notice is that with the new patch, *hint is always overridden
>> > at the start of the function, so it's always bumped by 1. I am not sure it
>> > was intended. Comments?
>> > 
>> 
>> But the actual change in behavior for '*hint' is only for the case in
>> which we run out of IDs, or am I missing something?  It didn't seem that
>> big of a deal to me.
> 
> Yes, I also don't see a problem, but want the author to confirm if there's a reason for that.

I’ve just revised the code again and see that for the case, where *hint = 0 and min=10, max=20 this still will not work.
However I’m not sure if this must be fixed, while there are no such cases for now.
What do you think?

*hint bump every time in normal situation (where we have enough available IDs) should be safe because it has similar behaviour to previous implementation.
First, tnlid was set to *<initial hint value> + 1 and then *hint was set by current tnlid.
It seems the same to me. Am I missing something?

>  
>> 
>> > This is all probably relevant to the question of whether any backports
>> > should happen for this patch.
>> > 
>> > Ihar
>> > 
>> 
>> Regards,
>> Dumitru
>> 
>> > 
>> > On Thu, Apr 4, 2024 at 12:46 PM Dumitru Ceara <dceara@redhat.com <mailto:dceara@redhat.com>> wrote:
>> > 
>> >> On 4/4/24 17:52, Vladislav Odintsov wrote:
>> >>> Thanks Dumitru!
>> >>> I’m totally fine with your change.
>> >>> Should I send backport patches with resolved conflicts for remaining
>> >> branches at least till 22.03, which is an LTS?
>> >>>
>> >>
>> >> Well, 24.03 is the most recent LTS.  We don't really backport patches to
>> >> 22.03 unless they fix critical issues.  I'm not completely convinced
>> >> that this is such a critical issue though.  You need 4K logical
>> >> datapaths with vxlan enabled before this gets hit.  In any case, Mark,
>> >> what do you think?
>> >>
>> >> Regards,
>> >> Dumitru
>> >>
>> >>>> On 4 Apr 2024, at 18:26, Dumitru Ceara <dceara@redhat.com <mailto:dceara@redhat.com>> wrote:
>> >>>>
>> >>>> On 4/1/24 16:27, Mark Michelson wrote:
>> >>>>> Thanks Vladislav,
>> >>>>>
>> >>>>> Acked-by: Mark Michelson <mmichels@redhat.com <mailto:mmichels@redhat.com> <mailto:
>> >> mmichels@redhat.com <mailto:mmichels@redhat.com>>>
>> >>>>>
>> >>>>
>> >>>> Thanks, Vladislav and Mark!  Applied to main and backported down to
>> >>>> 23.06 with a minor test change, please see below.
>> >>>>
>> >>>>> On 4/1/24 08:15, Vladislav Odintsov wrote:
>> >>>>>> In case if all tunnel ids are exhausted, ovn_allocate_tnlid() function
>> >>>>>> iterates over tnlids indefinitely when *hint is outside of [min, max].
>> >>>>>> This is because when tnlid reaches max, next tnlid is min and for-loop
>> >>>>>> never reaches exit condition for tnlid != *hint.
>> >>>>>>
>> >>>>>> This patch fixes mentioned issue and adds a testcase.
>> >>>>>>
>> >>>>>> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com <mailto:odivlad@gmail.com>>
>> >>>>>> ---
>> >>>>>>   lib/ovn-util.c      | 10 +++++++---
>> >>>>>>   tests/ovn-northd.at <http://ovn-northd.at/> | 26 ++++++++++++++++++++++++++
>> >>>>>>   2 files changed, 33 insertions(+), 3 deletions(-)
>> >>>>>>
>> >>>>>> diff --git a/lib/ovn-util.c b/lib/ovn-util.c
>> >>>>>> index ee5cbcdc3..9f97ae2ca 100644
>> >>>>>> --- a/lib/ovn-util.c
>> >>>>>> +++ b/lib/ovn-util.c
>> >>>>>> @@ -693,13 +693,17 @@ uint32_t
>> >>>>>>   ovn_allocate_tnlid(struct hmap *set, const char *name, uint32_t min,
>> >>>>>>                      uint32_t max, uint32_t *hint)
>> >>>>>>   {
>> >>>>>> -    for (uint32_t tnlid = next_tnlid(*hint, min, max); tnlid !=
>> >> *hint;
>> >>>>>> -         tnlid = next_tnlid(tnlid, min, max)) {
>> >>>>>> +    /* Normalize hint, because it can be outside of [min, max]. */
>> >>>>>> +    *hint = next_tnlid(*hint, min, max);
>> >>>>>> +
>> >>>>>> +    uint32_t tnlid = *hint;
>> >>>>>> +    do {
>> >>>>>>           if (ovn_add_tnlid(set, tnlid)) {
>> >>>>>>               *hint = tnlid;
>> >>>>>>               return tnlid;
>> >>>>>>           }
>> >>>>>> -    }
>> >>>>>> +        tnlid = next_tnlid(tnlid, min, max);
>> >>>>>> +    } while (tnlid != *hint);
>> >>>>>>         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
>> >>>>>>       VLOG_WARN_RL(&rl, "all %s tunnel ids exhausted", name);
>> >>>>>> diff --git a/tests/ovn-northd.at <http://ovn-northd.at/> b/tests/ovn-northd.at <http://ovn-northd.at/>
>> >>>>>> index cd53755b2..174dbacda 100644
>> >>>>>> --- a/tests/ovn-northd.at <http://ovn-northd.at/>
>> >>>>>> +++ b/tests/ovn-northd.at <http://ovn-northd.at/>
>> >>>>>> @@ -2822,6 +2822,32 @@ AT_CHECK([test $lsp02 = 3 && test $ls1 = 123])
>> >>>>>>   AT_CLEANUP
>> >>>>>>   ])
>> >>>>>>   +OVN_FOR_EACH_NORTHD_NO_HV([
>> >>>>>> +AT_SETUP([check tunnel ids exhaustion])
>> >>>>>> +ovn_start
>> >>>>>> +
>> >>>>>> +# Create a fake chassis with vxlan encap to lower MAX DP tunnel key
>> >>>>>> to 2^12
>> >>>>>> +ovn-sbctl \
>> >>>>>> +    --id=@e create encap chassis_name=hv1 ip="192.168.0.1"
>> >>>>>> type="vxlan" \
>> >>>>>> +    -- --id=@c create chassis name=hv1 encaps=@e
>> >>>>>> +
>> >>>>>> +cmd="ovn-nbctl --wait=sb"
>> >>>>>> +
>> >>>>>> +for i in {1..4097..1}; do
>> >>>>
>> >>>> This can be changed to:
>> >>>>
>> >>>> for i in {1..4097}; do
>> >>>>
>> >>>>>> +    cmd="${cmd} -- ls-add lsw-${i}"
>> >>>>>> +done
>> >>>>>> +
>> >>>>>> +eval $cmd
>> >>>>>> +
>> >>>>>> +check_row_count nb:Logical_Switch 4097
>> >>>>>> +wait_row_count sb:Datapath_Binding 4095
>> >>>>>> +
>> >>>>>> +OVS_WAIT_UNTIL([grep "all datapath tunnel ids exhausted"
>> >>>>>> northd/ovn-northd.log])
>> >>>>>> +
>> >>>>>> +AT_CLEANUP
>> >>>>>> +])
>> >>>>>> +
>> >>>>>> +
>> >>>>>>   OVN_FOR_EACH_NORTHD_NO_HV([
>> >>>>>>   AT_SETUP([Logical Flow Datapath Groups])
>> >>>>>>   ovn_start
>> >>>>
>> >>>> Regards,
>> >>>> Dumitru
>> >>>>
>> >>>> _______________________________________________
>> >>>> dev mailing list
>> >>>> dev@openvswitch.org <mailto:dev@openvswitch.org> <mailto:dev@openvswitch.org <mailto:dev@openvswitch.org>>
>> >>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>> >>>
>> >>>
>> >>> Regards,
>> >>> Vladislav Odintsov
>> >>>
>> >>>
>> >>
>> >> _______________________________________________
>> >> dev mailing list
>> >> dev@openvswitch.org <mailto:dev@openvswitch.org>
>> >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>> >>
>> > 


Regards,
Vladislav Odintsov
Mark Michelson April 4, 2024, 7:51 p.m. UTC | #9
On 4/4/24 12:46, Dumitru Ceara wrote:
> On 4/4/24 17:52, Vladislav Odintsov wrote:
>> Thanks Dumitru!
>> I’m totally fine with your change.
>> Should I send backport patches with resolved conflicts for remaining branches at least till 22.03, which is an LTS?
>>
> 
> Well, 24.03 is the most recent LTS.  We don't really backport patches to
> 22.03 unless they fix critical issues.  I'm not completely convinced
> that this is such a critical issue though.  You need 4K logical
> datapaths with vxlan enabled before this gets hit.  In any case, Mark,
> what do you think?

I don't think this needs backporting down to 22.03.

> 
> Regards,
> Dumitru
> 
>>> On 4 Apr 2024, at 18:26, Dumitru Ceara <dceara@redhat.com> wrote:
>>>
>>> On 4/1/24 16:27, Mark Michelson wrote:
>>>> Thanks Vladislav,
>>>>
>>>> Acked-by: Mark Michelson <mmichels@redhat.com <mailto:mmichels@redhat.com>>
>>>>
>>>
>>> Thanks, Vladislav and Mark!  Applied to main and backported down to
>>> 23.06 with a minor test change, please see below.
>>>
>>>> On 4/1/24 08:15, Vladislav Odintsov wrote:
>>>>> In case if all tunnel ids are exhausted, ovn_allocate_tnlid() function
>>>>> iterates over tnlids indefinitely when *hint is outside of [min, max].
>>>>> This is because when tnlid reaches max, next tnlid is min and for-loop
>>>>> never reaches exit condition for tnlid != *hint.
>>>>>
>>>>> This patch fixes mentioned issue and adds a testcase.
>>>>>
>>>>> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
>>>>> ---
>>>>>    lib/ovn-util.c      | 10 +++++++---
>>>>>    tests/ovn-northd.at | 26 ++++++++++++++++++++++++++
>>>>>    2 files changed, 33 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/lib/ovn-util.c b/lib/ovn-util.c
>>>>> index ee5cbcdc3..9f97ae2ca 100644
>>>>> --- a/lib/ovn-util.c
>>>>> +++ b/lib/ovn-util.c
>>>>> @@ -693,13 +693,17 @@ uint32_t
>>>>>    ovn_allocate_tnlid(struct hmap *set, const char *name, uint32_t min,
>>>>>                       uint32_t max, uint32_t *hint)
>>>>>    {
>>>>> -    for (uint32_t tnlid = next_tnlid(*hint, min, max); tnlid != *hint;
>>>>> -         tnlid = next_tnlid(tnlid, min, max)) {
>>>>> +    /* Normalize hint, because it can be outside of [min, max]. */
>>>>> +    *hint = next_tnlid(*hint, min, max);
>>>>> +
>>>>> +    uint32_t tnlid = *hint;
>>>>> +    do {
>>>>>            if (ovn_add_tnlid(set, tnlid)) {
>>>>>                *hint = tnlid;
>>>>>                return tnlid;
>>>>>            }
>>>>> -    }
>>>>> +        tnlid = next_tnlid(tnlid, min, max);
>>>>> +    } while (tnlid != *hint);
>>>>>          static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
>>>>>        VLOG_WARN_RL(&rl, "all %s tunnel ids exhausted", name);
>>>>> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
>>>>> index cd53755b2..174dbacda 100644
>>>>> --- a/tests/ovn-northd.at
>>>>> +++ b/tests/ovn-northd.at
>>>>> @@ -2822,6 +2822,32 @@ AT_CHECK([test $lsp02 = 3 && test $ls1 = 123])
>>>>>    AT_CLEANUP
>>>>>    ])
>>>>>    +OVN_FOR_EACH_NORTHD_NO_HV([
>>>>> +AT_SETUP([check tunnel ids exhaustion])
>>>>> +ovn_start
>>>>> +
>>>>> +# Create a fake chassis with vxlan encap to lower MAX DP tunnel key
>>>>> to 2^12
>>>>> +ovn-sbctl \
>>>>> +    --id=@e create encap chassis_name=hv1 ip="192.168.0.1"
>>>>> type="vxlan" \
>>>>> +    -- --id=@c create chassis name=hv1 encaps=@e
>>>>> +
>>>>> +cmd="ovn-nbctl --wait=sb"
>>>>> +
>>>>> +for i in {1..4097..1}; do
>>>
>>> This can be changed to:
>>>
>>> for i in {1..4097}; do
>>>
>>>>> +    cmd="${cmd} -- ls-add lsw-${i}"
>>>>> +done
>>>>> +
>>>>> +eval $cmd
>>>>> +
>>>>> +check_row_count nb:Logical_Switch 4097
>>>>> +wait_row_count sb:Datapath_Binding 4095
>>>>> +
>>>>> +OVS_WAIT_UNTIL([grep "all datapath tunnel ids exhausted"
>>>>> northd/ovn-northd.log])
>>>>> +
>>>>> +AT_CLEANUP
>>>>> +])
>>>>> +
>>>>> +
>>>>>    OVN_FOR_EACH_NORTHD_NO_HV([
>>>>>    AT_SETUP([Logical Flow Datapath Groups])
>>>>>    ovn_start
>>>
>>> Regards,
>>> Dumitru
>>>
>>> _______________________________________________
>>> dev mailing list
>>> dev@openvswitch.org <mailto:dev@openvswitch.org>
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
>>
>> Regards,
>> Vladislav Odintsov
>>
>>
>
Vladislav Odintsov April 4, 2024, 8:02 p.m. UTC | #10
> On 4 Apr 2024, at 22:51, Mark Michelson <mmichels@redhat.com> wrote:
> 
> On 4/4/24 12:46, Dumitru Ceara wrote:
>> On 4/4/24 17:52, Vladislav Odintsov wrote:
>>> Thanks Dumitru!
>>> I’m totally fine with your change.
>>> Should I send backport patches with resolved conflicts for remaining branches at least till 22.03, which is an LTS?
>>> 
>> Well, 24.03 is the most recent LTS.  We don't really backport patches to
>> 22.03 unless they fix critical issues.  I'm not completely convinced
>> that this is such a critical issue though.  You need 4K logical
>> datapaths with vxlan enabled before this gets hit.  In any case, Mark,
>> what do you think?
> 
> I don't think this needs backporting down to 22.03.

I just wanted to mention that to reproduce this bug it is only enough to have at least one chassis with vxlan encap and create 4096 LSs/LRs.
If the problem is triggered, ovn-northd starts consuming 100% CPU and hangs (doesn’t process any change) until excess LS/LR is removed and northd is restarted.
I can submit backport patches for old branches if needed (already rebased).

> 
>> Regards,
>> Dumitru
>>>> On 4 Apr 2024, at 18:26, Dumitru Ceara <dceara@redhat.com> wrote:
>>>> 
>>>> On 4/1/24 16:27, Mark Michelson wrote:
>>>>> Thanks Vladislav,
>>>>> 
>>>>> Acked-by: Mark Michelson <mmichels@redhat.com <mailto:mmichels@redhat.com>>
>>>>> 
>>>> 
>>>> Thanks, Vladislav and Mark!  Applied to main and backported down to
>>>> 23.06 with a minor test change, please see below.
>>>> 
>>>>> On 4/1/24 08:15, Vladislav Odintsov wrote:
>>>>>> In case if all tunnel ids are exhausted, ovn_allocate_tnlid() function
>>>>>> iterates over tnlids indefinitely when *hint is outside of [min, max].
>>>>>> This is because when tnlid reaches max, next tnlid is min and for-loop
>>>>>> never reaches exit condition for tnlid != *hint.
>>>>>> 
>>>>>> This patch fixes mentioned issue and adds a testcase.
>>>>>> 
>>>>>> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
>>>>>> ---
>>>>>>   lib/ovn-util.c      | 10 +++++++---
>>>>>>   tests/ovn-northd.at | 26 ++++++++++++++++++++++++++
>>>>>>   2 files changed, 33 insertions(+), 3 deletions(-)
>>>>>> 
>>>>>> diff --git a/lib/ovn-util.c b/lib/ovn-util.c
>>>>>> index ee5cbcdc3..9f97ae2ca 100644
>>>>>> --- a/lib/ovn-util.c
>>>>>> +++ b/lib/ovn-util.c
>>>>>> @@ -693,13 +693,17 @@ uint32_t
>>>>>>   ovn_allocate_tnlid(struct hmap *set, const char *name, uint32_t min,
>>>>>>                      uint32_t max, uint32_t *hint)
>>>>>>   {
>>>>>> -    for (uint32_t tnlid = next_tnlid(*hint, min, max); tnlid != *hint;
>>>>>> -         tnlid = next_tnlid(tnlid, min, max)) {
>>>>>> +    /* Normalize hint, because it can be outside of [min, max]. */
>>>>>> +    *hint = next_tnlid(*hint, min, max);
>>>>>> +
>>>>>> +    uint32_t tnlid = *hint;
>>>>>> +    do {
>>>>>>           if (ovn_add_tnlid(set, tnlid)) {
>>>>>>               *hint = tnlid;
>>>>>>               return tnlid;
>>>>>>           }
>>>>>> -    }
>>>>>> +        tnlid = next_tnlid(tnlid, min, max);
>>>>>> +    } while (tnlid != *hint);
>>>>>>         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
>>>>>>       VLOG_WARN_RL(&rl, "all %s tunnel ids exhausted", name);
>>>>>> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
>>>>>> index cd53755b2..174dbacda 100644
>>>>>> --- a/tests/ovn-northd.at
>>>>>> +++ b/tests/ovn-northd.at
>>>>>> @@ -2822,6 +2822,32 @@ AT_CHECK([test $lsp02 = 3 && test $ls1 = 123])
>>>>>>   AT_CLEANUP
>>>>>>   ])
>>>>>>   +OVN_FOR_EACH_NORTHD_NO_HV([
>>>>>> +AT_SETUP([check tunnel ids exhaustion])
>>>>>> +ovn_start
>>>>>> +
>>>>>> +# Create a fake chassis with vxlan encap to lower MAX DP tunnel key
>>>>>> to 2^12
>>>>>> +ovn-sbctl \
>>>>>> +    --id=@e create encap chassis_name=hv1 ip="192.168.0.1"
>>>>>> type="vxlan" \
>>>>>> +    -- --id=@c create chassis name=hv1 encaps=@e
>>>>>> +
>>>>>> +cmd="ovn-nbctl --wait=sb"
>>>>>> +
>>>>>> +for i in {1..4097..1}; do
>>>> 
>>>> This can be changed to:
>>>> 
>>>> for i in {1..4097}; do
>>>> 
>>>>>> +    cmd="${cmd} -- ls-add lsw-${i}"
>>>>>> +done
>>>>>> +
>>>>>> +eval $cmd
>>>>>> +
>>>>>> +check_row_count nb:Logical_Switch 4097
>>>>>> +wait_row_count sb:Datapath_Binding 4095
>>>>>> +
>>>>>> +OVS_WAIT_UNTIL([grep "all datapath tunnel ids exhausted"
>>>>>> northd/ovn-northd.log])
>>>>>> +
>>>>>> +AT_CLEANUP
>>>>>> +])
>>>>>> +
>>>>>> +
>>>>>>   OVN_FOR_EACH_NORTHD_NO_HV([
>>>>>>   AT_SETUP([Logical Flow Datapath Groups])
>>>>>>   ovn_start
>>>> 
>>>> Regards,
>>>> Dumitru
>>>> 
>>>> _______________________________________________
>>>> dev mailing list
>>>> dev@openvswitch.org <mailto:dev@openvswitch.org>
>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>> 
>>> 
>>> Regards,
>>> Vladislav Odintsov
>>> 
>>> 
> 


Regards,
Vladislav Odintsov
Ihar Hrachyshka April 5, 2024, 4:05 p.m. UTC | #11
On Thu, Apr 4, 2024 at 4:02 PM Vladislav Odintsov <odivlad@gmail.com> wrote:

>
>
> > On 4 Apr 2024, at 22:51, Mark Michelson <mmichels@redhat.com> wrote:
> >
> > On 4/4/24 12:46, Dumitru Ceara wrote:
> >> On 4/4/24 17:52, Vladislav Odintsov wrote:
> >>> Thanks Dumitru!
> >>> I’m totally fine with your change.
> >>> Should I send backport patches with resolved conflicts for remaining
> branches at least till 22.03, which is an LTS?
> >>>
> >> Well, 24.03 is the most recent LTS.  We don't really backport patches to
> >> 22.03 unless they fix critical issues.  I'm not completely convinced
> >> that this is such a critical issue though.  You need 4K logical
> >> datapaths with vxlan enabled before this gets hit.  In any case, Mark,
> >> what do you think?
> >
> > I don't think this needs backporting down to 22.03.
>
> I just wanted to mention that to reproduce this bug it is only enough to
> have at least one chassis with vxlan encap and create 4096 LSs/LRs.
>

Or: 4096... no, 2048! LSPs in a single network. (Because only 11 bits are
allowed for unicast tunnel keys.)


> If the problem is triggered, ovn-northd starts consuming 100% CPU and
> hangs (doesn’t process any change) until excess LS/LR is removed and northd
> is restarted.
> I can submit backport patches for old branches if needed (already rebased).
>

I also think this should be fixed in all supported branches.


>
> >
> >> Regards,
> >> Dumitru
> >>>> On 4 Apr 2024, at 18:26, Dumitru Ceara <dceara@redhat.com> wrote:
> >>>>
> >>>> On 4/1/24 16:27, Mark Michelson wrote:
> >>>>> Thanks Vladislav,
> >>>>>
> >>>>> Acked-by: Mark Michelson <mmichels@redhat.com <mailto:
> mmichels@redhat.com>>
> >>>>>
> >>>>
> >>>> Thanks, Vladislav and Mark!  Applied to main and backported down to
> >>>> 23.06 with a minor test change, please see below.
> >>>>
> >>>>> On 4/1/24 08:15, Vladislav Odintsov wrote:
> >>>>>> In case if all tunnel ids are exhausted, ovn_allocate_tnlid()
> function
> >>>>>> iterates over tnlids indefinitely when *hint is outside of [min,
> max].
> >>>>>> This is because when tnlid reaches max, next tnlid is min and
> for-loop
> >>>>>> never reaches exit condition for tnlid != *hint.
> >>>>>>
> >>>>>> This patch fixes mentioned issue and adds a testcase.
> >>>>>>
> >>>>>> Signed-off-by: Vladislav Odintsov <odivlad@gmail.com>
> >>>>>> ---
> >>>>>>   lib/ovn-util.c      | 10 +++++++---
> >>>>>>   tests/ovn-northd.at | 26 ++++++++++++++++++++++++++
> >>>>>>   2 files changed, 33 insertions(+), 3 deletions(-)
> >>>>>>
> >>>>>> diff --git a/lib/ovn-util.c b/lib/ovn-util.c
> >>>>>> index ee5cbcdc3..9f97ae2ca 100644
> >>>>>> --- a/lib/ovn-util.c
> >>>>>> +++ b/lib/ovn-util.c
> >>>>>> @@ -693,13 +693,17 @@ uint32_t
> >>>>>>   ovn_allocate_tnlid(struct hmap *set, const char *name, uint32_t
> min,
> >>>>>>                      uint32_t max, uint32_t *hint)
> >>>>>>   {
> >>>>>> -    for (uint32_t tnlid = next_tnlid(*hint, min, max); tnlid !=
> *hint;
> >>>>>> -         tnlid = next_tnlid(tnlid, min, max)) {
> >>>>>> +    /* Normalize hint, because it can be outside of [min, max]. */
> >>>>>> +    *hint = next_tnlid(*hint, min, max);
> >>>>>> +
> >>>>>> +    uint32_t tnlid = *hint;
> >>>>>> +    do {
> >>>>>>           if (ovn_add_tnlid(set, tnlid)) {
> >>>>>>               *hint = tnlid;
> >>>>>>               return tnlid;
> >>>>>>           }
> >>>>>> -    }
> >>>>>> +        tnlid = next_tnlid(tnlid, min, max);
> >>>>>> +    } while (tnlid != *hint);
> >>>>>>         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1,
> 1);
> >>>>>>       VLOG_WARN_RL(&rl, "all %s tunnel ids exhausted", name);
> >>>>>> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> >>>>>> index cd53755b2..174dbacda 100644
> >>>>>> --- a/tests/ovn-northd.at
> >>>>>> +++ b/tests/ovn-northd.at
> >>>>>> @@ -2822,6 +2822,32 @@ AT_CHECK([test $lsp02 = 3 && test $ls1 =
> 123])
> >>>>>>   AT_CLEANUP
> >>>>>>   ])
> >>>>>>   +OVN_FOR_EACH_NORTHD_NO_HV([
> >>>>>> +AT_SETUP([check tunnel ids exhaustion])
> >>>>>> +ovn_start
> >>>>>> +
> >>>>>> +# Create a fake chassis with vxlan encap to lower MAX DP tunnel key
> >>>>>> to 2^12
> >>>>>> +ovn-sbctl \
> >>>>>> +    --id=@e create encap chassis_name=hv1 ip="192.168.0.1"
> >>>>>> type="vxlan" \
> >>>>>> +    -- --id=@c create chassis name=hv1 encaps=@e
> >>>>>> +
> >>>>>> +cmd="ovn-nbctl --wait=sb"
> >>>>>> +
> >>>>>> +for i in {1..4097..1}; do
> >>>>
> >>>> This can be changed to:
> >>>>
> >>>> for i in {1..4097}; do
> >>>>
> >>>>>> +    cmd="${cmd} -- ls-add lsw-${i}"
> >>>>>> +done
> >>>>>> +
> >>>>>> +eval $cmd
> >>>>>> +
> >>>>>> +check_row_count nb:Logical_Switch 4097
> >>>>>> +wait_row_count sb:Datapath_Binding 4095
> >>>>>> +
> >>>>>> +OVS_WAIT_UNTIL([grep "all datapath tunnel ids exhausted"
> >>>>>> northd/ovn-northd.log])
> >>>>>> +
> >>>>>> +AT_CLEANUP
> >>>>>> +])
> >>>>>> +
> >>>>>> +
> >>>>>>   OVN_FOR_EACH_NORTHD_NO_HV([
> >>>>>>   AT_SETUP([Logical Flow Datapath Groups])
> >>>>>>   ovn_start
> >>>>
> >>>> Regards,
> >>>> Dumitru
> >>>>
> >>>> _______________________________________________
> >>>> dev mailing list
> >>>> dev@openvswitch.org <mailto:dev@openvswitch.org>
> >>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >>>
> >>>
> >>> Regards,
> >>> Vladislav Odintsov
> >>>
> >>>
> >
>
>
> Regards,
> Vladislav Odintsov
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
diff mbox series

Patch

diff --git a/lib/ovn-util.c b/lib/ovn-util.c
index ee5cbcdc3..9f97ae2ca 100644
--- a/lib/ovn-util.c
+++ b/lib/ovn-util.c
@@ -693,13 +693,17 @@  uint32_t
 ovn_allocate_tnlid(struct hmap *set, const char *name, uint32_t min,
                    uint32_t max, uint32_t *hint)
 {
-    for (uint32_t tnlid = next_tnlid(*hint, min, max); tnlid != *hint;
-         tnlid = next_tnlid(tnlid, min, max)) {
+    /* Normalize hint, because it can be outside of [min, max]. */
+    *hint = next_tnlid(*hint, min, max);
+
+    uint32_t tnlid = *hint;
+    do {
         if (ovn_add_tnlid(set, tnlid)) {
             *hint = tnlid;
             return tnlid;
         }
-    }
+        tnlid = next_tnlid(tnlid, min, max);
+    } while (tnlid != *hint);
 
     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
     VLOG_WARN_RL(&rl, "all %s tunnel ids exhausted", name);
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index cd53755b2..174dbacda 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -2822,6 +2822,32 @@  AT_CHECK([test $lsp02 = 3 && test $ls1 = 123])
 AT_CLEANUP
 ])
 
+OVN_FOR_EACH_NORTHD_NO_HV([
+AT_SETUP([check tunnel ids exhaustion])
+ovn_start
+
+# Create a fake chassis with vxlan encap to lower MAX DP tunnel key to 2^12
+ovn-sbctl \
+    --id=@e create encap chassis_name=hv1 ip="192.168.0.1" type="vxlan" \
+    -- --id=@c create chassis name=hv1 encaps=@e
+
+cmd="ovn-nbctl --wait=sb"
+
+for i in {1..4097..1}; do
+    cmd="${cmd} -- ls-add lsw-${i}"
+done
+
+eval $cmd
+
+check_row_count nb:Logical_Switch 4097
+wait_row_count sb:Datapath_Binding 4095
+
+OVS_WAIT_UNTIL([grep "all datapath tunnel ids exhausted" northd/ovn-northd.log])
+
+AT_CLEANUP
+])
+
+
 OVN_FOR_EACH_NORTHD_NO_HV([
 AT_SETUP([Logical Flow Datapath Groups])
 ovn_start