diff mbox series

[ovs-dev,v5,3/3] conntrack: limit port clash resolution attempts

Message ID 1631968964-11390-4-git-send-email-wenxu@ucloud.cn
State Superseded
Headers show
Series conntrack port selection optimization | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test fail github build: failed

Commit Message

wenxu Sept. 18, 2021, 12:42 p.m. UTC
From: wenxu <wenxu@ucloud.cn>

In case almost or all available ports are taken, clash resolution can
take a very long time, resulting in pmd hang in conntrack.

This can happen when many to-be-natted hosts connect to same
destination:port (e.g. a proxy) and all connections pass the same SNAT.

Pick a random offset in the acceptable range, then try ever smaller
number of adjacent port numbers, until either the limit is reached or a
useable port was found.  This results in at most 248 attempts
(128 + 64 + 32 + 16 + 8, i.e. 4 restarts with new search offset)
instead of 64000+.

And if thenumber of ip address will limit the max attempts and which
will lead the total attempts under 248.

Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 lib/conntrack.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 4 deletions(-)

Comments

Paolo Valerio Oct. 4, 2021, 10:04 p.m. UTC | #1
not a full review, but see some questions/remarks below.

wenxu@ucloud.cn writes:

> From: wenxu <wenxu@ucloud.cn>
>
> In case almost or all available ports are taken, clash resolution can
> take a very long time, resulting in pmd hang in conntrack.
>
> This can happen when many to-be-natted hosts connect to same
> destination:port (e.g. a proxy) and all connections pass the same SNAT.
>
> Pick a random offset in the acceptable range, then try ever smaller
> number of adjacent port numbers, until either the limit is reached or a
> useable port was found.  This results in at most 248 attempts
> (128 + 64 + 32 + 16 + 8, i.e. 4 restarts with new search offset)
> instead of 64000+.
>
> And if thenumber of ip address will limit the max attempts and which
> will lead the total attempts under 248.
>
> Signed-off-by: wenxu <wenxu@ucloud.cn>
> ---
>  lib/conntrack.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 43 insertions(+), 4 deletions(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 52b5211..02bc2b9 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -2427,7 +2427,11 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
>      uint16_t min_sport, max_sport, curr_sport, orig_sport;
>      bool pat_proto = conn->key.nw_proto == IPPROTO_TCP ||
>                       conn->key.nw_proto == IPPROTO_UDP;
> +    unsigned int attempts, max_attempts, min_attempts;
>      uint16_t min_dport, max_dport, curr_dport;
> +    uint16_t range_src, range_dst, range_max;
> +    uint32_t range_addr;
> +    unsigned int i;
>  
>      min_addr = nat_info->min_addr;
>      max_addr = nat_info->max_addr;
> @@ -2444,6 +2448,19 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
>      set_dport_range(nat_info, &conn->key, hash, &curr_dport,
>                      &min_dport, &max_dport);
>  
> +    range_src = max_sport - min_sport + 1;
> +    range_dst = max_dport - min_dport + 1;
> +    range_max = range_src > range_dst ? range_src : range_dst;
> +    range_addr = ntohl(max_addr.ipv4) - ntohl(min_addr.ipv4) + 1;

what about ipv6?

> +    max_attempts = 128 / range_addr;
> +    if (max_attempts < 1) {
> +        max_attempts = 1;
> +    }
> +    min_attempts = 16 / range_addr;
> +    if (min_attempts < 2) {
> +        min_attempts = 2;
> +    }
> +
>  another_round:
>      store_addr_to_key(&curr_addr, &nat_conn->rev_key,
>                        nat_info->nat_action);
> @@ -2459,17 +2476,39 @@ another_round:
>  
>      curr_sport = orig_sport;
>  
> +    attempts = range_max;
> +    if (attempts > max_attempts) {
> +        attempts = max_attempts;
> +    }
> +
> +another_port_round:
> +    i = 0;
>      FOR_EACH_PORT_IN_RANGE(curr_dport, min_dport, max_dport) {
>          nat_conn->rev_key.src.port = htons(curr_dport);
>          FOR_EACH_PORT_IN_RANGE(curr_sport, min_sport, max_sport) {
> -            nat_conn->rev_key.dst.port = htons(curr_sport);
> -            if (!conn_lookup(ct, &nat_conn->rev_key,
> -                             time_msec(), NULL, NULL)) {
> -                return true;
> +            if (i++ < attempts) {
> +                nat_conn->rev_key.dst.port = htons(curr_sport);
> +                if (!conn_lookup(ct, &nat_conn->rev_key,
> +                                 time_msec(), NULL, NULL)) {
> +                    return true;
> +                }
> +            } else {
> +                goto next_attempts;
>              }
>          }
>      }

>  
> +next_attempts:
> +    if (attempts >= range_max || attempts < min_attempts) {
> +        goto next_addr;
> +    }
> +
> +    attempts /= 2;
> +    curr_dport = min_dport + (random_uint32() % range_dst);
> +    curr_sport = min_sport + (random_uint32() % range_src);
> +
> +    goto another_port_round;
> +

here you should be able to spare the "goto next_addr" changing the
condition of the if and enclosing the four lines above

Just to resume some points raised in previous emails:

I think that your remark about splitting the two port range iterations
instead of keeping it nested could be a good idea.
The first visible difference here is that in such case the dst port (of
course, in case of DNAT) range would have precedence over the src
manipulation in the resolution.  The second is that we could double the
number of attempts, but that should not be a big deal if we keep the
maximum number reasonable.
This is what the kernel does, and IMO it could fit our needs too.

The second remark is about IP the range iteration. I wonder if we really
need it at all for pat_protos. I may be wrong here, but it seems to me
that the kernel doesn't do that.

It would be also nice if we have a chance to discuss about those points
before proceeding, so every attempt to chime in/expand/object is
welcome.


>      /* Check if next IP is in range and respin. Otherwise, notify
>       * exhaustion to the caller. */
>  next_addr:
> -- 
> 1.8.3.1
wenxu Oct. 6, 2021, 2:10 p.m. UTC | #2
From: Paolo Valerio <pvalerio@redhat.com>
Date: 2021-10-05 06:04:57
To:  wenxu@ucloud.cn,i.maximets@ovn.org,aconole@redhat.com
Cc:  dev@openvswitch.org
Subject: Re: [PATCH v5 3/3] conntrack: limit port clash resolution attempts>not a full review, but see some questions/remarks below.
>
>wenxu@ucloud.cn writes:
>
>> From: wenxu <wenxu@ucloud.cn>
>>
>> In case almost or all available ports are taken, clash resolution can
>> take a very long time, resulting in pmd hang in conntrack.
>>
>> This can happen when many to-be-natted hosts connect to same
>> destination:port (e.g. a proxy) and all connections pass the same SNAT.
>>
>> Pick a random offset in the acceptable range, then try ever smaller
>> number of adjacent port numbers, until either the limit is reached or a
>> useable port was found.  This results in at most 248 attempts
>> (128 + 64 + 32 + 16 + 8, i.e. 4 restarts with new search offset)
>> instead of 64000+.
>>
>> And if thenumber of ip address will limit the max attempts and which
>> will lead the total attempts under 248.
>>
>> Signed-off-by: wenxu <wenxu@ucloud.cn>
>> ---
>>  lib/conntrack.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
>>  1 file changed, 43 insertions(+), 4 deletions(-)
>>
>> diff --git a/lib/conntrack.c b/lib/conntrack.c
>> index 52b5211..02bc2b9 100644
>> --- a/lib/conntrack.c
>> +++ b/lib/conntrack.c
>> @@ -2427,7 +2427,11 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
>>      uint16_t min_sport, max_sport, curr_sport, orig_sport;
>>      bool pat_proto = conn->key.nw_proto == IPPROTO_TCP ||
>>                       conn->key.nw_proto == IPPROTO_UDP;
>> +    unsigned int attempts, max_attempts, min_attempts;
>>      uint16_t min_dport, max_dport, curr_dport;
>> +    uint16_t range_src, range_dst, range_max;
>> +    uint32_t range_addr;
>> +    unsigned int i;
>>  
>>      min_addr = nat_info->min_addr;
>>      max_addr = nat_info->max_addr;
>> @@ -2444,6 +2448,19 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
>>      set_dport_range(nat_info, &conn->key, hash, &curr_dport,
>>                      &min_dport, &max_dport);
>>  
>> +    range_src = max_sport - min_sport + 1;
>> +    range_dst = max_dport - min_dport + 1;
>> +    range_max = range_src > range_dst ? range_src : range_dst;
>> +    range_addr = ntohl(max_addr.ipv4) - ntohl(min_addr.ipv4) + 1;
>
>what about ipv6?
>
>> +    max_attempts = 128 / range_addr;
>> +    if (max_attempts < 1) {
>> +        max_attempts = 1;
>> +    }
>> +    min_attempts = 16 / range_addr;
>> +    if (min_attempts < 2) {
>> +        min_attempts = 2;
>> +    }
>> +
>>  another_round:
>>      store_addr_to_key(&curr_addr, &nat_conn->rev_key,
>>                        nat_info->nat_action);
>> @@ -2459,17 +2476,39 @@ another_round:
>>  
>>      curr_sport = orig_sport;
>>  
>> +    attempts = range_max;
>> +    if (attempts > max_attempts) {
>> +        attempts = max_attempts;
>> +    }
>> +
>> +another_port_round:
>> +    i = 0;
>>      FOR_EACH_PORT_IN_RANGE(curr_dport, min_dport, max_dport) {
>>          nat_conn->rev_key.src.port = htons(curr_dport);
>>          FOR_EACH_PORT_IN_RANGE(curr_sport, min_sport, max_sport) {
>> -            nat_conn->rev_key.dst.port = htons(curr_sport);
>> -            if (!conn_lookup(ct, &nat_conn->rev_key,
>> -                             time_msec(), NULL, NULL)) {
>> -                return true;
>> +            if (i++ < attempts) {
>> +                nat_conn->rev_key.dst.port = htons(curr_sport);
>> +                if (!conn_lookup(ct, &nat_conn->rev_key,
>> +                                 time_msec(), NULL, NULL)) {
>> +                    return true;
>> +                }
>> +            } else {
>> +                goto next_attempts;
>>              }
>>          }
>>      }
>
>>  
>> +next_attempts:
>> +    if (attempts >= range_max || attempts < min_attempts) {
>> +        goto next_addr;
>> +    }
>> +
>> +    attempts /= 2;
>> +    curr_dport = min_dport + (random_uint32() % range_dst);
>> +    curr_sport = min_sport + (random_uint32() % range_src);
>> +
>> +    goto another_port_round;
>> +
>
>here you should be able to spare the "goto next_addr" changing the
>condition of the if and enclosing the four lines above
>
>Just to resume some points raised in previous emails:
>
>I think that your remark about splitting the two port range iterations
>instead of keeping it nested could be a good idea.
 
>The first visible difference here is that in such case the dst port (of
>course, in case of DNAT) range would have precedence over the src
>manipulation in the resolution.  The second is that we could double the
>number of attempts, but that should not be a big deal if we keep the
>maximum number reasonable.


Agree with you.  It makes the dnat special case handle individually. So
I will post a new one for this.


>This is what the kernel does, and IMO it could fit our needs too.
>
>The second remark is about IP the range iteration. I wonder if we really
>need it at all for pat_protos. I may be wrong here, but it seems to me
>that the kernel doesn't do that.


The kernel conntrack only choose one ip address(Although there are range ip address)
to do nat and select the ports. But in the dpdk datapath we select all the ip address to
do nat and select the port.  So we should care the ip range.


By the way, any idea about the first patch of this series.
>
>It would be also nice if we have a chance to discuss about those points
>before proceeding, so every attempt to chime in/expand/object is
>welcome.
>
>
>>      /* Check if next IP is in range and respin. Otherwise, notify
>>       * exhaustion to the caller. */
>>  next_addr:
>> -- 
>> 1.8.3.1
>
diff mbox series

Patch

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 52b5211..02bc2b9 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -2427,7 +2427,11 @@  nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
     uint16_t min_sport, max_sport, curr_sport, orig_sport;
     bool pat_proto = conn->key.nw_proto == IPPROTO_TCP ||
                      conn->key.nw_proto == IPPROTO_UDP;
+    unsigned int attempts, max_attempts, min_attempts;
     uint16_t min_dport, max_dport, curr_dport;
+    uint16_t range_src, range_dst, range_max;
+    uint32_t range_addr;
+    unsigned int i;
 
     min_addr = nat_info->min_addr;
     max_addr = nat_info->max_addr;
@@ -2444,6 +2448,19 @@  nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
     set_dport_range(nat_info, &conn->key, hash, &curr_dport,
                     &min_dport, &max_dport);
 
+    range_src = max_sport - min_sport + 1;
+    range_dst = max_dport - min_dport + 1;
+    range_max = range_src > range_dst ? range_src : range_dst;
+    range_addr = ntohl(max_addr.ipv4) - ntohl(min_addr.ipv4) + 1;
+    max_attempts = 128 / range_addr;
+    if (max_attempts < 1) {
+        max_attempts = 1;
+    }
+    min_attempts = 16 / range_addr;
+    if (min_attempts < 2) {
+        min_attempts = 2;
+    }
+
 another_round:
     store_addr_to_key(&curr_addr, &nat_conn->rev_key,
                       nat_info->nat_action);
@@ -2459,17 +2476,39 @@  another_round:
 
     curr_sport = orig_sport;
 
+    attempts = range_max;
+    if (attempts > max_attempts) {
+        attempts = max_attempts;
+    }
+
+another_port_round:
+    i = 0;
     FOR_EACH_PORT_IN_RANGE(curr_dport, min_dport, max_dport) {
         nat_conn->rev_key.src.port = htons(curr_dport);
         FOR_EACH_PORT_IN_RANGE(curr_sport, min_sport, max_sport) {
-            nat_conn->rev_key.dst.port = htons(curr_sport);
-            if (!conn_lookup(ct, &nat_conn->rev_key,
-                             time_msec(), NULL, NULL)) {
-                return true;
+            if (i++ < attempts) {
+                nat_conn->rev_key.dst.port = htons(curr_sport);
+                if (!conn_lookup(ct, &nat_conn->rev_key,
+                                 time_msec(), NULL, NULL)) {
+                    return true;
+                }
+            } else {
+                goto next_attempts;
             }
         }
     }
 
+next_attempts:
+    if (attempts >= range_max || attempts < min_attempts) {
+        goto next_addr;
+    }
+
+    attempts /= 2;
+    curr_dport = min_dport + (random_uint32() % range_dst);
+    curr_sport = min_sport + (random_uint32() % range_src);
+
+    goto another_port_round;
+
     /* Check if next IP is in range and respin. Otherwise, notify
      * exhaustion to the caller. */
 next_addr: