diff mbox series

[ovs-dev,v8,3/3] conntrack: limit port clash resolution attempts

Message ID 1639453166-27494-3-git-send-email-wenxu@ucloud.cn
State Superseded
Headers show
Series [ovs-dev,v8,1/3] conntrack: select correct sport range for well-known origin sport | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed

Commit Message

wenxu Dec. 14, 2021, 3:39 a.m. UTC
From: wenxu <wenxu@ucloud.cn>

In case almost or all available ports are taken, clash resolution can
take a very long time, resulting in pmd hang in conntrack.

This can happen when many to-be-natted hosts connect to same
destination:port (e.g. a proxy) and all connections pass the same SNAT.

Pick a random offset in the acceptable range, then try ever smaller
number of adjacent port numbers, until either the limit is reached or a
useable port was found.  This results in at most 248 attempts
(128 + 64 + 32 + 16 + 8, i.e. 4 restarts with new search offset)
instead of 64000+.

And if thenumber of ip address will limit the max attempts and which
will lead the total attempts under 248.

Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 lib/conntrack.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 9 deletions(-)

Comments

Paolo Valerio Jan. 12, 2022, 10:19 a.m. UTC | #1
Hello wenxu,

I tested a bit more the patch, and it seems to effectively limit the
number of attempts. There is a case with a sufficiently large port range
that will always tries the same ports.
E.g. (incresing the IPs you can reduce the port range):

actions=ct(commit,nat(dst=10.1.1.100-10.1.1.101:80-144)

in this case the source port will never get the chance to resolve the
clash and the only IPs/ports tested would be the ones above.

wenxu@ucloud.cn writes:

> From: wenxu <wenxu@ucloud.cn>
>
> In case almost or all available ports are taken, clash resolution can
> take a very long time, resulting in pmd hang in conntrack.
>
> This can happen when many to-be-natted hosts connect to same
> destination:port (e.g. a proxy) and all connections pass the same SNAT.
>
> Pick a random offset in the acceptable range, then try ever smaller
> number of adjacent port numbers, until either the limit is reached or a
> useable port was found.  This results in at most 248 attempts
> (128 + 64 + 32 + 16 + 8, i.e. 4 restarts with new search offset)
> instead of 64000+.
>
> And if thenumber of ip address will limit the max attempts and which
> will lead the total attempts under 248.
>
> Signed-off-by: wenxu <wenxu@ucloud.cn>
> ---
>  lib/conntrack.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 56 insertions(+), 9 deletions(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 2a5d72a..dae8dd7 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -2426,10 +2426,14 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
>      union ct_addr min_addr = {0}, max_addr = {0}, curr_addr = {0},
>                    guard_addr = {0};
>      uint32_t hash = nat_range_hash(conn, ct->hash_basis, nat_info);
> +    uint16_t min_sport, max_sport, curr_sport, orig_sport;
>      bool pat_proto = conn->key.nw_proto == IPPROTO_TCP ||
>                       conn->key.nw_proto == IPPROTO_UDP;
> +    unsigned int attempts, max_attempts, min_attempts;
>      uint16_t min_dport, max_dport, curr_dport;
> -    uint16_t min_sport, max_sport, curr_sport;
> +    uint16_t range_src, range_dst, range_max;
> +    uint32_t range_addr;
> +    unsigned int i;
>  
>      min_addr = nat_info->min_addr;
>      max_addr = nat_info->max_addr;
> @@ -2441,11 +2445,29 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
>       * we can stop once we reach it. */
>      guard_addr = curr_addr;
>  
> -    set_sport_range(nat_info, &conn->key, hash, &curr_sport,
> +    set_sport_range(nat_info, &conn->key, hash, &orig_sport,
>                      &min_sport, &max_sport);
>      set_dport_range(nat_info, &conn->key, hash, &curr_dport,
>                      &min_dport, &max_dport);
>  
> +    range_src = max_sport - min_sport + 1;
> +    range_dst = max_dport - min_dport + 1;
> +    range_max = range_src > range_dst ? range_src : range_dst;
> +    if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
> +        range_addr = ntohl(max_addr.ipv4) - ntohl(min_addr.ipv4) + 1;
> +    } else {
> +        range_addr = nat_ipv6_addrs_delta(&nat_info->min_addr.ipv6,
> +                                          &nat_info->max_addr.ipv6) + 1;
> +    }
> +    max_attempts = 128 / range_addr;
> +    if (max_attempts < 1) {
> +        max_attempts = 1;
> +    }
> +    min_attempts = 16 / range_addr;
> +    if (min_attempts < 2) {
> +        min_attempts = 2;
> +    }
> +
>  another_round:
>      store_addr_to_key(&curr_addr, &nat_conn->rev_key,
>                        nat_info->nat_action);
> @@ -2459,22 +2481,47 @@ another_round:
>          goto next_addr;
>      }
>  
> +    curr_sport = orig_sport;

I think that you should restore the dport as well, right?

> +
> +    attempts = range_max;
> +    if (attempts > max_attempts) {
> +        attempts = max_attempts;
> +    }
> +
> +another_port_round:
> +    i = 0;
>      if (nat_info->nat_action & NAT_ACTION_DST_PORT) {
>          FOR_EACH_PORT_IN_RANGE(curr_dport, min_dport, max_dport) {
> -            nat_conn->rev_key.src.port = htons(curr_dport);
> +            if (i++ < attempts) {
> +                nat_conn->rev_key.src.port = htons(curr_dport);
> +                if (!conn_lookup(ct, &nat_conn->rev_key,
> +                                 time_msec(), NULL, NULL)) {
> +                    return true;
> +                }
> +            } else {
> +                break;

I don't know if it's really a problem (and maybe you noticed
already), but breaking before you go through the whole range will change
the dport (that is, it will not use the initial destination port) during
the the next clash resolution (based on the source port).

All in all, the patch does its job, but probably the DNAT case above
should be kept in mind.

> +            }
> +        }
> +    }
> +
> +    FOR_EACH_PORT_IN_RANGE(curr_sport, min_sport, max_sport) {
> +        if (i++ < attempts) {
> +            nat_conn->rev_key.dst.port = htons(curr_sport);
>              if (!conn_lookup(ct, &nat_conn->rev_key,
>                               time_msec(), NULL, NULL)) {
>                  return true;
>              }
> +        } else {
> +            break;
>          }
>      }
>  
> -    FOR_EACH_PORT_IN_RANGE(curr_sport, min_sport, max_sport) {
> -        nat_conn->rev_key.dst.port = htons(curr_sport);
> -        if (!conn_lookup(ct, &nat_conn->rev_key,
> -                         time_msec(), NULL, NULL)) {
> -            return true;
> -        }
> +    if (attempts < range_max && attempts >= min_attempts) {
> +        attempts /= 2;
> +        curr_dport = min_dport + (random_uint32() % range_dst);
> +        curr_sport = min_sport + (random_uint32() % range_src);
> +
> +        goto another_port_round;
>      }
>  
>      /* Check if next IP is in range and respin. Otherwise, notify
> -- 
> 1.8.3.1
wenxu Jan. 13, 2022, 2:10 a.m. UTC | #2
From: Paolo Valerio <pvalerio@redhat.com>
Date: 2022-01-12 18:19:25
To:  wenxu@ucloud.cn,i.maximets@ovn.org
Cc:  dev@openvswitch.org
Subject: Re: [PATCH v8 3/3] conntrack: limit port clash resolution attempts>Hello wenxu,
>
>I tested a bit more the patch, and it seems to effectively limit the
>number of attempts. There is a case with a sufficiently large port range
>that will always tries the same ports.
>E.g. (incresing the IPs you can reduce the port range):
>
>actions=ct(commit,nat(dst=10.1.1.100-10.1.1.101:80-144)
>
>in this case the source port will never get the chance to resolve the
>clash and the only IPs/ports tested would be the ones above.
Yes , for dnat case the source port resolve should be restore attempts and dport 
>
>wenxu@ucloud.cn writes:
>
>> From: wenxu <wenxu@ucloud.cn>
>>
>> In case almost or all available ports are taken, clash resolution can
>> take a very long time, resulting in pmd hang in conntrack.
>>
>> This can happen when many to-be-natted hosts connect to same
>> destination:port (e.g. a proxy) and all connections pass the same SNAT.
>>
>> Pick a random offset in the acceptable range, then try ever smaller
>> number of adjacent port numbers, until either the limit is reached or a
>> useable port was found.  This results in at most 248 attempts
>> (128 + 64 + 32 + 16 + 8, i.e. 4 restarts with new search offset)
>> instead of 64000+.
>>
>> And if thenumber of ip address will limit the max attempts and which
>> will lead the total attempts under 248.
>>
>> Signed-off-by: wenxu <wenxu@ucloud.cn>
>> ---
>>  lib/conntrack.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++--------
>>  1 file changed, 56 insertions(+), 9 deletions(-)
>>
>> diff --git a/lib/conntrack.c b/lib/conntrack.c
>> index 2a5d72a..dae8dd7 100644
>> --- a/lib/conntrack.c
>> +++ b/lib/conntrack.c
>> @@ -2426,10 +2426,14 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
>>      union ct_addr min_addr = {0}, max_addr = {0}, curr_addr = {0},
>>                    guard_addr = {0};
>>      uint32_t hash = nat_range_hash(conn, ct->hash_basis, nat_info);
>> +    uint16_t min_sport, max_sport, curr_sport, orig_sport;
>>      bool pat_proto = conn->key.nw_proto == IPPROTO_TCP ||
>>                       conn->key.nw_proto == IPPROTO_UDP;
>> +    unsigned int attempts, max_attempts, min_attempts;
>>      uint16_t min_dport, max_dport, curr_dport;
>> -    uint16_t min_sport, max_sport, curr_sport;
>> +    uint16_t range_src, range_dst, range_max;
>> +    uint32_t range_addr;
>> +    unsigned int i;
>>  
>>      min_addr = nat_info->min_addr;
>>      max_addr = nat_info->max_addr;
>> @@ -2441,11 +2445,29 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
>>       * we can stop once we reach it. */
>>      guard_addr = curr_addr;
>>  
>> -    set_sport_range(nat_info, &conn->key, hash, &curr_sport,
>> +    set_sport_range(nat_info, &conn->key, hash, &orig_sport,
>>                      &min_sport, &max_sport);
>>      set_dport_range(nat_info, &conn->key, hash, &curr_dport,
>>                      &min_dport, &max_dport);
>>  
>> +    range_src = max_sport - min_sport + 1;
>> +    range_dst = max_dport - min_dport + 1;
>> +    range_max = range_src > range_dst ? range_src : range_dst;
>> +    if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
>> +        range_addr = ntohl(max_addr.ipv4) - ntohl(min_addr.ipv4) + 1;
>> +    } else {
>> +        range_addr = nat_ipv6_addrs_delta(&nat_info->min_addr.ipv6,
>> +                                          &nat_info->max_addr.ipv6) + 1;
>> +    }
>> +    max_attempts = 128 / range_addr;
>> +    if (max_attempts < 1) {
>> +        max_attempts = 1;
>> +    }
>> +    min_attempts = 16 / range_addr;
>> +    if (min_attempts < 2) {
>> +        min_attempts = 2;
>> +    }
>> +
>>  another_round:
>>      store_addr_to_key(&curr_addr, &nat_conn->rev_key,
>>                        nat_info->nat_action);
>> @@ -2459,22 +2481,47 @@ another_round:
>>          goto next_addr;
>>      }
>>  
>> +    curr_sport = orig_sport;
>
>I think that you should restore the dport as well, right?
>
>> +
>> +    attempts = range_max;
>> +    if (attempts > max_attempts) {
>> +        attempts = max_attempts;
>> +    }
>> +
>> +another_port_round:
>> +    i = 0;
>>      if (nat_info->nat_action & NAT_ACTION_DST_PORT) {
>>          FOR_EACH_PORT_IN_RANGE(curr_dport, min_dport, max_dport) {
>> -            nat_conn->rev_key.src.port = htons(curr_dport);
>> +            if (i++ < attempts) {
>> +                nat_conn->rev_key.src.port = htons(curr_dport);
>> +                if (!conn_lookup(ct, &nat_conn->rev_key,
>> +                                 time_msec(), NULL, NULL)) {
>> +                    return true;
>> +                }
>> +            } else {
>> +                break;
>
>I don't know if it's really a problem (and maybe you noticed
>already), but breaking before you go through the whole range will change
>the dport (that is, it will not use the initial destination port) during
>the the next clash resolution (based on the source port).
>
>All in all, the patch does its job, but probably the DNAT case above
>should be kept in mind.
>
>> +            }
>> +        }
>> +    }
>> +
>> +    FOR_EACH_PORT_IN_RANGE(curr_sport, min_sport, max_sport) {
>> +        if (i++ < attempts) {
>> +            nat_conn->rev_key.dst.port = htons(curr_sport);
>>              if (!conn_lookup(ct, &nat_conn->rev_key,
>>                               time_msec(), NULL, NULL)) {
>>                  return true;
>>              }
>> +        } else {
>> +            break;
>>          }
>>      }
>>  
>> -    FOR_EACH_PORT_IN_RANGE(curr_sport, min_sport, max_sport) {
>> -        nat_conn->rev_key.dst.port = htons(curr_sport);
>> -        if (!conn_lookup(ct, &nat_conn->rev_key,
>> -                         time_msec(), NULL, NULL)) {
>> -            return true;
>> -        }
>> +    if (attempts < range_max && attempts >= min_attempts) {
>> +        attempts /= 2;
>> +        curr_dport = min_dport + (random_uint32() % range_dst);
>> +        curr_sport = min_sport + (random_uint32() % range_src);
>> +
>> +        goto another_port_round;
>>      }
>>  
>>      /* Check if next IP is in range and respin. Otherwise, notify
>> -- 
>> 1.8.3.1
>
diff mbox series

Patch

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 2a5d72a..dae8dd7 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -2426,10 +2426,14 @@  nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
     union ct_addr min_addr = {0}, max_addr = {0}, curr_addr = {0},
                   guard_addr = {0};
     uint32_t hash = nat_range_hash(conn, ct->hash_basis, nat_info);
+    uint16_t min_sport, max_sport, curr_sport, orig_sport;
     bool pat_proto = conn->key.nw_proto == IPPROTO_TCP ||
                      conn->key.nw_proto == IPPROTO_UDP;
+    unsigned int attempts, max_attempts, min_attempts;
     uint16_t min_dport, max_dport, curr_dport;
-    uint16_t min_sport, max_sport, curr_sport;
+    uint16_t range_src, range_dst, range_max;
+    uint32_t range_addr;
+    unsigned int i;
 
     min_addr = nat_info->min_addr;
     max_addr = nat_info->max_addr;
@@ -2441,11 +2445,29 @@  nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
      * we can stop once we reach it. */
     guard_addr = curr_addr;
 
-    set_sport_range(nat_info, &conn->key, hash, &curr_sport,
+    set_sport_range(nat_info, &conn->key, hash, &orig_sport,
                     &min_sport, &max_sport);
     set_dport_range(nat_info, &conn->key, hash, &curr_dport,
                     &min_dport, &max_dport);
 
+    range_src = max_sport - min_sport + 1;
+    range_dst = max_dport - min_dport + 1;
+    range_max = range_src > range_dst ? range_src : range_dst;
+    if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
+        range_addr = ntohl(max_addr.ipv4) - ntohl(min_addr.ipv4) + 1;
+    } else {
+        range_addr = nat_ipv6_addrs_delta(&nat_info->min_addr.ipv6,
+                                          &nat_info->max_addr.ipv6) + 1;
+    }
+    max_attempts = 128 / range_addr;
+    if (max_attempts < 1) {
+        max_attempts = 1;
+    }
+    min_attempts = 16 / range_addr;
+    if (min_attempts < 2) {
+        min_attempts = 2;
+    }
+
 another_round:
     store_addr_to_key(&curr_addr, &nat_conn->rev_key,
                       nat_info->nat_action);
@@ -2459,22 +2481,47 @@  another_round:
         goto next_addr;
     }
 
+    curr_sport = orig_sport;
+
+    attempts = range_max;
+    if (attempts > max_attempts) {
+        attempts = max_attempts;
+    }
+
+another_port_round:
+    i = 0;
     if (nat_info->nat_action & NAT_ACTION_DST_PORT) {
         FOR_EACH_PORT_IN_RANGE(curr_dport, min_dport, max_dport) {
-            nat_conn->rev_key.src.port = htons(curr_dport);
+            if (i++ < attempts) {
+                nat_conn->rev_key.src.port = htons(curr_dport);
+                if (!conn_lookup(ct, &nat_conn->rev_key,
+                                 time_msec(), NULL, NULL)) {
+                    return true;
+                }
+            } else {
+                break;
+            }
+        }
+    }
+
+    FOR_EACH_PORT_IN_RANGE(curr_sport, min_sport, max_sport) {
+        if (i++ < attempts) {
+            nat_conn->rev_key.dst.port = htons(curr_sport);
             if (!conn_lookup(ct, &nat_conn->rev_key,
                              time_msec(), NULL, NULL)) {
                 return true;
             }
+        } else {
+            break;
         }
     }
 
-    FOR_EACH_PORT_IN_RANGE(curr_sport, min_sport, max_sport) {
-        nat_conn->rev_key.dst.port = htons(curr_sport);
-        if (!conn_lookup(ct, &nat_conn->rev_key,
-                         time_msec(), NULL, NULL)) {
-            return true;
-        }
+    if (attempts < range_max && attempts >= min_attempts) {
+        attempts /= 2;
+        curr_dport = min_dport + (random_uint32() % range_dst);
+        curr_sport = min_sport + (random_uint32() % range_src);
+
+        goto another_port_round;
     }
 
     /* Check if next IP is in range and respin. Otherwise, notify