diff mbox series

[ovs-dev,v10,3/3] conntrack: limit port clash resolution attempts

Message ID 1642477806-18538-3-git-send-email-wenxu@ucloud.cn
State Changes Requested
Headers show
Series [ovs-dev,v10,1/3] conntrack: select correct sport range for well-known origin sport | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test fail github build: failed

Commit Message

wenxu Jan. 18, 2022, 3:50 a.m. UTC
From: wenxu <wenxu@ucloud.cn>

In case almost or all available ports are taken, clash resolution can
take a very long time, resulting in pmd hang in conntrack.

This can happen when many to-be-natted hosts connect to same
destination:port (e.g. a proxy) and all connections pass the same SNAT.

Pick a random offset in the acceptable range, then try ever smaller
number of adjacent port numbers, until either the limit is reached or a
useable port was found.  This results in at most 248 attempts
(128 + 64 + 32 + 16 + 8, i.e. 4 restarts with new search offset)
instead of 64000+.

And if thenumber of ip address will limit the max attempts and which
will lead the total attempts under 248.

Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 lib/conntrack.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 63 insertions(+), 8 deletions(-)

Comments

Paolo Valerio Jan. 30, 2022, 9:28 p.m. UTC | #1
wenxu@ucloud.cn writes:

> From: wenxu <wenxu@ucloud.cn>
>
> In case almost or all available ports are taken, clash resolution can
> take a very long time, resulting in pmd hang in conntrack.
>
> This can happen when many to-be-natted hosts connect to same
> destination:port (e.g. a proxy) and all connections pass the same SNAT.
>
> Pick a random offset in the acceptable range, then try ever smaller
> number of adjacent port numbers, until either the limit is reached or a
> useable port was found.  This results in at most 248 attempts
> (128 + 64 + 32 + 16 + 8, i.e. 4 restarts with new search offset)
> instead of 64000+.
>
> And if thenumber of ip address will limit the max attempts and which
> will lead the total attempts under 248.
>
> Signed-off-by: wenxu <wenxu@ucloud.cn>
> ---
>  lib/conntrack.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 63 insertions(+), 8 deletions(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index dc29c9b..248cd45 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -2400,19 +2400,50 @@ next_addr_in_range_guarded(union ct_addr *curr, union ct_addr *min,
>  static bool
>  nat_get_unique_l4(struct conntrack *ct, struct conn *nat_conn,
>                    ovs_be16 *port, uint16_t curr, uint16_t min,
> -                  uint16_t max)
> +                  uint16_t max, unsigned int attempts)
>  {
> +    unsigned int i = 0;
> +
>      FOR_EACH_PORT_IN_RANGE(curr, min, max) {
> -        *port = htons(curr);
> -        if (!conn_lookup(ct, &nat_conn->rev_key,
> -                         time_msec(), NULL, NULL)) {
> -            return true;
> +        if (i++ < attempts) {
> +            *port = htons(curr);
> +            if (!conn_lookup(ct, &nat_conn->rev_key,
> +                             time_msec(), NULL, NULL)) {
> +                return true;
> +            }
> +        } else {
> +            break;
>          }
>      }
>  
>      return false;
>  }
>  
> +static void
> +nat_get_attempts_range(union ct_addr *min_addr, union ct_addr *max_addr,
> +                       uint16_t dl_type, unsigned int *max_attempts,
> +                       unsigned int *min_attempts)
> +{
> +    uint32_t range_addr;
> +
> +    if (dl_type == htons(ETH_TYPE_IP)) {
> +        range_addr = ntohl(max_addr->ipv4) - ntohl(min_addr->ipv4) + 1;
> +    } else {
> +        range_addr = nat_ipv6_addrs_delta(&min_addr->ipv6,
> +                                          &max_addr->ipv6) + 1;
> +    }
> +
> +    *max_attempts = 128 / range_addr;
> +    if (*max_attempts < 1) {
> +        *max_attempts = 1;
> +    }
> +
> +    *min_attempts = 16 / range_addr;
> +    if (*min_attempts < 2) {
> +        *min_attempts = 2;
> +    }
> +}
> +
>  /* This function tries to get a unique tuple.
>   * Every iteration checks that the reverse tuple doesn't
>   * collide with any existing one.
> @@ -2446,6 +2477,9 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
>      uint16_t min_dport, max_dport, curr_dport, orig_dport;
>      bool pat_proto = conn->key.nw_proto == IPPROTO_TCP ||
>                       conn->key.nw_proto == IPPROTO_UDP;
> +    unsigned int attempts, max_attempts, min_attempts;
> +    uint16_t range_src, range_dst, range_max;
> +    bool found;
>  
>      min_addr = nat_info->min_addr;
>      max_addr = nat_info->max_addr;
> @@ -2462,6 +2496,13 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
>      set_dport_range(nat_info, &conn->key, hash, &orig_dport,
>                      &min_dport, &max_dport);
>  
> +    range_src = max_sport - min_sport + 1;
> +    range_dst = max_dport - min_dport + 1;
> +    range_max = range_src > range_dst ? range_src : range_dst;
> +

nit: these could be added in nat_get_attempts_range(), right?

> +    nat_get_attempts_range(&min_addr, &max_addr, conn->key.dl_type,
> +                           &max_attempts, &min_attempts);
> +
>  another_round:
>      store_addr_to_key(&curr_addr, &nat_conn->rev_key,
>                        nat_info->nat_action);
> @@ -2481,10 +2522,16 @@ another_round:
>      nat_conn->rev_key.src.port = htons(curr_dport);
>      nat_conn->rev_key.dst.port = htons(curr_sport);
>  
> -    bool found = false;
> +    attempts = range_max;
> +    if (attempts > max_attempts) {
> +        attempts = max_attempts;
> +    }
> +
> +another_port_round:
> +    found = false;
>      if (nat_info->nat_action & NAT_ACTION_DST_PORT) {
>          found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.src.port,
> -                                  curr_dport, min_dport, max_dport);
> +                                  curr_dport, min_dport, max_dport, attempts);
>          if (!found) {
>              nat_conn->rev_key.src.port = htons(orig_dport);
>          }

I still see some problems, though.

- dnat ranges smaller than the ephemeral set (almost always the case)
for snat will be checked (sometimes entirely) every time a random source
port subrange is checked.

Let's say you have a single IP address, a dport range like [80,85], and
so [1024, 65535] as source port range.

what happens is that 80-85 is tested entirely 5 times (assuming the
patch aims to restore the ports for each port round).

- large ip ranges will never iterate over ports

In this case, as far as I can see, using a large range like
actions=ct(commit,nat(src=10.1.1.100-10.1.1.164)

- a range slightly larger than 128 (single IP), will basically always
  test the same ports

here are some potential ideas that needs to be verified:

Probably, treating the ranges independently may be helpful for the first
problem, e.g. moving the another_port_round logic into
nat_get_unique_l4() and avoiding:

range_max = range_src > range_dst ? range_src : range_dst;

The second problem could be solved by removing (or by reducing to a very
small fixed maximum) the IP iterations, and just picking the IP address
with the hash (or according to some logic like the least used IP).

The third problem may depend on the previous solution. The number of
IPs in the range affect the number of subranges, thus the solution may
vary.

My suggestion though is to post the first two paches separately (as they
seem to be basically ready, and IMO, they make sense even separately from
this patch).
It may also be the case to at least evaluate if a slightly different
approach with this change may give better results (maybe also based on
some potential suggestions/discussions coming from the list).

Paolo

> @@ -2492,13 +2539,21 @@ another_round:
>  
>      if (!found) {
>          found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.dst.port,
> -                                  curr_sport, min_sport, max_sport);
> +                                  curr_sport, min_sport, max_sport, attempts);
>      }
>  
>      if (found) {
>          return true;
>      }
>  
> +    if (attempts < range_max && attempts >= min_attempts) {
> +        attempts /= 2;
> +        curr_dport = min_dport + (random_uint32() % range_dst);
> +        curr_sport = min_sport + (random_uint32() % range_src);
> +
> +        goto another_port_round;
> +    }
> +
>      /* Check if next IP is in range and respin. Otherwise, notify
>       * exhaustion to the caller. */
>  next_addr:
> -- 
> 1.8.3.1
diff mbox series

Patch

diff --git a/lib/conntrack.c b/lib/conntrack.c
index dc29c9b..248cd45 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -2400,19 +2400,50 @@  next_addr_in_range_guarded(union ct_addr *curr, union ct_addr *min,
 static bool
 nat_get_unique_l4(struct conntrack *ct, struct conn *nat_conn,
                   ovs_be16 *port, uint16_t curr, uint16_t min,
-                  uint16_t max)
+                  uint16_t max, unsigned int attempts)
 {
+    unsigned int i = 0;
+
     FOR_EACH_PORT_IN_RANGE(curr, min, max) {
-        *port = htons(curr);
-        if (!conn_lookup(ct, &nat_conn->rev_key,
-                         time_msec(), NULL, NULL)) {
-            return true;
+        if (i++ < attempts) {
+            *port = htons(curr);
+            if (!conn_lookup(ct, &nat_conn->rev_key,
+                             time_msec(), NULL, NULL)) {
+                return true;
+            }
+        } else {
+            break;
         }
     }
 
     return false;
 }
 
+static void
+nat_get_attempts_range(union ct_addr *min_addr, union ct_addr *max_addr,
+                       uint16_t dl_type, unsigned int *max_attempts,
+                       unsigned int *min_attempts)
+{
+    uint32_t range_addr;
+
+    if (dl_type == htons(ETH_TYPE_IP)) {
+        range_addr = ntohl(max_addr->ipv4) - ntohl(min_addr->ipv4) + 1;
+    } else {
+        range_addr = nat_ipv6_addrs_delta(&min_addr->ipv6,
+                                          &max_addr->ipv6) + 1;
+    }
+
+    *max_attempts = 128 / range_addr;
+    if (*max_attempts < 1) {
+        *max_attempts = 1;
+    }
+
+    *min_attempts = 16 / range_addr;
+    if (*min_attempts < 2) {
+        *min_attempts = 2;
+    }
+}
+
 /* This function tries to get a unique tuple.
  * Every iteration checks that the reverse tuple doesn't
  * collide with any existing one.
@@ -2446,6 +2477,9 @@  nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
     uint16_t min_dport, max_dport, curr_dport, orig_dport;
     bool pat_proto = conn->key.nw_proto == IPPROTO_TCP ||
                      conn->key.nw_proto == IPPROTO_UDP;
+    unsigned int attempts, max_attempts, min_attempts;
+    uint16_t range_src, range_dst, range_max;
+    bool found;
 
     min_addr = nat_info->min_addr;
     max_addr = nat_info->max_addr;
@@ -2462,6 +2496,13 @@  nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
     set_dport_range(nat_info, &conn->key, hash, &orig_dport,
                     &min_dport, &max_dport);
 
+    range_src = max_sport - min_sport + 1;
+    range_dst = max_dport - min_dport + 1;
+    range_max = range_src > range_dst ? range_src : range_dst;
+
+    nat_get_attempts_range(&min_addr, &max_addr, conn->key.dl_type,
+                           &max_attempts, &min_attempts);
+
 another_round:
     store_addr_to_key(&curr_addr, &nat_conn->rev_key,
                       nat_info->nat_action);
@@ -2481,10 +2522,16 @@  another_round:
     nat_conn->rev_key.src.port = htons(curr_dport);
     nat_conn->rev_key.dst.port = htons(curr_sport);
 
-    bool found = false;
+    attempts = range_max;
+    if (attempts > max_attempts) {
+        attempts = max_attempts;
+    }
+
+another_port_round:
+    found = false;
     if (nat_info->nat_action & NAT_ACTION_DST_PORT) {
         found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.src.port,
-                                  curr_dport, min_dport, max_dport);
+                                  curr_dport, min_dport, max_dport, attempts);
         if (!found) {
             nat_conn->rev_key.src.port = htons(orig_dport);
         }
@@ -2492,13 +2539,21 @@  another_round:
 
     if (!found) {
         found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.dst.port,
-                                  curr_sport, min_sport, max_sport);
+                                  curr_sport, min_sport, max_sport, attempts);
     }
 
     if (found) {
         return true;
     }
 
+    if (attempts < range_max && attempts >= min_attempts) {
+        attempts /= 2;
+        curr_dport = min_dport + (random_uint32() % range_dst);
+        curr_sport = min_sport + (random_uint32() % range_src);
+
+        goto another_port_round;
+    }
+
     /* Check if next IP is in range and respin. Otherwise, notify
      * exhaustion to the caller. */
 next_addr: