Message ID | 1642477806-18538-3-git-send-email-wenxu@ucloud.cn |
---|---|
State | Changes Requested |
Headers | show |
Series | [ovs-dev,v10,1/3] conntrack: select correct sport range for well-known origin sport | expand |
Context | Check | Description |
---|---|---|
ovsrobot/apply-robot | success | apply and check: success |
ovsrobot/github-robot-_Build_and_Test | fail | github build: failed |
wenxu@ucloud.cn writes: > From: wenxu <wenxu@ucloud.cn> > > In case almost or all available ports are taken, clash resolution can > take a very long time, resulting in pmd hang in conntrack. > > This can happen when many to-be-natted hosts connect to same > destination:port (e.g. a proxy) and all connections pass the same SNAT. > > Pick a random offset in the acceptable range, then try ever smaller > number of adjacent port numbers, until either the limit is reached or a > useable port was found. This results in at most 248 attempts > (128 + 64 + 32 + 16 + 8, i.e. 4 restarts with new search offset) > instead of 64000+. > > And if thenumber of ip address will limit the max attempts and which > will lead the total attempts under 248. > > Signed-off-by: wenxu <wenxu@ucloud.cn> > --- > lib/conntrack.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++------- > 1 file changed, 63 insertions(+), 8 deletions(-) > > diff --git a/lib/conntrack.c b/lib/conntrack.c > index dc29c9b..248cd45 100644 > --- a/lib/conntrack.c > +++ b/lib/conntrack.c > @@ -2400,19 +2400,50 @@ next_addr_in_range_guarded(union ct_addr *curr, union ct_addr *min, > static bool > nat_get_unique_l4(struct conntrack *ct, struct conn *nat_conn, > ovs_be16 *port, uint16_t curr, uint16_t min, > - uint16_t max) > + uint16_t max, unsigned int attempts) > { > + unsigned int i = 0; > + > FOR_EACH_PORT_IN_RANGE(curr, min, max) { > - *port = htons(curr); > - if (!conn_lookup(ct, &nat_conn->rev_key, > - time_msec(), NULL, NULL)) { > - return true; > + if (i++ < attempts) { > + *port = htons(curr); > + if (!conn_lookup(ct, &nat_conn->rev_key, > + time_msec(), NULL, NULL)) { > + return true; > + } > + } else { > + break; > } > } > > return false; > } > > +static void > +nat_get_attempts_range(union ct_addr *min_addr, union ct_addr *max_addr, > + uint16_t dl_type, unsigned int *max_attempts, > + unsigned int *min_attempts) > +{ > + uint32_t range_addr; > + > + if (dl_type == htons(ETH_TYPE_IP)) { > + range_addr = ntohl(max_addr->ipv4) - ntohl(min_addr->ipv4) + 1; > + } else { > + range_addr = nat_ipv6_addrs_delta(&min_addr->ipv6, > + &max_addr->ipv6) + 1; > + } > + > + *max_attempts = 128 / range_addr; > + if (*max_attempts < 1) { > + *max_attempts = 1; > + } > + > + *min_attempts = 16 / range_addr; > + if (*min_attempts < 2) { > + *min_attempts = 2; > + } > +} > + > /* This function tries to get a unique tuple. > * Every iteration checks that the reverse tuple doesn't > * collide with any existing one. > @@ -2446,6 +2477,9 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn, > uint16_t min_dport, max_dport, curr_dport, orig_dport; > bool pat_proto = conn->key.nw_proto == IPPROTO_TCP || > conn->key.nw_proto == IPPROTO_UDP; > + unsigned int attempts, max_attempts, min_attempts; > + uint16_t range_src, range_dst, range_max; > + bool found; > > min_addr = nat_info->min_addr; > max_addr = nat_info->max_addr; > @@ -2462,6 +2496,13 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn, > set_dport_range(nat_info, &conn->key, hash, &orig_dport, > &min_dport, &max_dport); > > + range_src = max_sport - min_sport + 1; > + range_dst = max_dport - min_dport + 1; > + range_max = range_src > range_dst ? range_src : range_dst; > + nit: these could be added in nat_get_attempts_range(), right? > + nat_get_attempts_range(&min_addr, &max_addr, conn->key.dl_type, > + &max_attempts, &min_attempts); > + > another_round: > store_addr_to_key(&curr_addr, &nat_conn->rev_key, > nat_info->nat_action); > @@ -2481,10 +2522,16 @@ another_round: > nat_conn->rev_key.src.port = htons(curr_dport); > nat_conn->rev_key.dst.port = htons(curr_sport); > > - bool found = false; > + attempts = range_max; > + if (attempts > max_attempts) { > + attempts = max_attempts; > + } > + > +another_port_round: > + found = false; > if (nat_info->nat_action & NAT_ACTION_DST_PORT) { > found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.src.port, > - curr_dport, min_dport, max_dport); > + curr_dport, min_dport, max_dport, attempts); > if (!found) { > nat_conn->rev_key.src.port = htons(orig_dport); > } I still see some problems, though. - dnat ranges smaller than the ephemeral set (almost always the case) for snat will be checked (sometimes entirely) every time a random source port subrange is checked. Let's say you have a single IP address, a dport range like [80,85], and so [1024, 65535] as source port range. what happens is that 80-85 is tested entirely 5 times (assuming the patch aims to restore the ports for each port round). - large ip ranges will never iterate over ports In this case, as far as I can see, using a large range like actions=ct(commit,nat(src=10.1.1.100-10.1.1.164) - a range slightly larger than 128 (single IP), will basically always test the same ports here are some potential ideas that needs to be verified: Probably, treating the ranges independently may be helpful for the first problem, e.g. moving the another_port_round logic into nat_get_unique_l4() and avoiding: range_max = range_src > range_dst ? range_src : range_dst; The second problem could be solved by removing (or by reducing to a very small fixed maximum) the IP iterations, and just picking the IP address with the hash (or according to some logic like the least used IP). The third problem may depend on the previous solution. The number of IPs in the range affect the number of subranges, thus the solution may vary. My suggestion though is to post the first two paches separately (as they seem to be basically ready, and IMO, they make sense even separately from this patch). It may also be the case to at least evaluate if a slightly different approach with this change may give better results (maybe also based on some potential suggestions/discussions coming from the list). Paolo > @@ -2492,13 +2539,21 @@ another_round: > > if (!found) { > found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.dst.port, > - curr_sport, min_sport, max_sport); > + curr_sport, min_sport, max_sport, attempts); > } > > if (found) { > return true; > } > > + if (attempts < range_max && attempts >= min_attempts) { > + attempts /= 2; > + curr_dport = min_dport + (random_uint32() % range_dst); > + curr_sport = min_sport + (random_uint32() % range_src); > + > + goto another_port_round; > + } > + > /* Check if next IP is in range and respin. Otherwise, notify > * exhaustion to the caller. */ > next_addr: > -- > 1.8.3.1
diff --git a/lib/conntrack.c b/lib/conntrack.c index dc29c9b..248cd45 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -2400,19 +2400,50 @@ next_addr_in_range_guarded(union ct_addr *curr, union ct_addr *min, static bool nat_get_unique_l4(struct conntrack *ct, struct conn *nat_conn, ovs_be16 *port, uint16_t curr, uint16_t min, - uint16_t max) + uint16_t max, unsigned int attempts) { + unsigned int i = 0; + FOR_EACH_PORT_IN_RANGE(curr, min, max) { - *port = htons(curr); - if (!conn_lookup(ct, &nat_conn->rev_key, - time_msec(), NULL, NULL)) { - return true; + if (i++ < attempts) { + *port = htons(curr); + if (!conn_lookup(ct, &nat_conn->rev_key, + time_msec(), NULL, NULL)) { + return true; + } + } else { + break; } } return false; } +static void +nat_get_attempts_range(union ct_addr *min_addr, union ct_addr *max_addr, + uint16_t dl_type, unsigned int *max_attempts, + unsigned int *min_attempts) +{ + uint32_t range_addr; + + if (dl_type == htons(ETH_TYPE_IP)) { + range_addr = ntohl(max_addr->ipv4) - ntohl(min_addr->ipv4) + 1; + } else { + range_addr = nat_ipv6_addrs_delta(&min_addr->ipv6, + &max_addr->ipv6) + 1; + } + + *max_attempts = 128 / range_addr; + if (*max_attempts < 1) { + *max_attempts = 1; + } + + *min_attempts = 16 / range_addr; + if (*min_attempts < 2) { + *min_attempts = 2; + } +} + /* This function tries to get a unique tuple. * Every iteration checks that the reverse tuple doesn't * collide with any existing one. @@ -2446,6 +2477,9 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn, uint16_t min_dport, max_dport, curr_dport, orig_dport; bool pat_proto = conn->key.nw_proto == IPPROTO_TCP || conn->key.nw_proto == IPPROTO_UDP; + unsigned int attempts, max_attempts, min_attempts; + uint16_t range_src, range_dst, range_max; + bool found; min_addr = nat_info->min_addr; max_addr = nat_info->max_addr; @@ -2462,6 +2496,13 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn, set_dport_range(nat_info, &conn->key, hash, &orig_dport, &min_dport, &max_dport); + range_src = max_sport - min_sport + 1; + range_dst = max_dport - min_dport + 1; + range_max = range_src > range_dst ? range_src : range_dst; + + nat_get_attempts_range(&min_addr, &max_addr, conn->key.dl_type, + &max_attempts, &min_attempts); + another_round: store_addr_to_key(&curr_addr, &nat_conn->rev_key, nat_info->nat_action); @@ -2481,10 +2522,16 @@ another_round: nat_conn->rev_key.src.port = htons(curr_dport); nat_conn->rev_key.dst.port = htons(curr_sport); - bool found = false; + attempts = range_max; + if (attempts > max_attempts) { + attempts = max_attempts; + } + +another_port_round: + found = false; if (nat_info->nat_action & NAT_ACTION_DST_PORT) { found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.src.port, - curr_dport, min_dport, max_dport); + curr_dport, min_dport, max_dport, attempts); if (!found) { nat_conn->rev_key.src.port = htons(orig_dport); } @@ -2492,13 +2539,21 @@ another_round: if (!found) { found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.dst.port, - curr_sport, min_sport, max_sport); + curr_sport, min_sport, max_sport, attempts); } if (found) { return true; } + if (attempts < range_max && attempts >= min_attempts) { + attempts /= 2; + curr_dport = min_dport + (random_uint32() % range_dst); + curr_sport = min_sport + (random_uint32() % range_src); + + goto another_port_round; + } + /* Check if next IP is in range and respin. Otherwise, notify * exhaustion to the caller. */ next_addr: