@@ -29,6 +29,7 @@
#include "openvswitch/list.h"
#include "openvswitch/types.h"
#include "packets.h"
+#include "rculist.h"
#include "unaligned.h"
#include "dp-packet.h"
@@ -86,6 +87,31 @@ struct alg_exp_node {
bool nat_rpl_dst;
};
+/* Timeouts: all the possible timeout states passed to update_expiration()
+ * are listed here. The name will be prefix by CT_TM_ and the value is in
+ * milliseconds */
+#define CT_TIMEOUTS \
+ CT_TIMEOUT(TCP_FIRST_PACKET) \
+ CT_TIMEOUT(TCP_OPENING) \
+ CT_TIMEOUT(TCP_ESTABLISHED) \
+ CT_TIMEOUT(TCP_CLOSING) \
+ CT_TIMEOUT(TCP_FIN_WAIT) \
+ CT_TIMEOUT(TCP_CLOSED) \
+ CT_TIMEOUT(OTHER_FIRST) \
+ CT_TIMEOUT(OTHER_MULTIPLE) \
+ CT_TIMEOUT(OTHER_BIDIR) \
+ CT_TIMEOUT(ICMP_FIRST) \
+ CT_TIMEOUT(ICMP_REPLY)
+
+enum ct_timeout {
+#define CT_TIMEOUT(NAME) CT_TM_##NAME,
+ CT_TIMEOUTS
+#undef CT_TIMEOUT
+ N_CT_TM
+};
+
+#define EXP_LISTS 100
+
enum OVS_PACKED_ENUM ct_conn_type {
CT_CONN_TYPE_DEFAULT,
CT_CONN_TYPE_UN_NAT,
@@ -96,11 +122,16 @@ struct conn {
struct conn_key key;
struct conn_key rev_key;
struct conn_key parent_key; /* Only used for orig_tuple support. */
- struct ovs_list exp_node;
struct cmap_node cm_node;
uint16_t nat_action;
char *alg;
struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */
+ atomic_flag reclaimed; /* False during the lifetime of the connection,
+ * True as soon as a thread has started freeing
+ * its memory. */
+
+ /* Inserted once by a PMD, then managed by the 'ct_clean' thread. */
+ struct rculist node;
/* Mutable data. */
struct ovs_mutex lock; /* Guards all mutable fields. */
@@ -116,7 +147,6 @@ struct conn {
/* Mutable data. */
bool seq_skew_dir; /* TCP sequence skew direction due to NATTing of FTP
* control messages; true if reply direction. */
- bool cleaned; /* True if cleaned from expiry lists. */
/* Immutable data. */
bool alg_related; /* True if alg data connection. */
@@ -132,22 +162,6 @@ enum ct_update_res {
CT_UPDATE_VALID_NEW,
};
-/* Timeouts: all the possible timeout states passed to update_expiration()
- * are listed here. The name will be prefix by CT_TM_ and the value is in
- * milliseconds */
-#define CT_TIMEOUTS \
- CT_TIMEOUT(TCP_FIRST_PACKET) \
- CT_TIMEOUT(TCP_OPENING) \
- CT_TIMEOUT(TCP_ESTABLISHED) \
- CT_TIMEOUT(TCP_CLOSING) \
- CT_TIMEOUT(TCP_FIN_WAIT) \
- CT_TIMEOUT(TCP_CLOSED) \
- CT_TIMEOUT(OTHER_FIRST) \
- CT_TIMEOUT(OTHER_MULTIPLE) \
- CT_TIMEOUT(OTHER_BIDIR) \
- CT_TIMEOUT(ICMP_FIRST) \
- CT_TIMEOUT(ICMP_REPLY)
-
#define NAT_ACTION_SNAT_ALL (NAT_ACTION_SRC | NAT_ACTION_SRC_PORT)
#define NAT_ACTION_DNAT_ALL (NAT_ACTION_DST | NAT_ACTION_DST_PORT)
@@ -181,22 +195,17 @@ enum ct_ephemeral_range {
#define FOR_EACH_PORT_IN_RANGE(curr, min, max) \
FOR_EACH_PORT_IN_RANGE__(curr, min, max, OVS_JOIN(idx, __COUNTER__))
-enum ct_timeout {
-#define CT_TIMEOUT(NAME) CT_TM_##NAME,
- CT_TIMEOUTS
-#undef CT_TIMEOUT
- N_CT_TM
-};
-
struct conntrack {
struct ovs_mutex ct_lock; /* Protects 2 following fields. */
struct cmap conns OVS_GUARDED;
- struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED;
+ struct rculist exp_lists[EXP_LISTS];
struct cmap zone_limits OVS_GUARDED;
struct cmap timeout_policies OVS_GUARDED;
uint32_t hash_basis; /* Salt for hashing a connection key. */
pthread_t clean_thread; /* Periodically cleans up connection tracker. */
struct latch clean_thread_exit; /* To destroy the 'clean_thread'. */
+ atomic_uint ct_next_list;
+ unsigned int next_sweep;
/* Counting connections. */
atomic_count n_conn; /* Number of connections currently tracked. */
@@ -216,8 +225,8 @@ struct conntrack {
};
/* Lock acquisition order:
- * 1. 'ct_lock'
- * 2. 'conn->lock'
+ * 1. 'conn->lock'
+ * 2. 'ct_lock'
* 3. 'resources_lock'
*/
@@ -237,4 +246,23 @@ struct ct_l4_proto {
struct ct_dpif_protoinfo *);
};
+static unsigned int
+ct_next_list(struct conntrack *ct)
+{
+ unsigned int old;
+
+ atomic_add_relaxed(&ct->ct_next_list, 1u, &old);
+
+ return old % EXP_LISTS;
+}
+
+static inline void
+conn_expire_push_front(struct conntrack *ct, struct conn *conn)
+ OVS_REQUIRES(ct->ct_lock)
+{
+ unsigned int next = ct_next_list(ct);
+
+ rculist_push_front(&ct->exp_lists[next], &conn->node);
+}
+
#endif /* conntrack-private.h */
@@ -236,71 +236,30 @@ tm_to_ct_dpif_tp(enum ct_timeout tm)
return CT_DPIF_TP_ATTR_MAX;
}
-static void
-conn_update_expiration__(struct conntrack *ct, struct conn *conn,
- enum ct_timeout tm, long long now,
- uint32_t tp_value)
- OVS_REQUIRES(conn->lock)
-{
- ovs_mutex_unlock(&conn->lock);
-
- ovs_mutex_lock(&ct->ct_lock);
- ovs_mutex_lock(&conn->lock);
- if (!conn->cleaned) {
- conn->expiration = now + tp_value * 1000;
- ovs_list_remove(&conn->exp_node);
- ovs_list_push_back(&ct->exp_lists[tm], &conn->exp_node);
- }
- ovs_mutex_unlock(&conn->lock);
- ovs_mutex_unlock(&ct->ct_lock);
-
- ovs_mutex_lock(&conn->lock);
-}
-
/* The conn entry lock must be held on entry and exit. */
void
conn_update_expiration(struct conntrack *ct, struct conn *conn,
enum ct_timeout tm, long long now)
- OVS_REQUIRES(conn->lock)
{
struct timeout_policy *tp;
uint32_t val;
- ovs_mutex_unlock(&conn->lock);
-
- ovs_mutex_lock(&ct->ct_lock);
- ovs_mutex_lock(&conn->lock);
tp = timeout_policy_lookup(ct, conn->tp_id);
if (tp) {
val = tp->policy.attrs[tm_to_ct_dpif_tp(tm)];
} else {
val = ct_dpif_netdev_tp_def[tm_to_ct_dpif_tp(tm)];
}
- ovs_mutex_unlock(&conn->lock);
- ovs_mutex_unlock(&ct->ct_lock);
-
- ovs_mutex_lock(&conn->lock);
VLOG_DBG_RL(&rl, "Update timeout %s zone=%u with policy id=%d "
"val=%u sec.",
ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
- conn_update_expiration__(ct, conn, tm, now, val);
+ conn->expiration = now + val * 1000;
}
-static void
-conn_init_expiration__(struct conntrack *ct, struct conn *conn,
- enum ct_timeout tm, long long now,
- uint32_t tp_value)
-{
- conn->expiration = now + tp_value * 1000;
- ovs_list_push_back(&ct->exp_lists[tm], &conn->exp_node);
-}
-
-/* ct_lock must be held. */
void
conn_init_expiration(struct conntrack *ct, struct conn *conn,
enum ct_timeout tm, long long now)
- OVS_REQUIRES(ct->ct_lock)
{
struct timeout_policy *tp;
uint32_t val;
@@ -315,5 +274,5 @@ conn_init_expiration(struct conntrack *ct, struct conn *conn,
VLOG_DBG_RL(&rl, "Init timeout %s zone=%u with policy id=%d val=%u sec.",
ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
- conn_init_expiration__(ct, conn, tm, now, val);
+ conn->expiration = now + val * 1000;
}
@@ -39,12 +39,12 @@
#include "ovs-thread.h"
#include "openvswitch/poll-loop.h"
#include "random.h"
+#include "rculist.h"
#include "timeval.h"
VLOG_DEFINE_THIS_MODULE(conntrack);
COVERAGE_DEFINE(conntrack_full);
-COVERAGE_DEFINE(conntrack_long_cleanup);
COVERAGE_DEFINE(conntrack_l3csum_err);
COVERAGE_DEFINE(conntrack_l4csum_err);
COVERAGE_DEFINE(conntrack_lookup_natted_miss);
@@ -96,7 +96,6 @@ static struct conn *new_conn(struct conntrack *ct, struct dp_packet *pkt,
uint32_t tp_id);
static void delete_conn_cmn(struct conn *);
static void delete_conn(struct conn *);
-static void delete_conn_one(struct conn *conn);
static enum ct_update_res conn_update(struct conntrack *ct, struct conn *conn,
struct dp_packet *pkt,
struct conn_lookup_ctx *ctx,
@@ -309,7 +308,7 @@ conntrack_init(void)
ovs_mutex_lock(&ct->ct_lock);
cmap_init(&ct->conns);
for (unsigned i = 0; i < ARRAY_SIZE(ct->exp_lists); i++) {
- ovs_list_init(&ct->exp_lists[i]);
+ rculist_init(&ct->exp_lists[i]);
}
cmap_init(&ct->zone_limits);
ct->zone_limit_seq = 0;
@@ -319,6 +318,7 @@ conntrack_init(void)
atomic_count_init(&ct->n_conn, 0);
atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
atomic_init(&ct->tcp_seq_chk, true);
+ atomic_init(&ct->ct_next_list, 0);
latch_init(&ct->clean_thread_exit);
ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
ct->ipf = ipf_init();
@@ -468,7 +468,7 @@ zone_limit_delete(struct conntrack *ct, uint16_t zone)
static void
conn_clean_cmn(struct conntrack *ct, struct conn *conn)
- OVS_REQUIRES(ct->ct_lock)
+ OVS_REQUIRES(conn->lock, ct->ct_lock)
{
if (conn->alg) {
expectation_clean(ct, &conn->key);
@@ -487,32 +487,38 @@ conn_clean_cmn(struct conntrack *ct, struct conn *conn)
* removes the associated nat 'conn' from the lookup datastructures. */
static void
conn_clean(struct conntrack *ct, struct conn *conn)
- OVS_REQUIRES(ct->ct_lock)
+ OVS_EXCLUDED(conn->lock, ct->ct_lock)
{
ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
+ if (atomic_flag_test_and_set(&conn->reclaimed)) {
+ return;
+ }
+
+ ovs_mutex_lock(&conn->lock);
+
+ ovs_mutex_lock(&ct->ct_lock);
conn_clean_cmn(ct, conn);
if (conn->nat_conn) {
uint32_t hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis);
cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash);
}
- ovs_list_remove(&conn->exp_node);
- conn->cleaned = true;
+
+ rculist_remove(&conn->node);
+ ovs_mutex_unlock(&ct->ct_lock);
+
ovsrcu_postpone(delete_conn, conn);
atomic_count_dec(&ct->n_conn);
+
+ ovs_mutex_unlock(&conn->lock);
}
static void
-conn_clean_one(struct conntrack *ct, struct conn *conn)
- OVS_REQUIRES(ct->ct_lock)
+conn_force_expire(struct conn *conn)
{
- conn_clean_cmn(ct, conn);
- if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
- ovs_list_remove(&conn->exp_node);
- conn->cleaned = true;
- atomic_count_dec(&ct->n_conn);
- }
- ovsrcu_postpone(delete_conn_one, conn);
+ ovs_mutex_lock(&conn->lock);
+ conn->expiration = 0;
+ ovs_mutex_unlock(&conn->lock);
}
/* Destroys the connection tracker 'ct' and frees all the allocated memory.
@@ -522,15 +528,16 @@ void
conntrack_destroy(struct conntrack *ct)
{
struct conn *conn;
+
latch_set(&ct->clean_thread_exit);
pthread_join(ct->clean_thread, NULL);
latch_destroy(&ct->clean_thread_exit);
- ovs_mutex_lock(&ct->ct_lock);
- CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
- conn_clean_one(ct, conn);
+ for (unsigned i = 0; i < EXP_LISTS; i++) {
+ RCULIST_FOR_EACH (conn, node, &ct->exp_lists[i]) {
+ conn_clean(ct, conn);
+ }
}
- cmap_destroy(&ct->conns);
struct zone_limit *zl;
CMAP_FOR_EACH (zl, node, &ct->zone_limits) {
@@ -539,7 +546,6 @@ conntrack_destroy(struct conntrack *ct)
cmap_remove(&ct->zone_limits, &zl->node, hash);
ovsrcu_postpone(free, zl);
}
- cmap_destroy(&ct->zone_limits);
struct timeout_policy *tp;
CMAP_FOR_EACH (tp, node, &ct->timeout_policies) {
@@ -548,6 +554,11 @@ conntrack_destroy(struct conntrack *ct)
cmap_remove(&ct->timeout_policies, &tp->node, hash);
ovsrcu_postpone(free, tp);
}
+
+ ovs_mutex_lock(&ct->ct_lock);
+
+ cmap_destroy(&ct->conns);
+ cmap_destroy(&ct->zone_limits);
cmap_destroy(&ct->timeout_policies);
ovs_mutex_unlock(&ct->ct_lock);
@@ -1087,7 +1098,9 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
nc->nat_conn = nat_conn;
ovs_mutex_init_adaptive(&nc->lock);
nc->conn_type = CT_CONN_TYPE_DEFAULT;
+ atomic_flag_clear(&nc->reclaimed);
cmap_insert(&ct->conns, &nc->cm_node, ctx->hash);
+ conn_expire_push_front(ct, nc);
atomic_count_inc(&ct->n_conn);
ctx->conn = nc; /* For completeness. */
if (zl) {
@@ -1108,7 +1121,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
* can limit DoS impact. */
nat_res_exhaustion:
free(nat_conn);
- ovs_list_remove(&nc->exp_node);
delete_conn_cmn(nc);
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
@@ -1148,11 +1160,9 @@ conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
pkt->md.ct_state = CS_INVALID;
break;
case CT_UPDATE_NEW:
- ovs_mutex_lock(&ct->ct_lock);
if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
- conn_clean(ct, conn);
+ conn_force_expire(conn);
}
- ovs_mutex_unlock(&ct->ct_lock);
create_new_conn = true;
break;
case CT_UPDATE_VALID_NEW:
@@ -1363,11 +1373,9 @@ process_one(struct conntrack *ct, struct dp_packet *pkt,
/* Delete found entry if in wrong direction. 'force' implies commit. */
if (OVS_UNLIKELY(force && ctx->reply && conn)) {
- ovs_mutex_lock(&ct->ct_lock);
if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
- conn_clean(ct, conn);
+ conn_force_expire(conn);
}
- ovs_mutex_unlock(&ct->ct_lock);
conn = NULL;
}
@@ -1553,39 +1561,21 @@ set_label(struct dp_packet *pkt, struct conn *conn,
* LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
* if 'limit' is reached */
static long long
-ct_sweep(struct conntrack *ct, long long now, size_t limit)
+ct_sweep(struct conntrack *ct, struct rculist *list, long long now)
+ OVS_NO_THREAD_SAFETY_ANALYSIS
{
struct conn *conn;
- long long min_expiration = LLONG_MAX;
size_t count = 0;
- ovs_mutex_lock(&ct->ct_lock);
-
- for (unsigned i = 0; i < N_CT_TM; i++) {
- LIST_FOR_EACH_SAFE (conn, exp_node, &ct->exp_lists[i]) {
- ovs_mutex_lock(&conn->lock);
- if (now < conn->expiration || count >= limit) {
- min_expiration = MIN(min_expiration, conn->expiration);
- ovs_mutex_unlock(&conn->lock);
- if (count >= limit) {
- /* Do not check other lists. */
- COVERAGE_INC(conntrack_long_cleanup);
- goto out;
- }
- break;
- } else {
- ovs_mutex_unlock(&conn->lock);
- conn_clean(ct, conn);
- }
- count++;
+ RCULIST_FOR_EACH (conn, node, list) {
+ if (conn_expired(conn, now)) {
+ conn_clean(ct, conn);
}
+
+ count++;
}
-out:
- VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec", count,
- time_msec() - now);
- ovs_mutex_unlock(&ct->ct_lock);
- return min_expiration;
+ return count;
}
/* Cleans up old connection entries from 'ct'. Returns the time when the
@@ -1595,11 +1585,26 @@ out:
static long long
conntrack_clean(struct conntrack *ct, long long now)
{
- unsigned int n_conn_limit;
+ long long next_wakeup = now + 30 * 1000;
+ unsigned int n_conn_limit, i, count = 0;
+ size_t clean_end;
+
atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
- size_t clean_max = n_conn_limit > 10 ? n_conn_limit / 10 : 1;
- long long min_exp = ct_sweep(ct, now, clean_max);
- long long next_wakeup = MIN(min_exp, now + CT_DPIF_NETDEV_TP_MIN);
+ clean_end = n_conn_limit / 64;
+
+ for (i = ct->next_sweep; i < EXP_LISTS; i++) {
+ count += ct_sweep(ct, &ct->exp_lists[i], now);
+
+ if (count > clean_end) {
+ next_wakeup = 0;
+ break;
+ }
+ }
+
+ ct->next_sweep = (i < EXP_LISTS) ? i : 0;
+
+ VLOG_DBG("conntrack cleanup %"PRIu32" entries in %lld msec", count,
+ time_msec() - now);
return next_wakeup;
}
@@ -1628,6 +1633,7 @@ conntrack_clean(struct conntrack *ct, long long now)
static void *
clean_thread_main(void *f_)
+ OVS_NO_THREAD_SAFETY_ANALYSIS
{
struct conntrack *ct = f_;
@@ -2554,15 +2560,6 @@ delete_conn(struct conn *conn)
delete_conn_cmn(conn);
}
-/* Only used by conn_clean_one(). */
-static void
-delete_conn_one(struct conn *conn)
-{
- if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
- ovs_mutex_destroy(&conn->lock);
- }
- delete_conn_cmn(conn);
-}
/* Convert a conntrack address 'a' into an IP address 'b' based on 'dl_type'.
*
@@ -2714,6 +2711,11 @@ conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
}
struct conn *conn;
INIT_CONTAINER(conn, cm_node, cm_node);
+
+ if (conn_expired(conn, now)) {
+ continue;
+ }
+
if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
(conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
conn_to_ct_dpif_entry(conn, entry, now);
@@ -2735,13 +2737,15 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
{
struct conn *conn;
- ovs_mutex_lock(&ct->ct_lock);
CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
+ if (conn->conn_type != CT_CONN_TYPE_DEFAULT) {
+ continue;
+ }
+
if (!zone || *zone == conn->key.zone) {
- conn_clean_one(ct, conn);
+ conn_clean(ct, conn);
}
}
- ovs_mutex_unlock(&ct->ct_lock);
return 0;
}
@@ -2756,7 +2760,6 @@ conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
memset(&key, 0, sizeof(key));
tuple_to_conn_key(tuple, zone, &key);
- ovs_mutex_lock(&ct->ct_lock);
conn_lookup(ct, &key, time_msec(), &conn, NULL);
if (conn && conn->conn_type == CT_CONN_TYPE_DEFAULT) {
@@ -2766,7 +2769,6 @@ conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
error = ENOENT;
}
- ovs_mutex_unlock(&ct->ct_lock);
return error;
}