Message ID | dc44dddc89b2b698f2de8201600f06b4a6dcfb09.1613557616.git.grive@u256.net |
---|---|
State | Changes Requested |
Headers | show |
Series | conntrack: improve multithread scalability | expand |
Hi Gaetan, Thanks for the patch, looks very useful. I haven't tested it yet. Minor question/comments inline. On Wed, Feb 17, 2021 at 8:34 AM Gaetan Rivet <grive@u256.net> wrote: > > Change the connection expiration lists from ovs_list to rculist. > This is a pre-step towards reducing the granularity of 'ct_lock'. > > Signed-off-by: Gaetan Rivet <grive@u256.net> > Reviewed-by: Eli Britstein <elibr@nvidia.com> > --- > lib/conntrack-private.h | 76 +++++++++++++++++++++++++++-------------- > lib/conntrack-tp.c | 60 +++++++++++++++++++++++++------- > lib/conntrack.c | 14 ++++---- > 3 files changed, 105 insertions(+), 45 deletions(-) > > diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h > index e8332bdba..4b6f9eae3 100644 > --- a/lib/conntrack-private.h > +++ b/lib/conntrack-private.h > @@ -29,6 +29,7 @@ > #include "openvswitch/list.h" > #include "openvswitch/types.h" > #include "packets.h" > +#include "rculist.h" > #include "unaligned.h" > #include "dp-packet.h" > > @@ -86,17 +87,55 @@ struct alg_exp_node { > bool nat_rpl_dst; > }; > > +/* Timeouts: all the possible timeout states passed to update_expiration() > + * are listed here. The name will be prefix by CT_TM_ and the value is in > + * milliseconds */ > +#define CT_TIMEOUTS \ > + CT_TIMEOUT(TCP_FIRST_PACKET) \ > + CT_TIMEOUT(TCP_OPENING) \ > + CT_TIMEOUT(TCP_ESTABLISHED) \ > + CT_TIMEOUT(TCP_CLOSING) \ > + CT_TIMEOUT(TCP_FIN_WAIT) \ > + CT_TIMEOUT(TCP_CLOSED) \ > + CT_TIMEOUT(OTHER_FIRST) \ > + CT_TIMEOUT(OTHER_MULTIPLE) \ > + CT_TIMEOUT(OTHER_BIDIR) \ > + CT_TIMEOUT(ICMP_FIRST) \ > + CT_TIMEOUT(ICMP_REPLY) > + > +enum ct_timeout { > +#define CT_TIMEOUT(NAME) CT_TM_##NAME, > + CT_TIMEOUTS > +#undef CT_TIMEOUT > + N_CT_TM > +}; > + any reason for moving this macro to here? > enum OVS_PACKED_ENUM ct_conn_type { > CT_CONN_TYPE_DEFAULT, > CT_CONN_TYPE_UN_NAT, > }; > > +struct conn_expire { > + /* Set once when initializing the expiration node. */ > + struct conntrack *ct; > + /* Timeout state of the connection. > + * It follows the connection state updates. > + */ > + enum ct_timeout tm; > + /* Insert and remove the expiration node only once per RCU syncs. > + * If multiple threads update the connection, its expiration should > + * be removed only once and added only once to timeout lists. > + */ > + atomic_flag insert_once; > + atomic_flag remove_once; > + struct rculist node; > +}; > + > struct conn { > /* Immutable data. */ > struct conn_key key; > struct conn_key rev_key; > struct conn_key parent_key; /* Only used for orig_tuple support. */ > - struct ovs_list exp_node; > struct cmap_node cm_node; > struct nat_action_info_t *nat_info; > char *alg; > @@ -104,6 +143,7 @@ struct conn { > > /* Mutable data. */ > struct ovs_mutex lock; /* Guards all mutable fields. */ > + struct conn_expire exp; > ovs_u128 label; > long long expiration; > uint32_t mark; > @@ -132,33 +172,10 @@ enum ct_update_res { > CT_UPDATE_VALID_NEW, > }; > > -/* Timeouts: all the possible timeout states passed to update_expiration() > - * are listed here. The name will be prefix by CT_TM_ and the value is in > - * milliseconds */ > -#define CT_TIMEOUTS \ > - CT_TIMEOUT(TCP_FIRST_PACKET) \ > - CT_TIMEOUT(TCP_OPENING) \ > - CT_TIMEOUT(TCP_ESTABLISHED) \ > - CT_TIMEOUT(TCP_CLOSING) \ > - CT_TIMEOUT(TCP_FIN_WAIT) \ > - CT_TIMEOUT(TCP_CLOSED) \ > - CT_TIMEOUT(OTHER_FIRST) \ > - CT_TIMEOUT(OTHER_MULTIPLE) \ > - CT_TIMEOUT(OTHER_BIDIR) \ > - CT_TIMEOUT(ICMP_FIRST) \ > - CT_TIMEOUT(ICMP_REPLY) > - > -enum ct_timeout { > -#define CT_TIMEOUT(NAME) CT_TM_##NAME, > - CT_TIMEOUTS > -#undef CT_TIMEOUT > - N_CT_TM > -}; > - > struct conntrack { > struct ovs_mutex ct_lock; /* Protects 2 following fields. */ > struct cmap conns OVS_GUARDED; > - struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED; > + struct rculist exp_lists[N_CT_TM] OVS_GUARDED; > struct hmap zone_limits OVS_GUARDED; > struct hmap timeout_policies OVS_GUARDED; > uint32_t hash_basis; /* Salt for hashing a connection key. */ > @@ -204,4 +221,13 @@ struct ct_l4_proto { > struct ct_dpif_protoinfo *); > }; > > +static inline void > +conn_expire_remove(struct conn_expire *exp) > +{ > + if (!atomic_flag_test_and_set(&exp->remove_once) > + && rculist_next(&exp->node)) { > + rculist_remove(&exp->node); > + } > +} > + > #endif /* conntrack-private.h */ > diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c > index a586d3a8d..30ba4bda8 100644 > --- a/lib/conntrack-tp.c > +++ b/lib/conntrack-tp.c > @@ -230,6 +230,50 @@ tm_to_ct_dpif_tp(enum ct_timeout tm) > return CT_DPIF_TP_ATTR_MAX; > } > > +static void > +conn_expire_init(struct conn *conn, struct conntrack *ct) > +{ > + struct conn_expire *exp = &conn->exp; > + > + if (exp->ct != NULL) { > + return; > + } > + > + exp->ct = ct; > + atomic_flag_clear(&exp->insert_once); > + atomic_flag_clear(&exp->remove_once); > + /* The expiration is initially unscheduled, flag it as 'removed'. */ > + atomic_flag_test_and_set(&exp->remove_once); > +} > + > +static void > +conn_expire_insert(struct conn *conn) > +{ > + struct conn_expire *exp = &conn->exp; > + > + ovs_mutex_lock(&exp->ct->ct_lock); > + ovs_mutex_lock(&conn->lock); > + > + rculist_push_back(&exp->ct->exp_lists[exp->tm], &exp->node); > + atomic_flag_clear(&exp->insert_once); > + atomic_flag_clear(&exp->remove_once); > + > + ovs_mutex_unlock(&conn->lock); > + ovs_mutex_unlock(&exp->ct->ct_lock); > +} > + > +static void > +conn_schedule_expiration(struct conntrack *ct, struct conn *conn, > + enum ct_timeout tm, long long now, uint32_t tp_value) > +{ > + conn_expire_init(conn, ct); > + conn->expiration = now + tp_value * 1000; > + conn->exp.tm = tm; > + if (!atomic_flag_test_and_set(&conn->exp.insert_once)) { Does this mean we only insert the oldest one, not the latest one? So pkt1 arrives at t1, pkt2 arrives later at t2, then we update using t1 > + ovsrcu_postpone(conn_expire_insert, conn); > + } > +} > + rest looks good to me William
On Tue, Feb 23, 2021, at 22:55, William Tu wrote: > Hi Gaetan, > > Thanks for the patch, looks very useful. > I haven't tested it yet. > Minor question/comments inline. Hi William, thanks for taking a look! [...] > > > > +/* Timeouts: all the possible timeout states passed to update_expiration() > > + * are listed here. The name will be prefix by CT_TM_ and the value is in > > + * milliseconds */ > > +#define CT_TIMEOUTS \ > > + CT_TIMEOUT(TCP_FIRST_PACKET) \ > > + CT_TIMEOUT(TCP_OPENING) \ > > + CT_TIMEOUT(TCP_ESTABLISHED) \ > > + CT_TIMEOUT(TCP_CLOSING) \ > > + CT_TIMEOUT(TCP_FIN_WAIT) \ > > + CT_TIMEOUT(TCP_CLOSED) \ > > + CT_TIMEOUT(OTHER_FIRST) \ > > + CT_TIMEOUT(OTHER_MULTIPLE) \ > > + CT_TIMEOUT(OTHER_BIDIR) \ > > + CT_TIMEOUT(ICMP_FIRST) \ > > + CT_TIMEOUT(ICMP_REPLY) > > + > > +enum ct_timeout { > > +#define CT_TIMEOUT(NAME) CT_TM_##NAME, > > + CT_TIMEOUTS > > +#undef CT_TIMEOUT > > + N_CT_TM > > +}; > > + > any reason for moving this macro to here? > I embed the enum ct_timeout within the conn expiration node, so I must have it defined first. I did not want to put too much code between 'ct_conn_type' below and the 'conn' struct itself, which is why I put 'ct_timeout' just above. Otherwise, this could be avoided by linking the rculist pointer directly instead of storing its index. But I thought it would be unwise as it would more tightly couple the list type to the expiration node, and it means taking a full pointer vs. storing an enum. > > enum OVS_PACKED_ENUM ct_conn_type { > > CT_CONN_TYPE_DEFAULT, > > CT_CONN_TYPE_UN_NAT, > > }; > > > > +struct conn_expire { > > + /* Set once when initializing the expiration node. */ > > + struct conntrack *ct; > > + /* Timeout state of the connection. > > + * It follows the connection state updates. > > + */ > > + enum ct_timeout tm; > > + /* Insert and remove the expiration node only once per RCU syncs. > > + * If multiple threads update the connection, its expiration should > > + * be removed only once and added only once to timeout lists. > > + */ > > + atomic_flag insert_once; > > + atomic_flag remove_once; > > + struct rculist node; > > +}; > > + > > struct conn { > > /* Immutable data. */ > > struct conn_key key; > > struct conn_key rev_key; > > struct conn_key parent_key; /* Only used for orig_tuple support. */ > > - struct ovs_list exp_node; > > struct cmap_node cm_node; > > struct nat_action_info_t *nat_info; > > char *alg; > > @@ -104,6 +143,7 @@ struct conn { > > > > /* Mutable data. */ > > struct ovs_mutex lock; /* Guards all mutable fields. */ > > + struct conn_expire exp; > > ovs_u128 label; > > long long expiration; > > uint32_t mark; > > @@ -132,33 +172,10 @@ enum ct_update_res { > > CT_UPDATE_VALID_NEW, > > }; > > > > -/* Timeouts: all the possible timeout states passed to update_expiration() > > - * are listed here. The name will be prefix by CT_TM_ and the value is in > > - * milliseconds */ > > -#define CT_TIMEOUTS \ > > - CT_TIMEOUT(TCP_FIRST_PACKET) \ > > - CT_TIMEOUT(TCP_OPENING) \ > > - CT_TIMEOUT(TCP_ESTABLISHED) \ > > - CT_TIMEOUT(TCP_CLOSING) \ > > - CT_TIMEOUT(TCP_FIN_WAIT) \ > > - CT_TIMEOUT(TCP_CLOSED) \ > > - CT_TIMEOUT(OTHER_FIRST) \ > > - CT_TIMEOUT(OTHER_MULTIPLE) \ > > - CT_TIMEOUT(OTHER_BIDIR) \ > > - CT_TIMEOUT(ICMP_FIRST) \ > > - CT_TIMEOUT(ICMP_REPLY) > > - > > -enum ct_timeout { > > -#define CT_TIMEOUT(NAME) CT_TM_##NAME, > > - CT_TIMEOUTS > > -#undef CT_TIMEOUT > > - N_CT_TM > > -}; > > - > > struct conntrack { > > struct ovs_mutex ct_lock; /* Protects 2 following fields. */ > > struct cmap conns OVS_GUARDED; > > - struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED; > > + struct rculist exp_lists[N_CT_TM] OVS_GUARDED; > > struct hmap zone_limits OVS_GUARDED; > > struct hmap timeout_policies OVS_GUARDED; > > uint32_t hash_basis; /* Salt for hashing a connection key. */ > > @@ -204,4 +221,13 @@ struct ct_l4_proto { > > struct ct_dpif_protoinfo *); > > }; > > > > +static inline void > > +conn_expire_remove(struct conn_expire *exp) > > +{ > > + if (!atomic_flag_test_and_set(&exp->remove_once) > > + && rculist_next(&exp->node)) { > > + rculist_remove(&exp->node); > > + } > > +} > > + > > #endif /* conntrack-private.h */ > > diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c > > index a586d3a8d..30ba4bda8 100644 > > --- a/lib/conntrack-tp.c > > +++ b/lib/conntrack-tp.c > > @@ -230,6 +230,50 @@ tm_to_ct_dpif_tp(enum ct_timeout tm) > > return CT_DPIF_TP_ATTR_MAX; > > } > > > > +static void > > +conn_expire_init(struct conn *conn, struct conntrack *ct) > > +{ > > + struct conn_expire *exp = &conn->exp; > > + > > + if (exp->ct != NULL) { > > + return; > > + } > > + > > + exp->ct = ct; > > + atomic_flag_clear(&exp->insert_once); > > + atomic_flag_clear(&exp->remove_once); > > + /* The expiration is initially unscheduled, flag it as 'removed'. */ > > + atomic_flag_test_and_set(&exp->remove_once); > > +} > > + > > +static void > > +conn_expire_insert(struct conn *conn) > > +{ > > + struct conn_expire *exp = &conn->exp; > > + > > + ovs_mutex_lock(&exp->ct->ct_lock); > > + ovs_mutex_lock(&conn->lock); > > + > > + rculist_push_back(&exp->ct->exp_lists[exp->tm], &exp->node); > > + atomic_flag_clear(&exp->insert_once); > > + atomic_flag_clear(&exp->remove_once); > > + > > + ovs_mutex_unlock(&conn->lock); > > + ovs_mutex_unlock(&exp->ct->ct_lock); > > +} > > + > > +static void > > +conn_schedule_expiration(struct conntrack *ct, struct conn *conn, > > + enum ct_timeout tm, long long now, uint32_t tp_value) > > +{ > > + conn_expire_init(conn, ct); > > + conn->expiration = now + tp_value * 1000; > > + conn->exp.tm = tm; > > + if (!atomic_flag_test_and_set(&conn->exp.insert_once)) { > > Does this mean we only insert the oldest one, not the latest one? > So pkt1 arrives at t1, pkt2 arrives later at t2, then we update using t1 > The expiration node is flagged for insertion at pkt1, which means registering the insertion callback using RCU. Then pkt2 arrives and will update 'conn->exp.tm'. If the RCU did not yet execute its callbacks, next time it does the ct_timeout list will correspond to the latest packet received. If it did execute already, then the 'insert_once' flag was cleared and the insertion callback is re-scheduled. This avoids repeatedly scheduling insertions through the RCU. Connections could potentially be updated several times between RCU sync, for potentially many connections: it would mean making the cbset in RCU reallocate many times and grow unnecessarily. > > + ovsrcu_postpone(conn_expire_insert, conn); > > + } > > +} > > + > > rest looks good to me > > > William >
On Tue, Feb 23, 2021 at 4:34 PM Gaƫtan Rivet <grive@u256.net> wrote: > > > > On Tue, Feb 23, 2021, at 22:55, William Tu wrote: > > Hi Gaetan, > > > > Thanks for the patch, looks very useful. > > I haven't tested it yet. > > Minor question/comments inline. > > Hi William, thanks for taking a look! > > [...] > > > > > > +/* Timeouts: all the possible timeout states passed to update_expiration() > > > + * are listed here. The name will be prefix by CT_TM_ and the value is in > > > + * milliseconds */ > > > +#define CT_TIMEOUTS \ > > > + CT_TIMEOUT(TCP_FIRST_PACKET) \ > > > + CT_TIMEOUT(TCP_OPENING) \ > > > + CT_TIMEOUT(TCP_ESTABLISHED) \ > > > + CT_TIMEOUT(TCP_CLOSING) \ > > > + CT_TIMEOUT(TCP_FIN_WAIT) \ > > > + CT_TIMEOUT(TCP_CLOSED) \ > > > + CT_TIMEOUT(OTHER_FIRST) \ > > > + CT_TIMEOUT(OTHER_MULTIPLE) \ > > > + CT_TIMEOUT(OTHER_BIDIR) \ > > > + CT_TIMEOUT(ICMP_FIRST) \ > > > + CT_TIMEOUT(ICMP_REPLY) > > > + > > > +enum ct_timeout { > > > +#define CT_TIMEOUT(NAME) CT_TM_##NAME, > > > + CT_TIMEOUTS > > > +#undef CT_TIMEOUT > > > + N_CT_TM > > > +}; > > > + > > any reason for moving this macro to here? > > > > I embed the enum ct_timeout within the conn expiration node, so I must have it defined first. > I did not want to put too much code between 'ct_conn_type' below and the 'conn' struct itself, > which is why I put 'ct_timeout' just above. > > Otherwise, this could be avoided by linking the rculist pointer directly instead of storing its index. > But I thought it would be unwise as it would more tightly couple the list type to the expiration node, and it means taking a full pointer vs. storing an enum. > I see, make sense, thanks! > > > enum OVS_PACKED_ENUM ct_conn_type { > > > CT_CONN_TYPE_DEFAULT, > > > CT_CONN_TYPE_UN_NAT, > > > }; > > > > > > +struct conn_expire { > > > + /* Set once when initializing the expiration node. */ > > > + struct conntrack *ct; > > > + /* Timeout state of the connection. > > > + * It follows the connection state updates. > > > + */ > > > + enum ct_timeout tm; > > > + /* Insert and remove the expiration node only once per RCU syncs. > > > + * If multiple threads update the connection, its expiration should > > > + * be removed only once and added only once to timeout lists. > > > + */ > > > + atomic_flag insert_once; > > > + atomic_flag remove_once; > > > + struct rculist node; > > > +}; > > > + > > > struct conn { > > > /* Immutable data. */ > > > struct conn_key key; > > > struct conn_key rev_key; > > > struct conn_key parent_key; /* Only used for orig_tuple support. */ > > > - struct ovs_list exp_node; > > > struct cmap_node cm_node; > > > struct nat_action_info_t *nat_info; > > > char *alg; > > > @@ -104,6 +143,7 @@ struct conn { > > > > > > /* Mutable data. */ > > > struct ovs_mutex lock; /* Guards all mutable fields. */ > > > + struct conn_expire exp; > > > ovs_u128 label; > > > long long expiration; > > > uint32_t mark; > > > @@ -132,33 +172,10 @@ enum ct_update_res { > > > CT_UPDATE_VALID_NEW, > > > }; > > > > > > -/* Timeouts: all the possible timeout states passed to update_expiration() > > > - * are listed here. The name will be prefix by CT_TM_ and the value is in > > > - * milliseconds */ > > > -#define CT_TIMEOUTS \ > > > - CT_TIMEOUT(TCP_FIRST_PACKET) \ > > > - CT_TIMEOUT(TCP_OPENING) \ > > > - CT_TIMEOUT(TCP_ESTABLISHED) \ > > > - CT_TIMEOUT(TCP_CLOSING) \ > > > - CT_TIMEOUT(TCP_FIN_WAIT) \ > > > - CT_TIMEOUT(TCP_CLOSED) \ > > > - CT_TIMEOUT(OTHER_FIRST) \ > > > - CT_TIMEOUT(OTHER_MULTIPLE) \ > > > - CT_TIMEOUT(OTHER_BIDIR) \ > > > - CT_TIMEOUT(ICMP_FIRST) \ > > > - CT_TIMEOUT(ICMP_REPLY) > > > - > > > -enum ct_timeout { > > > -#define CT_TIMEOUT(NAME) CT_TM_##NAME, > > > - CT_TIMEOUTS > > > -#undef CT_TIMEOUT > > > - N_CT_TM > > > -}; > > > - > > > struct conntrack { > > > struct ovs_mutex ct_lock; /* Protects 2 following fields. */ > > > struct cmap conns OVS_GUARDED; > > > - struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED; > > > + struct rculist exp_lists[N_CT_TM] OVS_GUARDED; > > > struct hmap zone_limits OVS_GUARDED; > > > struct hmap timeout_policies OVS_GUARDED; > > > uint32_t hash_basis; /* Salt for hashing a connection key. */ > > > @@ -204,4 +221,13 @@ struct ct_l4_proto { > > > struct ct_dpif_protoinfo *); > > > }; > > > > > > +static inline void > > > +conn_expire_remove(struct conn_expire *exp) > > > +{ > > > + if (!atomic_flag_test_and_set(&exp->remove_once) > > > + && rculist_next(&exp->node)) { > > > + rculist_remove(&exp->node); > > > + } > > > +} > > > + > > > #endif /* conntrack-private.h */ > > > diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c > > > index a586d3a8d..30ba4bda8 100644 > > > --- a/lib/conntrack-tp.c > > > +++ b/lib/conntrack-tp.c > > > @@ -230,6 +230,50 @@ tm_to_ct_dpif_tp(enum ct_timeout tm) > > > return CT_DPIF_TP_ATTR_MAX; > > > } > > > > > > +static void > > > +conn_expire_init(struct conn *conn, struct conntrack *ct) > > > +{ > > > + struct conn_expire *exp = &conn->exp; > > > + > > > + if (exp->ct != NULL) { > > > + return; > > > + } > > > + > > > + exp->ct = ct; > > > + atomic_flag_clear(&exp->insert_once); > > > + atomic_flag_clear(&exp->remove_once); > > > + /* The expiration is initially unscheduled, flag it as 'removed'. */ > > > + atomic_flag_test_and_set(&exp->remove_once); > > > +} > > > + > > > +static void > > > +conn_expire_insert(struct conn *conn) > > > +{ > > > + struct conn_expire *exp = &conn->exp; > > > + > > > + ovs_mutex_lock(&exp->ct->ct_lock); > > > + ovs_mutex_lock(&conn->lock); > > > + > > > + rculist_push_back(&exp->ct->exp_lists[exp->tm], &exp->node); > > > + atomic_flag_clear(&exp->insert_once); > > > + atomic_flag_clear(&exp->remove_once); > > > + > > > + ovs_mutex_unlock(&conn->lock); > > > + ovs_mutex_unlock(&exp->ct->ct_lock); > > > +} > > > + > > > +static void > > > +conn_schedule_expiration(struct conntrack *ct, struct conn *conn, > > > + enum ct_timeout tm, long long now, uint32_t tp_value) > > > +{ > > > + conn_expire_init(conn, ct); > > > + conn->expiration = now + tp_value * 1000; > > > + conn->exp.tm = tm; > > > + if (!atomic_flag_test_and_set(&conn->exp.insert_once)) { > > > > Does this mean we only insert the oldest one, not the latest one? > > So pkt1 arrives at t1, pkt2 arrives later at t2, then we update using t1 > > > > The expiration node is flagged for insertion at pkt1, which means registering the insertion callback using RCU. > Then pkt2 arrives and will update 'conn->exp.tm'. If the RCU did not yet execute its callbacks, next time it does the ct_timeout list will correspond to the latest packet received. If it did execute already, then the 'insert_once' flag was cleared and the insertion callback is re-scheduled. > > This avoids repeatedly scheduling insertions through the RCU. Connections could potentially be updated several times between RCU sync, for potentially many connections: it would mean making the cbset in RCU reallocate many times and grow unnecessarily. I see, looks good to me, thanks. William
diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h index e8332bdba..4b6f9eae3 100644 --- a/lib/conntrack-private.h +++ b/lib/conntrack-private.h @@ -29,6 +29,7 @@ #include "openvswitch/list.h" #include "openvswitch/types.h" #include "packets.h" +#include "rculist.h" #include "unaligned.h" #include "dp-packet.h" @@ -86,17 +87,55 @@ struct alg_exp_node { bool nat_rpl_dst; }; +/* Timeouts: all the possible timeout states passed to update_expiration() + * are listed here. The name will be prefix by CT_TM_ and the value is in + * milliseconds */ +#define CT_TIMEOUTS \ + CT_TIMEOUT(TCP_FIRST_PACKET) \ + CT_TIMEOUT(TCP_OPENING) \ + CT_TIMEOUT(TCP_ESTABLISHED) \ + CT_TIMEOUT(TCP_CLOSING) \ + CT_TIMEOUT(TCP_FIN_WAIT) \ + CT_TIMEOUT(TCP_CLOSED) \ + CT_TIMEOUT(OTHER_FIRST) \ + CT_TIMEOUT(OTHER_MULTIPLE) \ + CT_TIMEOUT(OTHER_BIDIR) \ + CT_TIMEOUT(ICMP_FIRST) \ + CT_TIMEOUT(ICMP_REPLY) + +enum ct_timeout { +#define CT_TIMEOUT(NAME) CT_TM_##NAME, + CT_TIMEOUTS +#undef CT_TIMEOUT + N_CT_TM +}; + enum OVS_PACKED_ENUM ct_conn_type { CT_CONN_TYPE_DEFAULT, CT_CONN_TYPE_UN_NAT, }; +struct conn_expire { + /* Set once when initializing the expiration node. */ + struct conntrack *ct; + /* Timeout state of the connection. + * It follows the connection state updates. + */ + enum ct_timeout tm; + /* Insert and remove the expiration node only once per RCU syncs. + * If multiple threads update the connection, its expiration should + * be removed only once and added only once to timeout lists. + */ + atomic_flag insert_once; + atomic_flag remove_once; + struct rculist node; +}; + struct conn { /* Immutable data. */ struct conn_key key; struct conn_key rev_key; struct conn_key parent_key; /* Only used for orig_tuple support. */ - struct ovs_list exp_node; struct cmap_node cm_node; struct nat_action_info_t *nat_info; char *alg; @@ -104,6 +143,7 @@ struct conn { /* Mutable data. */ struct ovs_mutex lock; /* Guards all mutable fields. */ + struct conn_expire exp; ovs_u128 label; long long expiration; uint32_t mark; @@ -132,33 +172,10 @@ enum ct_update_res { CT_UPDATE_VALID_NEW, }; -/* Timeouts: all the possible timeout states passed to update_expiration() - * are listed here. The name will be prefix by CT_TM_ and the value is in - * milliseconds */ -#define CT_TIMEOUTS \ - CT_TIMEOUT(TCP_FIRST_PACKET) \ - CT_TIMEOUT(TCP_OPENING) \ - CT_TIMEOUT(TCP_ESTABLISHED) \ - CT_TIMEOUT(TCP_CLOSING) \ - CT_TIMEOUT(TCP_FIN_WAIT) \ - CT_TIMEOUT(TCP_CLOSED) \ - CT_TIMEOUT(OTHER_FIRST) \ - CT_TIMEOUT(OTHER_MULTIPLE) \ - CT_TIMEOUT(OTHER_BIDIR) \ - CT_TIMEOUT(ICMP_FIRST) \ - CT_TIMEOUT(ICMP_REPLY) - -enum ct_timeout { -#define CT_TIMEOUT(NAME) CT_TM_##NAME, - CT_TIMEOUTS -#undef CT_TIMEOUT - N_CT_TM -}; - struct conntrack { struct ovs_mutex ct_lock; /* Protects 2 following fields. */ struct cmap conns OVS_GUARDED; - struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED; + struct rculist exp_lists[N_CT_TM] OVS_GUARDED; struct hmap zone_limits OVS_GUARDED; struct hmap timeout_policies OVS_GUARDED; uint32_t hash_basis; /* Salt for hashing a connection key. */ @@ -204,4 +221,13 @@ struct ct_l4_proto { struct ct_dpif_protoinfo *); }; +static inline void +conn_expire_remove(struct conn_expire *exp) +{ + if (!atomic_flag_test_and_set(&exp->remove_once) + && rculist_next(&exp->node)) { + rculist_remove(&exp->node); + } +} + #endif /* conntrack-private.h */ diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c index a586d3a8d..30ba4bda8 100644 --- a/lib/conntrack-tp.c +++ b/lib/conntrack-tp.c @@ -230,6 +230,50 @@ tm_to_ct_dpif_tp(enum ct_timeout tm) return CT_DPIF_TP_ATTR_MAX; } +static void +conn_expire_init(struct conn *conn, struct conntrack *ct) +{ + struct conn_expire *exp = &conn->exp; + + if (exp->ct != NULL) { + return; + } + + exp->ct = ct; + atomic_flag_clear(&exp->insert_once); + atomic_flag_clear(&exp->remove_once); + /* The expiration is initially unscheduled, flag it as 'removed'. */ + atomic_flag_test_and_set(&exp->remove_once); +} + +static void +conn_expire_insert(struct conn *conn) +{ + struct conn_expire *exp = &conn->exp; + + ovs_mutex_lock(&exp->ct->ct_lock); + ovs_mutex_lock(&conn->lock); + + rculist_push_back(&exp->ct->exp_lists[exp->tm], &exp->node); + atomic_flag_clear(&exp->insert_once); + atomic_flag_clear(&exp->remove_once); + + ovs_mutex_unlock(&conn->lock); + ovs_mutex_unlock(&exp->ct->ct_lock); +} + +static void +conn_schedule_expiration(struct conntrack *ct, struct conn *conn, + enum ct_timeout tm, long long now, uint32_t tp_value) +{ + conn_expire_init(conn, ct); + conn->expiration = now + tp_value * 1000; + conn->exp.tm = tm; + if (!atomic_flag_test_and_set(&conn->exp.insert_once)) { + ovsrcu_postpone(conn_expire_insert, conn); + } +} + static void conn_update_expiration__(struct conntrack *ct, struct conn *conn, enum ct_timeout tm, long long now, @@ -241,9 +285,8 @@ conn_update_expiration__(struct conntrack *ct, struct conn *conn, ovs_mutex_lock(&ct->ct_lock); ovs_mutex_lock(&conn->lock); if (!conn->cleaned) { - conn->expiration = now + tp_value * 1000; - ovs_list_remove(&conn->exp_node); - ovs_list_push_back(&ct->exp_lists[tm], &conn->exp_node); + conn_expire_remove(&conn->exp); + conn_schedule_expiration(ct, conn, tm, now, tp_value); } ovs_mutex_unlock(&conn->lock); ovs_mutex_unlock(&ct->ct_lock); @@ -281,15 +324,6 @@ conn_update_expiration(struct conntrack *ct, struct conn *conn, conn_update_expiration__(ct, conn, tm, now, val); } -static void -conn_init_expiration__(struct conntrack *ct, struct conn *conn, - enum ct_timeout tm, long long now, - uint32_t tp_value) -{ - conn->expiration = now + tp_value * 1000; - ovs_list_push_back(&ct->exp_lists[tm], &conn->exp_node); -} - /* ct_lock must be held. */ void conn_init_expiration(struct conntrack *ct, struct conn *conn, @@ -309,5 +343,5 @@ conn_init_expiration(struct conntrack *ct, struct conn *conn, VLOG_DBG_RL(&rl, "Init timeout %s zone=%u with policy id=%d val=%u sec.", ct_timeout_str[tm], conn->key.zone, conn->tp_id, val); - conn_init_expiration__(ct, conn, tm, now, val); + conn_schedule_expiration(ct, conn, tm, now, val); } diff --git a/lib/conntrack.c b/lib/conntrack.c index 99198a601..ac12f9196 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -301,7 +301,7 @@ conntrack_init(void) ovs_mutex_lock(&ct->ct_lock); cmap_init(&ct->conns); for (unsigned i = 0; i < ARRAY_SIZE(ct->exp_lists); i++) { - ovs_list_init(&ct->exp_lists[i]); + rculist_init(&ct->exp_lists[i]); } hmap_init(&ct->zone_limits); ct->zone_limit_seq = 0; @@ -466,7 +466,7 @@ conn_clean(struct conntrack *ct, struct conn *conn) uint32_t hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis); cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash); } - ovs_list_remove(&conn->exp_node); + conn_expire_remove(&conn->exp); conn->cleaned = true; ovsrcu_postpone(delete_conn, conn); atomic_count_dec(&ct->n_conn); @@ -478,7 +478,7 @@ conn_clean_one(struct conntrack *ct, struct conn *conn) { conn_clean_cmn(ct, conn); if (conn->conn_type == CT_CONN_TYPE_DEFAULT) { - ovs_list_remove(&conn->exp_node); + conn_expire_remove(&conn->exp); conn->cleaned = true; atomic_count_dec(&ct->n_conn); } @@ -1075,8 +1075,8 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, * can limit DoS impact. */ nat_res_exhaustion: free(nat_conn); - ovs_list_remove(&nc->exp_node); - delete_conn_cmn(nc); + conn_expire_remove(&nc->exp); + ovsrcu_postpone(delete_conn_cmn, nc); static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - " "if DoS attack, use firewalling and/or zone partitioning."); @@ -1493,14 +1493,14 @@ set_label(struct dp_packet *pkt, struct conn *conn, static long long ct_sweep(struct conntrack *ct, long long now, size_t limit) { - struct conn *conn, *next; + struct conn *conn; long long min_expiration = LLONG_MAX; size_t count = 0; ovs_mutex_lock(&ct->ct_lock); for (unsigned i = 0; i < N_CT_TM; i++) { - LIST_FOR_EACH_SAFE (conn, next, exp_node, &ct->exp_lists[i]) { + RCULIST_FOR_EACH (conn, exp.node, &ct->exp_lists[i]) { ovs_mutex_lock(&conn->lock); if (now < conn->expiration || count >= limit) { min_expiration = MIN(min_expiration, conn->expiration);