diff mbox series

[ovs-dev,v1,1/9] conntrack: Use rcu-lists to store conn expirations

Message ID dc44dddc89b2b698f2de8201600f06b4a6dcfb09.1613557616.git.grive@u256.net
State Changes Requested
Headers show
Series conntrack: improve multithread scalability | expand

Commit Message

Gaetan Rivet Feb. 17, 2021, 4:34 p.m. UTC
Change the connection expiration lists from ovs_list to rculist.
This is a pre-step towards reducing the granularity of 'ct_lock'.

Signed-off-by: Gaetan Rivet <grive@u256.net>
Reviewed-by: Eli Britstein <elibr@nvidia.com>
---
 lib/conntrack-private.h | 76 +++++++++++++++++++++++++++--------------
 lib/conntrack-tp.c      | 60 +++++++++++++++++++++++++-------
 lib/conntrack.c         | 14 ++++----
 3 files changed, 105 insertions(+), 45 deletions(-)

Comments

William Tu Feb. 23, 2021, 10:55 p.m. UTC | #1
Hi Gaetan,

Thanks for the patch, looks very useful.
I haven't tested it yet.
Minor question/comments inline.

On Wed, Feb 17, 2021 at 8:34 AM Gaetan Rivet <grive@u256.net> wrote:
>
> Change the connection expiration lists from ovs_list to rculist.
> This is a pre-step towards reducing the granularity of 'ct_lock'.
>
> Signed-off-by: Gaetan Rivet <grive@u256.net>
> Reviewed-by: Eli Britstein <elibr@nvidia.com>
> ---
>  lib/conntrack-private.h | 76 +++++++++++++++++++++++++++--------------
>  lib/conntrack-tp.c      | 60 +++++++++++++++++++++++++-------
>  lib/conntrack.c         | 14 ++++----
>  3 files changed, 105 insertions(+), 45 deletions(-)
>
> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
> index e8332bdba..4b6f9eae3 100644
> --- a/lib/conntrack-private.h
> +++ b/lib/conntrack-private.h
> @@ -29,6 +29,7 @@
>  #include "openvswitch/list.h"
>  #include "openvswitch/types.h"
>  #include "packets.h"
> +#include "rculist.h"
>  #include "unaligned.h"
>  #include "dp-packet.h"
>
> @@ -86,17 +87,55 @@ struct alg_exp_node {
>      bool nat_rpl_dst;
>  };
>
> +/* Timeouts: all the possible timeout states passed to update_expiration()
> + * are listed here. The name will be prefix by CT_TM_ and the value is in
> + * milliseconds */
> +#define CT_TIMEOUTS \
> +    CT_TIMEOUT(TCP_FIRST_PACKET) \
> +    CT_TIMEOUT(TCP_OPENING) \
> +    CT_TIMEOUT(TCP_ESTABLISHED) \
> +    CT_TIMEOUT(TCP_CLOSING) \
> +    CT_TIMEOUT(TCP_FIN_WAIT) \
> +    CT_TIMEOUT(TCP_CLOSED) \
> +    CT_TIMEOUT(OTHER_FIRST) \
> +    CT_TIMEOUT(OTHER_MULTIPLE) \
> +    CT_TIMEOUT(OTHER_BIDIR) \
> +    CT_TIMEOUT(ICMP_FIRST) \
> +    CT_TIMEOUT(ICMP_REPLY)
> +
> +enum ct_timeout {
> +#define CT_TIMEOUT(NAME) CT_TM_##NAME,
> +    CT_TIMEOUTS
> +#undef CT_TIMEOUT
> +    N_CT_TM
> +};
> +
any reason for moving this macro to here?

>  enum OVS_PACKED_ENUM ct_conn_type {
>      CT_CONN_TYPE_DEFAULT,
>      CT_CONN_TYPE_UN_NAT,
>  };
>
> +struct conn_expire {
> +    /* Set once when initializing the expiration node. */
> +    struct conntrack *ct;
> +    /* Timeout state of the connection.
> +     * It follows the connection state updates.
> +     */
> +    enum ct_timeout tm;
> +    /* Insert and remove the expiration node only once per RCU syncs.
> +     * If multiple threads update the connection, its expiration should
> +     * be removed only once and added only once to timeout lists.
> +     */
> +    atomic_flag insert_once;
> +    atomic_flag remove_once;
> +    struct rculist node;
> +};
> +
>  struct conn {
>      /* Immutable data. */
>      struct conn_key key;
>      struct conn_key rev_key;
>      struct conn_key parent_key; /* Only used for orig_tuple support. */
> -    struct ovs_list exp_node;
>      struct cmap_node cm_node;
>      struct nat_action_info_t *nat_info;
>      char *alg;
> @@ -104,6 +143,7 @@ struct conn {
>
>      /* Mutable data. */
>      struct ovs_mutex lock; /* Guards all mutable fields. */
> +    struct conn_expire exp;
>      ovs_u128 label;
>      long long expiration;
>      uint32_t mark;
> @@ -132,33 +172,10 @@ enum ct_update_res {
>      CT_UPDATE_VALID_NEW,
>  };
>
> -/* Timeouts: all the possible timeout states passed to update_expiration()
> - * are listed here. The name will be prefix by CT_TM_ and the value is in
> - * milliseconds */
> -#define CT_TIMEOUTS \
> -    CT_TIMEOUT(TCP_FIRST_PACKET) \
> -    CT_TIMEOUT(TCP_OPENING) \
> -    CT_TIMEOUT(TCP_ESTABLISHED) \
> -    CT_TIMEOUT(TCP_CLOSING) \
> -    CT_TIMEOUT(TCP_FIN_WAIT) \
> -    CT_TIMEOUT(TCP_CLOSED) \
> -    CT_TIMEOUT(OTHER_FIRST) \
> -    CT_TIMEOUT(OTHER_MULTIPLE) \
> -    CT_TIMEOUT(OTHER_BIDIR) \
> -    CT_TIMEOUT(ICMP_FIRST) \
> -    CT_TIMEOUT(ICMP_REPLY)
> -
> -enum ct_timeout {
> -#define CT_TIMEOUT(NAME) CT_TM_##NAME,
> -    CT_TIMEOUTS
> -#undef CT_TIMEOUT
> -    N_CT_TM
> -};
> -
>  struct conntrack {
>      struct ovs_mutex ct_lock; /* Protects 2 following fields. */
>      struct cmap conns OVS_GUARDED;
> -    struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED;
> +    struct rculist exp_lists[N_CT_TM] OVS_GUARDED;
>      struct hmap zone_limits OVS_GUARDED;
>      struct hmap timeout_policies OVS_GUARDED;
>      uint32_t hash_basis; /* Salt for hashing a connection key. */
> @@ -204,4 +221,13 @@ struct ct_l4_proto {
>                                 struct ct_dpif_protoinfo *);
>  };
>
> +static inline void
> +conn_expire_remove(struct conn_expire *exp)
> +{
> +    if (!atomic_flag_test_and_set(&exp->remove_once)
> +        && rculist_next(&exp->node)) {
> +        rculist_remove(&exp->node);
> +    }
> +}
> +
>  #endif /* conntrack-private.h */
> diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
> index a586d3a8d..30ba4bda8 100644
> --- a/lib/conntrack-tp.c
> +++ b/lib/conntrack-tp.c
> @@ -230,6 +230,50 @@ tm_to_ct_dpif_tp(enum ct_timeout tm)
>      return CT_DPIF_TP_ATTR_MAX;
>  }
>
> +static void
> +conn_expire_init(struct conn *conn, struct conntrack *ct)
> +{
> +    struct conn_expire *exp = &conn->exp;
> +
> +    if (exp->ct != NULL) {
> +        return;
> +    }
> +
> +    exp->ct = ct;
> +    atomic_flag_clear(&exp->insert_once);
> +    atomic_flag_clear(&exp->remove_once);
> +    /* The expiration is initially unscheduled, flag it as 'removed'. */
> +    atomic_flag_test_and_set(&exp->remove_once);
> +}
> +
> +static void
> +conn_expire_insert(struct conn *conn)
> +{
> +    struct conn_expire *exp = &conn->exp;
> +
> +    ovs_mutex_lock(&exp->ct->ct_lock);
> +    ovs_mutex_lock(&conn->lock);
> +
> +    rculist_push_back(&exp->ct->exp_lists[exp->tm], &exp->node);
> +    atomic_flag_clear(&exp->insert_once);
> +    atomic_flag_clear(&exp->remove_once);
> +
> +    ovs_mutex_unlock(&conn->lock);
> +    ovs_mutex_unlock(&exp->ct->ct_lock);
> +}
> +
> +static void
> +conn_schedule_expiration(struct conntrack *ct, struct conn *conn,
> +                         enum ct_timeout tm, long long now, uint32_t tp_value)
> +{
> +    conn_expire_init(conn, ct);
> +    conn->expiration = now + tp_value * 1000;
> +    conn->exp.tm = tm;
> +    if (!atomic_flag_test_and_set(&conn->exp.insert_once)) {

Does this mean we only insert the oldest one, not the latest one?
So pkt1 arrives at t1, pkt2 arrives later at t2, then we update using t1

> +        ovsrcu_postpone(conn_expire_insert, conn);
> +    }
> +}
> +

rest looks good to me


William
Gaetan Rivet Feb. 24, 2021, 12:33 a.m. UTC | #2
On Tue, Feb 23, 2021, at 22:55, William Tu wrote:
> Hi Gaetan,
> 
> Thanks for the patch, looks very useful.
> I haven't tested it yet.
> Minor question/comments inline.

Hi William, thanks for taking a look!

[...]
> >
> > +/* Timeouts: all the possible timeout states passed to update_expiration()
> > + * are listed here. The name will be prefix by CT_TM_ and the value is in
> > + * milliseconds */
> > +#define CT_TIMEOUTS \
> > +    CT_TIMEOUT(TCP_FIRST_PACKET) \
> > +    CT_TIMEOUT(TCP_OPENING) \
> > +    CT_TIMEOUT(TCP_ESTABLISHED) \
> > +    CT_TIMEOUT(TCP_CLOSING) \
> > +    CT_TIMEOUT(TCP_FIN_WAIT) \
> > +    CT_TIMEOUT(TCP_CLOSED) \
> > +    CT_TIMEOUT(OTHER_FIRST) \
> > +    CT_TIMEOUT(OTHER_MULTIPLE) \
> > +    CT_TIMEOUT(OTHER_BIDIR) \
> > +    CT_TIMEOUT(ICMP_FIRST) \
> > +    CT_TIMEOUT(ICMP_REPLY)
> > +
> > +enum ct_timeout {
> > +#define CT_TIMEOUT(NAME) CT_TM_##NAME,
> > +    CT_TIMEOUTS
> > +#undef CT_TIMEOUT
> > +    N_CT_TM
> > +};
> > +
> any reason for moving this macro to here?
> 

I embed the enum ct_timeout within the conn expiration node, so I must have it defined first.
I did not want to put too much code between 'ct_conn_type' below and the 'conn' struct itself,
which is why I put 'ct_timeout' just above.

Otherwise, this could be avoided by linking the rculist pointer directly instead of storing its index.
But I thought it would be unwise as it would more tightly couple the list type to the expiration node, and it means taking a full pointer vs. storing an enum.

> >  enum OVS_PACKED_ENUM ct_conn_type {
> >      CT_CONN_TYPE_DEFAULT,
> >      CT_CONN_TYPE_UN_NAT,
> >  };
> >
> > +struct conn_expire {
> > +    /* Set once when initializing the expiration node. */
> > +    struct conntrack *ct;
> > +    /* Timeout state of the connection.
> > +     * It follows the connection state updates.
> > +     */
> > +    enum ct_timeout tm;
> > +    /* Insert and remove the expiration node only once per RCU syncs.
> > +     * If multiple threads update the connection, its expiration should
> > +     * be removed only once and added only once to timeout lists.
> > +     */
> > +    atomic_flag insert_once;
> > +    atomic_flag remove_once;
> > +    struct rculist node;
> > +};
> > +
> >  struct conn {
> >      /* Immutable data. */
> >      struct conn_key key;
> >      struct conn_key rev_key;
> >      struct conn_key parent_key; /* Only used for orig_tuple support. */
> > -    struct ovs_list exp_node;
> >      struct cmap_node cm_node;
> >      struct nat_action_info_t *nat_info;
> >      char *alg;
> > @@ -104,6 +143,7 @@ struct conn {
> >
> >      /* Mutable data. */
> >      struct ovs_mutex lock; /* Guards all mutable fields. */
> > +    struct conn_expire exp;
> >      ovs_u128 label;
> >      long long expiration;
> >      uint32_t mark;
> > @@ -132,33 +172,10 @@ enum ct_update_res {
> >      CT_UPDATE_VALID_NEW,
> >  };
> >
> > -/* Timeouts: all the possible timeout states passed to update_expiration()
> > - * are listed here. The name will be prefix by CT_TM_ and the value is in
> > - * milliseconds */
> > -#define CT_TIMEOUTS \
> > -    CT_TIMEOUT(TCP_FIRST_PACKET) \
> > -    CT_TIMEOUT(TCP_OPENING) \
> > -    CT_TIMEOUT(TCP_ESTABLISHED) \
> > -    CT_TIMEOUT(TCP_CLOSING) \
> > -    CT_TIMEOUT(TCP_FIN_WAIT) \
> > -    CT_TIMEOUT(TCP_CLOSED) \
> > -    CT_TIMEOUT(OTHER_FIRST) \
> > -    CT_TIMEOUT(OTHER_MULTIPLE) \
> > -    CT_TIMEOUT(OTHER_BIDIR) \
> > -    CT_TIMEOUT(ICMP_FIRST) \
> > -    CT_TIMEOUT(ICMP_REPLY)
> > -
> > -enum ct_timeout {
> > -#define CT_TIMEOUT(NAME) CT_TM_##NAME,
> > -    CT_TIMEOUTS
> > -#undef CT_TIMEOUT
> > -    N_CT_TM
> > -};
> > -
> >  struct conntrack {
> >      struct ovs_mutex ct_lock; /* Protects 2 following fields. */
> >      struct cmap conns OVS_GUARDED;
> > -    struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED;
> > +    struct rculist exp_lists[N_CT_TM] OVS_GUARDED;
> >      struct hmap zone_limits OVS_GUARDED;
> >      struct hmap timeout_policies OVS_GUARDED;
> >      uint32_t hash_basis; /* Salt for hashing a connection key. */
> > @@ -204,4 +221,13 @@ struct ct_l4_proto {
> >                                 struct ct_dpif_protoinfo *);
> >  };
> >
> > +static inline void
> > +conn_expire_remove(struct conn_expire *exp)
> > +{
> > +    if (!atomic_flag_test_and_set(&exp->remove_once)
> > +        && rculist_next(&exp->node)) {
> > +        rculist_remove(&exp->node);
> > +    }
> > +}
> > +
> >  #endif /* conntrack-private.h */
> > diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
> > index a586d3a8d..30ba4bda8 100644
> > --- a/lib/conntrack-tp.c
> > +++ b/lib/conntrack-tp.c
> > @@ -230,6 +230,50 @@ tm_to_ct_dpif_tp(enum ct_timeout tm)
> >      return CT_DPIF_TP_ATTR_MAX;
> >  }
> >
> > +static void
> > +conn_expire_init(struct conn *conn, struct conntrack *ct)
> > +{
> > +    struct conn_expire *exp = &conn->exp;
> > +
> > +    if (exp->ct != NULL) {
> > +        return;
> > +    }
> > +
> > +    exp->ct = ct;
> > +    atomic_flag_clear(&exp->insert_once);
> > +    atomic_flag_clear(&exp->remove_once);
> > +    /* The expiration is initially unscheduled, flag it as 'removed'. */
> > +    atomic_flag_test_and_set(&exp->remove_once);
> > +}
> > +
> > +static void
> > +conn_expire_insert(struct conn *conn)
> > +{
> > +    struct conn_expire *exp = &conn->exp;
> > +
> > +    ovs_mutex_lock(&exp->ct->ct_lock);
> > +    ovs_mutex_lock(&conn->lock);
> > +
> > +    rculist_push_back(&exp->ct->exp_lists[exp->tm], &exp->node);
> > +    atomic_flag_clear(&exp->insert_once);
> > +    atomic_flag_clear(&exp->remove_once);
> > +
> > +    ovs_mutex_unlock(&conn->lock);
> > +    ovs_mutex_unlock(&exp->ct->ct_lock);
> > +}
> > +
> > +static void
> > +conn_schedule_expiration(struct conntrack *ct, struct conn *conn,
> > +                         enum ct_timeout tm, long long now, uint32_t tp_value)
> > +{
> > +    conn_expire_init(conn, ct);
> > +    conn->expiration = now + tp_value * 1000;
> > +    conn->exp.tm = tm;
> > +    if (!atomic_flag_test_and_set(&conn->exp.insert_once)) {
> 
> Does this mean we only insert the oldest one, not the latest one?
> So pkt1 arrives at t1, pkt2 arrives later at t2, then we update using t1
> 

The expiration node is flagged for insertion at pkt1, which means registering the insertion callback using RCU.
Then pkt2 arrives and will update 'conn->exp.tm'. If the RCU did not yet execute its callbacks, next time it does the ct_timeout list will correspond to the latest packet received. If it did execute already, then the 'insert_once' flag was cleared and the insertion callback is re-scheduled.

This avoids repeatedly scheduling insertions through the RCU. Connections could potentially be updated several times between RCU sync, for potentially many connections: it would mean making the cbset in RCU reallocate many times and grow unnecessarily.

> > +        ovsrcu_postpone(conn_expire_insert, conn);
> > +    }
> > +}
> > +
> 
> rest looks good to me
> 
> 
> William
>
William Tu Feb. 24, 2021, 4:55 p.m. UTC | #3
On Tue, Feb 23, 2021 at 4:34 PM Gaƫtan Rivet <grive@u256.net> wrote:
>
>
>
> On Tue, Feb 23, 2021, at 22:55, William Tu wrote:
> > Hi Gaetan,
> >
> > Thanks for the patch, looks very useful.
> > I haven't tested it yet.
> > Minor question/comments inline.
>
> Hi William, thanks for taking a look!
>
> [...]
> > >
> > > +/* Timeouts: all the possible timeout states passed to update_expiration()
> > > + * are listed here. The name will be prefix by CT_TM_ and the value is in
> > > + * milliseconds */
> > > +#define CT_TIMEOUTS \
> > > +    CT_TIMEOUT(TCP_FIRST_PACKET) \
> > > +    CT_TIMEOUT(TCP_OPENING) \
> > > +    CT_TIMEOUT(TCP_ESTABLISHED) \
> > > +    CT_TIMEOUT(TCP_CLOSING) \
> > > +    CT_TIMEOUT(TCP_FIN_WAIT) \
> > > +    CT_TIMEOUT(TCP_CLOSED) \
> > > +    CT_TIMEOUT(OTHER_FIRST) \
> > > +    CT_TIMEOUT(OTHER_MULTIPLE) \
> > > +    CT_TIMEOUT(OTHER_BIDIR) \
> > > +    CT_TIMEOUT(ICMP_FIRST) \
> > > +    CT_TIMEOUT(ICMP_REPLY)
> > > +
> > > +enum ct_timeout {
> > > +#define CT_TIMEOUT(NAME) CT_TM_##NAME,
> > > +    CT_TIMEOUTS
> > > +#undef CT_TIMEOUT
> > > +    N_CT_TM
> > > +};
> > > +
> > any reason for moving this macro to here?
> >
>
> I embed the enum ct_timeout within the conn expiration node, so I must have it defined first.
> I did not want to put too much code between 'ct_conn_type' below and the 'conn' struct itself,
> which is why I put 'ct_timeout' just above.
>
> Otherwise, this could be avoided by linking the rculist pointer directly instead of storing its index.
> But I thought it would be unwise as it would more tightly couple the list type to the expiration node, and it means taking a full pointer vs. storing an enum.
>

I see, make sense, thanks!

> > >  enum OVS_PACKED_ENUM ct_conn_type {
> > >      CT_CONN_TYPE_DEFAULT,
> > >      CT_CONN_TYPE_UN_NAT,
> > >  };
> > >
> > > +struct conn_expire {
> > > +    /* Set once when initializing the expiration node. */
> > > +    struct conntrack *ct;
> > > +    /* Timeout state of the connection.
> > > +     * It follows the connection state updates.
> > > +     */
> > > +    enum ct_timeout tm;
> > > +    /* Insert and remove the expiration node only once per RCU syncs.
> > > +     * If multiple threads update the connection, its expiration should
> > > +     * be removed only once and added only once to timeout lists.
> > > +     */
> > > +    atomic_flag insert_once;
> > > +    atomic_flag remove_once;
> > > +    struct rculist node;
> > > +};
> > > +
> > >  struct conn {
> > >      /* Immutable data. */
> > >      struct conn_key key;
> > >      struct conn_key rev_key;
> > >      struct conn_key parent_key; /* Only used for orig_tuple support. */
> > > -    struct ovs_list exp_node;
> > >      struct cmap_node cm_node;
> > >      struct nat_action_info_t *nat_info;
> > >      char *alg;
> > > @@ -104,6 +143,7 @@ struct conn {
> > >
> > >      /* Mutable data. */
> > >      struct ovs_mutex lock; /* Guards all mutable fields. */
> > > +    struct conn_expire exp;
> > >      ovs_u128 label;
> > >      long long expiration;
> > >      uint32_t mark;
> > > @@ -132,33 +172,10 @@ enum ct_update_res {
> > >      CT_UPDATE_VALID_NEW,
> > >  };
> > >
> > > -/* Timeouts: all the possible timeout states passed to update_expiration()
> > > - * are listed here. The name will be prefix by CT_TM_ and the value is in
> > > - * milliseconds */
> > > -#define CT_TIMEOUTS \
> > > -    CT_TIMEOUT(TCP_FIRST_PACKET) \
> > > -    CT_TIMEOUT(TCP_OPENING) \
> > > -    CT_TIMEOUT(TCP_ESTABLISHED) \
> > > -    CT_TIMEOUT(TCP_CLOSING) \
> > > -    CT_TIMEOUT(TCP_FIN_WAIT) \
> > > -    CT_TIMEOUT(TCP_CLOSED) \
> > > -    CT_TIMEOUT(OTHER_FIRST) \
> > > -    CT_TIMEOUT(OTHER_MULTIPLE) \
> > > -    CT_TIMEOUT(OTHER_BIDIR) \
> > > -    CT_TIMEOUT(ICMP_FIRST) \
> > > -    CT_TIMEOUT(ICMP_REPLY)
> > > -
> > > -enum ct_timeout {
> > > -#define CT_TIMEOUT(NAME) CT_TM_##NAME,
> > > -    CT_TIMEOUTS
> > > -#undef CT_TIMEOUT
> > > -    N_CT_TM
> > > -};
> > > -
> > >  struct conntrack {
> > >      struct ovs_mutex ct_lock; /* Protects 2 following fields. */
> > >      struct cmap conns OVS_GUARDED;
> > > -    struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED;
> > > +    struct rculist exp_lists[N_CT_TM] OVS_GUARDED;
> > >      struct hmap zone_limits OVS_GUARDED;
> > >      struct hmap timeout_policies OVS_GUARDED;
> > >      uint32_t hash_basis; /* Salt for hashing a connection key. */
> > > @@ -204,4 +221,13 @@ struct ct_l4_proto {
> > >                                 struct ct_dpif_protoinfo *);
> > >  };
> > >
> > > +static inline void
> > > +conn_expire_remove(struct conn_expire *exp)
> > > +{
> > > +    if (!atomic_flag_test_and_set(&exp->remove_once)
> > > +        && rculist_next(&exp->node)) {
> > > +        rculist_remove(&exp->node);
> > > +    }
> > > +}
> > > +
> > >  #endif /* conntrack-private.h */
> > > diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
> > > index a586d3a8d..30ba4bda8 100644
> > > --- a/lib/conntrack-tp.c
> > > +++ b/lib/conntrack-tp.c
> > > @@ -230,6 +230,50 @@ tm_to_ct_dpif_tp(enum ct_timeout tm)
> > >      return CT_DPIF_TP_ATTR_MAX;
> > >  }
> > >
> > > +static void
> > > +conn_expire_init(struct conn *conn, struct conntrack *ct)
> > > +{
> > > +    struct conn_expire *exp = &conn->exp;
> > > +
> > > +    if (exp->ct != NULL) {
> > > +        return;
> > > +    }
> > > +
> > > +    exp->ct = ct;
> > > +    atomic_flag_clear(&exp->insert_once);
> > > +    atomic_flag_clear(&exp->remove_once);
> > > +    /* The expiration is initially unscheduled, flag it as 'removed'. */
> > > +    atomic_flag_test_and_set(&exp->remove_once);
> > > +}
> > > +
> > > +static void
> > > +conn_expire_insert(struct conn *conn)
> > > +{
> > > +    struct conn_expire *exp = &conn->exp;
> > > +
> > > +    ovs_mutex_lock(&exp->ct->ct_lock);
> > > +    ovs_mutex_lock(&conn->lock);
> > > +
> > > +    rculist_push_back(&exp->ct->exp_lists[exp->tm], &exp->node);
> > > +    atomic_flag_clear(&exp->insert_once);
> > > +    atomic_flag_clear(&exp->remove_once);
> > > +
> > > +    ovs_mutex_unlock(&conn->lock);
> > > +    ovs_mutex_unlock(&exp->ct->ct_lock);
> > > +}
> > > +
> > > +static void
> > > +conn_schedule_expiration(struct conntrack *ct, struct conn *conn,
> > > +                         enum ct_timeout tm, long long now, uint32_t tp_value)
> > > +{
> > > +    conn_expire_init(conn, ct);
> > > +    conn->expiration = now + tp_value * 1000;
> > > +    conn->exp.tm = tm;
> > > +    if (!atomic_flag_test_and_set(&conn->exp.insert_once)) {
> >
> > Does this mean we only insert the oldest one, not the latest one?
> > So pkt1 arrives at t1, pkt2 arrives later at t2, then we update using t1
> >
>
> The expiration node is flagged for insertion at pkt1, which means registering the insertion callback using RCU.
> Then pkt2 arrives and will update 'conn->exp.tm'. If the RCU did not yet execute its callbacks, next time it does the ct_timeout list will correspond to the latest packet received. If it did execute already, then the 'insert_once' flag was cleared and the insertion callback is re-scheduled.
>
> This avoids repeatedly scheduling insertions through the RCU. Connections could potentially be updated several times between RCU sync, for potentially many connections: it would mean making the cbset in RCU reallocate many times and grow unnecessarily.

I see, looks good to me, thanks.
William
diff mbox series

Patch

diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
index e8332bdba..4b6f9eae3 100644
--- a/lib/conntrack-private.h
+++ b/lib/conntrack-private.h
@@ -29,6 +29,7 @@ 
 #include "openvswitch/list.h"
 #include "openvswitch/types.h"
 #include "packets.h"
+#include "rculist.h"
 #include "unaligned.h"
 #include "dp-packet.h"
 
@@ -86,17 +87,55 @@  struct alg_exp_node {
     bool nat_rpl_dst;
 };
 
+/* Timeouts: all the possible timeout states passed to update_expiration()
+ * are listed here. The name will be prefix by CT_TM_ and the value is in
+ * milliseconds */
+#define CT_TIMEOUTS \
+    CT_TIMEOUT(TCP_FIRST_PACKET) \
+    CT_TIMEOUT(TCP_OPENING) \
+    CT_TIMEOUT(TCP_ESTABLISHED) \
+    CT_TIMEOUT(TCP_CLOSING) \
+    CT_TIMEOUT(TCP_FIN_WAIT) \
+    CT_TIMEOUT(TCP_CLOSED) \
+    CT_TIMEOUT(OTHER_FIRST) \
+    CT_TIMEOUT(OTHER_MULTIPLE) \
+    CT_TIMEOUT(OTHER_BIDIR) \
+    CT_TIMEOUT(ICMP_FIRST) \
+    CT_TIMEOUT(ICMP_REPLY)
+
+enum ct_timeout {
+#define CT_TIMEOUT(NAME) CT_TM_##NAME,
+    CT_TIMEOUTS
+#undef CT_TIMEOUT
+    N_CT_TM
+};
+
 enum OVS_PACKED_ENUM ct_conn_type {
     CT_CONN_TYPE_DEFAULT,
     CT_CONN_TYPE_UN_NAT,
 };
 
+struct conn_expire {
+    /* Set once when initializing the expiration node. */
+    struct conntrack *ct;
+    /* Timeout state of the connection.
+     * It follows the connection state updates.
+     */
+    enum ct_timeout tm;
+    /* Insert and remove the expiration node only once per RCU syncs.
+     * If multiple threads update the connection, its expiration should
+     * be removed only once and added only once to timeout lists.
+     */
+    atomic_flag insert_once;
+    atomic_flag remove_once;
+    struct rculist node;
+};
+
 struct conn {
     /* Immutable data. */
     struct conn_key key;
     struct conn_key rev_key;
     struct conn_key parent_key; /* Only used for orig_tuple support. */
-    struct ovs_list exp_node;
     struct cmap_node cm_node;
     struct nat_action_info_t *nat_info;
     char *alg;
@@ -104,6 +143,7 @@  struct conn {
 
     /* Mutable data. */
     struct ovs_mutex lock; /* Guards all mutable fields. */
+    struct conn_expire exp;
     ovs_u128 label;
     long long expiration;
     uint32_t mark;
@@ -132,33 +172,10 @@  enum ct_update_res {
     CT_UPDATE_VALID_NEW,
 };
 
-/* Timeouts: all the possible timeout states passed to update_expiration()
- * are listed here. The name will be prefix by CT_TM_ and the value is in
- * milliseconds */
-#define CT_TIMEOUTS \
-    CT_TIMEOUT(TCP_FIRST_PACKET) \
-    CT_TIMEOUT(TCP_OPENING) \
-    CT_TIMEOUT(TCP_ESTABLISHED) \
-    CT_TIMEOUT(TCP_CLOSING) \
-    CT_TIMEOUT(TCP_FIN_WAIT) \
-    CT_TIMEOUT(TCP_CLOSED) \
-    CT_TIMEOUT(OTHER_FIRST) \
-    CT_TIMEOUT(OTHER_MULTIPLE) \
-    CT_TIMEOUT(OTHER_BIDIR) \
-    CT_TIMEOUT(ICMP_FIRST) \
-    CT_TIMEOUT(ICMP_REPLY)
-
-enum ct_timeout {
-#define CT_TIMEOUT(NAME) CT_TM_##NAME,
-    CT_TIMEOUTS
-#undef CT_TIMEOUT
-    N_CT_TM
-};
-
 struct conntrack {
     struct ovs_mutex ct_lock; /* Protects 2 following fields. */
     struct cmap conns OVS_GUARDED;
-    struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED;
+    struct rculist exp_lists[N_CT_TM] OVS_GUARDED;
     struct hmap zone_limits OVS_GUARDED;
     struct hmap timeout_policies OVS_GUARDED;
     uint32_t hash_basis; /* Salt for hashing a connection key. */
@@ -204,4 +221,13 @@  struct ct_l4_proto {
                                struct ct_dpif_protoinfo *);
 };
 
+static inline void
+conn_expire_remove(struct conn_expire *exp)
+{
+    if (!atomic_flag_test_and_set(&exp->remove_once)
+        && rculist_next(&exp->node)) {
+        rculist_remove(&exp->node);
+    }
+}
+
 #endif /* conntrack-private.h */
diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
index a586d3a8d..30ba4bda8 100644
--- a/lib/conntrack-tp.c
+++ b/lib/conntrack-tp.c
@@ -230,6 +230,50 @@  tm_to_ct_dpif_tp(enum ct_timeout tm)
     return CT_DPIF_TP_ATTR_MAX;
 }
 
+static void
+conn_expire_init(struct conn *conn, struct conntrack *ct)
+{
+    struct conn_expire *exp = &conn->exp;
+
+    if (exp->ct != NULL) {
+        return;
+    }
+
+    exp->ct = ct;
+    atomic_flag_clear(&exp->insert_once);
+    atomic_flag_clear(&exp->remove_once);
+    /* The expiration is initially unscheduled, flag it as 'removed'. */
+    atomic_flag_test_and_set(&exp->remove_once);
+}
+
+static void
+conn_expire_insert(struct conn *conn)
+{
+    struct conn_expire *exp = &conn->exp;
+
+    ovs_mutex_lock(&exp->ct->ct_lock);
+    ovs_mutex_lock(&conn->lock);
+
+    rculist_push_back(&exp->ct->exp_lists[exp->tm], &exp->node);
+    atomic_flag_clear(&exp->insert_once);
+    atomic_flag_clear(&exp->remove_once);
+
+    ovs_mutex_unlock(&conn->lock);
+    ovs_mutex_unlock(&exp->ct->ct_lock);
+}
+
+static void
+conn_schedule_expiration(struct conntrack *ct, struct conn *conn,
+                         enum ct_timeout tm, long long now, uint32_t tp_value)
+{
+    conn_expire_init(conn, ct);
+    conn->expiration = now + tp_value * 1000;
+    conn->exp.tm = tm;
+    if (!atomic_flag_test_and_set(&conn->exp.insert_once)) {
+        ovsrcu_postpone(conn_expire_insert, conn);
+    }
+}
+
 static void
 conn_update_expiration__(struct conntrack *ct, struct conn *conn,
                          enum ct_timeout tm, long long now,
@@ -241,9 +285,8 @@  conn_update_expiration__(struct conntrack *ct, struct conn *conn,
     ovs_mutex_lock(&ct->ct_lock);
     ovs_mutex_lock(&conn->lock);
     if (!conn->cleaned) {
-        conn->expiration = now + tp_value * 1000;
-        ovs_list_remove(&conn->exp_node);
-        ovs_list_push_back(&ct->exp_lists[tm], &conn->exp_node);
+        conn_expire_remove(&conn->exp);
+        conn_schedule_expiration(ct, conn, tm, now, tp_value);
     }
     ovs_mutex_unlock(&conn->lock);
     ovs_mutex_unlock(&ct->ct_lock);
@@ -281,15 +324,6 @@  conn_update_expiration(struct conntrack *ct, struct conn *conn,
     conn_update_expiration__(ct, conn, tm, now, val);
 }
 
-static void
-conn_init_expiration__(struct conntrack *ct, struct conn *conn,
-                       enum ct_timeout tm, long long now,
-                       uint32_t tp_value)
-{
-    conn->expiration = now + tp_value * 1000;
-    ovs_list_push_back(&ct->exp_lists[tm], &conn->exp_node);
-}
-
 /* ct_lock must be held. */
 void
 conn_init_expiration(struct conntrack *ct, struct conn *conn,
@@ -309,5 +343,5 @@  conn_init_expiration(struct conntrack *ct, struct conn *conn,
     VLOG_DBG_RL(&rl, "Init timeout %s zone=%u with policy id=%d val=%u sec.",
                 ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
 
-    conn_init_expiration__(ct, conn, tm, now, val);
+    conn_schedule_expiration(ct, conn, tm, now, val);
 }
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 99198a601..ac12f9196 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -301,7 +301,7 @@  conntrack_init(void)
     ovs_mutex_lock(&ct->ct_lock);
     cmap_init(&ct->conns);
     for (unsigned i = 0; i < ARRAY_SIZE(ct->exp_lists); i++) {
-        ovs_list_init(&ct->exp_lists[i]);
+        rculist_init(&ct->exp_lists[i]);
     }
     hmap_init(&ct->zone_limits);
     ct->zone_limit_seq = 0;
@@ -466,7 +466,7 @@  conn_clean(struct conntrack *ct, struct conn *conn)
         uint32_t hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis);
         cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash);
     }
-    ovs_list_remove(&conn->exp_node);
+    conn_expire_remove(&conn->exp);
     conn->cleaned = true;
     ovsrcu_postpone(delete_conn, conn);
     atomic_count_dec(&ct->n_conn);
@@ -478,7 +478,7 @@  conn_clean_one(struct conntrack *ct, struct conn *conn)
 {
     conn_clean_cmn(ct, conn);
     if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
-        ovs_list_remove(&conn->exp_node);
+        conn_expire_remove(&conn->exp);
         conn->cleaned = true;
         atomic_count_dec(&ct->n_conn);
     }
@@ -1075,8 +1075,8 @@  conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
      * can limit DoS impact. */
 nat_res_exhaustion:
     free(nat_conn);
-    ovs_list_remove(&nc->exp_node);
-    delete_conn_cmn(nc);
+    conn_expire_remove(&nc->exp);
+    ovsrcu_postpone(delete_conn_cmn, nc);
     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
     VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
                  "if DoS attack, use firewalling and/or zone partitioning.");
@@ -1493,14 +1493,14 @@  set_label(struct dp_packet *pkt, struct conn *conn,
 static long long
 ct_sweep(struct conntrack *ct, long long now, size_t limit)
 {
-    struct conn *conn, *next;
+    struct conn *conn;
     long long min_expiration = LLONG_MAX;
     size_t count = 0;
 
     ovs_mutex_lock(&ct->ct_lock);
 
     for (unsigned i = 0; i < N_CT_TM; i++) {
-        LIST_FOR_EACH_SAFE (conn, next, exp_node, &ct->exp_lists[i]) {
+        RCULIST_FOR_EACH (conn, exp.node, &ct->exp_lists[i]) {
             ovs_mutex_lock(&conn->lock);
             if (now < conn->expiration || count >= limit) {
                 min_expiration = MIN(min_expiration, conn->expiration);