[ovs-dev,v2,2/3] conntrack: Support conntrack flush by ct 5-tuple

Message ID 1511312455-7733-3-git-send-email-yihung.wei@gmail.com
State Superseded
Delegated to: Justin Pettit
Headers show
Series
  • Support conntrack flush by ct 5-tuple
Related show

Commit Message

Yi-Hung Wei Nov. 22, 2017, 1 a.m.
This patch adds support of flushing a conntrack entry specified by the
conntrack 5-tuple in dpif-netdev.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
---
 lib/conntrack.c   | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/conntrack.h   |  3 +++
 lib/dpif-netdev.c |  2 +-
 3 files changed, 68 insertions(+), 1 deletion(-)

Comments

Justin Pettit Dec. 6, 2017, 10:57 p.m. | #1
> On Nov 21, 2017, at 5:00 PM, Yi-Hung Wei <yihung.wei@gmail.com> wrote:
> 
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index f5a3aa9fab34..c9077c07042c 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -2335,6 +2335,18 @@ ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
> }
> 
> static void
> +ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a,
> +                                 struct ct_addr *b,
> +                                 const uint16_t l3_type)
> +{
> +    if (l3_type == AF_INET) {
> +        b->ipv4_aligned = a->ip;
> +    } else if (l3_type == AF_INET6) {
> +        b->ipv6_aligned = a->in6;
> +    }
> +}

This is the reverse of the function ct_endpoint_to_ct_dpif_inet_addr(), and they both take a 16-bit argument describing the IP version, but their form is different.  I'd suggest adding comments to both to make it clearer, since it could lead to subtle bugs for people using them later:

-=-=-=-=-=-=-=-=-
/* Convert a conntrack address 'a' into an IP address 'b' of type 'dl_type'.
 *
 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
 * in network-byte order. */
static void
ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
                                 union ct_dpif_inet_addr *b,
                                 ovs_be16 dl_type)

....

/* Convert an IP address 'a' of type 'l3_type' into a conntrack address 'b'.
 *
 * Note that 'l3_type' should be either "AF_INET" or "AF_INET6". */
static void
ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a,
                                 struct ct_addr *b,
                                 const uint16_t l3_type)
-=-=-=-=-=-=-=-=-

> +int
> +conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
> +                      uint16_t zone)
> +{
> +    struct conn_lookup_ctx ctx;
> +    int error = 0;
> +
> +    memset(&ctx, 0, sizeof(ctx));
> +    tuple_to_conn_key(tuple, zone, &ctx.key);
> +    ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
> +    unsigned bucket = hash_to_bucket(ctx.hash);
> +
> +    ct_lock_lock(&ct->buckets[bucket].lock);
> +    conn_key_lookup(&ct->buckets[bucket], &ctx, time_msec());
> +    if (ctx.conn) {
> +        conn_clean(ct, ctx.conn, &ct->buckets[bucket]);
> +    } else {
> +        error = ENOENT;
> +    }
> +    ct_lock_unlock(&ct->buckets[bucket].lock);
> +    return error;
> +}

I believe the kernel clears out expectations.  As we discussed off-line, I think it would make sense to follow similar behavior, since presumably if the control channel is being flushed, the related flows should be, too.

--Justin
Darrell Ball Dec. 9, 2017, 2:23 a.m. | #2
On 12/6/17, 2:57 PM, "ovs-dev-bounces@openvswitch.org on behalf of Justin Pettit" <ovs-dev-bounces@openvswitch.org on behalf of jpettit@gmail.com> wrote:

    
    
    > On Nov 21, 2017, at 5:00 PM, Yi-Hung Wei <yihung.wei@gmail.com> wrote:
    > 
    > diff --git a/lib/conntrack.c b/lib/conntrack.c
    > index f5a3aa9fab34..c9077c07042c 100644
    > --- a/lib/conntrack.c
    > +++ b/lib/conntrack.c
    > @@ -2335,6 +2335,18 @@ ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
    > }
    > 
    > static void
    > +ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a,
    > +                                 struct ct_addr *b,
    > +                                 const uint16_t l3_type)
    > +{
    > +    if (l3_type == AF_INET) {
    > +        b->ipv4_aligned = a->ip;
    > +    } else if (l3_type == AF_INET6) {
    > +        b->ipv6_aligned = a->in6;
    > +    }
    > +}
    
    This is the reverse of the function ct_endpoint_to_ct_dpif_inet_addr(), and they both take a 16-bit argument describing the IP version, but their form is different.  I'd suggest adding comments to both to make it clearer, since it could lead to subtle bugs for people using them later:
    
    -=-=-=-=-=-=-=-=-
    /* Convert a conntrack address 'a' into an IP address 'b' of type 'dl_type'.
     *
     * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
     * in network-byte order. */
    static void
    ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
                                     union ct_dpif_inet_addr *b,
                                     ovs_be16 dl_type)
    
    ....
    
    /* Convert an IP address 'a' of type 'l3_type' into a conntrack address 'b'.
     *
     * Note that 'l3_type' should be either "AF_INET" or "AF_INET6". */
    static void
    ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a,
                                     struct ct_addr *b,
                                     const uint16_t l3_type)
    -=-=-=-=-=-=-=-=-
    
    > +int
    > +conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
    > +                      uint16_t zone)
    > +{
    > +    struct conn_lookup_ctx ctx;
    > +    int error = 0;
    > +
    > +    memset(&ctx, 0, sizeof(ctx));
    > +    tuple_to_conn_key(tuple, zone, &ctx.key);
    > +    ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
    > +    unsigned bucket = hash_to_bucket(ctx.hash);
    > +
    > +    ct_lock_lock(&ct->buckets[bucket].lock);
    > +    conn_key_lookup(&ct->buckets[bucket], &ctx, time_msec());
    > +    if (ctx.conn) {
    > +        conn_clean(ct, ctx.conn, &ct->buckets[bucket]);
    > +    } else {
    > +        error = ENOENT;
    > +    }
    > +    ct_lock_unlock(&ct->buckets[bucket].lock);
    > +    return error;
    > +}
    
    I believe the kernel clears out expectations.  As we discussed off-line, I think it would make sense to follow similar behavior, since presumably if the control channel is being flushed, the related flows should be, too.

[Darrell] As discussed, I sent a patch here https://patchwork.ozlabs.org/patch/846490/
               Yi-hung, please try this out and let me know how it goes.





    
    --Justin
    
    
    _______________________________________________
    dev mailing list
    dev@openvswitch.org
    https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwICAg&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=eaLpf1UY3hR6Une-p61yqtAuAmJleZlABp8utu71Z9o&s=c0Tlj2UKYZjOQ4MQwu19hrzQFZuSSk4HoMNdrIM3jfA&e=
Justin Pettit Dec. 9, 2017, 2:25 a.m. | #3
> On Dec 8, 2017, at 6:23 PM, Darrell Ball <dball@vmware.com> wrote:
> 
> 
>    I believe the kernel clears out expectations.  As we discussed off-line, I think it would make sense to follow similar behavior, since presumably if the control channel is being flushed, the related flows should be, too.
> 
> [Darrell] As discussed, I sent a patch here https://patchwork.ozlabs.org/patch/846490/
>               Yi-hung, please try this out and let me know how it goes.

Thanks, Darrell!

--Justin
Darrell Ball Dec. 11, 2017, 11:44 p.m. | #4
A few suggestions Yi-hung

Thanks Darrell

On 11/21/17, 5:04 PM, "ovs-dev-bounces@openvswitch.org on behalf of Yi-Hung Wei" <ovs-dev-bounces@openvswitch.org on behalf of yihung.wei@gmail.com> wrote:

    This patch adds support of flushing a conntrack entry specified by the
    conntrack 5-tuple in dpif-netdev.
    
    Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>

    ---
     lib/conntrack.c   | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
     lib/conntrack.h   |  3 +++
     lib/dpif-netdev.c |  2 +-
     3 files changed, 68 insertions(+), 1 deletion(-)
    
    diff --git a/lib/conntrack.c b/lib/conntrack.c
    index f5a3aa9fab34..c9077c07042c 100644
    --- a/lib/conntrack.c
    +++ b/lib/conntrack.c
    @@ -2335,6 +2335,18 @@ ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
     }
     
     static void
    +ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a,
    +                                 struct ct_addr *b,
    +                                 const uint16_t l3_type)


[Darrell]
1/ typo here: no need for ‘const’ qualifier to l3_type or if dl_type were to be used instead.

2/ In this file we typically select on dl_type, not AF_* for V4 vs V6 differentiation.
    The caller already finds the dl_types, so it might be better just to use dl_type here 
    to be consistent throughout the file


    +{
    +    if (l3_type == AF_INET) {
    +        b->ipv4_aligned = a->ip;
    +    } else if (l3_type == AF_INET6) {
    +        b->ipv6_aligned = a->in6;
    +    }
    +}
    +
    +static void
     conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
     {
         if (key->dl_type == htons(ETH_TYPE_IP)) {
    @@ -2359,6 +2371,35 @@ conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
     }
     
     static void
    +tuple_to_conn_key(const struct ct_dpif_tuple *tuple, uint16_t zone,
    +                  struct conn_key *key)
    +{
    +    if (tuple->l3_type == AF_INET) {
    +        key->dl_type = htons(ETH_TYPE_IP);
    +    } else if (tuple->l3_type == AF_INET6) {
    +        key->dl_type = htons(ETH_TYPE_IPV6);
    +    }
    +    key->nw_proto = tuple->ip_proto;
    +    ct_dpif_inet_addr_to_ct_endpoint(&tuple->src, &key->src.addr,
    +                                     tuple->l3_type);
    +    ct_dpif_inet_addr_to_ct_endpoint(&tuple->dst, &key->dst.addr,
    +                                     tuple->l3_type);
    +
    +    if (tuple->ip_proto == IPPROTO_ICMP || tuple->ip_proto == IPPROTO_ICMPV6) {
    +        key->src.icmp_id = tuple->icmp_id;
    +        key->src.icmp_type = tuple->icmp_type;
    +        key->src.icmp_code = tuple->icmp_code;
    +        key->dst.icmp_id = tuple->icmp_id;
    +        key->dst.icmp_type = reverse_icmp_type(tuple->icmp_type);
    +        key->dst.icmp_code = tuple->icmp_code;
    +    } else {
    +        key->src.port = tuple->src_port;
    +        key->dst.port = tuple->dst_port;
    +    }
    +    key->zone = zone;
    +}
    +
    +static void
     conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
                           long long now, int bkt)
     {
    @@ -2485,6 +2526,29 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
         return 0;
     }
     
    +int
    +conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
    +                      uint16_t zone)
    +{
    +    struct conn_lookup_ctx ctx;
    +    int error = 0;
    +
    +    memset(&ctx, 0, sizeof(ctx));
    +    tuple_to_conn_key(tuple, zone, &ctx.key);
    +    ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
    +    unsigned bucket = hash_to_bucket(ctx.hash);
    +
    +    ct_lock_lock(&ct->buckets[bucket].lock);
    +    conn_key_lookup(&ct->buckets[bucket], &ctx, time_msec());
    +    if (ctx.conn) {
    +        conn_clean(ct, ctx.conn, &ct->buckets[bucket]);
    +    } else {
    +        error = ENOENT;

[Darrell] Not finding a conntrack entry is pretty normal since all entries timeout normally.
              Sending the error upwards is fine. However, I see dpctl raising an error as a result, in the 
              other patches; is this intentional ?  



    +    }
    +    ct_lock_unlock(&ct->buckets[bucket].lock);
    +    return error;
    +}
    +
     /* This function must be called with the ct->resources read lock taken. */
     static struct alg_exp_node *
     expectation_lookup(struct hmap *alg_expectations,
    diff --git a/lib/conntrack.h b/lib/conntrack.h
    index fbeef1c4754e..b5b26dd96e00 100644
    --- a/lib/conntrack.h
    +++ b/lib/conntrack.h
    @@ -107,6 +107,7 @@ struct conntrack_dump {
     };
     
     struct ct_dpif_entry;
    +struct ct_dpif_tuple;
     
     int conntrack_dump_start(struct conntrack *, struct conntrack_dump *,
                              const uint16_t *pzone, int *);
    @@ -114,6 +115,8 @@ int conntrack_dump_next(struct conntrack_dump *, struct ct_dpif_entry *);
     int conntrack_dump_done(struct conntrack_dump *);
     
     int conntrack_flush(struct conntrack *, const uint16_t *zone);
    +int conntrack_flush_tuple(struct conntrack *, const struct ct_dpif_tuple *,
    +                          uint16_t zone);
     ?
     /* 'struct ct_lock' is a wrapper for an adaptive mutex.  It's useful to try
      * different types of locks (e.g. spinlocks) */
    diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
    index b1ef9a6a5992..527442ebfc2a 100644
    --- a/lib/dpif-netdev.c
    +++ b/lib/dpif-netdev.c
    @@ -5740,7 +5740,7 @@ dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
         struct dp_netdev *dp = get_dp_netdev(dpif);
     
         if (tuple) {
    -        return EOPNOTSUPP;
    +        return conntrack_flush_tuple(&dp->conntrack, tuple, zone ? *zone : 0);
         }
         return conntrack_flush(&dp->conntrack, zone);
     }
    -- 
    2.7.4
    
    _______________________________________________
    dev mailing list
    dev@openvswitch.org
    https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwICAg&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=xEG32YTS7ucH0vCOZtma2TPY5p8n_8GKoQpPLcqMLzw&s=-LSEfbFzQvfGzWh-L84gcYHOMMUb7PrcIpluRoG81tw&e=

Patch

diff --git a/lib/conntrack.c b/lib/conntrack.c
index f5a3aa9fab34..c9077c07042c 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -2335,6 +2335,18 @@  ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
 }
 
 static void
+ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a,
+                                 struct ct_addr *b,
+                                 const uint16_t l3_type)
+{
+    if (l3_type == AF_INET) {
+        b->ipv4_aligned = a->ip;
+    } else if (l3_type == AF_INET6) {
+        b->ipv6_aligned = a->in6;
+    }
+}
+
+static void
 conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
 {
     if (key->dl_type == htons(ETH_TYPE_IP)) {
@@ -2359,6 +2371,35 @@  conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
 }
 
 static void
+tuple_to_conn_key(const struct ct_dpif_tuple *tuple, uint16_t zone,
+                  struct conn_key *key)
+{
+    if (tuple->l3_type == AF_INET) {
+        key->dl_type = htons(ETH_TYPE_IP);
+    } else if (tuple->l3_type == AF_INET6) {
+        key->dl_type = htons(ETH_TYPE_IPV6);
+    }
+    key->nw_proto = tuple->ip_proto;
+    ct_dpif_inet_addr_to_ct_endpoint(&tuple->src, &key->src.addr,
+                                     tuple->l3_type);
+    ct_dpif_inet_addr_to_ct_endpoint(&tuple->dst, &key->dst.addr,
+                                     tuple->l3_type);
+
+    if (tuple->ip_proto == IPPROTO_ICMP || tuple->ip_proto == IPPROTO_ICMPV6) {
+        key->src.icmp_id = tuple->icmp_id;
+        key->src.icmp_type = tuple->icmp_type;
+        key->src.icmp_code = tuple->icmp_code;
+        key->dst.icmp_id = tuple->icmp_id;
+        key->dst.icmp_type = reverse_icmp_type(tuple->icmp_type);
+        key->dst.icmp_code = tuple->icmp_code;
+    } else {
+        key->src.port = tuple->src_port;
+        key->dst.port = tuple->dst_port;
+    }
+    key->zone = zone;
+}
+
+static void
 conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
                       long long now, int bkt)
 {
@@ -2485,6 +2526,29 @@  conntrack_flush(struct conntrack *ct, const uint16_t *zone)
     return 0;
 }
 
+int
+conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
+                      uint16_t zone)
+{
+    struct conn_lookup_ctx ctx;
+    int error = 0;
+
+    memset(&ctx, 0, sizeof(ctx));
+    tuple_to_conn_key(tuple, zone, &ctx.key);
+    ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
+    unsigned bucket = hash_to_bucket(ctx.hash);
+
+    ct_lock_lock(&ct->buckets[bucket].lock);
+    conn_key_lookup(&ct->buckets[bucket], &ctx, time_msec());
+    if (ctx.conn) {
+        conn_clean(ct, ctx.conn, &ct->buckets[bucket]);
+    } else {
+        error = ENOENT;
+    }
+    ct_lock_unlock(&ct->buckets[bucket].lock);
+    return error;
+}
+
 /* This function must be called with the ct->resources read lock taken. */
 static struct alg_exp_node *
 expectation_lookup(struct hmap *alg_expectations,
diff --git a/lib/conntrack.h b/lib/conntrack.h
index fbeef1c4754e..b5b26dd96e00 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -107,6 +107,7 @@  struct conntrack_dump {
 };
 
 struct ct_dpif_entry;
+struct ct_dpif_tuple;
 
 int conntrack_dump_start(struct conntrack *, struct conntrack_dump *,
                          const uint16_t *pzone, int *);
@@ -114,6 +115,8 @@  int conntrack_dump_next(struct conntrack_dump *, struct ct_dpif_entry *);
 int conntrack_dump_done(struct conntrack_dump *);
 
 int conntrack_flush(struct conntrack *, const uint16_t *zone);
+int conntrack_flush_tuple(struct conntrack *, const struct ct_dpif_tuple *,
+                          uint16_t zone);
 
 /* 'struct ct_lock' is a wrapper for an adaptive mutex.  It's useful to try
  * different types of locks (e.g. spinlocks) */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index b1ef9a6a5992..527442ebfc2a 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -5740,7 +5740,7 @@  dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
     struct dp_netdev *dp = get_dp_netdev(dpif);
 
     if (tuple) {
-        return EOPNOTSUPP;
+        return conntrack_flush_tuple(&dp->conntrack, tuple, zone ? *zone : 0);
     }
     return conntrack_flush(&dp->conntrack, zone);
 }