[ovs-dev,1/2] ct-dpif, dpif-netlink: Support conntrack flush by ct 5-tuple

Message ID 1510603980-62407-1-git-send-email-yihung.wei@gmail.com
State New
Headers show
Series
  • [ovs-dev,1/2] ct-dpif, dpif-netlink: Support conntrack flush by ct 5-tuple
Related show

Commit Message

Yi-Hung Wei Nov. 13, 2017, 8:12 p.m.
This patch adds support of flushing a conntrack entry specified by the
conntrack 5-tuple, and provides the implementation in dpif-netlink.
The implementation of dpif-netlink in the Linux datapath utilizes the
NFNL_SUBSYS_CTNETLINK netlink subsystem to delete a conntrack entry in
nf_conntrack.

VMWare-BZ: #1983178
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
---
 lib/ct-dpif.c           |  23 ++++++++---
 lib/ct-dpif.h           |   3 +-
 lib/dpctl.c             |   2 +-
 lib/dpif-netdev.c       |   6 ++-
 lib/dpif-netlink.c      |   7 +++-
 lib/dpif-provider.h     |  13 ++++--
 lib/netlink-conntrack.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++
 lib/netlink-conntrack.h |   1 +
 ofproto/ofproto-dpif.c  |   2 +-
 tests/ovs-ofctl.at      |   2 +-
 10 files changed, 148 insertions(+), 16 deletions(-)

Comments

Justin Pettit Nov. 16, 2017, 12:28 a.m. | #1
> On Nov 13, 2017, at 12:12 PM, Yi-Hung Wei <yihung.wei@gmail.com> wrote:
> 
> This patch adds support of flushing a conntrack entry specified by the
> conntrack 5-tuple, and provides the implementation in dpif-netlink.
> The implementation of dpif-netlink in the Linux datapath utilizes the
> NFNL_SUBSYS_CTNETLINK netlink subsystem to delete a conntrack entry in
> nf_conntrack.
> 
> VMWare-BZ: #1983178
> Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
> ---
> lib/ct-dpif.c           |  23 ++++++++---
> lib/ct-dpif.h           |   3 +-
> lib/dpctl.c             |   2 +-
> lib/dpif-netdev.c       |   6 ++-
> lib/dpif-netlink.c      |   7 +++-
> lib/dpif-provider.h     |  13 ++++--
> lib/netlink-conntrack.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++
> lib/netlink-conntrack.h |   1 +
> ofproto/ofproto-dpif.c  |   2 +-
> tests/ovs-ofctl.at      |   2 +-
> 10 files changed, 148 insertions(+), 16 deletions(-)
> 
> diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
> index c79e69e23970..a9f58f4eb449 100644
> --- a/lib/ct-dpif.c
> +++ b/lib/ct-dpif.c
> @@ -111,19 +111,30 @@ ct_dpif_dump_done(struct ct_dpif_dump_state *dump)
> }
> 
> /* Flush the entries in the connection tracker used by 'dpif'.
> - *
> - * If 'zone' is not NULL, flush only the entries in '*zone'. */
> + * If both 'zone' and 'tuple' are NULL, flush all the conntrack entries.
> + * If 'zone' is not NULL, and 'tuple's is NULL, flush all the conntrack

I'd drop the "s" in "'tuple's".

> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 599308d257a8..0e1696a94e31 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -5741,10 +5741,14 @@ dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
> }
> 
> static int
> -dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone)
> +dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
> +                     const struct ct_dpif_tuple *tuple)
> {
>     struct dp_netdev *dp = get_dp_netdev(dpif);
> 
> +    if (tuple) {
> +        return EOPNOTSUPP;
> +    }
>     return conntrack_flush(&dp->conntrack, zone);
> }

Do you plan on implementing this?

> diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
> index 1d82a0939aa1..083d67c8d5b0 100644
> --- a/lib/dpif-provider.h
> +++ b/lib/dpif-provider.h
> ...
> -    /* Flushes the connection tracking tables. If 'zone' is not NULL,
> -     * only deletes connections in '*zone'. */
> -    int (*ct_flush)(struct dpif *, const uint16_t *zone);
> +    /* Flushes the connection tracking tables.
> +     * If both 'zone' and 'tuple' are NULL, flush all the conntrack entries.
> +     * If 'zone' is not NULL, and 'tuple's is NULL, flush all the conntrack
> +     * entries in '*zone'.

Same comment as above about "'tuple's'".

> diff --git a/lib/netlink-conntrack.c b/lib/netlink-conntrack.c
> index 1e1bb2f79d1d..9094779aaa55 100644
> --- a/lib/netlink-conntrack.c
> +++ b/lib/netlink-conntrack.c
> ....
> +int
> +nl_ct_delete(const struct ct_dpif_tuple *tuple, uint16_t zone)
> +{
> +    int err;
> +    struct ofpbuf buf;
> +
> +    ofpbuf_init(&buf, NL_DUMP_BUFSIZE);
> +    nl_msg_put_nfgenmsg(&buf, 0, tuple->l3_type, NFNL_SUBSYS_CTNETLINK,
> +                        IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
> +
> +    nl_msg_put_be16(&buf, CTA_ZONE, htons(zone));
> +    if (!nl_ct_put_ct_tuple(&buf, tuple, CTA_TUPLE_ORIG)) {
> +        err = EOPNOTSUPP;
> +        goto out;
> +    }
> +    err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
> +out:
> +    ofpbuf_uninit(&buf);
> +    return err;
> +}

The Linux code for nl_ct_flush_zone() seems to go through some contortions due to there being a problem with flushing a specific zone.  I took a brief look at the kernel code, and it looks like it does properly handle zones if the tuple is provided, but not otherwise.  Is that a correct reading?

On a more cosmetic note, most of the external interface refers to deleting a specific conntrack entry as a "flush", but the internal code refers to it as a "delete". I think this is a reasonable distinction, but if it's not carried externally, then it doesn't seem necessary internally.  Also, this delete and flush code is nearly identical, so it seems like there's some opportunity to just call it "flush" and consolidate to a single implementation that handles a tuple if it's provided.

> +static bool
> +nl_ct_put_tuple(struct ofpbuf *buf, const struct ct_dpif_tuple *tuple)
> +{
> +    if (!nl_ct_put_tuple_ip(buf, tuple)) {
> +        return false;
> +    }
> +    if (!nl_ct_put_tuple_proto(buf, tuple)) {
> +        return false;
> +    }
> +    return true;
> +}
> +
> +static bool
> +nl_ct_put_ct_tuple(struct ofpbuf *buf, const struct ct_dpif_tuple *tuple,
> +                   enum ctattr_type type)
> +{
> +    if (type != CTA_TUPLE_ORIG && type != CTA_TUPLE_REPLY &&
> +        type != CTA_TUPLE_MASTER) {
> +        return false;
> +    }
> +
> +    size_t offset = nl_msg_start_nested(buf, type);
> +    if (!nl_ct_put_tuple(buf, tuple)) {
> +        return false;
> +    }
> +
> +    nl_msg_end_nested(buf, offset);
> +    return true;
> +}

The names for nl_ct_put_ct_tuple() and nl_ct_put_tuple() are very similar.  nl_ct_put_tuple() doesn't do much, what about just folding that into nl_ct_put_ct_tuple()?

Thanks,

--Justin

Patch

diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
index c79e69e23970..a9f58f4eb449 100644
--- a/lib/ct-dpif.c
+++ b/lib/ct-dpif.c
@@ -111,19 +111,30 @@  ct_dpif_dump_done(struct ct_dpif_dump_state *dump)
 }
 
 /* Flush the entries in the connection tracker used by 'dpif'.
- *
- * If 'zone' is not NULL, flush only the entries in '*zone'. */
+ * If both 'zone' and 'tuple' are NULL, flush all the conntrack entries.
+ * If 'zone' is not NULL, and 'tuple's is NULL, flush all the conntrack
+ * entries in '*zone'.
+ * If 'tuple' is not NULL, flush the conntrack entry specified by 'tuple'
+ * in '*zone'. In this case, we use default zone (zone 0) if 'zone' is
+ * NULL. */
 int
-ct_dpif_flush(struct dpif *dpif, const uint16_t *zone)
+ct_dpif_flush(struct dpif *dpif, const uint16_t *zone,
+              const struct ct_dpif_tuple *tuple)
 {
-    if (zone) {
-        VLOG_DBG("%s: ct_flush: %"PRIu16, dpif_name(dpif), *zone);
+    if (tuple) {
+        struct ds ds = DS_EMPTY_INITIALIZER;
+        ct_dpif_format_tuple(&ds, tuple);
+        VLOG_DBG("%s: ct_flush: %s in zone %d", dpif_name(dpif), ds_cstr(&ds),
+                                                zone ? *zone : 0);
+        ds_destroy(&ds);
+    } else if (zone) {
+        VLOG_DBG("%s: ct_flush: zone %"PRIu16, dpif_name(dpif), *zone);
     } else {
         VLOG_DBG("%s: ct_flush: <all>", dpif_name(dpif));
     }
 
     return (dpif->dpif_class->ct_flush
-            ? dpif->dpif_class->ct_flush(dpif, zone)
+            ? dpif->dpif_class->ct_flush(dpif, zone, tuple)
             : EOPNOTSUPP);
 }
 
diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h
index d5f966175bc0..ef019050c78e 100644
--- a/lib/ct-dpif.h
+++ b/lib/ct-dpif.h
@@ -195,7 +195,8 @@  int ct_dpif_dump_start(struct dpif *, struct ct_dpif_dump_state **,
                        const uint16_t *zone, int *);
 int ct_dpif_dump_next(struct ct_dpif_dump_state *, struct ct_dpif_entry *);
 int ct_dpif_dump_done(struct ct_dpif_dump_state *);
-int ct_dpif_flush(struct dpif *, const uint16_t *zone);
+int ct_dpif_flush(struct dpif *, const uint16_t *zone,
+                  const struct ct_dpif_tuple *);
 void ct_dpif_entry_uninit(struct ct_dpif_entry *);
 void ct_dpif_format_entry(const struct ct_dpif_entry *, struct ds *,
                           bool verbose, bool print_stats);
diff --git a/lib/dpctl.c b/lib/dpctl.c
index 8055cdb56202..7569189d8f22 100644
--- a/lib/dpctl.c
+++ b/lib/dpctl.c
@@ -1353,7 +1353,7 @@  dpctl_flush_conntrack(int argc, const char *argv[],
         return error;
     }
 
-    error = ct_dpif_flush(dpif, pzone);
+    error = ct_dpif_flush(dpif, pzone, NULL);
 
     dpif_close(dpif);
     return error;
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 599308d257a8..0e1696a94e31 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -5741,10 +5741,14 @@  dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
 }
 
 static int
-dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone)
+dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
+                     const struct ct_dpif_tuple *tuple)
 {
     struct dp_netdev *dp = get_dp_netdev(dpif);
 
+    if (tuple) {
+        return EOPNOTSUPP;
+    }
     return conntrack_flush(&dp->conntrack, zone);
 }
 
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index c265909f8587..57e4e26f42b7 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -2892,9 +2892,12 @@  dpif_netlink_ct_dump_done(struct dpif *dpif OVS_UNUSED,
 }
 
 static int
-dpif_netlink_ct_flush(struct dpif *dpif OVS_UNUSED, const uint16_t *zone)
+dpif_netlink_ct_flush(struct dpif *dpif OVS_UNUSED, const uint16_t *zone,
+                      const struct ct_dpif_tuple *tuple)
 {
-    if (zone) {
+    if (tuple) {
+        return nl_ct_delete(tuple, zone ? *zone : 0);
+    } else if (zone) {
         return nl_ct_flush_zone(*zone);
     } else {
         return nl_ct_flush();
diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
index 1d82a0939aa1..083d67c8d5b0 100644
--- a/lib/dpif-provider.h
+++ b/lib/dpif-provider.h
@@ -75,6 +75,7 @@  dpif_flow_dump_thread_init(struct dpif_flow_dump_thread *thread,
 
 struct ct_dpif_dump_state;
 struct ct_dpif_entry;
+struct ct_dpif_tuple;
 
 /* Datapath interface class structure, to be defined by each implementation of
  * a datapath interface.
@@ -424,9 +425,15 @@  struct dpif_class {
                         struct ct_dpif_entry *entry);
     int (*ct_dump_done)(struct dpif *, struct ct_dpif_dump_state *state);
 
-    /* Flushes the connection tracking tables. If 'zone' is not NULL,
-     * only deletes connections in '*zone'. */
-    int (*ct_flush)(struct dpif *, const uint16_t *zone);
+    /* Flushes the connection tracking tables.
+     * If both 'zone' and 'tuple' are NULL, flush all the conntrack entries.
+     * If 'zone' is not NULL, and 'tuple's is NULL, flush all the conntrack
+     * entries in '*zone'.
+     * If 'tuple' is not NULL, flush the conntrack entry specified by 'tuple'
+     * in '*zone'. In this case, we use default zone (zone 0) if 'zone' is
+     * NULL. */
+    int (*ct_flush)(struct dpif *, const uint16_t *zone,
+                    const struct ct_dpif_tuple *tuple);
 
     /* Meters */
 
diff --git a/lib/netlink-conntrack.c b/lib/netlink-conntrack.c
index 1e1bb2f79d1d..9094779aaa55 100644
--- a/lib/netlink-conntrack.c
+++ b/lib/netlink-conntrack.c
@@ -18,6 +18,7 @@ 
 
 #include "netlink-conntrack.h"
 
+#include <errno.h>
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_conntrack.h>
 #include <linux/netfilter/nf_conntrack_common.h>
@@ -111,6 +112,8 @@  static bool nl_ct_parse_header_policy(struct ofpbuf *buf,
 static bool nl_ct_attrs_to_ct_dpif_entry(struct ct_dpif_entry *entry,
         struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)],
         uint8_t nfgen_family);
+static bool nl_ct_put_ct_tuple(struct ofpbuf *buf,
+        const struct ct_dpif_tuple *tuple, enum ctattr_type type);
 
 struct nl_ct_dump_state {
     struct nl_dump dump;
@@ -325,6 +328,27 @@  nl_ct_flush_zone(uint16_t flush_zone)
 }
 #endif
 
+int
+nl_ct_delete(const struct ct_dpif_tuple *tuple, uint16_t zone)
+{
+    int err;
+    struct ofpbuf buf;
+
+    ofpbuf_init(&buf, NL_DUMP_BUFSIZE);
+    nl_msg_put_nfgenmsg(&buf, 0, tuple->l3_type, NFNL_SUBSYS_CTNETLINK,
+                        IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
+
+    nl_msg_put_be16(&buf, CTA_ZONE, htons(zone));
+    if (!nl_ct_put_ct_tuple(&buf, tuple, CTA_TUPLE_ORIG)) {
+        err = EOPNOTSUPP;
+        goto out;
+    }
+    err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
+out:
+    ofpbuf_uninit(&buf);
+    return err;
+}
+
 /* Conntrack netlink parsing. */
 
 static bool
@@ -517,6 +541,87 @@  nl_ct_parse_tuple(struct nlattr *nla, struct ct_dpif_tuple *tuple,
     return parsed;
 }
 
+static bool
+nl_ct_put_tuple_ip(struct ofpbuf *buf, const struct ct_dpif_tuple *tuple)
+{
+    size_t offset = nl_msg_start_nested(buf, CTA_TUPLE_IP);
+
+    if (tuple->l3_type == AF_INET) {
+        nl_msg_put_be32(buf, CTA_IP_V4_SRC, tuple->src.ip);
+        nl_msg_put_be32(buf, CTA_IP_V4_DST, tuple->dst.ip);
+    } else if (tuple->l3_type == AF_INET6) {
+        nl_msg_put_in6_addr(buf, CTA_IP_V6_SRC, &tuple->src.in6);
+        nl_msg_put_in6_addr(buf, CTA_IP_V6_DST, &tuple->dst.in6);
+    } else {
+        VLOG_WARN_RL(&rl, "Unsupported IP protocol: %"PRIu16".",
+                     tuple->l3_type);
+        return false;
+    }
+
+    nl_msg_end_nested(buf, offset);
+    return true;
+}
+
+static bool
+nl_ct_put_tuple_proto(struct ofpbuf *buf, const struct ct_dpif_tuple *tuple)
+{
+    size_t offset = nl_msg_start_nested(buf, CTA_TUPLE_PROTO);
+
+    nl_msg_put_u8(buf, CTA_PROTO_NUM, tuple->ip_proto);
+
+    if (tuple->l3_type == AF_INET && tuple->ip_proto == IPPROTO_ICMP) {
+        nl_msg_put_be16(buf, CTA_PROTO_ICMP_ID, tuple->icmp_id);
+        nl_msg_put_u8(buf, CTA_PROTO_ICMP_TYPE, tuple->icmp_type);
+        nl_msg_put_u8(buf, CTA_PROTO_ICMP_CODE, tuple->icmp_code);
+    } else if (tuple->l3_type == AF_INET6 &&
+               tuple->ip_proto == IPPROTO_ICMPV6) {
+        nl_msg_put_be16(buf, CTA_PROTO_ICMPV6_ID, tuple->icmp_id);
+        nl_msg_put_u8(buf, CTA_PROTO_ICMPV6_TYPE, tuple->icmp_type);
+        nl_msg_put_u8(buf, CTA_PROTO_ICMPV6_CODE, tuple->icmp_code);
+    } else if (tuple->ip_proto == IPPROTO_TCP ||
+               tuple->ip_proto == IPPROTO_UDP) {
+        nl_msg_put_be16(buf, CTA_PROTO_SRC_PORT, tuple->src_port);
+        nl_msg_put_be16(buf, CTA_PROTO_DST_PORT, tuple->dst_port);
+    } else {
+        VLOG_WARN_RL(&rl, "Unsupported L4 protocol: %"PRIu8".",
+                     tuple->ip_proto);
+        return false;
+    }
+
+    nl_msg_end_nested(buf, offset);
+    return true;
+}
+
+static bool
+nl_ct_put_tuple(struct ofpbuf *buf, const struct ct_dpif_tuple *tuple)
+{
+    if (!nl_ct_put_tuple_ip(buf, tuple)) {
+        return false;
+    }
+    if (!nl_ct_put_tuple_proto(buf, tuple)) {
+        return false;
+    }
+    return true;
+}
+
+static bool
+nl_ct_put_ct_tuple(struct ofpbuf *buf, const struct ct_dpif_tuple *tuple,
+                   enum ctattr_type type)
+{
+    if (type != CTA_TUPLE_ORIG && type != CTA_TUPLE_REPLY &&
+        type != CTA_TUPLE_MASTER) {
+        return false;
+    }
+
+    size_t offset = nl_msg_start_nested(buf, type);
+    if (!nl_ct_put_tuple(buf, tuple)) {
+        return false;
+    }
+
+    nl_msg_end_nested(buf, offset);
+    return true;
+}
+
 /* Translate netlink TCP state to CT_DPIF_TCP state. */
 static uint8_t
 nl_ct_tcp_state_to_dpif(uint8_t state)
diff --git a/lib/netlink-conntrack.h b/lib/netlink-conntrack.h
index a95aa69a4cde..ff2c4d725e49 100644
--- a/lib/netlink-conntrack.h
+++ b/lib/netlink-conntrack.h
@@ -42,6 +42,7 @@  int nl_ct_dump_done(struct nl_ct_dump_state *);
 
 int nl_ct_flush(void);
 int nl_ct_flush_zone(uint16_t zone);
+int nl_ct_delete(const struct ct_dpif_tuple *, uint16_t zone);
 
 bool nl_ct_parse_entry(struct ofpbuf *, struct ct_dpif_entry *,
                        enum nl_ct_event_type *);
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 0e86d041ef3e..06d6c62aec79 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -4842,7 +4842,7 @@  ct_flush(const struct ofproto *ofproto_, const uint16_t *zone)
 {
     struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);
 
-    ct_dpif_flush(ofproto->backer->dpif, zone);
+    ct_dpif_flush(ofproto->backer->dpif, zone, NULL);
 }
 
 static bool
diff --git a/tests/ovs-ofctl.at b/tests/ovs-ofctl.at
index 28bca8b9b781..f8d43adccd33 100644
--- a/tests/ovs-ofctl.at
+++ b/tests/ovs-ofctl.at
@@ -3167,7 +3167,7 @@  AT_CHECK([ovs-appctl vlog/set ct_dpif:dbg])
 AT_CHECK([ovs-ofctl ct-flush-zone br0 123])
 
 OVS_WAIT_UNTIL([grep -q "|ct_dpif|DBG|.*ct_flush:" ovs-vswitchd.log])
-AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: 123" ovs-vswitchd.log])
+AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone 123" ovs-vswitchd.log])
 
 OVS_VSWITCHD_STOP
 AT_CLEANUP