diff mbox

[ovs-dev,7/7] ovn-northd: Logical flows for load balancers.

Message ID 1467188231-10194-8-git-send-email-guru@ovn.org
State Accepted
Headers show

Commit Message

Gurucharan Shetty June 29, 2016, 8:17 a.m. UTC
This commit adds a 'pre_lb' table that sits before 'pre_stateful' table.
For packets that need to be load balanced, this table sets reg0[0]
to act as a hint for the pre-stateful table to send the packet to
the conntrack table for defragmentation.

This commit also adds a 'lb' table that sits before 'stateful' table.
For packets from established connections, this table sets reg0[2] to
indicate to the 'stateful' table that the packet needs to be sent to
connection tracking table to just do NAT.

In stateful table, packet for a new connection that needs to be load balanced
is given a ct_lb("$IP_LIST") action.

Signed-off-by: Gurucharan Shetty <guru@ovn.org>
---
 ovn/northd/ovn-northd.c | 201 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 190 insertions(+), 11 deletions(-)

Comments

Zong Kai LI July 1, 2016, 10:13 a.m. UTC | #1
Hi, Guru.


> +        /* 'REGBIT_CONNTRACK_DEFRAG' is set to let the pre-stateful table
> send
> +         * packet to conntrack for defragmentation. */

+        const char *ip_address;
> +        SSET_FOR_EACH(ip_address, &all_ips) {
> +            char *match = xasprintf("ip && ip4.dst == %s", ip_address);
> +            ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB,
> +                          100, match, REGBIT_CONNTRACK_DEFRAG" = 1;
> next;");
> +            free(match);
> +        }
>

For ls_in_pre_lb table, you add a same action from ls_in_pre_acl table.
Indeed, there will be lflows, such as:
table=3(   ls_in_pre_acl), priority=  100, match=(ip), action=(reg0[0] = 1;
next;) [1]
table=4(    ls_in_pre_lb), priority=  100, match=(ip && ip4.dst ==
30.0.0.1), action=(reg0[0] = 1; next;) [2]

I know it will be flexible to have different tables for different purpose,
but they are too close enough.
And for me, the only different is "&& ip4.dst == ENDPOINT_IP".

What's more, it's easily to have a allow-related ACL rules for traffic
whose initiater is VM/container. By that, in the most common case, we will
have rule [1] in table ls_in_pre_acl. If so, rule [2] in table ls_in_pre_lb
seems duplicated, and rules in table ls_out_pre_lb are duplicated to ones
in table ls_out_pre_acl.

So the only case to make pre_lb tables are necessary is, logical switch
doesn't contain an "allow-related" action ACL rule. It seems possible, but
I cannot figure out why people choose to not using "allow-related" action,
that will make ACL table hard to maintain.

+
> +        sset_destroy(&all_ips);
> +
> +        if (vip_configured) {
> +            ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB,
> +                          100, "ip", REGBIT_CONNTRACK_DEFRAG" = 1;
> next;");
> +        }
> +    }
> +}





> +    if (od->nbs->load_balancer) {
> +        struct nbrec_load_balancer *lb = od->nbs->load_balancer;
> +        struct smap *vips = &lb->vips;
> +        struct smap_node *node;
> +
> +        SMAP_FOR_EACH (node, vips) {
> +            uint16_t port = 0;
> +
> +            /* node->key contains IP:port or just IP. */
> +            char *ip_address = NULL;
> +            ip_address_and_port_from_lb_key(node->key, &ip_address,
> &port);
> +            if (!ip_address) {
> +                continue;
> +            }
> +
> +            /* New connections in Ingress table. */
> +            char *action = xasprintf("ct_lb(\"%s\");", node->value);
> +            struct ds match = DS_EMPTY_INITIALIZER;
> +            ds_put_format(&match, "ct.new && ip && ip4.dst == %s",
> ip_address);
> +            if (port) {
> +                if (lb->protocol && !strcmp(lb->protocol, "udp")) {
> +                    ds_put_format(&match, "&& udp && udp.dst == %d",
> port);
> +                } else {
> +                    ds_put_format(&match, "&& tcp && tcp.dst == %d",
> port);
> +                }
> +                ovn_lflow_add(lflows, od, S_SWITCH_IN_LB,
> +                              120, ds_cstr(&match), action);
> +            } else {
> +                ovn_lflow_add(lflows, od, S_SWITCH_IN_LB,
> +                              110, ds_cstr(&match), action);
> +            }
>

S_SWITCH_IN_LB, I think you missed to put them into method build_lb.


> +
> +            ds_destroy(&match);
> +            free(action);
> +       }
> +    }
>  }
>
>
Thanks.
Zong Kai, LI
Gurucharan Shetty July 1, 2016, 2:30 p.m. UTC | #2
>
>
> So the only case to make pre_lb tables are necessary is, logical switch
> doesn't contain an "allow-related" action ACL rule. It seems possible, but
> I cannot figure out why people choose to not using "allow-related" action,
> that will make ACL table hard to maintain.
>

That is a bad assumption. Firewall rules are not compulsory.


>
> +
> > +        sset_destroy(&all_ips);
> > +
> > +        if (vip_configured) {
> > +            ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB,
> > +                          100, "ip", REGBIT_CONNTRACK_DEFRAG" = 1;
> > next;");
> > +        }
> > +    }
> > +}
>
>
>
>
>
> > +    if (od->nbs->load_balancer) {
> > +        struct nbrec_load_balancer *lb = od->nbs->load_balancer;
> > +        struct smap *vips = &lb->vips;
> > +        struct smap_node *node;
> > +
> > +        SMAP_FOR_EACH (node, vips) {
> > +            uint16_t port = 0;
> > +
> > +            /* node->key contains IP:port or just IP. */
> > +            char *ip_address = NULL;
> > +            ip_address_and_port_from_lb_key(node->key, &ip_address,
> > &port);
> > +            if (!ip_address) {
> > +                continue;
> > +            }
> > +
> > +            /* New connections in Ingress table. */
> > +            char *action = xasprintf("ct_lb(\"%s\");", node->value);
> > +            struct ds match = DS_EMPTY_INITIALIZER;
> > +            ds_put_format(&match, "ct.new && ip && ip4.dst == %s",
> > ip_address);
> > +            if (port) {
> > +                if (lb->protocol && !strcmp(lb->protocol, "udp")) {
> > +                    ds_put_format(&match, "&& udp && udp.dst == %d",
> > port);
> > +                } else {
> > +                    ds_put_format(&match, "&& tcp && tcp.dst == %d",
> > port);
> > +                }
> > +                ovn_lflow_add(lflows, od, S_SWITCH_IN_LB,
> > +                              120, ds_cstr(&match), action);
> > +            } else {
> > +                ovn_lflow_add(lflows, od, S_SWITCH_IN_LB,
> > +                              110, ds_cstr(&match), action);
> > +            }
> >
>
> S_SWITCH_IN_LB, I think you missed to put them into method build_lb.
>
Thank you for noticing. I should either move this part of the code to
build_lb() function or change it to S_SWITCH_IN_STATEFUL. I will do this as
part of v2.

>
>
> > +
> > +            ds_destroy(&match);
> > +            free(action);
> > +       }
> > +    }
> >  }
> >
> >
> Thanks.
> Zong Kai, LI
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
>
Ben Pfaff July 3, 2016, 5:24 p.m. UTC | #3
On Wed, Jun 29, 2016 at 01:17:11AM -0700, Gurucharan Shetty wrote:
> This commit adds a 'pre_lb' table that sits before 'pre_stateful' table.
> For packets that need to be load balanced, this table sets reg0[0]
> to act as a hint for the pre-stateful table to send the packet to
> the conntrack table for defragmentation.
> 
> This commit also adds a 'lb' table that sits before 'stateful' table.
> For packets from established connections, this table sets reg0[2] to
> indicate to the 'stateful' table that the packet needs to be sent to
> connection tracking table to just do NAT.
> 
> In stateful table, packet for a new connection that needs to be load balanced
> is given a ct_lb("$IP_LIST") action.
> 
> Signed-off-by: Gurucharan Shetty <guru@ovn.org>

This will require a change to the generated flow syntax if you accept my
suggestion for patch 6.

Is there a way to test this?

Acked-by: Ben Pfaff <blp@ovn.org>
diff mbox

Patch

diff --git a/ovn/northd/ovn-northd.c b/ovn/northd/ovn-northd.c
index ba80fbe..c3c737a 100644
--- a/ovn/northd/ovn-northd.c
+++ b/ovn/northd/ovn-northd.c
@@ -33,6 +33,7 @@ 
 #include "packets.h"
 #include "poll-loop.h"
 #include "smap.h"
+#include "sset.h"
 #include "stream.h"
 #include "stream-ssl.h"
 #include "unixctl.h"
@@ -92,19 +93,23 @@  enum ovn_stage {
     PIPELINE_STAGE(SWITCH, IN,  PORT_SEC_IP,    1, "ls_in_port_sec_ip")     \
     PIPELINE_STAGE(SWITCH, IN,  PORT_SEC_ND,    2, "ls_in_port_sec_nd")     \
     PIPELINE_STAGE(SWITCH, IN,  PRE_ACL,        3, "ls_in_pre_acl")      \
-    PIPELINE_STAGE(SWITCH, IN,  PRE_STATEFUL,   4, "ls_in_pre_stateful")    \
-    PIPELINE_STAGE(SWITCH, IN,  ACL,            5, "ls_in_acl")          \
-    PIPELINE_STAGE(SWITCH, IN,  STATEFUL,       6, "ls_in_stateful")     \
-    PIPELINE_STAGE(SWITCH, IN,  ARP_RSP,        7, "ls_in_arp_rsp")      \
-    PIPELINE_STAGE(SWITCH, IN,  L2_LKUP,        8, "ls_in_l2_lkup")      \
+    PIPELINE_STAGE(SWITCH, IN,  PRE_LB,         4, "ls_in_pre_lb")         \
+    PIPELINE_STAGE(SWITCH, IN,  PRE_STATEFUL,   5, "ls_in_pre_stateful")    \
+    PIPELINE_STAGE(SWITCH, IN,  ACL,            6, "ls_in_acl")          \
+    PIPELINE_STAGE(SWITCH, IN,  LB,             7, "ls_in_lb")           \
+    PIPELINE_STAGE(SWITCH, IN,  STATEFUL,       8, "ls_in_stateful")     \
+    PIPELINE_STAGE(SWITCH, IN,  ARP_RSP,        9, "ls_in_arp_rsp")      \
+    PIPELINE_STAGE(SWITCH, IN,  L2_LKUP,       10, "ls_in_l2_lkup")      \
                                                                       \
     /* Logical switch egress stages. */                               \
-    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,      0, "ls_out_pre_acl")     \
-    PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 1, "ls_out_pre_stateful")  \
-    PIPELINE_STAGE(SWITCH, OUT, ACL,          2, "ls_out_acl")            \
-    PIPELINE_STAGE(SWITCH, OUT, STATEFUL,     3, "ls_out_stateful")       \
-    PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_IP,  4, "ls_out_port_sec_ip")    \
-    PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_L2,  5, "ls_out_port_sec_l2")    \
+    PIPELINE_STAGE(SWITCH, OUT, PRE_LB,       0, "ls_out_pre_lb")     \
+    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,      1, "ls_out_pre_acl")     \
+    PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 2, "ls_out_pre_stateful")  \
+    PIPELINE_STAGE(SWITCH, OUT, LB,           3, "ls_out_lb")            \
+    PIPELINE_STAGE(SWITCH, OUT, ACL,          4, "ls_out_acl")            \
+    PIPELINE_STAGE(SWITCH, OUT, STATEFUL,     5, "ls_out_stateful")       \
+    PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_IP,  6, "ls_out_port_sec_ip")    \
+    PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_L2,  7, "ls_out_port_sec_l2")    \
                                                                       \
     /* Logical router ingress stages. */                              \
     PIPELINE_STAGE(ROUTER, IN,  ADMISSION,   0, "lr_in_admission")    \
@@ -134,6 +139,7 @@  enum ovn_stage {
 
 #define REGBIT_CONNTRACK_DEFRAG "reg0[0]"
 #define REGBIT_CONNTRACK_COMMIT "reg0[1]"
+#define REGBIT_CONNTRACK_NAT    "reg0[2]"
 
 /* Returns an "enum ovn_stage" built from the arguments. */
 static enum ovn_stage
@@ -1395,6 +1401,107 @@  build_pre_acls(struct ovn_datapath *od, struct hmap *lflows,
     }
 }
 
+/* For a 'key' of the form "IP:port" or just "IP", sets 'port' and
+ * 'ip_address'.  The caller must free() the memory allocated for
+ * 'ip_address'. */
+static void
+ip_address_and_port_from_lb_key(const char *key, char **ip_address,
+                                uint16_t *port)
+{
+    char *ip_str, *start, *next;
+    *ip_address = NULL;
+    *port = 0;
+
+    next = start = xstrdup(key);
+    ip_str = strsep(&next, ":");
+    if (!ip_str || !ip_str[0]) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+        VLOG_WARN_RL(&rl, "bad ip address for load balancer key %s", key);
+        free(start);
+        return;
+    }
+
+    ovs_be32 ip, mask;
+    char *error = ip_parse_masked(ip_str, &ip, &mask);
+    if (error || mask != OVS_BE32_MAX) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+        VLOG_WARN_RL(&rl, "bad ip address for load balancer key %s", key);
+        free(start);
+        free(error);
+        return;
+    }
+
+    int l4_port = 0;
+    if (next && next[0]) {
+        if (!str_to_int(next, 0, &l4_port) || l4_port < 0 || l4_port > 65535) {
+            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+            VLOG_WARN_RL(&rl, "bad ip port for load balancer key %s", key);
+            free(start);
+            return;
+        }
+    }
+
+    *port = l4_port;
+    *ip_address = strdup(ip_str);
+    free(start);
+}
+
+static void
+build_pre_lb(struct ovn_datapath *od, struct hmap *lflows)
+{
+    /* Allow all packets to go to next tables by default. */
+    ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 0, "1", "next;");
+    ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, 0, "1", "next;");
+
+    struct sset all_ips = SSET_INITIALIZER(&all_ips);
+    if (od->nbs->load_balancer) {
+        struct nbrec_load_balancer *lb = od->nbs->load_balancer;
+        struct smap *vips = &lb->vips;
+        struct smap_node *node;
+        bool vip_configured = false;
+
+        SMAP_FOR_EACH (node, vips) {
+            vip_configured = true;
+
+            /* node->key contains IP:port or just IP. */
+            char *ip_address = NULL;
+            uint16_t port;
+            ip_address_and_port_from_lb_key(node->key, &ip_address, &port);
+            if (!ip_address) {
+                continue;
+            }
+
+            if (!sset_contains(&all_ips, ip_address)) {
+                sset_add(&all_ips, ip_address);
+            }
+
+            free(ip_address);
+
+            /* Ignore L4 port information in the key because fragmented packets
+             * may not have L4 information.  The pre-stateful table will send
+             * the packet through ct() action to de-fragment. In stateful
+             * table, we will eventually look at L4 information. */
+        }
+
+        /* 'REGBIT_CONNTRACK_DEFRAG' is set to let the pre-stateful table send
+         * packet to conntrack for defragmentation. */
+        const char *ip_address;
+        SSET_FOR_EACH(ip_address, &all_ips) {
+            char *match = xasprintf("ip && ip4.dst == %s", ip_address);
+            ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB,
+                          100, match, REGBIT_CONNTRACK_DEFRAG" = 1; next;");
+            free(match);
+        }
+
+        sset_destroy(&all_ips);
+
+        if (vip_configured) {
+            ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB,
+                          100, "ip", REGBIT_CONNTRACK_DEFRAG" = 1; next;");
+        }
+    }
+}
+
 static void
 build_pre_stateful(struct ovn_datapath *od, struct hmap *lflows)
 {
@@ -1521,6 +1628,27 @@  build_acls(struct ovn_datapath *od, struct hmap *lflows)
 }
 
 static void
+build_lb(struct ovn_datapath *od, struct hmap *lflows)
+{
+    /* Ingress and Egress LB Table (Priority 0): Packets are allowed by
+     * default.  */
+    ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, 0, "1", "next;");
+    ovn_lflow_add(lflows, od, S_SWITCH_OUT_LB, 0, "1", "next;");
+
+    if (od->nbs->load_balancer) {
+        /* Ingress and Egress LB Table (Priority 65535).
+         *
+         * Send established traffic through conntrack for just NAT. */
+        ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, UINT16_MAX,
+                      "ct.est && !ct.rel && !ct.new && !ct.inv",
+                      REGBIT_CONNTRACK_NAT" = 1; next;");
+        ovn_lflow_add(lflows, od, S_SWITCH_OUT_LB, UINT16_MAX,
+                      "ct.est && !ct.rel && !ct.new && !ct.inv",
+                      REGBIT_CONNTRACK_NAT" = 1; next;");
+    }
+}
+
+static void
 build_stateful(struct ovn_datapath *od, struct hmap *lflows)
 {
     /* Ingress and Egress stateful Table (Priority 0): Packets are
@@ -1534,6 +1662,55 @@  build_stateful(struct ovn_datapath *od, struct hmap *lflows)
                   REGBIT_CONNTRACK_COMMIT" == 1", "ct_commit; next;");
     ovn_lflow_add(lflows, od, S_SWITCH_OUT_STATEFUL, 100,
                   REGBIT_CONNTRACK_COMMIT" == 1", "ct_commit; next;");
+
+    /* If REGBIT_CONNTRACK_NAT is set as 1, then packets should just be sent
+     * through nat (without committing).
+     *
+     * REGBIT_CONNTRACK_COMMIT is set for new connections and
+     * REGBIT_CONNTRACK_NAT is set for established connections. So they
+     * don't overlap.
+     */
+    ovn_lflow_add(lflows, od, S_SWITCH_IN_STATEFUL, 100,
+                  REGBIT_CONNTRACK_NAT" == 1", "ct_lb;");
+    ovn_lflow_add(lflows, od, S_SWITCH_OUT_STATEFUL, 100,
+                  REGBIT_CONNTRACK_NAT" == 1", "ct_lb;");
+
+    if (od->nbs->load_balancer) {
+        struct nbrec_load_balancer *lb = od->nbs->load_balancer;
+        struct smap *vips = &lb->vips;
+        struct smap_node *node;
+
+        SMAP_FOR_EACH (node, vips) {
+            uint16_t port = 0;
+
+            /* node->key contains IP:port or just IP. */
+            char *ip_address = NULL;
+            ip_address_and_port_from_lb_key(node->key, &ip_address, &port);
+            if (!ip_address) {
+                continue;
+            }
+
+            /* New connections in Ingress table. */
+            char *action = xasprintf("ct_lb(\"%s\");", node->value);
+            struct ds match = DS_EMPTY_INITIALIZER;
+            ds_put_format(&match, "ct.new && ip && ip4.dst == %s", ip_address);
+            if (port) {
+                if (lb->protocol && !strcmp(lb->protocol, "udp")) {
+                    ds_put_format(&match, "&& udp && udp.dst == %d", port);
+                } else {
+                    ds_put_format(&match, "&& tcp && tcp.dst == %d", port);
+                }
+                ovn_lflow_add(lflows, od, S_SWITCH_IN_LB,
+                              120, ds_cstr(&match), action);
+            } else {
+                ovn_lflow_add(lflows, od, S_SWITCH_IN_LB,
+                              110, ds_cstr(&match), action);
+            }
+
+            ds_destroy(&match);
+            free(action);
+       }
+    }
 }
 
 static void
@@ -1552,8 +1729,10 @@  build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
         }
 
         build_pre_acls(od, lflows, ports);
+        build_pre_lb(od, lflows);
         build_pre_stateful(od, lflows);
         build_acls(od, lflows);
+        build_lb(od, lflows);
         build_stateful(od, lflows);
     }