[ovs-dev,v6] conntrack: Optimize recirculations.
diff mbox series

Message ID 1566835544-72391-1-git-send-email-dlu998@gmail.com
State New
Headers show
Series
  • [ovs-dev,v6] conntrack: Optimize recirculations.
Related show

Commit Message

Darrell Ball Aug. 26, 2019, 4:05 p.m. UTC
Cache the 'conn' context and use it when it is valid.  The cached 'conn'
context will get reset if it is not expected to be valid; the cost to do
this is negligible.  Besides being most optimal, this also handles corner
cases, such as decapsulation leading to the same tuple, as in tunnel VPN
cases.  A negative test is added to check the resetting of the cached
'conn'.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
---

v6: a/ Added 'conn' reset for mpls push case to force 'invalid', for
       consistency reasons.
    b/ Add missed lock around 'conn->mark' and 'conn->label' access in
       'process_one_fast()'.
    c/ Use opaque vs void 'conn' ptr (Ben).
    d/ Remove inline specifier on 2 static functions in source file. (Ben).
    e/ Remove 3 'continue' instructions in lieu of 'else' clause. (Ben).

v5: Check for alg ctl on recirculation to handle potential corner case.
    Remove unnecessary 'commit' filter.

v4: Reset 'conn' cache context automatically when tuple changes, rather than
    needing an explicit 'ct_clear' action.  Need to check if all cases are
    handled.
v3: Remove unneeded 'NAT' field added to 'struct conn_lookup_ctx'.

 lib/conntrack.c         | 60 ++++++++++++++++++++++++++++++++++++++++++----
 lib/netdev.c            |  1 +
 lib/packets.c           |  9 +++++++
 lib/packets.h           | 11 +++++++++
 tests/system-traffic.at | 63 +++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 139 insertions(+), 5 deletions(-)

Comments

Ben Pfaff Sept. 25, 2019, 3:58 p.m. UTC | #1
On Mon, Aug 26, 2019 at 09:05:44AM -0700, Darrell Ball wrote:
> Cache the 'conn' context and use it when it is valid.  The cached 'conn'
> context will get reset if it is not expected to be valid; the cost to do
> this is negligible.  Besides being most optimal, this also handles corner
> cases, such as decapsulation leading to the same tuple, as in tunnel VPN
> cases.  A negative test is added to check the resetting of the cached
> 'conn'.
> 
> Signed-off-by: Darrell Ball <dlu998@gmail.com>

Applied to master.  Thank you!
Darrell Ball Sept. 25, 2019, 8:22 p.m. UTC | #2
On Wed, Sep 25, 2019 at 10:51 AM Ben Pfaff <blp@ovn.org> wrote:

> On Mon, Aug 26, 2019 at 09:05:44AM -0700, Darrell Ball wrote:
> > Cache the 'conn' context and use it when it is valid.  The cached 'conn'
> > context will get reset if it is not expected to be valid; the cost to do
> > this is negligible.  Besides being most optimal, this also handles corner
> > cases, such as decapsulation leading to the same tuple, as in tunnel VPN
> > cases.  A negative test is added to check the resetting of the cached
> > 'conn'.
> >
> > Signed-off-by: Darrell Ball <dlu998@gmail.com>
>
> Applied to master.  Thank you!
>

cool, thank you

Patch
diff mbox series

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 0125bb4..ecf3bcc 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -1086,6 +1086,46 @@  conn_update_state_alg(struct conntrack *ct, struct dp_packet *pkt,
 }
 
 static void
+set_cached_conn(const struct nat_action_info_t *nat_action_info,
+                const struct conn_lookup_ctx *ctx, struct conn *conn,
+                struct dp_packet *pkt)
+{
+    if (OVS_LIKELY(!nat_action_info)) {
+        pkt->md.conn = conn;
+        pkt->md.reply = ctx->reply;
+        pkt->md.icmp_related = ctx->icmp_related;
+    } else {
+        pkt->md.conn = NULL;
+    }
+}
+
+static void
+process_one_fast(uint16_t zone, const uint32_t *setmark,
+                 const struct ovs_key_ct_labels *setlabel,
+                 const struct nat_action_info_t *nat_action_info,
+                 struct conn *conn, struct dp_packet *pkt)
+{
+    if (nat_action_info) {
+        handle_nat(pkt, conn, zone, pkt->md.reply, pkt->md.icmp_related);
+        pkt->md.conn = NULL;
+    }
+
+    pkt->md.ct_zone = zone;
+    ovs_mutex_lock(&conn->lock);
+    pkt->md.ct_mark = conn->mark;
+    pkt->md.ct_label = conn->label;
+    ovs_mutex_unlock(&conn->lock);
+
+    if (setmark) {
+        set_mark(pkt, conn, setmark[0], setmark[1]);
+    }
+
+    if (setlabel) {
+        set_label(pkt, conn, &setlabel[0], &setlabel[1]);
+    }
+}
+
+static void
 process_one(struct conntrack *ct, struct dp_packet *pkt,
             struct conn_lookup_ctx *ctx, uint16_t zone,
             bool force, bool commit, long long now, const uint32_t *setmark,
@@ -1185,6 +1225,8 @@  process_one(struct conntrack *ct, struct dp_packet *pkt,
     }
 
     handle_alg_ctl(ct, ctx, pkt, ct_alg_ctl, conn, now, !!nat_action_info);
+
+    set_cached_conn(nat_action_info, ctx, conn, pkt);
 }
 
 /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'.  All
@@ -1212,14 +1254,21 @@  conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
     struct conn_lookup_ctx ctx;
 
     DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) {
-        if (packet->md.ct_state == CS_INVALID
-            || !conn_key_extract(ct, packet, dl_type, &ctx, zone)) {
+        struct conn *conn = packet->md.conn;
+        if (OVS_UNLIKELY(packet->md.ct_state == CS_INVALID)) {
+            write_ct_md(packet, zone, NULL, NULL, NULL);
+        } else if (conn && conn->key.zone == zone && !force
+                   && !get_alg_ctl_type(packet, tp_src, tp_dst, helper)) {
+            process_one_fast(zone, setmark, setlabel, nat_action_info,
+                             conn, packet);
+        } else if (OVS_UNLIKELY(!conn_key_extract(ct, packet, dl_type, &ctx,
+                                zone))) {
             packet->md.ct_state = CS_INVALID;
             write_ct_md(packet, zone, NULL, NULL, NULL);
-            continue;
+        } else {
+            process_one(ct, packet, &ctx, zone, force, commit, now, setmark,
+                        setlabel, nat_action_info, tp_src, tp_dst, helper);
         }
-        process_one(ct, packet, &ctx, zone, force, commit, now, setmark,
-                    setlabel, nat_action_info, tp_src, tp_dst, helper);
     }
 
     ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type);
@@ -1233,6 +1282,7 @@  conntrack_clear(struct dp_packet *packet)
     /* According to pkt_metadata_init(), ct_state == 0 is enough to make all of
      * the conntrack fields invalid. */
     packet->md.ct_state = 0;
+    pkt_metadata_init_conn(&packet->md);
 }
 
 static void
diff --git a/lib/netdev.c b/lib/netdev.c
index b1976d3..af8f856 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -837,6 +837,7 @@  netdev_pop_header(struct netdev *netdev, struct dp_packet_batch *batch)
              * interpretation in the further packet processing when
              * recirculated.*/
             dp_packet_reset_offload(packet);
+            pkt_metadata_init_conn(&packet->md);
             dp_packet_batch_refill(batch, packet, i);
         }
     }
diff --git a/lib/packets.c b/lib/packets.c
index ab0b1a3..f59ce39 100644
--- a/lib/packets.c
+++ b/lib/packets.c
@@ -391,6 +391,8 @@  push_mpls(struct dp_packet *packet, ovs_be16 ethtype, ovs_be32 lse)
     header = dp_packet_resize_l2_5(packet, MPLS_HLEN);
     memmove(header, header + MPLS_HLEN, len);
     memcpy(header + len, &lse, sizeof lse);
+
+    pkt_metadata_init_conn(&packet->md);
 }
 
 /* If 'packet' is an MPLS packet, removes its outermost MPLS label stack entry.
@@ -979,6 +981,8 @@  packet_set_ipv4_addr(struct dp_packet *packet,
     ovs_be32 old_addr = get_16aligned_be32(addr);
     size_t l4_size = dp_packet_l4_size(packet);
 
+    pkt_metadata_init_conn(&packet->md);
+
     if (nh->ip_proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) {
         struct tcp_header *th = dp_packet_l4(packet);
 
@@ -1118,6 +1122,7 @@  packet_set_ipv6_addr(struct dp_packet *packet, uint8_t proto,
         packet_update_csum128(packet, proto, addr, new_addr);
     }
     memcpy(addr, new_addr, sizeof(ovs_be32[4]));
+    pkt_metadata_init_conn(&packet->md);
 }
 
 static void
@@ -1219,6 +1224,7 @@  packet_set_tcp_port(struct dp_packet *packet, ovs_be16 src, ovs_be16 dst)
 
     packet_set_port(&th->tcp_src, src, &th->tcp_csum);
     packet_set_port(&th->tcp_dst, dst, &th->tcp_csum);
+    pkt_metadata_init_conn(&packet->md);
 }
 
 /* Sets the UDP source and destination port ('src' and 'dst' respectively) of
@@ -1240,6 +1246,7 @@  packet_set_udp_port(struct dp_packet *packet, ovs_be16 src, ovs_be16 dst)
         uh->udp_src = src;
         uh->udp_dst = dst;
     }
+    pkt_metadata_init_conn(&packet->md);
 }
 
 /* Sets the SCTP source and destination port ('src' and 'dst' respectively) of
@@ -1261,6 +1268,7 @@  packet_set_sctp_port(struct dp_packet *packet, ovs_be16 src, ovs_be16 dst)
 
     new_csum = crc32c((void *)sh, tp_len);
     put_16aligned_be32(&sh->sctp_csum, old_csum ^ old_correct_csum ^ new_csum);
+    pkt_metadata_init_conn(&packet->md);
 }
 
 /* Sets the ICMP type and code of the ICMP header contained in 'packet'.
@@ -1279,6 +1287,7 @@  packet_set_icmp(struct dp_packet *packet, uint8_t type, uint8_t code)
 
         ih->icmp_csum = recalc_csum16(ih->icmp_csum, orig_tc, new_tc);
     }
+    pkt_metadata_init_conn(&packet->md);
 }
 
 /* Sets the IGMP type to IGMP_HOST_MEMBERSHIP_QUERY and populates the
diff --git a/lib/packets.h b/lib/packets.h
index a4bee38..0b3e3a2 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -35,6 +35,7 @@ 
 #include "timeval.h"
 
 struct dp_packet;
+struct conn;
 struct ds;
 
 /* Purely internal to OVS userspace. These flags should never be exposed to
@@ -108,6 +109,9 @@  PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
     uint32_t ct_mark;           /* Connection mark. */
     ovs_u128 ct_label;          /* Connection label. */
     union flow_in_port in_port; /* Input port. */
+    struct conn *conn;          /* Cached conntrack connection. */
+    bool reply;                 /* True if reply direction. */
+    bool icmp_related;          /* True if ICMP related. */
 );
 
 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
@@ -140,6 +144,12 @@  pkt_metadata_init_tnl(struct pkt_metadata *md)
 }
 
 static inline void
+pkt_metadata_init_conn(struct pkt_metadata *md)
+{
+    md->conn = NULL;
+}
+
+static inline void
 pkt_metadata_init(struct pkt_metadata *md, odp_port_t port)
 {
     /* This is called for every packet in userspace datapath and affects
@@ -157,6 +167,7 @@  pkt_metadata_init(struct pkt_metadata *md, odp_port_t port)
     md->tunnel.ip_dst = 0;
     md->tunnel.ipv6_dst = in6addr_any;
     md->in_port.odp_port = port;
+    md->conn = NULL;
 }
 
 /* This function prefetches the cachelines touched by pkt_metadata_init()
diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index 1a04199..c2276f7 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -5618,6 +5618,69 @@  grep "tcp,orig=(src=10.254.254.2,dst=10.1.1.1,sport=<cleared>,dport=<cleared>),r
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([conntrack - negative test for recirculation optimization])
+dnl This test will fail if 'conn' caching is being used, because the tuple
+dnl has been changed outside of conntrack.
+AT_SKIP_IF([test $HAVE_NC = no])
+CHECK_CONNTRACK()
+OVS_TRAFFIC_VSWITCHD_START()
+OVS_CHECK_CT_CLEAR()
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24", "f0:00:00:01:01:01") dnl FIP 10.254.254.1
+ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24", "f0:00:00:01:01:02") dnl FIP 10.254.254.2
+
+dnl Static ARPs
+NS_CHECK_EXEC([at_ns0], [ip neigh add 10.1.1.2 lladdr f0:00:00:01:01:02 dev p0])
+NS_CHECK_EXEC([at_ns1], [ip neigh add 10.1.1.1 lladdr f0:00:00:01:01:01 dev p1])
+
+dnl Static ARP and route entries for the FIP "gateway"
+NS_CHECK_EXEC([at_ns0], [ip neigh add 10.1.1.254 lladdr f0:00:00:01:01:FE dev p0])
+NS_CHECK_EXEC([at_ns1], [ip neigh add 10.1.1.254 lladdr f0:00:00:01:01:FE dev p1])
+NS_CHECK_EXEC([at_ns0], [ip route add default nexthop via 10.1.1.254])
+NS_CHECK_EXEC([at_ns1], [ip route add default nexthop via 10.1.1.254])
+
+NETNS_DAEMONIZE([at_ns0], [nc -l -k 1234 > /dev/null], [nc0.pid])
+
+AT_DATA([flows.txt], [dnl
+table=0,priority=10  ip action=ct(table=1)
+dnl dst FIP
+table=1,priority=20  ip,ct_state=+trk+est,nw_dst=10.254.254.0/24 action=goto_table:2
+table=1,priority=20  ip,ct_state=+trk+new,nw_dst=10.254.254.0/24 action=ct(commit,exec(set_field:1->ct_mark),table=2)
+dnl
+dnl FIP translation (dst FIP, src local) --> (dst local, src FIP)
+table=2             ip,nw_dst=10.254.254.1 action=set_field:10.1.1.1->nw_dst,goto_table:3
+table=2             ip,nw_dst=10.254.254.2 action=set_field:10.1.1.2->nw_dst,goto_table:3
+table=3             ip,nw_src=10.1.1.1 action=set_field:10.254.254.1->nw_src,goto_table:4
+table=3             ip,nw_src=10.1.1.2 action=set_field:10.254.254.2->nw_src,goto_table:4
+table=4             ip,nw_dst=10.1.1.1 action=set_field:f0:00:00:01:01:01->eth_dst,goto_table:5
+table=4             ip,nw_dst=10.1.1.2 action=set_field:f0:00:00:01:01:02->eth_dst,goto_table:5
+table=5             ip,nw_src=10.254.254.0/24 action=set_field:f0:00:00:01:01:FE->eth_src,goto_table:6
+dnl
+dnl Tuple has been changed outside of conntrack
+table=6,priority=10 ip action=ct(table=7)
+dnl
+table=7             ip,ct_state=+trk+est action=goto_table:8
+table=7             ip,ct_mark=0x0,ct_state=+trk+new action=ct(commit,exec(set_field:2->ct_mark),table=8)
+dnl
+table=8             ip,nw_dst=10.1.1.1 action=output:ovs-p0
+table=8             ip,nw_dst=10.1.1.2 action=output:ovs-p1
+])
+
+AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
+
+NS_CHECK_EXEC([at_ns1], [echo "foobar" |nc $NC_EOF_OPT 10.254.254.1 1234])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.254.254)], [0], [dnl
+tcp,orig=(src=10.1.1.2,dst=10.254.254.1,sport=<cleared>,dport=<cleared>),reply=(src=10.254.254.1,dst=10.1.1.2,sport=<cleared>,dport=<cleared>),mark=1,protoinfo=(state=<cleared>)
+tcp,orig=(src=10.254.254.2,dst=10.1.1.1,sport=<cleared>,dport=<cleared>),reply=(src=10.1.1.1,dst=10.254.254.2,sport=<cleared>,dport=<cleared>),mark=2,protoinfo=(state=<cleared>)
+])
+
+ovs-appctl dpif/dump-flows br0
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
+
 AT_BANNER([802.1ad])
 
 AT_SETUP([802.1ad - vlan_limit])