@@ -25,6 +25,7 @@ License, version 2.
The following files are licensed under the 2-clause BSD license.
include/windows/getopt.h
lib/getopt_long.c
+ lib/conntrack-tcp.c
The following files are licensed under the 3-clause BSD-license
include/windows/netinet/icmp6.h
@@ -21,6 +21,9 @@ Upstream Copyright Holders:
Copyright (c) 2014 Michael Chapman
Copyright (c) 2014 WindRiver, Inc.
Copyright (c) 2014 Avaya, Inc.
+ Copyright (c) 2001 Daniel Hartmeier
+ Copyright (c) 2002 - 2008 Henning Brauer
+ Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
License:
@@ -90,6 +93,7 @@ License:
lib/getopt_long.c
include/windows/getopt.h
datapath-windows/ovsext/Conntrack-tcp.c
+ lib/conntrack-tcp.c
* The following files are licensed under the 3-clause BSD-license
@@ -108,6 +108,10 @@ static const ovs_u128 OVS_U128_MAX = { { UINT32_MAX, UINT32_MAX,
UINT32_MAX, UINT32_MAX } };
static const ovs_be128 OVS_BE128_MAX OVS_UNUSED = { { OVS_BE32_MAX, OVS_BE32_MAX,
OVS_BE32_MAX, OVS_BE32_MAX } };
+static const ovs_u128 OVS_U128_MIN OVS_UNUSED = { {0, 0, 0, 0} };
+static const ovs_u128 OVS_BE128_MIN OVS_UNUSED = { {0, 0, 0, 0} };
+
+#define OVS_U128_ZERO OVS_U128_MIN
/* A 64-bit value, in network byte order, that is only aligned on a 32-bit
* boundary. */
@@ -49,6 +49,11 @@ lib_libopenvswitch_la_SOURCES = \
lib/compiler.h \
lib/connectivity.c \
lib/connectivity.h \
+ lib/conntrack-private.h \
+ lib/conntrack-tcp.c \
+ lib/conntrack-other.c \
+ lib/conntrack.c \
+ lib/conntrack.h \
lib/coverage.c \
lib/coverage.h \
lib/crc32c.c \
new file mode 100644
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "conntrack-private.h"
+#include "dp-packet.h"
+
+enum other_state {
+ OTHERS_FIRST,
+ OTHERS_MULTIPLE,
+ OTHERS_BIDIR,
+};
+
+struct conn_other {
+ struct conn up;
+ enum other_state state;
+};
+
+static const enum ct_timeout other_timeouts[] = {
+ [OTHERS_FIRST] = CT_TM_OTHER_FIRST,
+ [OTHERS_MULTIPLE] = CT_TM_OTHER_MULTIPLE,
+ [OTHERS_BIDIR] = CT_TM_OTHER_BIDIR,
+};
+
+static struct conn_other *
+conn_other_cast(const struct conn *conn)
+{
+ return CONTAINER_OF(conn, struct conn_other, up);
+}
+
+static enum ct_update_res
+other_conn_update(struct conn *conn_, struct dp_packet *pkt OVS_UNUSED,
+ bool reply, long long now)
+{
+ struct conn_other *conn = conn_other_cast(conn_);
+
+ if (reply && conn->state != OTHERS_BIDIR) {
+ conn->state = OTHERS_BIDIR;
+ } else if (conn->state == OTHERS_FIRST) {
+ conn->state = OTHERS_MULTIPLE;
+ }
+
+ update_expiration(conn_, other_timeouts[conn->state], now);
+
+ return CT_UPDATE_VALID;
+}
+
+static bool
+other_valid_new(struct dp_packet *pkt OVS_UNUSED)
+{
+ return true;
+}
+
+static struct conn *
+other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now)
+{
+ struct conn_other *conn;
+
+ conn = xzalloc(sizeof *conn);
+ conn->state = OTHERS_FIRST;
+
+ update_expiration(&conn->up, other_timeouts[conn->state], now);
+
+ return &conn->up;
+}
+
+struct ct_l4_proto ct_proto_other = {
+ .new_conn = other_new_conn,
+ .valid_new = other_valid_new,
+ .conn_update = other_conn_update,
+};
new file mode 100644
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CONNTRACK_PRIVATE_H
+#define CONNTRACK_PRIVATE_H 1
+
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netinet/ip6.h>
+
+#include "conntrack.h"
+#include "openvswitch/hmap.h"
+#include "openvswitch/list.h"
+#include "openvswitch/types.h"
+#include "packets.h"
+#include "unaligned.h"
+
+struct ct_addr {
+ union {
+ ovs_16aligned_be32 ipv4;
+ union ovs_16aligned_in6_addr ipv6;
+ ovs_be32 ipv4_aligned;
+ struct in6_addr ipv6_aligned;
+ };
+};
+
+struct ct_endpoint {
+ struct ct_addr addr;
+ ovs_be16 port;
+};
+
+/* Changes to this structure need to be reflected in conn_key_hash() */
+struct conn_key {
+ struct ct_endpoint src;
+ struct ct_endpoint dst;
+
+ ovs_be16 dl_type;
+ uint8_t nw_proto;
+ uint16_t zone;
+};
+
+struct conn {
+ struct conn_key key;
+ struct conn_key rev_key;
+ long long expiration;
+ struct ovs_list exp_node;
+ struct hmap_node node;
+ uint32_t mark;
+ ovs_u128 label;
+};
+
+enum ct_update_res {
+ CT_UPDATE_INVALID,
+ CT_UPDATE_VALID,
+ CT_UPDATE_NEW,
+};
+
+struct ct_l4_proto {
+ struct conn *(*new_conn)(struct dp_packet *pkt, long long now);
+ bool (*valid_new)(struct dp_packet *pkt);
+ enum ct_update_res (*conn_update)(struct conn *conn, struct dp_packet *pkt,
+ bool reply, long long now);
+};
+
+extern struct ct_l4_proto ct_proto_tcp;
+extern struct ct_l4_proto ct_proto_other;
+
+extern long long ct_timeout_val[];
+
+static inline void
+update_expiration(struct conn *conn, enum ct_timeout tm, long long now)
+{
+ conn->expiration = now + ct_timeout_val[tm];
+}
+
+#endif /* conntrack-private.h */
new file mode 100644
@@ -0,0 +1,462 @@
+/*-
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002 - 2008 Henning Brauer
+ * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
+ */
+
+#include <config.h>
+
+#include "conntrack-private.h"
+#include "ct-dpif.h"
+#include "dp-packet.h"
+#include "util.h"
+
+struct tcp_peer {
+ enum ct_dpif_tcp_state state;
+ uint32_t seqlo; /* Max sequence number sent */
+ uint32_t seqhi; /* Max the other end ACKd + win */
+ uint16_t max_win; /* largest window (pre scaling) */
+ uint8_t wscale; /* window scaling factor */
+};
+
+struct conn_tcp {
+ struct conn up;
+ struct tcp_peer peer[2];
+};
+
+enum {
+ TCPOPT_EOL,
+ TCPOPT_NOP,
+ TCPOPT_WINDOW = 3,
+};
+
+/* TCP sequence numbers are 32 bit integers operated
+ * on with modular arithmetic. These macros can be
+ * used to compare such integers. */
+#define SEQ_LT(a,b) INT_MOD_LT(a, b)
+#define SEQ_LEQ(a,b) INT_MOD_LEQ(a, b)
+#define SEQ_GT(a,b) INT_MOD_GT(a, b)
+#define SEQ_GEQ(a,b) INT_MOD_GEQ(a, b)
+
+#define SEQ_MIN(a, b) INT_MOD_MIN(a, b)
+#define SEQ_MAX(a, b) INT_MOD_MAX(a, b)
+
+static struct conn_tcp*
+conn_tcp_cast(const struct conn* conn)
+{
+ return CONTAINER_OF(conn, struct conn_tcp, up);
+}
+
+/* pf does this in in pf_normalize_tcp(), and it is called only if scrub
+ * is enabled. We're not scrubbing, but this check seems reasonable. */
+static bool
+tcp_invalid_flags(uint16_t flags)
+{
+
+ if (flags & TCP_SYN) {
+ if (flags & TCP_RST || flags & TCP_FIN) {
+ return true;
+ }
+ } else {
+ /* Illegal packet */
+ if (!(flags & (TCP_ACK|TCP_RST))) {
+ return true;
+ }
+ }
+
+ if (!(flags & TCP_ACK)) {
+ /* These flags are only valid if ACK is set */
+ if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+#define TCP_MAX_WSCALE 14
+#define CT_WSCALE_FLAG 0x80
+#define CT_WSCALE_UNKNOWN 0x40
+#define CT_WSCALE_MASK 0xf
+
+static uint8_t
+tcp_get_wscale(const struct tcp_header *tcp)
+{
+ int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;
+ const uint8_t *opt = (const uint8_t *)(tcp + 1);
+ uint8_t wscale = 0;
+ uint8_t optlen;
+
+ while (len >= 3) {
+ switch (*opt) {
+ case TCPOPT_EOL:
+ return wscale;
+ case TCPOPT_NOP:
+ opt++;
+ len--;
+ break;
+ case TCPOPT_WINDOW:
+ wscale = MIN(opt[2], TCP_MAX_WSCALE);
+ wscale |= CT_WSCALE_FLAG;
+ /* fall through */
+ default:
+ optlen = opt[1];
+ if (optlen < 2) {
+ optlen = 2;
+ }
+ len -= optlen;
+ opt += optlen;
+ }
+ }
+
+ return wscale;
+}
+
+static uint32_t
+tcp_payload_length(struct dp_packet *pkt)
+{
+ return (char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt)
+ - (char *) dp_packet_get_tcp_payload(pkt);
+}
+
+static enum ct_update_res
+tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool reply,
+ long long now)
+{
+ struct conn_tcp *conn = conn_tcp_cast(conn_);
+ struct tcp_header *tcp = dp_packet_l4(pkt);
+ /* The peer that sent 'pkt' */
+ struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
+ /* The peer that should receive 'pkt' */
+ struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
+ uint8_t sws = 0, dws = 0;
+ uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
+
+ uint16_t win = ntohs(tcp->tcp_winsz);
+ uint32_t ack, end, seq, orig_seq;
+ uint32_t p_len = tcp_payload_length(pkt);
+ int ackskew;
+
+ if (tcp_invalid_flags(tcp_flags)) {
+ return CT_UPDATE_INVALID;
+ }
+
+ if (((tcp_flags & (TCP_SYN | TCP_ACK)) == TCP_SYN)
+ && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
+ && src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
+ src->state = dst->state = CT_DPIF_TCPS_CLOSED;
+ return CT_UPDATE_NEW;
+ }
+
+ if (src->wscale & CT_WSCALE_FLAG
+ && dst->wscale & CT_WSCALE_FLAG
+ && !(tcp_flags & TCP_SYN)) {
+
+ sws = src->wscale & CT_WSCALE_MASK;
+ dws = dst->wscale & CT_WSCALE_MASK;
+
+ } else if (src->wscale & CT_WSCALE_UNKNOWN
+ && dst->wscale & CT_WSCALE_UNKNOWN
+ && !(tcp_flags & TCP_SYN)) {
+
+ sws = TCP_MAX_WSCALE;
+ dws = TCP_MAX_WSCALE;
+ }
+
+ /*
+ * Sequence tracking algorithm from Guido van Rooij's paper:
+ * http://www.madison-gurkha.com/publications/tcp_filtering/
+ * tcp_filtering.ps
+ */
+
+ orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
+ if (src->state < CT_DPIF_TCPS_SYN_SENT) {
+ /* First packet from this end. Set its state */
+
+ ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
+
+ end = seq + p_len;
+ if (tcp_flags & TCP_SYN) {
+ end++;
+ if (dst->wscale & CT_WSCALE_FLAG) {
+ src->wscale = tcp_get_wscale(tcp);
+ if (src->wscale & CT_WSCALE_FLAG) {
+ /* Remove scale factor from initial window */
+ sws = src->wscale & CT_WSCALE_MASK;
+ win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
+ dws = dst->wscale & CT_WSCALE_MASK;
+ } else {
+ /* fixup other window */
+ dst->max_win <<= dst->wscale & CT_WSCALE_MASK;
+ /* in case of a retrans SYN|ACK */
+ dst->wscale = 0;
+ }
+ }
+ }
+ if (tcp_flags & TCP_FIN) {
+ end++;
+ }
+
+ src->seqlo = seq;
+ src->state = CT_DPIF_TCPS_SYN_SENT;
+ /*
+ * May need to slide the window (seqhi may have been set by
+ * the crappy stack check or if we picked up the connection
+ * after establishment)
+ */
+ if (src->seqhi == 1
+ || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {
+ src->seqhi = end + MAX(1, dst->max_win << dws);
+ }
+ if (win > src->max_win) {
+ src->max_win = win;
+ }
+
+ } else {
+ ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
+ end = seq + p_len;
+ if (tcp_flags & TCP_SYN) {
+ end++;
+ }
+ if (tcp_flags & TCP_FIN) {
+ end++;
+ }
+ }
+
+ if ((tcp_flags & TCP_ACK) == 0) {
+ /* Let it pass through the ack skew check */
+ ack = dst->seqlo;
+ } else if ((ack == 0
+ && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))
+ /* broken tcp stacks do not set ack */) {
+ /* Many stacks (ours included) will set the ACK number in an
+ * FIN|ACK if the SYN times out -- no sequence to ACK. */
+ ack = dst->seqlo;
+ }
+
+ if (seq == end) {
+ /* Ease sequencing restrictions on no data packets */
+ seq = src->seqlo;
+ end = seq;
+ }
+
+ ackskew = dst->seqlo - ack;
+#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
+ if (SEQ_GEQ(src->seqhi, end)
+ /* Last octet inside other's window space */
+ && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
+ /* Retrans: not more than one window back */
+ && (ackskew >= -MAXACKWINDOW)
+ /* Acking not more than one reassembled fragment backwards */
+ && (ackskew <= (MAXACKWINDOW << sws))
+ /* Acking not more than one window forward */
+ && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo
+ || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo))) {
+ /* Require an exact/+1 sequence match on resets when possible */
+
+ /* update max window */
+ if (src->max_win < win) {
+ src->max_win = win;
+ }
+ /* synchronize sequencing */
+ if (SEQ_GT(end, src->seqlo)) {
+ src->seqlo = end;
+ }
+ /* slide the window of what the other end can send */
+ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
+ dst->seqhi = ack + MAX((win << sws), 1);
+ }
+
+ /* update states */
+ if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {
+ src->state = CT_DPIF_TCPS_SYN_SENT;
+ }
+ if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
+ src->state = CT_DPIF_TCPS_CLOSING;
+ }
+ if (tcp_flags & TCP_ACK) {
+ if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
+ dst->state = CT_DPIF_TCPS_ESTABLISHED;
+ } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
+ dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
+ }
+ }
+ if (tcp_flags & TCP_RST) {
+ src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
+ }
+
+ if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
+ && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
+ update_expiration(conn_, CT_TM_TCP_CLOSED, now);
+ } else if (src->state >= CT_DPIF_TCPS_CLOSING
+ && dst->state >= CT_DPIF_TCPS_CLOSING) {
+ update_expiration(conn_, CT_TM_TCP_FIN_WAIT, now);
+ } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
+ || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
+ update_expiration(conn_, now, CT_TM_TCP_OPENING);
+ } else if (src->state >= CT_DPIF_TCPS_CLOSING
+ || dst->state >= CT_DPIF_TCPS_CLOSING) {
+ update_expiration(conn_, now, CT_TM_TCP_CLOSING);
+ } else {
+ update_expiration(conn_, now, CT_TM_TCP_ESTABLISHED);
+ }
+ } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
+ || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
+ || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
+ && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
+ /* Within a window forward of the originating packet */
+ && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
+ /* Within a window backward of the originating packet */
+
+ /*
+ * This currently handles three situations:
+ * 1) Stupid stacks will shotgun SYNs before their peer
+ * replies.
+ * 2) When PF catches an already established stream (the
+ * firewall rebooted, the state table was flushed, routes
+ * changed...)
+ * 3) Packets get funky immediately after the connection
+ * closes (this should catch Solaris spurious ACK|FINs
+ * that web servers like to spew after a close)
+ *
+ * This must be a little more careful than the above code
+ * since packet floods will also be caught here. We don't
+ * update the TTL here to mitigate the damage of a packet
+ * flood and so the same code can handle awkward establishment
+ * and a loosened connection close.
+ * In the establishment case, a correct peer response will
+ * validate the connection, go through the normal state code
+ * and keep updating the state TTL.
+ */
+
+ /* update max window */
+ if (src->max_win < win) {
+ src->max_win = win;
+ }
+ /* synchronize sequencing */
+ if (SEQ_GT(end, src->seqlo)) {
+ src->seqlo = end;
+ }
+ /* slide the window of what the other end can send */
+ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
+ dst->seqhi = ack + MAX((win << sws), 1);
+ }
+
+ /*
+ * Cannot set dst->seqhi here since this could be a shotgunned
+ * SYN and not an already established connection.
+ */
+
+ if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
+ src->state = CT_DPIF_TCPS_CLOSING;
+ }
+
+ if (tcp_flags & TCP_RST) {
+ src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
+ }
+ } else {
+ return CT_UPDATE_INVALID;
+ }
+
+ return CT_UPDATE_VALID;
+}
+
+static bool
+tcp_valid_new(struct dp_packet *pkt)
+{
+ struct tcp_header *tcp = dp_packet_l4(pkt);
+ uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
+
+ if (tcp_invalid_flags(tcp_flags)) {
+ return false;
+ }
+
+ /* A syn+ack is not allowed to create a connection. We want to allow
+ * totally new connections (syn) or already established, not partially
+ * open (syn+ack). */
+ if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {
+ return false;
+ }
+
+ return true;
+}
+
+static struct conn *
+tcp_new_conn(struct dp_packet *pkt, long long now)
+{
+ struct conn_tcp* newconn = NULL;
+ struct tcp_header *tcp = dp_packet_l4(pkt);
+ struct tcp_peer *src, *dst;
+ uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
+
+ newconn = xzalloc(sizeof *newconn);
+
+ src = &newconn->peer[0];
+ dst = &newconn->peer[1];
+
+ src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));
+ src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1;
+
+ if (tcp_flags & TCP_SYN) {
+ src->seqhi++;
+ src->wscale = tcp_get_wscale(tcp);
+ } else {
+ src->wscale = CT_WSCALE_UNKNOWN;
+ dst->wscale = CT_WSCALE_UNKNOWN;
+ }
+ src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);
+ if (src->wscale & CT_WSCALE_MASK) {
+ /* Remove scale factor from initial window */
+ uint8_t sws = src->wscale & CT_WSCALE_MASK;
+ src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);
+ }
+ if (tcp_flags & TCP_FIN) {
+ src->seqhi++;
+ }
+ dst->seqhi = 1;
+ dst->max_win = 1;
+ src->state = CT_DPIF_TCPS_SYN_SENT;
+ dst->state = CT_DPIF_TCPS_CLOSED;
+
+ update_expiration(&newconn->up, now, CT_TM_TCP_FIRST_PACKET);
+
+ return &newconn->up;
+}
+
+struct ct_l4_proto ct_proto_tcp = {
+ .new_conn = tcp_new_conn,
+ .valid_new = tcp_valid_new,
+ .conn_update = tcp_conn_update,
+};
new file mode 100644
@@ -0,0 +1,890 @@
+/*
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include "conntrack.h"
+
+#include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netinet/icmp6.h>
+
+#include "bitmap.h"
+#include "conntrack-private.h"
+#include "coverage.h"
+#include "csum.h"
+#include "dp-packet.h"
+#include "flow.h"
+#include "netdev.h"
+#include "odp-netlink.h"
+#include "openvswitch/hmap.h"
+#include "openvswitch/vlog.h"
+#include "ovs-rcu.h"
+#include "random.h"
+#include "timeval.h"
+
+VLOG_DEFINE_THIS_MODULE(conntrack);
+
+COVERAGE_DEFINE(conntrack_full);
+
+struct conn_lookup_ctx {
+ struct conn_key key;
+ struct conn *conn;
+ uint32_t hash;
+ bool reply;
+ bool related;
+};
+
+static bool conn_key_extract(struct conntrack *, struct dp_packet *,
+ struct conn_lookup_ctx *, uint16_t zone);
+static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
+static void conn_key_reverse(struct conn_key *);
+static void conn_key_lookup(struct conntrack_bucket *ctb,
+ struct conn_lookup_ctx *ctx,
+ long long now);
+static bool valid_new(struct dp_packet *pkt, struct conn_key *);
+static struct conn *new_conn(struct dp_packet *pkt, struct conn_key *,
+ long long now);
+static void delete_conn(struct conn *);
+static enum ct_update_res conn_update(struct conn *, struct dp_packet*,
+ bool reply, long long now);
+static bool conn_expired(struct conn *, long long now);
+static void set_mark(struct dp_packet *, struct conn *,
+ uint32_t val, uint32_t mask);
+static void set_label(struct dp_packet *, struct conn *,
+ const struct ovs_key_ct_labels *val,
+ const struct ovs_key_ct_labels *mask);
+
+static struct ct_l4_proto *l4_protos[] = {
+ [IPPROTO_TCP] = &ct_proto_tcp,
+ [IPPROTO_UDP] = &ct_proto_other,
+ [IPPROTO_ICMP] = &ct_proto_other,
+ [IPPROTO_ICMPV6] = &ct_proto_other,
+};
+
+long long ct_timeout_val[] = {
+#define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
+ CT_TIMEOUTS
+#undef CT_TIMEOUT
+};
+
+/* If the total number of connections goes above this value, no new connections
+ * are accepted */
+#define DEFAULT_N_CONN_LIMIT 3000000
+
+/* Initializes the connection tracker 'ct'. The caller is responsible for
+ * calling 'conntrack_destroy()', when the instance is not needed anymore */
+void
+conntrack_init(struct conntrack *ct)
+{
+ unsigned i;
+
+ for (i = 0; i < CONNTRACK_BUCKETS; i++) {
+ struct conntrack_bucket *ctb = &ct->buckets[i];
+
+ ct_lock_init(&ctb->lock);
+ ct_lock_lock(&ctb->lock);
+ hmap_init(&ctb->connections);
+ ct_lock_unlock(&ctb->lock);
+ }
+ ct->hash_basis = random_uint32();
+ atomic_count_init(&ct->n_conn, 0);
+ atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
+}
+
+/* Destroys the connection tracker 'ct' and frees all the allocated memory. */
+void
+conntrack_destroy(struct conntrack *ct)
+{
+ unsigned i;
+
+ for (i = 0; i < CONNTRACK_BUCKETS; i++) {
+ struct conntrack_bucket *ctb = &ct->buckets[i];
+ struct conn *conn;
+
+ ct_lock_lock(&ctb->lock);
+ HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
+ atomic_count_dec(&ct->n_conn);
+ delete_conn(conn);
+ }
+ hmap_destroy(&ctb->connections);
+ ct_lock_unlock(&ctb->lock);
+ ct_lock_destroy(&ctb->lock);
+ }
+}
+
+static unsigned hash_to_bucket(uint32_t hash)
+{
+ /* Extracts the most significant bits in hash. The least significant bits
+ * are already used internally by the hmap implementation. */
+ BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
+
+ return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
+}
+
+static void
+write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone,
+ uint32_t mark, ovs_u128 label)
+{
+ pkt->md.ct_state = state | CS_TRACKED;
+ pkt->md.ct_zone = zone;
+ pkt->md.ct_mark = mark;
+ pkt->md.ct_label = label;
+}
+
+static struct conn *
+conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
+ struct conn_lookup_ctx *ctx, uint16_t *state, bool commit,
+ long long now)
+{
+ unsigned bucket = hash_to_bucket(ctx->hash);
+ struct conn *nc = NULL;
+
+ if (!valid_new(pkt, &ctx->key)) {
+ *state |= CS_INVALID;
+ return nc;
+ }
+
+ *state |= CS_NEW;
+
+ if (commit) {
+ unsigned int n_conn_limit;
+
+ atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
+
+ if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
+ COVERAGE_INC(conntrack_full);
+ return nc;
+ }
+
+ nc = new_conn(pkt, &ctx->key, now);
+
+ memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);
+
+ conn_key_reverse(&nc->rev_key);
+ hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
+ atomic_count_inc(&ct->n_conn);
+ }
+
+ return nc;
+}
+
+static struct conn *
+process_one(struct conntrack *ct, struct dp_packet *pkt,
+ struct conn_lookup_ctx *ctx, uint16_t zone,
+ bool commit, long long now)
+{
+ unsigned bucket = hash_to_bucket(ctx->hash);
+ struct conn *conn = ctx->conn;
+ uint16_t state = 0;
+
+ if (conn) {
+ if (ctx->related) {
+ state |= CS_RELATED;
+ if (ctx->reply) {
+ state |= CS_REPLY_DIR;
+ }
+ } else {
+ enum ct_update_res res;
+
+ res = conn_update(conn, pkt, ctx->reply, now);
+
+ switch (res) {
+ case CT_UPDATE_VALID:
+ state |= CS_ESTABLISHED;
+ if (ctx->reply) {
+ state |= CS_REPLY_DIR;
+ }
+ break;
+ case CT_UPDATE_INVALID:
+ state |= CS_INVALID;
+ break;
+ case CT_UPDATE_NEW:
+ hmap_remove(&ct->buckets[bucket].connections, &conn->node);
+ atomic_count_dec(&ct->n_conn);
+ delete_conn(conn);
+ conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
+ break;
+ default:
+ OVS_NOT_REACHED();
+ }
+ }
+ } else {
+ conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
+ }
+
+ write_ct_md(pkt, state, zone, conn ? conn->mark : 0,
+ conn ? conn->label : OVS_U128_ZERO);
+
+ return conn;
+}
+
+/* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
+ * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
+ * the l3 and and l4 offset properly set.
+ *
+ * If 'commit' is true, the packets are allowed to create new entries in the
+ * connection tables. 'setmark', if not NULL, should point to a two
+ * elements array containing a value and a mask to set the connection mark.
+ * 'setlabel' behaves similarly for the connection label.*/
+int
+conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
+ bool commit, uint16_t zone, const uint32_t *setmark,
+ const struct ovs_key_ct_labels *setlabel,
+ const char *helper)
+{
+ struct dp_packet **pkts = pkt_batch->packets;
+ size_t cnt = pkt_batch->count;
+#if !defined(__CHECKER__) && !defined(_WIN32)
+ const size_t KEY_ARRAY_SIZE = cnt;
+#else
+ enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };
+#endif
+ struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];
+ int8_t bucket_list[CONNTRACK_BUCKETS];
+ struct {
+ unsigned bucket;
+ unsigned long maps;
+ } arr[KEY_ARRAY_SIZE];
+ long long now = time_msec();
+ size_t i = 0;
+ uint8_t arrcnt = 0;
+
+ BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= NETDEV_MAX_BURST);
+
+ if (helper) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+
+ VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
+ /* Continue without the helper */
+ }
+
+ memset(bucket_list, INT8_C(-1), sizeof bucket_list);
+ for (i = 0; i < cnt; i++) {
+ unsigned bucket;
+
+ if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) {
+ write_ct_md(pkts[i], CS_INVALID, zone, 0, OVS_U128_ZERO);
+ continue;
+ }
+
+ bucket = hash_to_bucket(ctxs[i].hash);
+ if (bucket_list[bucket] == INT8_C(-1)) {
+ bucket_list[bucket] = arrcnt;
+
+ arr[arrcnt].maps = 0;
+ ULLONG_SET1(arr[arrcnt].maps, i);
+ arr[arrcnt++].bucket = bucket;
+ } else {
+ ULLONG_SET1(arr[bucket_list[bucket]].maps, i);
+ arr[bucket_list[bucket]].maps |= 1UL << i;
+ }
+ }
+
+ for (i = 0; i < arrcnt; i++) {
+ struct conntrack_bucket *ctb = &ct->buckets[arr[i].bucket];
+ size_t j;
+
+ ct_lock_lock(&ctb->lock);
+
+ ULLONG_FOR_EACH_1(j, arr[i].maps) {
+ struct conn *conn;
+
+ conn_key_lookup(ctb, &ctxs[j], now);
+
+ conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now);
+
+ if (conn && setmark) {
+ set_mark(pkts[j], conn, setmark[0], setmark[1]);
+ }
+
+ if (conn && setlabel) {
+ set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);
+ }
+ }
+ ct_lock_unlock(&ctb->lock);
+ }
+
+ return 0;
+}
+
+static void
+set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
+{
+ pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
+ conn->mark = pkt->md.ct_mark;
+}
+
+static void
+set_label(struct dp_packet *pkt, struct conn *conn,
+ const struct ovs_key_ct_labels *val,
+ const struct ovs_key_ct_labels *mask)
+{
+ ovs_u128 v, m;
+
+ memcpy(&v, val, sizeof v);
+ memcpy(&m, mask, sizeof m);
+
+ pkt->md.ct_label.u64.lo = v.u64.lo
+ | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
+ pkt->md.ct_label.u64.hi = v.u64.hi
+ | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
+ conn->label = pkt->md.ct_label;
+}
+
+/* Key extraction */
+
+/* The function stores a pointer to the first byte after the header in
+ * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
+ * not interested in the header's tail, meaning that the header has
+ * already been parsed (e.g. by flow_extract): we take this as a hint to
+ * save a few checks. If 'validate_checksum' is true, the function returns
+ * false if the IPv4 checksum is invalid. */
+static inline bool
+extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
+ const char **new_data, bool validate_checksum)
+{
+ const struct ip_header *ip = data;
+ size_t ip_len;
+
+ if (new_data) {
+ if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
+ return false;
+ }
+ }
+
+ ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
+
+ if (new_data) {
+ if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
+ return false;
+ }
+ if (OVS_UNLIKELY(size < ip_len)) {
+ return false;
+ }
+
+ *new_data = (char *) data + ip_len;
+ }
+
+ if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
+ return false;
+ }
+
+ if (validate_checksum && csum(data, ip_len) != 0) {
+ return false;
+ }
+
+ key->src.addr.ipv4 = ip->ip_src;
+ key->dst.addr.ipv4 = ip->ip_dst;
+ key->nw_proto = ip->ip_proto;
+
+ return true;
+}
+
+/* The function stores a pointer to the first byte after the header in
+ * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
+ * not interested in the header's tail, meaning that the header has
+ * already been parsed (e.g. by flow_extract): we take this as a hint to
+ * save a few checks. */
+static inline bool
+extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
+ const char **new_data)
+{
+ const struct ovs_16aligned_ip6_hdr *ip6 = data;
+ uint8_t nw_proto = ip6->ip6_nxt;
+ uint8_t nw_frag = 0;
+
+ if (new_data) {
+ if (OVS_UNLIKELY(size < sizeof *ip6)) {
+ return false;
+ }
+ }
+
+ data = ip6 + 1;
+ size -= sizeof *ip6;
+
+ if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
+ return false;
+ }
+
+ if (new_data) {
+ *new_data = data;
+ }
+
+ if (nw_frag) {
+ return false;
+ }
+
+ key->src.addr.ipv6 = ip6->ip6_src;
+ key->dst.addr.ipv6 = ip6->ip6_dst;
+ key->nw_proto = nw_proto;
+
+ return true;
+}
+
+static inline bool
+checksum_valid(const struct conn_key *key, const void *data, size_t size,
+ const void *l3)
+{
+ uint32_t csum = 0;
+
+ if (key->dl_type == htons(ETH_TYPE_IP)) {
+ csum = packet_csum_pseudoheader(l3);
+ } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
+ csum = packet_csum_pseudoheader6(l3);
+ } else {
+ return false;
+ }
+
+ csum = csum_continue(csum, data, size);
+
+ return csum_finish(csum) == 0;
+}
+
+static inline bool
+check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
+ const void *l3)
+{
+ const struct tcp_header *tcp = data;
+ size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
+
+ if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
+ return false;
+ }
+
+ return checksum_valid(key, data, size, l3);
+}
+
+static inline bool
+check_l4_udp(const struct conn_key *key, const void *data, size_t size,
+ const void *l3)
+{
+ const struct udp_header *udp = data;
+ size_t udp_len = ntohs(udp->udp_len);
+
+ if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
+ return false;
+ }
+
+ /* Validation must be skipped if checksum is 0 on IPv4 packets */
+ return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
+ || checksum_valid(key, data, size, l3);
+}
+
+static inline bool
+check_l4_icmp(const void *data, size_t size)
+{
+ return csum(data, size) == 0;
+}
+
+static inline bool
+check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
+ const void *l3)
+{
+ return checksum_valid(key, data, size, l3);
+}
+
+static inline bool
+extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
+{
+ const struct tcp_header *tcp = data;
+
+ if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
+ return false;
+ }
+
+ key->src.port = tcp->tcp_src;
+ key->dst.port = tcp->tcp_dst;
+
+ /* Port 0 is invalid */
+ return key->src.port && key->dst.port;
+}
+
+static inline bool
+extract_l4_udp(struct conn_key *key, const void *data, size_t size)
+{
+ const struct udp_header *udp = data;
+
+ if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
+ return false;
+ }
+
+ key->src.port = udp->udp_src;
+ key->dst.port = udp->udp_dst;
+
+ /* Port 0 is invalid */
+ return key->src.port && key->dst.port;
+}
+
+static inline bool extract_l4(struct conn_key *key, const void *data,
+ size_t size, bool *related, const void *l3);
+
+/* If 'related' is not NULL and the function is processing an ICMP
+ * error packet, extract the l3 and l4 fields from the nested header
+ * instead and set *related to true. If 'related' is NULL we're
+ * already processing a nested header and no such recursion is
+ * possible */
+static inline int
+extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
+ bool *related)
+{
+ const struct icmp_header *icmp = data;
+
+ if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
+ return false;
+ }
+
+ switch (icmp->icmp_type) {
+ case ICMP4_ECHO_REQUEST:
+ case ICMP4_ECHO_REPLY:
+ case ICMP4_TIMESTAMP:
+ case ICMP4_TIMESTAMPREPLY:
+ case ICMP4_INFOREQUEST:
+ case ICMP4_INFOREPLY:
+ /* Separate ICMP connection: identified using id */
+ key->src.port = key->dst.port = icmp->icmp_fields.echo.id;
+ break;
+ case ICMP4_DST_UNREACH:
+ case ICMP4_TIME_EXCEEDED:
+ case ICMP4_PARAM_PROB:
+ case ICMP4_SOURCEQUENCH:
+ case ICMP4_REDIRECT: {
+ /* ICMP packet part of another connection. We should
+ * extract the key from embedded packet header */
+ struct conn_key inner_key;
+ const char *l3 = (const char *) (icmp + 1);
+ const char *tail = (const char *) data + size;
+ const char *l4;
+ bool ok;
+
+ if (!related) {
+ return false;
+ }
+
+ memset(&inner_key, 0, sizeof inner_key);
+ inner_key.dl_type = htons(ETH_TYPE_IP);
+ ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
+ if (!ok) {
+ return false;
+ }
+
+ /* pf doesn't do this, but it seems a good idea */
+ if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
+ || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
+ return false;
+ }
+
+ key->src = inner_key.src;
+ key->dst = inner_key.dst;
+ key->nw_proto = inner_key.nw_proto;
+
+ ok = extract_l4(key, l4, tail - l4, NULL, l3);
+ if (ok) {
+ conn_key_reverse(key);
+ *related = true;
+ }
+ return ok;
+ }
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+/* If 'related' is not NULL and the function is processing an ICMP
+ * error packet, extract the l3 and l4 fields from the nested header
+ * instead and set *related to true. If 'related' is NULL we're
+ * already processing a nested header and no such recursion is
+ * possible */
+static inline bool
+extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
+ bool *related)
+{
+ const struct icmp6_header *icmp6 = data;
+
+ /* All the messages that we support need at least 4 bytes after
+ * the header */
+ if (size < sizeof *icmp6 + 4) {
+ return false;
+ }
+
+ switch (icmp6->icmp6_type) {
+ case ICMP6_ECHO_REQUEST:
+ case ICMP6_ECHO_REPLY:
+ /* Separate ICMP connection: identified using id */
+ key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1);
+ break;
+ case ICMP6_DST_UNREACH:
+ case ICMP6_PACKET_TOO_BIG:
+ case ICMP6_TIME_EXCEEDED:
+ case ICMP6_PARAM_PROB: {
+ /* ICMP packet part of another connection. We should
+ * extract the key from embedded packet header */
+ struct conn_key inner_key;
+ const char *l3 = (const char *) icmp6 + 8;
+ const char *tail = (const char *) data + size;
+ const char *l4 = NULL;
+ bool ok;
+
+ if (!related) {
+ return false;
+ }
+
+ memset(&inner_key, 0, sizeof inner_key);
+ inner_key.dl_type = htons(ETH_TYPE_IPV6);
+ ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
+ if (!ok) {
+ return false;
+ }
+
+ /* pf doesn't do this, but it seems a good idea */
+ if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
+ &key->dst.addr.ipv6_aligned)
+ || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
+ &key->src.addr.ipv6_aligned)) {
+ return false;
+ }
+
+ key->src = inner_key.src;
+ key->dst = inner_key.dst;
+ key->nw_proto = inner_key.nw_proto;
+
+ ok = extract_l4(key, l4, tail - l4, NULL, l3);
+ if (ok) {
+ conn_key_reverse(key);
+ *related = true;
+ }
+ return ok;
+ }
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+/* Extract l4 fields into 'key', which must already contain valid l3
+ * members.
+ *
+ * If 'related' is not NULL and an ICMP error packet is being
+ * processed, the function will extract the key from the packet nested
+ * in the ICMP paylod and set '*related' to true.
+ *
+ * If 'related' is NULL, it means that we're already parsing a header nested
+ * in an ICMP error. In this case, we skip checksum and length validation. */
+static inline bool
+extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
+ const void *l3)
+{
+ if (key->nw_proto == IPPROTO_TCP) {
+ return (!related || check_l4_tcp(key, data, size, l3))
+ && extract_l4_tcp(key, data, size);
+ } else if (key->nw_proto == IPPROTO_UDP) {
+ return (!related || check_l4_udp(key, data, size, l3))
+ && extract_l4_udp(key, data, size);
+ } else if (key->dl_type == htons(ETH_TYPE_IP)
+ && key->nw_proto == IPPROTO_ICMP) {
+ return (!related || check_l4_icmp(data, size))
+ && extract_l4_icmp(key, data, size, related);
+ } else if (key->dl_type == htons(ETH_TYPE_IPV6)
+ && key->nw_proto == IPPROTO_ICMPV6) {
+ return (!related || check_l4_icmp6(key, data, size, l3))
+ && extract_l4_icmp6(key, data, size, related);
+ } else {
+ return false;
+ }
+}
+
+static bool
+conn_key_extract(struct conntrack *ct, struct dp_packet *pkt,
+ struct conn_lookup_ctx *ctx, uint16_t zone)
+{
+ const struct eth_header *l2 = dp_packet_l2(pkt);
+ const struct ip_header *l3 = dp_packet_l3(pkt);
+ const char *l4 = dp_packet_l4(pkt);
+ const char *tail = dp_packet_tail(pkt);
+ bool ok;
+
+ memset(ctx, 0, sizeof *ctx);
+
+ if (!l2 || !l3 || !l4) {
+ return false;
+ }
+
+ ctx->key.zone = zone;
+
+ /* XXX In this function we parse the packet (again, it has already
+ * gone through miniflow_extract()) for two reasons:
+ *
+ * 1) To extract the l3 addresses and l4 ports.
+ * We already have the l3 and l4 headers' pointers. Extracting
+ * the l3 addresses and the l4 ports is really cheap, since they
+ * can be found at fixed locations.
+ * 2) To extract the l3 and l4 types.
+ * Extracting the l3 and l4 types (especially the l3[1]) on the
+ * other hand is quite expensive, because they're not at a
+ * fixed location.
+ *
+ * Here's a way to avoid (2) with the help of the datapath.
+ * The datapath doesn't keep the packet's extracted flow[2], so
+ * using that is not an option. We could use the packet's matching
+ * megaflow for l3 type (it's always unwildcarded), and for l4 type
+ * (we have to unwildcard it first). This means either:
+ *
+ * a) dpif-netdev passes the matching megaflow to dp_execute_cb(), which
+ * is used to extract the l3 type. Unfortunately, dp_execute_cb() is
+ * used also in dpif_netdev_execute(), which doesn't have a matching
+ * megaflow.
+ *
+ * b) We define an alternative OVS_ACTION_ATTR_CT, used only by the
+ * userspace datapath, which includes l3 (and l4) type. The
+ * alternative action could be generated by ofproto-dpif specifically
+ * for the userspace datapath. Having a different interface for
+ * userspace and kernel doesn't seem very clean, though.
+ *
+ * ---
+ * [1] A simple benchmark (running only the connection tracker
+ * over and over on the same packets) shows that if the
+ * l3 type is already provided we are 15% faster (running the
+ * connection tracker over a couple of DPDK devices with a
+ * stream of UDP 64-bytes packets shows that we are 4% faster).
+ *
+ * [2] The reasons for this are that keeping the flow increases
+ * (slightly) the cache footprint and increases computation
+ * time as we move the packet around. Most importantly, the flow
+ * should be updated by the actions and this can be slow, as
+ * we use a sparse representation (miniflow).
+ *
+ */
+ ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2);
+ if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
+ ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, true);
+ } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
+ ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
+ } else {
+ ok = false;
+ }
+
+ if (ok) {
+ if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) {
+ ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/* Symmetric */
+static uint32_t
+conn_key_hash(const struct conn_key *key, uint32_t basis)
+{
+ uint32_t hsrc, hdst, hash;
+ int i;
+
+ hsrc = hdst = basis;
+
+ /* Hash the source and destination tuple */
+ for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
+ hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
+ hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
+ }
+
+ /* Even if source and destination are swapped the hash will be the same. */
+ hash = hsrc ^ hdst;
+
+ /* Hash the rest of the key(L3 and L4 types and zone). */
+ hash = hash_words((uint32_t *) &key->dst + 1,
+ (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
+ hash);
+
+ return hash;
+}
+
+static void
+conn_key_reverse(struct conn_key *key)
+{
+ struct ct_endpoint tmp;
+ tmp = key->src;
+ key->src = key->dst;
+ key->dst = tmp;
+}
+
+static void
+conn_key_lookup(struct conntrack_bucket *ctb,
+ struct conn_lookup_ctx *ctx,
+ long long now)
+{
+ uint32_t hash = ctx->hash;
+ struct conn *conn;
+
+ ctx->conn = NULL;
+
+ HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
+ if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))
+ && !conn_expired(conn, now)) {
+ ctx->conn = conn;
+ ctx->reply = false;
+ break;
+ }
+ if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))
+ && !conn_expired(conn, now)) {
+ ctx->conn = conn;
+ ctx->reply = true;
+ break;
+ }
+ }
+}
+
+static enum ct_update_res
+conn_update(struct conn *conn, struct dp_packet *pkt, bool reply,
+ long long now)
+{
+ return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt, reply, now);
+}
+
+static bool
+conn_expired(struct conn *conn, long long now)
+{
+ return now >= conn->expiration;
+}
+
+static bool
+valid_new(struct dp_packet *pkt, struct conn_key *key)
+{
+ return l4_protos[key->nw_proto]->valid_new(pkt);
+}
+
+static struct conn *
+new_conn(struct dp_packet *pkt, struct conn_key *key, long long now)
+{
+ struct conn *newconn;
+
+ newconn = l4_protos[key->nw_proto]->new_conn(pkt, now);
+
+ if (newconn) {
+ newconn->key = *key;
+ }
+
+ return newconn;
+}
+
+static void
+delete_conn(struct conn *conn)
+{
+ free(conn);
+}
new file mode 100644
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CONNTRACK_H
+#define CONNTRACK_H 1
+
+#include <stdbool.h>
+
+#include "odp-netlink.h"
+#include "openvswitch/hmap.h"
+#include "openvswitch/thread.h"
+#include "openvswitch/types.h"
+#include "ovs-atomic.h"
+
+/* Userspace connection tracker
+ * ============================
+ *
+ * This is a connection tracking module that keeps all the state in userspace.
+ *
+ * Usage
+ * =====
+ *
+ * struct conntrack ct;
+ *
+ * Initialization:
+ *
+ * conntrack_init(&ct);
+ *
+ * It is necessary to periodically issue a call to
+ *
+ * conntrack_run(&ct);
+ *
+ * to allow the module to clean up expired connections.
+ *
+ * To send a group of packets through the connection tracker:
+ *
+ * conntrack_execute(&ct, pkts, n_pkts, ...);
+ *
+ * Thread-safety
+ * =============
+ *
+ * conntrack_execute() can be called by multiple threads simultaneoulsy.
+ */
+
+struct dp_packet_batch;
+
+struct conntrack;
+
+void conntrack_init(struct conntrack *);
+void conntrack_run(struct conntrack *);
+void conntrack_destroy(struct conntrack *);
+
+int conntrack_execute(struct conntrack *, struct dp_packet_batch *,
+ bool commit, uint16_t zone, const uint32_t *setmark,
+ const struct ovs_key_ct_labels *setlabel,
+ const char *helper);
+
+/* 'struct ct_lock' is a wrapper for an adaptive mutex. It's useful to try
+ * different types of locks (e.g. spinlocks) */
+
+struct OVS_LOCKABLE ct_lock {
+ struct ovs_mutex lock;
+};
+
+static inline void ct_lock_init(struct ct_lock *lock)
+{
+ ovs_mutex_init_adaptive(&lock->lock);
+}
+
+static inline void ct_lock_lock(struct ct_lock *lock)
+ OVS_ACQUIRES(lock)
+ OVS_NO_THREAD_SAFETY_ANALYSIS
+{
+ ovs_mutex_lock(&lock->lock);
+}
+
+static inline void ct_lock_unlock(struct ct_lock *lock)
+ OVS_RELEASES(lock)
+ OVS_NO_THREAD_SAFETY_ANALYSIS
+{
+ ovs_mutex_unlock(&lock->lock);
+}
+
+static inline void ct_lock_destroy(struct ct_lock *lock)
+{
+ ovs_mutex_destroy(&lock->lock);
+}
+
+/* Timeouts: all the possible timeout states passed to update_expiration()
+ * are listed here. The name will be prefix by CT_TM_ and the value is in
+ * milliseconds */
+#define CT_TIMEOUTS \
+ CT_TIMEOUT(TCP_FIRST_PACKET, 30 * 1000) \
+ CT_TIMEOUT(TCP_OPENING, 30 * 1000) \
+ CT_TIMEOUT(TCP_ESTABLISHED, 24 * 60 * 60 * 1000) \
+ CT_TIMEOUT(TCP_CLOSING, 15 * 60 * 1000) \
+ CT_TIMEOUT(TCP_FIN_WAIT, 45 * 1000) \
+ CT_TIMEOUT(TCP_CLOSED, 30 * 1000) \
+ CT_TIMEOUT(OTHER_FIRST, 60 * 1000) \
+ CT_TIMEOUT(OTHER_MULTIPLE, 60 * 1000) \
+ CT_TIMEOUT(OTHER_BIDIR, 30 * 1000) \
+
+enum ct_timeout {
+#define CT_TIMEOUT(NAME, VALUE) CT_TM_##NAME,
+ CT_TIMEOUTS
+#undef CT_TIMEOUT
+ N_CT_TM
+};
+
+/* Locking:
+ *
+ * The connections are kept in different buckets, which are completely
+ * independent. The connection bucket is determined by the hash of its key.
+ * */
+struct conntrack_bucket {
+ struct ct_lock lock;
+ struct hmap connections OVS_GUARDED;
+};
+
+#define CONNTRACK_BUCKETS_SHIFT 8
+#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT)
+
+struct conntrack {
+ /* Independent buckets containing the connections */
+ struct conntrack_bucket buckets[CONNTRACK_BUCKETS];
+
+ /* Salt for hashing a connection key. */
+ uint32_t hash_basis;
+
+ /* Number of connections currently in the connection tracker. */
+ atomic_count n_conn;
+ /* Connections limit. When this limit is reached, no new connection
+ * will be accepted. */
+ atomic_uint n_conn_limit;
+};
+
+#endif /* conntrack.h */
@@ -70,6 +70,15 @@ ovs_prefetch_range(const void *start, size_t size)
#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
#endif
+/* Comparisons for ints with modular arithmetic */
+#define INT_MOD_LT(a,b) ((int) ((a)-(b)) < 0)
+#define INT_MOD_LEQ(a,b) ((int) ((a)-(b)) <= 0)
+#define INT_MOD_GT(a,b) ((int) ((a)-(b)) > 0)
+#define INT_MOD_GEQ(a,b) ((int) ((a)-(b)) >= 0)
+
+#define INT_MOD_MIN(a, b) ((INT_MOD_LT(a, b)) ? (a) : (b))
+#define INT_MOD_MAX(a, b) ((INT_MOD_GT(a, b)) ? (a) : (b))
+
#define OVS_NOT_REACHED() abort()
/* Use "%"PRIuSIZE to format size_t with printf(). */