[ovs-dev,v5,04/16] conntrack: New userspace connection tracker.

Message ID	1469581096-42007-5-git-send-email-diproiettod@vmware.com
State	Accepted
Headers	show Return-Path: <dev-bounces@openvswitch.org> Received-SPF: error (mx3-pf1.cudamail.com: error in processing during lookup of vmware.com: DNS problem) From: Daniele Di Proietto <diproiettod@vmware.com> To: <dev@openvswitch.org> Date: Tue, 26 Jul 2016 17:58:04 -0700 conntrack: New userspace connection tracker. Message-ID: <1469581096-42007-5-git-send-email-diproiettod@vmware.com> In-Reply-To: <1469581096-42007-1-git-send-email-diproiettod@vmware.com> References: <1469581096-42007-1-git-send-email-diproiettod@vmware.com> MIME-Version: 1.0 Received-SPF: None (EX13-EDG-OU-001.vmware.com: diproiettod@vmware.com does not designate permitted sender hosts) Subject: [ovs-dev] [PATCH v5 04/16] conntrack: New userspace connection tracker. Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: dev-bounces@openvswitch.org Sender: "dev" <dev-bounces@openvswitch.org>

diff --git a/COPYING b/COPYING index 308e3ea..afb98b9 100644 --- a/COPYING +++ b/COPYING @@ -25,6 +25,7 @@ License, version 2. The following files are licensed under the 2-clause BSD license. include/windows/getopt.h lib/getopt_long.c + lib/conntrack-tcp.c The following files are licensed under the 3-clause BSD-license include/windows/netinet/icmp6.h diff --git a/debian/copyright.in b/debian/copyright.in index 57d007a..a15f4dd 100644 --- a/debian/copyright.in +++ b/debian/copyright.in @@ -21,6 +21,9 @@ Upstream Copyright Holders: Copyright (c) 2014 Michael Chapman Copyright (c) 2014 WindRiver, Inc. Copyright (c) 2014 Avaya, Inc. + Copyright (c) 2001 Daniel Hartmeier + Copyright (c) 2002 - 2008 Henning Brauer + Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org> License: @@ -90,6 +93,7 @@ License: lib/getopt_long.c include/windows/getopt.h datapath-windows/ovsext/Conntrack-tcp.c + lib/conntrack-tcp.c * The following files are licensed under the 3-clause BSD-license diff --git a/include/openvswitch/types.h b/include/openvswitch/types.h index da56d4b..2f5fcca 100644 --- a/include/openvswitch/types.h +++ b/include/openvswitch/types.h @@ -108,6 +108,10 @@ static const ovs_u128 OVS_U128_MAX = { { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX } }; static const ovs_be128 OVS_BE128_MAX OVS_UNUSED = { { OVS_BE32_MAX, OVS_BE32_MAX, OVS_BE32_MAX, OVS_BE32_MAX } }; +static const ovs_u128 OVS_U128_MIN OVS_UNUSED = { {0, 0, 0, 0} }; +static const ovs_u128 OVS_BE128_MIN OVS_UNUSED = { {0, 0, 0, 0} }; + +#define OVS_U128_ZERO OVS_U128_MIN /* A 64-bit value, in network byte order, that is only aligned on a 32-bit * boundary. */ diff --git a/lib/automake.mk b/lib/automake.mk index 71c9d41..b1da53d 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -49,6 +49,11 @@ lib_libopenvswitch_la_SOURCES = \ lib/compiler.h \ lib/connectivity.c \ lib/connectivity.h \ + lib/conntrack-private.h \ + lib/conntrack-tcp.c \ + lib/conntrack-other.c \ + lib/conntrack.c \ + lib/conntrack.h \ lib/coverage.c \ lib/coverage.h \ lib/crc32c.c \ diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c new file mode 100644 index 0000000..295cb2c --- /dev/null +++ b/lib/conntrack-other.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2015, 2016 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <config.h> + +#include "conntrack-private.h" +#include "dp-packet.h" + +enum other_state { + OTHERS_FIRST, + OTHERS_MULTIPLE, + OTHERS_BIDIR, +}; + +struct conn_other { + struct conn up; + enum other_state state; +}; + +static const enum ct_timeout other_timeouts[] = { + [OTHERS_FIRST] = CT_TM_OTHER_FIRST, + [OTHERS_MULTIPLE] = CT_TM_OTHER_MULTIPLE, + [OTHERS_BIDIR] = CT_TM_OTHER_BIDIR, +}; + +static struct conn_other * +conn_other_cast(const struct conn *conn) +{ + return CONTAINER_OF(conn, struct conn_other, up); +} + +static enum ct_update_res +other_conn_update(struct conn *conn_, struct dp_packet *pkt OVS_UNUSED, + bool reply, long long now) +{ + struct conn_other *conn = conn_other_cast(conn_); + + if (reply && conn->state != OTHERS_BIDIR) { + conn->state = OTHERS_BIDIR; + } else if (conn->state == OTHERS_FIRST) { + conn->state = OTHERS_MULTIPLE; + } + + update_expiration(conn_, other_timeouts[conn->state], now); + + return CT_UPDATE_VALID; +} + +static bool +other_valid_new(struct dp_packet *pkt OVS_UNUSED) +{ + return true; +} + +static struct conn * +other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now) +{ + struct conn_other *conn; + + conn = xzalloc(sizeof *conn); + conn->state = OTHERS_FIRST; + + update_expiration(&conn->up, other_timeouts[conn->state], now); + + return &conn->up; +} + +struct ct_l4_proto ct_proto_other = { + .new_conn = other_new_conn, + .valid_new = other_valid_new, + .conn_update = other_conn_update, +}; diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h new file mode 100644 index 0000000..bc32448 --- /dev/null +++ b/lib/conntrack-private.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2015, 2016 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CONNTRACK_PRIVATE_H +#define CONNTRACK_PRIVATE_H 1 + +#include <sys/types.h> +#include <netinet/in.h> +#include <netinet/ip6.h> + +#include "conntrack.h" +#include "openvswitch/hmap.h" +#include "openvswitch/list.h" +#include "openvswitch/types.h" +#include "packets.h" +#include "unaligned.h" + +struct ct_addr { + union { + ovs_16aligned_be32 ipv4; + union ovs_16aligned_in6_addr ipv6; + ovs_be32 ipv4_aligned; + struct in6_addr ipv6_aligned; + }; +}; + +struct ct_endpoint { + struct ct_addr addr; + ovs_be16 port; +}; + +/* Changes to this structure need to be reflected in conn_key_hash() */ +struct conn_key { + struct ct_endpoint src; + struct ct_endpoint dst; + + ovs_be16 dl_type; + uint8_t nw_proto; + uint16_t zone; +}; + +struct conn { + struct conn_key key; + struct conn_key rev_key; + long long expiration; + struct ovs_list exp_node; + struct hmap_node node; + uint32_t mark; + ovs_u128 label; +}; + +enum ct_update_res { + CT_UPDATE_INVALID, + CT_UPDATE_VALID, + CT_UPDATE_NEW, +}; + +struct ct_l4_proto { + struct conn *(*new_conn)(struct dp_packet *pkt, long long now); + bool (*valid_new)(struct dp_packet *pkt); + enum ct_update_res (*conn_update)(struct conn *conn, struct dp_packet *pkt, + bool reply, long long now); +}; + +extern struct ct_l4_proto ct_proto_tcp; +extern struct ct_l4_proto ct_proto_other; + +extern long long ct_timeout_val[]; + +static inline void +update_expiration(struct conn *conn, enum ct_timeout tm, long long now) +{ + conn->expiration = now + ct_timeout_val[tm]; +} + +#endif /* conntrack-private.h */ diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c new file mode 100644 index 0000000..6da798d --- /dev/null +++ b/lib/conntrack-tcp.c @@ -0,0 +1,462 @@ +/*- + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002 - 2008 Henning Brauer + * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org> + * Copyright (c) 2015, 2016 Nicira, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ + */ + +#include <config.h> + +#include "conntrack-private.h" +#include "ct-dpif.h" +#include "dp-packet.h" +#include "util.h" + +struct tcp_peer { + enum ct_dpif_tcp_state state; + uint32_t seqlo; /* Max sequence number sent */ + uint32_t seqhi; /* Max the other end ACKd + win */ + uint16_t max_win; /* largest window (pre scaling) */ + uint8_t wscale; /* window scaling factor */ +}; + +struct conn_tcp { + struct conn up; + struct tcp_peer peer[2]; +}; + +enum { + TCPOPT_EOL, + TCPOPT_NOP, + TCPOPT_WINDOW = 3, +}; + +/* TCP sequence numbers are 32 bit integers operated + * on with modular arithmetic. These macros can be + * used to compare such integers. */ +#define SEQ_LT(a,b) INT_MOD_LT(a, b) +#define SEQ_LEQ(a,b) INT_MOD_LEQ(a, b) +#define SEQ_GT(a,b) INT_MOD_GT(a, b) +#define SEQ_GEQ(a,b) INT_MOD_GEQ(a, b) + +#define SEQ_MIN(a, b) INT_MOD_MIN(a, b) +#define SEQ_MAX(a, b) INT_MOD_MAX(a, b) + +static struct conn_tcp* +conn_tcp_cast(const struct conn* conn) +{ + return CONTAINER_OF(conn, struct conn_tcp, up); +} + +/* pf does this in in pf_normalize_tcp(), and it is called only if scrub + * is enabled. We're not scrubbing, but this check seems reasonable. */ +static bool +tcp_invalid_flags(uint16_t flags) +{ + + if (flags & TCP_SYN) { + if (flags & TCP_RST || flags & TCP_FIN) { + return true; + } + } else { + /* Illegal packet */ + if (!(flags & (TCP_ACK|TCP_RST))) { + return true; + } + } + + if (!(flags & TCP_ACK)) { + /* These flags are only valid if ACK is set */ + if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) { + return true; + } + } + + return false; +} + +#define TCP_MAX_WSCALE 14 +#define CT_WSCALE_FLAG 0x80 +#define CT_WSCALE_UNKNOWN 0x40 +#define CT_WSCALE_MASK 0xf + +static uint8_t +tcp_get_wscale(const struct tcp_header *tcp) +{ + int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp; + const uint8_t *opt = (const uint8_t *)(tcp + 1); + uint8_t wscale = 0; + uint8_t optlen; + + while (len >= 3) { + switch (*opt) { + case TCPOPT_EOL: + return wscale; + case TCPOPT_NOP: + opt++; + len--; + break; + case TCPOPT_WINDOW: + wscale = MIN(opt[2], TCP_MAX_WSCALE); + wscale |= CT_WSCALE_FLAG; + /* fall through */ + default: + optlen = opt[1]; + if (optlen < 2) { + optlen = 2; + } + len -= optlen; + opt += optlen; + } + } + + return wscale; +} + +static uint32_t +tcp_payload_length(struct dp_packet *pkt) +{ + return (char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt) + - (char *) dp_packet_get_tcp_payload(pkt); +} + +static enum ct_update_res +tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool reply, + long long now) +{ + struct conn_tcp *conn = conn_tcp_cast(conn_); + struct tcp_header *tcp = dp_packet_l4(pkt); + /* The peer that sent 'pkt' */ + struct tcp_peer *src = &conn->peer[reply ? 1 : 0]; + /* The peer that should receive 'pkt' */ + struct tcp_peer *dst = &conn->peer[reply ? 0 : 1]; + uint8_t sws = 0, dws = 0; + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); + + uint16_t win = ntohs(tcp->tcp_winsz); + uint32_t ack, end, seq, orig_seq; + uint32_t p_len = tcp_payload_length(pkt); + int ackskew; + + if (tcp_invalid_flags(tcp_flags)) { + return CT_UPDATE_INVALID; + } + + if (((tcp_flags & (TCP_SYN | TCP_ACK)) == TCP_SYN) + && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 + && src->state >= CT_DPIF_TCPS_FIN_WAIT_2) { + src->state = dst->state = CT_DPIF_TCPS_CLOSED; + return CT_UPDATE_NEW; + } + + if (src->wscale & CT_WSCALE_FLAG + && dst->wscale & CT_WSCALE_FLAG + && !(tcp_flags & TCP_SYN)) { + + sws = src->wscale & CT_WSCALE_MASK; + dws = dst->wscale & CT_WSCALE_MASK; + + } else if (src->wscale & CT_WSCALE_UNKNOWN + && dst->wscale & CT_WSCALE_UNKNOWN + && !(tcp_flags & TCP_SYN)) { + + sws = TCP_MAX_WSCALE; + dws = TCP_MAX_WSCALE; + } + + /* + * Sequence tracking algorithm from Guido van Rooij's paper: + * http://www.madison-gurkha.com/publications/tcp_filtering/ + * tcp_filtering.ps + */ + + orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq)); + if (src->state < CT_DPIF_TCPS_SYN_SENT) { + /* First packet from this end. Set its state */ + + ack = ntohl(get_16aligned_be32(&tcp->tcp_ack)); + + end = seq + p_len; + if (tcp_flags & TCP_SYN) { + end++; + if (dst->wscale & CT_WSCALE_FLAG) { + src->wscale = tcp_get_wscale(tcp); + if (src->wscale & CT_WSCALE_FLAG) { + /* Remove scale factor from initial window */ + sws = src->wscale & CT_WSCALE_MASK; + win = DIV_ROUND_UP((uint32_t) win, 1 << sws); + dws = dst->wscale & CT_WSCALE_MASK; + } else { + /* fixup other window */ + dst->max_win <<= dst->wscale & CT_WSCALE_MASK; + /* in case of a retrans SYN|ACK */ + dst->wscale = 0; + } + } + } + if (tcp_flags & TCP_FIN) { + end++; + } + + src->seqlo = seq; + src->state = CT_DPIF_TCPS_SYN_SENT; + /* + * May need to slide the window (seqhi may have been set by + * the crappy stack check or if we picked up the connection + * after establishment) + */ + if (src->seqhi == 1 + || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) { + src->seqhi = end + MAX(1, dst->max_win << dws); + } + if (win > src->max_win) { + src->max_win = win; + } + + } else { + ack = ntohl(get_16aligned_be32(&tcp->tcp_ack)); + end = seq + p_len; + if (tcp_flags & TCP_SYN) { + end++; + } + if (tcp_flags & TCP_FIN) { + end++; + } + } + + if ((tcp_flags & TCP_ACK) == 0) { + /* Let it pass through the ack skew check */ + ack = dst->seqlo; + } else if ((ack == 0 + && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST)) + /* broken tcp stacks do not set ack */) { + /* Many stacks (ours included) will set the ACK number in an + * FIN|ACK if the SYN times out -- no sequence to ACK. */ + ack = dst->seqlo; + } + + if (seq == end) { + /* Ease sequencing restrictions on no data packets */ + seq = src->seqlo; + end = seq; + } + + ackskew = dst->seqlo - ack; +#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ + if (SEQ_GEQ(src->seqhi, end) + /* Last octet inside other's window space */ + && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) + /* Retrans: not more than one window back */ + && (ackskew >= -MAXACKWINDOW) + /* Acking not more than one reassembled fragment backwards */ + && (ackskew <= (MAXACKWINDOW << sws)) + /* Acking not more than one window forward */ + && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo + || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo))) { + /* Require an exact/+1 sequence match on resets when possible */ + + /* update max window */ + if (src->max_win < win) { + src->max_win = win; + } + /* synchronize sequencing */ + if (SEQ_GT(end, src->seqlo)) { + src->seqlo = end; + } + /* slide the window of what the other end can send */ + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { + dst->seqhi = ack + MAX((win << sws), 1); + } + + /* update states */ + if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) { + src->state = CT_DPIF_TCPS_SYN_SENT; + } + if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) { + src->state = CT_DPIF_TCPS_CLOSING; + } + if (tcp_flags & TCP_ACK) { + if (dst->state == CT_DPIF_TCPS_SYN_SENT) { + dst->state = CT_DPIF_TCPS_ESTABLISHED; + } else if (dst->state == CT_DPIF_TCPS_CLOSING) { + dst->state = CT_DPIF_TCPS_FIN_WAIT_2; + } + } + if (tcp_flags & TCP_RST) { + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; + } + + if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2 + && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) { + update_expiration(conn_, CT_TM_TCP_CLOSED, now); + } else if (src->state >= CT_DPIF_TCPS_CLOSING + && dst->state >= CT_DPIF_TCPS_CLOSING) { + update_expiration(conn_, CT_TM_TCP_FIN_WAIT, now); + } else if (src->state < CT_DPIF_TCPS_ESTABLISHED + || dst->state < CT_DPIF_TCPS_ESTABLISHED) { + update_expiration(conn_, now, CT_TM_TCP_OPENING); + } else if (src->state >= CT_DPIF_TCPS_CLOSING + || dst->state >= CT_DPIF_TCPS_CLOSING) { + update_expiration(conn_, now, CT_TM_TCP_CLOSING); + } else { + update_expiration(conn_, now, CT_TM_TCP_ESTABLISHED); + } + } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT + || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 + || src->state >= CT_DPIF_TCPS_FIN_WAIT_2) + && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) + /* Within a window forward of the originating packet */ + && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { + /* Within a window backward of the originating packet */ + + /* + * This currently handles three situations: + * 1) Stupid stacks will shotgun SYNs before their peer + * replies. + * 2) When PF catches an already established stream (the + * firewall rebooted, the state table was flushed, routes + * changed...) + * 3) Packets get funky immediately after the connection + * closes (this should catch Solaris spurious ACK|FINs + * that web servers like to spew after a close) + * + * This must be a little more careful than the above code + * since packet floods will also be caught here. We don't + * update the TTL here to mitigate the damage of a packet + * flood and so the same code can handle awkward establishment + * and a loosened connection close. + * In the establishment case, a correct peer response will + * validate the connection, go through the normal state code + * and keep updating the state TTL. + */ + + /* update max window */ + if (src->max_win < win) { + src->max_win = win; + } + /* synchronize sequencing */ + if (SEQ_GT(end, src->seqlo)) { + src->seqlo = end; + } + /* slide the window of what the other end can send */ + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { + dst->seqhi = ack + MAX((win << sws), 1); + } + + /* + * Cannot set dst->seqhi here since this could be a shotgunned + * SYN and not an already established connection. + */ + + if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) { + src->state = CT_DPIF_TCPS_CLOSING; + } + + if (tcp_flags & TCP_RST) { + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; + } + } else { + return CT_UPDATE_INVALID; + } + + return CT_UPDATE_VALID; +} + +static bool +tcp_valid_new(struct dp_packet *pkt) +{ + struct tcp_header *tcp = dp_packet_l4(pkt); + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); + + if (tcp_invalid_flags(tcp_flags)) { + return false; + } + + /* A syn+ack is not allowed to create a connection. We want to allow + * totally new connections (syn) or already established, not partially + * open (syn+ack). */ + if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) { + return false; + } + + return true; +} + +static struct conn * +tcp_new_conn(struct dp_packet *pkt, long long now) +{ + struct conn_tcp* newconn = NULL; + struct tcp_header *tcp = dp_packet_l4(pkt); + struct tcp_peer *src, *dst; + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); + + newconn = xzalloc(sizeof *newconn); + + src = &newconn->peer[0]; + dst = &newconn->peer[1]; + + src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq)); + src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1; + + if (tcp_flags & TCP_SYN) { + src->seqhi++; + src->wscale = tcp_get_wscale(tcp); + } else { + src->wscale = CT_WSCALE_UNKNOWN; + dst->wscale = CT_WSCALE_UNKNOWN; + } + src->max_win = MAX(ntohs(tcp->tcp_winsz), 1); + if (src->wscale & CT_WSCALE_MASK) { + /* Remove scale factor from initial window */ + uint8_t sws = src->wscale & CT_WSCALE_MASK; + src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws); + } + if (tcp_flags & TCP_FIN) { + src->seqhi++; + } + dst->seqhi = 1; + dst->max_win = 1; + src->state = CT_DPIF_TCPS_SYN_SENT; + dst->state = CT_DPIF_TCPS_CLOSED; + + update_expiration(&newconn->up, now, CT_TM_TCP_FIRST_PACKET); + + return &newconn->up; +} + +struct ct_l4_proto ct_proto_tcp = { + .new_conn = tcp_new_conn, + .valid_new = tcp_valid_new, + .conn_update = tcp_conn_update, +}; diff --git a/lib/conntrack.c b/lib/conntrack.c new file mode 100644 index 0000000..ff5ce6f --- /dev/null +++ b/lib/conntrack.c @@ -0,0 +1,890 @@ +/* + * Copyright (c) 2015, 2016 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <config.h> +#include "conntrack.h" + +#include <errno.h> +#include <sys/types.h> +#include <netinet/in.h> +#include <netinet/icmp6.h> + +#include "bitmap.h" +#include "conntrack-private.h" +#include "coverage.h" +#include "csum.h" +#include "dp-packet.h" +#include "flow.h" +#include "netdev.h" +#include "odp-netlink.h" +#include "openvswitch/hmap.h" +#include "openvswitch/vlog.h" +#include "ovs-rcu.h" +#include "random.h" +#include "timeval.h" + +VLOG_DEFINE_THIS_MODULE(conntrack); + +COVERAGE_DEFINE(conntrack_full); + +struct conn_lookup_ctx { + struct conn_key key; + struct conn *conn; + uint32_t hash; + bool reply; + bool related; +}; + +static bool conn_key_extract(struct conntrack *, struct dp_packet *, + struct conn_lookup_ctx *, uint16_t zone); +static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis); +static void conn_key_reverse(struct conn_key *); +static void conn_key_lookup(struct conntrack_bucket *ctb, + struct conn_lookup_ctx *ctx, + long long now); +static bool valid_new(struct dp_packet *pkt, struct conn_key *); +static struct conn *new_conn(struct dp_packet *pkt, struct conn_key *, + long long now); +static void delete_conn(struct conn *); +static enum ct_update_res conn_update(struct conn *, struct dp_packet*, + bool reply, long long now); +static bool conn_expired(struct conn *, long long now); +static void set_mark(struct dp_packet *, struct conn *, + uint32_t val, uint32_t mask); +static void set_label(struct dp_packet *, struct conn *, + const struct ovs_key_ct_labels *val, + const struct ovs_key_ct_labels *mask); + +static struct ct_l4_proto *l4_protos[] = { + [IPPROTO_TCP] = &ct_proto_tcp, + [IPPROTO_UDP] = &ct_proto_other, + [IPPROTO_ICMP] = &ct_proto_other, + [IPPROTO_ICMPV6] = &ct_proto_other, +}; + +long long ct_timeout_val[] = { +#define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL, + CT_TIMEOUTS +#undef CT_TIMEOUT +}; + +/* If the total number of connections goes above this value, no new connections + * are accepted */ +#define DEFAULT_N_CONN_LIMIT 3000000 + +/* Initializes the connection tracker 'ct'. The caller is responsible for + * calling 'conntrack_destroy()', when the instance is not needed anymore */ +void +conntrack_init(struct conntrack *ct) +{ + unsigned i; + + for (i = 0; i < CONNTRACK_BUCKETS; i++) { + struct conntrack_bucket *ctb = &ct->buckets[i]; + + ct_lock_init(&ctb->lock); + ct_lock_lock(&ctb->lock); + hmap_init(&ctb->connections); + ct_lock_unlock(&ctb->lock); + } + ct->hash_basis = random_uint32(); + atomic_count_init(&ct->n_conn, 0); + atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT); +} + +/* Destroys the connection tracker 'ct' and frees all the allocated memory. */ +void +conntrack_destroy(struct conntrack *ct) +{ + unsigned i; + + for (i = 0; i < CONNTRACK_BUCKETS; i++) { + struct conntrack_bucket *ctb = &ct->buckets[i]; + struct conn *conn; + + ct_lock_lock(&ctb->lock); + HMAP_FOR_EACH_POP(conn, node, &ctb->connections) { + atomic_count_dec(&ct->n_conn); + delete_conn(conn); + } + hmap_destroy(&ctb->connections); + ct_lock_unlock(&ctb->lock); + ct_lock_destroy(&ctb->lock); + } +} + +static unsigned hash_to_bucket(uint32_t hash) +{ + /* Extracts the most significant bits in hash. The least significant bits + * are already used internally by the hmap implementation. */ + BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1); + + return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS; +} + +static void +write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone, + uint32_t mark, ovs_u128 label) +{ + pkt->md.ct_state = state | CS_TRACKED; + pkt->md.ct_zone = zone; + pkt->md.ct_mark = mark; + pkt->md.ct_label = label; +} + +static struct conn * +conn_not_found(struct conntrack *ct, struct dp_packet *pkt, + struct conn_lookup_ctx *ctx, uint16_t *state, bool commit, + long long now) +{ + unsigned bucket = hash_to_bucket(ctx->hash); + struct conn *nc = NULL; + + if (!valid_new(pkt, &ctx->key)) { + *state |= CS_INVALID; + return nc; + } + + *state |= CS_NEW; + + if (commit) { + unsigned int n_conn_limit; + + atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit); + + if (atomic_count_get(&ct->n_conn) >= n_conn_limit) { + COVERAGE_INC(conntrack_full); + return nc; + } + + nc = new_conn(pkt, &ctx->key, now); + + memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key); + + conn_key_reverse(&nc->rev_key); + hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash); + atomic_count_inc(&ct->n_conn); + } + + return nc; +} + +static struct conn * +process_one(struct conntrack *ct, struct dp_packet *pkt, + struct conn_lookup_ctx *ctx, uint16_t zone, + bool commit, long long now) +{ + unsigned bucket = hash_to_bucket(ctx->hash); + struct conn *conn = ctx->conn; + uint16_t state = 0; + + if (conn) { + if (ctx->related) { + state |= CS_RELATED; + if (ctx->reply) { + state |= CS_REPLY_DIR; + } + } else { + enum ct_update_res res; + + res = conn_update(conn, pkt, ctx->reply, now); + + switch (res) { + case CT_UPDATE_VALID: + state |= CS_ESTABLISHED; + if (ctx->reply) { + state |= CS_REPLY_DIR; + } + break; + case CT_UPDATE_INVALID: + state |= CS_INVALID; + break; + case CT_UPDATE_NEW: + hmap_remove(&ct->buckets[bucket].connections, &conn->node); + atomic_count_dec(&ct->n_conn); + delete_conn(conn); + conn = conn_not_found(ct, pkt, ctx, &state, commit, now); + break; + default: + OVS_NOT_REACHED(); + } + } + } else { + conn = conn_not_found(ct, pkt, ctx, &state, commit, now); + } + + write_ct_md(pkt, state, zone, conn ? conn->mark : 0, + conn ? conn->label : OVS_U128_ZERO); + + return conn; +} + +/* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All + * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have + * the l3 and and l4 offset properly set. + * + * If 'commit' is true, the packets are allowed to create new entries in the + * connection tables. 'setmark', if not NULL, should point to a two + * elements array containing a value and a mask to set the connection mark. + * 'setlabel' behaves similarly for the connection label.*/ +int +conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, + bool commit, uint16_t zone, const uint32_t *setmark, + const struct ovs_key_ct_labels *setlabel, + const char *helper) +{ + struct dp_packet **pkts = pkt_batch->packets; + size_t cnt = pkt_batch->count; +#if !defined(__CHECKER__) && !defined(_WIN32) + const size_t KEY_ARRAY_SIZE = cnt; +#else + enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST }; +#endif + struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE]; + int8_t bucket_list[CONNTRACK_BUCKETS]; + struct { + unsigned bucket; + unsigned long maps; + } arr[KEY_ARRAY_SIZE]; + long long now = time_msec(); + size_t i = 0; + uint8_t arrcnt = 0; + + BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= NETDEV_MAX_BURST); + + if (helper) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + + VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper); + /* Continue without the helper */ + } + + memset(bucket_list, INT8_C(-1), sizeof bucket_list); + for (i = 0; i < cnt; i++) { + unsigned bucket; + + if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) { + write_ct_md(pkts[i], CS_INVALID, zone, 0, OVS_U128_ZERO); + continue; + } + + bucket = hash_to_bucket(ctxs[i].hash); + if (bucket_list[bucket] == INT8_C(-1)) { + bucket_list[bucket] = arrcnt; + + arr[arrcnt].maps = 0; + ULLONG_SET1(arr[arrcnt].maps, i); + arr[arrcnt++].bucket = bucket; + } else { + ULLONG_SET1(arr[bucket_list[bucket]].maps, i); + arr[bucket_list[bucket]].maps |= 1UL << i; + } + } + + for (i = 0; i < arrcnt; i++) { + struct conntrack_bucket *ctb = &ct->buckets[arr[i].bucket]; + size_t j; + + ct_lock_lock(&ctb->lock); + + ULLONG_FOR_EACH_1(j, arr[i].maps) { + struct conn *conn; + + conn_key_lookup(ctb, &ctxs[j], now); + + conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now); + + if (conn && setmark) { + set_mark(pkts[j], conn, setmark[0], setmark[1]); + } + + if (conn && setlabel) { + set_label(pkts[j], conn, &setlabel[0], &setlabel[1]); + } + } + ct_lock_unlock(&ctb->lock); + } + + return 0; +} + +static void +set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask) +{ + pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask)); + conn->mark = pkt->md.ct_mark; +} + +static void +set_label(struct dp_packet *pkt, struct conn *conn, + const struct ovs_key_ct_labels *val, + const struct ovs_key_ct_labels *mask) +{ + ovs_u128 v, m; + + memcpy(&v, val, sizeof v); + memcpy(&m, mask, sizeof m); + + pkt->md.ct_label.u64.lo = v.u64.lo + | (pkt->md.ct_label.u64.lo & ~(m.u64.lo)); + pkt->md.ct_label.u64.hi = v.u64.hi + | (pkt->md.ct_label.u64.hi & ~(m.u64.hi)); + conn->label = pkt->md.ct_label; +} + +/* Key extraction */ + +/* The function stores a pointer to the first byte after the header in + * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is + * not interested in the header's tail, meaning that the header has + * already been parsed (e.g. by flow_extract): we take this as a hint to + * save a few checks. If 'validate_checksum' is true, the function returns + * false if the IPv4 checksum is invalid. */ +static inline bool +extract_l3_ipv4(struct conn_key *key, const void *data, size_t size, + const char **new_data, bool validate_checksum) +{ + const struct ip_header *ip = data; + size_t ip_len; + + if (new_data) { + if (OVS_UNLIKELY(size < IP_HEADER_LEN)) { + return false; + } + } + + ip_len = IP_IHL(ip->ip_ihl_ver) * 4; + + if (new_data) { + if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) { + return false; + } + if (OVS_UNLIKELY(size < ip_len)) { + return false; + } + + *new_data = (char *) data + ip_len; + } + + if (IP_IS_FRAGMENT(ip->ip_frag_off)) { + return false; + } + + if (validate_checksum && csum(data, ip_len) != 0) { + return false; + } + + key->src.addr.ipv4 = ip->ip_src; + key->dst.addr.ipv4 = ip->ip_dst; + key->nw_proto = ip->ip_proto; + + return true; +} + +/* The function stores a pointer to the first byte after the header in + * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is + * not interested in the header's tail, meaning that the header has + * already been parsed (e.g. by flow_extract): we take this as a hint to + * save a few checks. */ +static inline bool +extract_l3_ipv6(struct conn_key *key, const void *data, size_t size, + const char **new_data) +{ + const struct ovs_16aligned_ip6_hdr *ip6 = data; + uint8_t nw_proto = ip6->ip6_nxt; + uint8_t nw_frag = 0; + + if (new_data) { + if (OVS_UNLIKELY(size < sizeof *ip6)) { + return false; + } + } + + data = ip6 + 1; + size -= sizeof *ip6; + + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) { + return false; + } + + if (new_data) { + *new_data = data; + } + + if (nw_frag) { + return false; + } + + key->src.addr.ipv6 = ip6->ip6_src; + key->dst.addr.ipv6 = ip6->ip6_dst; + key->nw_proto = nw_proto; + + return true; +} + +static inline bool +checksum_valid(const struct conn_key *key, const void *data, size_t size, + const void *l3) +{ + uint32_t csum = 0; + + if (key->dl_type == htons(ETH_TYPE_IP)) { + csum = packet_csum_pseudoheader(l3); + } else if (key->dl_type == htons(ETH_TYPE_IPV6)) { + csum = packet_csum_pseudoheader6(l3); + } else { + return false; + } + + csum = csum_continue(csum, data, size); + + return csum_finish(csum) == 0; +} + +static inline bool +check_l4_tcp(const struct conn_key *key, const void *data, size_t size, + const void *l3) +{ + const struct tcp_header *tcp = data; + size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4; + + if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) { + return false; + } + + return checksum_valid(key, data, size, l3); +} + +static inline bool +check_l4_udp(const struct conn_key *key, const void *data, size_t size, + const void *l3) +{ + const struct udp_header *udp = data; + size_t udp_len = ntohs(udp->udp_len); + + if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) { + return false; + } + + /* Validation must be skipped if checksum is 0 on IPv4 packets */ + return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP)) + || checksum_valid(key, data, size, l3); +} + +static inline bool +check_l4_icmp(const void *data, size_t size) +{ + return csum(data, size) == 0; +} + +static inline bool +check_l4_icmp6(const struct conn_key *key, const void *data, size_t size, + const void *l3) +{ + return checksum_valid(key, data, size, l3); +} + +static inline bool +extract_l4_tcp(struct conn_key *key, const void *data, size_t size) +{ + const struct tcp_header *tcp = data; + + if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) { + return false; + } + + key->src.port = tcp->tcp_src; + key->dst.port = tcp->tcp_dst; + + /* Port 0 is invalid */ + return key->src.port && key->dst.port; +} + +static inline bool +extract_l4_udp(struct conn_key *key, const void *data, size_t size) +{ + const struct udp_header *udp = data; + + if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) { + return false; + } + + key->src.port = udp->udp_src; + key->dst.port = udp->udp_dst; + + /* Port 0 is invalid */ + return key->src.port && key->dst.port; +} + +static inline bool extract_l4(struct conn_key *key, const void *data, + size_t size, bool *related, const void *l3); + +/* If 'related' is not NULL and the function is processing an ICMP + * error packet, extract the l3 and l4 fields from the nested header + * instead and set *related to true. If 'related' is NULL we're + * already processing a nested header and no such recursion is + * possible */ +static inline int +extract_l4_icmp(struct conn_key *key, const void *data, size_t size, + bool *related) +{ + const struct icmp_header *icmp = data; + + if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) { + return false; + } + + switch (icmp->icmp_type) { + case ICMP4_ECHO_REQUEST: + case ICMP4_ECHO_REPLY: + case ICMP4_TIMESTAMP: + case ICMP4_TIMESTAMPREPLY: + case ICMP4_INFOREQUEST: + case ICMP4_INFOREPLY: + /* Separate ICMP connection: identified using id */ + key->src.port = key->dst.port = icmp->icmp_fields.echo.id; + break; + case ICMP4_DST_UNREACH: + case ICMP4_TIME_EXCEEDED: + case ICMP4_PARAM_PROB: + case ICMP4_SOURCEQUENCH: + case ICMP4_REDIRECT: { + /* ICMP packet part of another connection. We should + * extract the key from embedded packet header */ + struct conn_key inner_key; + const char *l3 = (const char *) (icmp + 1); + const char *tail = (const char *) data + size; + const char *l4; + bool ok; + + if (!related) { + return false; + } + + memset(&inner_key, 0, sizeof inner_key); + inner_key.dl_type = htons(ETH_TYPE_IP); + ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false); + if (!ok) { + return false; + } + + /* pf doesn't do this, but it seems a good idea */ + if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned + || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) { + return false; + } + + key->src = inner_key.src; + key->dst = inner_key.dst; + key->nw_proto = inner_key.nw_proto; + + ok = extract_l4(key, l4, tail - l4, NULL, l3); + if (ok) { + conn_key_reverse(key); + *related = true; + } + return ok; + } + default: + return false; + } + + return true; +} + +/* If 'related' is not NULL and the function is processing an ICMP + * error packet, extract the l3 and l4 fields from the nested header + * instead and set *related to true. If 'related' is NULL we're + * already processing a nested header and no such recursion is + * possible */ +static inline bool +extract_l4_icmp6(struct conn_key *key, const void *data, size_t size, + bool *related) +{ + const struct icmp6_header *icmp6 = data; + + /* All the messages that we support need at least 4 bytes after + * the header */ + if (size < sizeof *icmp6 + 4) { + return false; + } + + switch (icmp6->icmp6_type) { + case ICMP6_ECHO_REQUEST: + case ICMP6_ECHO_REPLY: + /* Separate ICMP connection: identified using id */ + key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1); + break; + case ICMP6_DST_UNREACH: + case ICMP6_PACKET_TOO_BIG: + case ICMP6_TIME_EXCEEDED: + case ICMP6_PARAM_PROB: { + /* ICMP packet part of another connection. We should + * extract the key from embedded packet header */ + struct conn_key inner_key; + const char *l3 = (const char *) icmp6 + 8; + const char *tail = (const char *) data + size; + const char *l4 = NULL; + bool ok; + + if (!related) { + return false; + } + + memset(&inner_key, 0, sizeof inner_key); + inner_key.dl_type = htons(ETH_TYPE_IPV6); + ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4); + if (!ok) { + return false; + } + + /* pf doesn't do this, but it seems a good idea */ + if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned, + &key->dst.addr.ipv6_aligned) + || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned, + &key->src.addr.ipv6_aligned)) { + return false; + } + + key->src = inner_key.src; + key->dst = inner_key.dst; + key->nw_proto = inner_key.nw_proto; + + ok = extract_l4(key, l4, tail - l4, NULL, l3); + if (ok) { + conn_key_reverse(key); + *related = true; + } + return ok; + } + default: + return false; + } + + return true; +} + +/* Extract l4 fields into 'key', which must already contain valid l3 + * members. + * + * If 'related' is not NULL and an ICMP error packet is being + * processed, the function will extract the key from the packet nested + * in the ICMP paylod and set '*related' to true. + * + * If 'related' is NULL, it means that we're already parsing a header nested + * in an ICMP error. In this case, we skip checksum and length validation. */ +static inline bool +extract_l4(struct conn_key *key, const void *data, size_t size, bool *related, + const void *l3) +{ + if (key->nw_proto == IPPROTO_TCP) { + return (!related || check_l4_tcp(key, data, size, l3)) + && extract_l4_tcp(key, data, size); + } else if (key->nw_proto == IPPROTO_UDP) { + return (!related || check_l4_udp(key, data, size, l3)) + && extract_l4_udp(key, data, size); + } else if (key->dl_type == htons(ETH_TYPE_IP) + && key->nw_proto == IPPROTO_ICMP) { + return (!related || check_l4_icmp(data, size)) + && extract_l4_icmp(key, data, size, related); + } else if (key->dl_type == htons(ETH_TYPE_IPV6) + && key->nw_proto == IPPROTO_ICMPV6) { + return (!related || check_l4_icmp6(key, data, size, l3)) + && extract_l4_icmp6(key, data, size, related); + } else { + return false; + } +} + +static bool +conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, + struct conn_lookup_ctx *ctx, uint16_t zone) +{ + const struct eth_header *l2 = dp_packet_l2(pkt); + const struct ip_header *l3 = dp_packet_l3(pkt); + const char *l4 = dp_packet_l4(pkt); + const char *tail = dp_packet_tail(pkt); + bool ok; + + memset(ctx, 0, sizeof *ctx); + + if (!l2 || !l3 || !l4) { + return false; + } + + ctx->key.zone = zone; + + /* XXX In this function we parse the packet (again, it has already + * gone through miniflow_extract()) for two reasons: + * + * 1) To extract the l3 addresses and l4 ports. + * We already have the l3 and l4 headers' pointers. Extracting + * the l3 addresses and the l4 ports is really cheap, since they + * can be found at fixed locations. + * 2) To extract the l3 and l4 types. + * Extracting the l3 and l4 types (especially the l3[1]) on the + * other hand is quite expensive, because they're not at a + * fixed location. + * + * Here's a way to avoid (2) with the help of the datapath. + * The datapath doesn't keep the packet's extracted flow[2], so + * using that is not an option. We could use the packet's matching + * megaflow for l3 type (it's always unwildcarded), and for l4 type + * (we have to unwildcard it first). This means either: + * + * a) dpif-netdev passes the matching megaflow to dp_execute_cb(), which + * is used to extract the l3 type. Unfortunately, dp_execute_cb() is + * used also in dpif_netdev_execute(), which doesn't have a matching + * megaflow. + * + * b) We define an alternative OVS_ACTION_ATTR_CT, used only by the + * userspace datapath, which includes l3 (and l4) type. The + * alternative action could be generated by ofproto-dpif specifically + * for the userspace datapath. Having a different interface for + * userspace and kernel doesn't seem very clean, though. + * + * --- + * [1] A simple benchmark (running only the connection tracker + * over and over on the same packets) shows that if the + * l3 type is already provided we are 15% faster (running the + * connection tracker over a couple of DPDK devices with a + * stream of UDP 64-bytes packets shows that we are 4% faster). + * + * [2] The reasons for this are that keeping the flow increases + * (slightly) the cache footprint and increases computation + * time as we move the packet around. Most importantly, the flow + * should be updated by the actions and this can be slow, as + * we use a sparse representation (miniflow). + * + */ + ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2); + if (ctx->key.dl_type == htons(ETH_TYPE_IP)) { + ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, true); + } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) { + ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL); + } else { + ok = false; + } + + if (ok) { + if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) { + ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis); + return true; + } + } + + return false; +} + +/* Symmetric */ +static uint32_t +conn_key_hash(const struct conn_key *key, uint32_t basis) +{ + uint32_t hsrc, hdst, hash; + int i; + + hsrc = hdst = basis; + + /* Hash the source and destination tuple */ + for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) { + hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]); + hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]); + } + + /* Even if source and destination are swapped the hash will be the same. */ + hash = hsrc ^ hdst; + + /* Hash the rest of the key(L3 and L4 types and zone). */ + hash = hash_words((uint32_t *) &key->dst + 1, + (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1), + hash); + + return hash; +} + +static void +conn_key_reverse(struct conn_key *key) +{ + struct ct_endpoint tmp; + tmp = key->src; + key->src = key->dst; + key->dst = tmp; +} + +static void +conn_key_lookup(struct conntrack_bucket *ctb, + struct conn_lookup_ctx *ctx, + long long now) +{ + uint32_t hash = ctx->hash; + struct conn *conn; + + ctx->conn = NULL; + + HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) { + if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key)) + && !conn_expired(conn, now)) { + ctx->conn = conn; + ctx->reply = false; + break; + } + if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key)) + && !conn_expired(conn, now)) { + ctx->conn = conn; + ctx->reply = true; + break; + } + } +} + +static enum ct_update_res +conn_update(struct conn *conn, struct dp_packet *pkt, bool reply, + long long now) +{ + return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt, reply, now); +} + +static bool +conn_expired(struct conn *conn, long long now) +{ + return now >= conn->expiration; +} + +static bool +valid_new(struct dp_packet *pkt, struct conn_key *key) +{ + return l4_protos[key->nw_proto]->valid_new(pkt); +} + +static struct conn * +new_conn(struct dp_packet *pkt, struct conn_key *key, long long now) +{ + struct conn *newconn; + + newconn = l4_protos[key->nw_proto]->new_conn(pkt, now); + + if (newconn) { + newconn->key = *key; + } + + return newconn; +} + +static void +delete_conn(struct conn *conn) +{ + free(conn); +} diff --git a/lib/conntrack.h b/lib/conntrack.h new file mode 100644 index 0000000..331a890 --- /dev/null +++ b/lib/conntrack.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2015, 2016 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CONNTRACK_H +#define CONNTRACK_H 1 + +#include <stdbool.h> + +#include "odp-netlink.h" +#include "openvswitch/hmap.h" +#include "openvswitch/thread.h" +#include "openvswitch/types.h" +#include "ovs-atomic.h" + +/* Userspace connection tracker + * ============================ + * + * This is a connection tracking module that keeps all the state in userspace. + * + * Usage + * ===== + * + * struct conntrack ct; + * + * Initialization: + * + * conntrack_init(&ct); + * + * It is necessary to periodically issue a call to + * + * conntrack_run(&ct); + * + * to allow the module to clean up expired connections. + * + * To send a group of packets through the connection tracker: + * + * conntrack_execute(&ct, pkts, n_pkts, ...); + * + * Thread-safety + * ============= + * + * conntrack_execute() can be called by multiple threads simultaneoulsy. + */ + +struct dp_packet_batch; + +struct conntrack; + +void conntrack_init(struct conntrack *); +void conntrack_run(struct conntrack *); +void conntrack_destroy(struct conntrack *); + +int conntrack_execute(struct conntrack *, struct dp_packet_batch *, + bool commit, uint16_t zone, const uint32_t *setmark, + const struct ovs_key_ct_labels *setlabel, + const char *helper); + +/* 'struct ct_lock' is a wrapper for an adaptive mutex. It's useful to try + * different types of locks (e.g. spinlocks) */ + +struct OVS_LOCKABLE ct_lock { + struct ovs_mutex lock; +}; + +static inline void ct_lock_init(struct ct_lock *lock) +{ + ovs_mutex_init_adaptive(&lock->lock); +} + +static inline void ct_lock_lock(struct ct_lock *lock) + OVS_ACQUIRES(lock) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + ovs_mutex_lock(&lock->lock); +} + +static inline void ct_lock_unlock(struct ct_lock *lock) + OVS_RELEASES(lock) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + ovs_mutex_unlock(&lock->lock); +} + +static inline void ct_lock_destroy(struct ct_lock *lock) +{ + ovs_mutex_destroy(&lock->lock); +} + +/* Timeouts: all the possible timeout states passed to update_expiration() + * are listed here. The name will be prefix by CT_TM_ and the value is in + * milliseconds */ +#define CT_TIMEOUTS \ + CT_TIMEOUT(TCP_FIRST_PACKET, 30 * 1000) \ + CT_TIMEOUT(TCP_OPENING, 30 * 1000) \ + CT_TIMEOUT(TCP_ESTABLISHED, 24 * 60 * 60 * 1000) \ + CT_TIMEOUT(TCP_CLOSING, 15 * 60 * 1000) \ + CT_TIMEOUT(TCP_FIN_WAIT, 45 * 1000) \ + CT_TIMEOUT(TCP_CLOSED, 30 * 1000) \ + CT_TIMEOUT(OTHER_FIRST, 60 * 1000) \ + CT_TIMEOUT(OTHER_MULTIPLE, 60 * 1000) \ + CT_TIMEOUT(OTHER_BIDIR, 30 * 1000) \ + +enum ct_timeout { +#define CT_TIMEOUT(NAME, VALUE) CT_TM_##NAME, + CT_TIMEOUTS +#undef CT_TIMEOUT + N_CT_TM +}; + +/* Locking: + * + * The connections are kept in different buckets, which are completely + * independent. The connection bucket is determined by the hash of its key. + * */ +struct conntrack_bucket { + struct ct_lock lock; + struct hmap connections OVS_GUARDED; +}; + +#define CONNTRACK_BUCKETS_SHIFT 8 +#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT) + +struct conntrack { + /* Independent buckets containing the connections */ + struct conntrack_bucket buckets[CONNTRACK_BUCKETS]; + + /* Salt for hashing a connection key. */ + uint32_t hash_basis; + + /* Number of connections currently in the connection tracker. */ + atomic_count n_conn; + /* Connections limit. When this limit is reached, no new connection + * will be accepted. */ + atomic_uint n_conn_limit; +}; + +#endif /* conntrack.h */ diff --git a/lib/util.h b/lib/util.h index 3f6b690..5ad25cb 100644 --- a/lib/util.h +++ b/lib/util.h @@ -70,6 +70,15 @@ ovs_prefetch_range(const void *start, size_t size) #define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) #endif +/* Comparisons for ints with modular arithmetic */ +#define INT_MOD_LT(a,b) ((int) ((a)-(b)) < 0) +#define INT_MOD_LEQ(a,b) ((int) ((a)-(b)) <= 0) +#define INT_MOD_GT(a,b) ((int) ((a)-(b)) > 0) +#define INT_MOD_GEQ(a,b) ((int) ((a)-(b)) >= 0) + +#define INT_MOD_MIN(a, b) ((INT_MOD_LT(a, b)) ? (a) : (b)) +#define INT_MOD_MAX(a, b) ((INT_MOD_GT(a, b)) ? (a) : (b)) + #define OVS_NOT_REACHED() abort() /* Use "%"PRIuSIZE to format size_t with printf(). */

[ovs-dev,v5,04/16] conntrack: New userspace connection tracker.

Commit Message

Patch